bblean 0.6.0b2__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cp312-win_amd64.pyd +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1850 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b2.dist-info/METADATA +288 -0
- bblean-0.6.0b2.dist-info/RECORD +31 -0
- bblean-0.6.0b2.dist-info/WHEEL +5 -0
- bblean-0.6.0b2.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b2.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1144 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
# BitBIRCH is an open-source clustering module based on iSIM
|
|
3
|
+
#
|
|
4
|
+
# Please, cite the BitBIRCH paper: https://doi.org/10.1039/D5DD00030K
|
|
5
|
+
#
|
|
6
|
+
# BitBIRCH is free software; you can redistribute it and/or modify
|
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
|
8
|
+
# the Free Software Foundation, version 3.
|
|
9
|
+
#
|
|
10
|
+
# BitBIRCH is distributed in the hope that it will be useful,
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
+
# GNU General Public License for more details.
|
|
14
|
+
#
|
|
15
|
+
# BitBIRCH License: GPL-3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
|
|
16
|
+
#
|
|
17
|
+
# Memory-efficient BitBIRCH authors: Ramon Alain Miranda Quintana <ramirandaq@gmail.com>, <quintana@chem.ufl.edu>
|
|
18
|
+
# Krizstina Zsigmond <kzsigmond@ufl.edu>
|
|
19
|
+
#
|
|
20
|
+
### Part of the tree-management code was derived from https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html
|
|
21
|
+
### Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
|
22
|
+
### Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
|
23
|
+
### Joel Nothman <joel.nothman@gmail.com>
|
|
24
|
+
### License: BSD 3 clause
|
|
25
|
+
# Parts of the BitBIRCH algorithm were previously released under the LGPL-3.0 license by:
|
|
26
|
+
# Ramon Alain Miranda Quintana <ramirandaq@gmail.com>, <quintana@chem.ufl.edu>
|
|
27
|
+
# Vicky (Vic) Jung <jungvicky@ufl.edu>
|
|
28
|
+
# Kenneth Lopez Perez <klopezperez@chem.ufl.edu>
|
|
29
|
+
# Kate Huddleston <kdavis2@chem.ufl.edu>
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
import numpy as np
|
|
33
|
+
from scipy import sparse
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def safe_sum(nmax, np1, np2):
|
|
37
|
+
if nmax >= 4294967294:
|
|
38
|
+
return np1.astype("uint64") + np2.astype("uint64")
|
|
39
|
+
elif nmax >= 65534:
|
|
40
|
+
return np1.astype("uint32") + np2.astype("uint32")
|
|
41
|
+
elif nmax >= 254:
|
|
42
|
+
return np1.astype("uint16") + np2.astype("uint16")
|
|
43
|
+
else:
|
|
44
|
+
return np1 + np2
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _copy_or_unpack(x, n_features, input_is_packed: bool = True):
|
|
48
|
+
return unpack_fingerprints(x, n_features) if input_is_packed else x.copy()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def unpack_fingerprints(a, n_features: int):
|
|
52
|
+
"""Unpacks uint8 arrays into boolean arrays"""
|
|
53
|
+
# n_features is required to discard padded zeros if it is not a multiple of 8
|
|
54
|
+
return np.unpackbits(a, axis=-1, count=n_features)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Utility function to validate the n_features argument for packed inputs
|
|
58
|
+
def _validate_n_features(X, input_is_packed: bool, n_features: int | None) -> int:
|
|
59
|
+
if input_is_packed:
|
|
60
|
+
if n_features is None:
|
|
61
|
+
raise ValueError("n_features is required for packed inputs")
|
|
62
|
+
return n_features
|
|
63
|
+
|
|
64
|
+
x_n_features = X.shape[1]
|
|
65
|
+
if n_features is not None:
|
|
66
|
+
if n_features != x_n_features:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"n_features is redundant for non-packed inputs"
|
|
69
|
+
" if passed, it must be equal to X.shape[1]."
|
|
70
|
+
f" For passed X, X.shape[1] = {X.shape[1]}."
|
|
71
|
+
" If this value is not what you expected,"
|
|
72
|
+
" make sure the passed X is actually unpacked."
|
|
73
|
+
)
|
|
74
|
+
return x_n_features
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def set_merge(merge_criterion, tolerance=0.05):
|
|
78
|
+
"""
|
|
79
|
+
Sets merge_accept function for merge_subcluster, based on user specified merge_criteria.
|
|
80
|
+
|
|
81
|
+
Radius: merge subcluster based on comparison to centroid of the cluster
|
|
82
|
+
Diameter: merge subcluster based on instant Tanimoto similarity of cluster
|
|
83
|
+
Tolerance: applies tolerance threshold to diameter merge criteria, which will merge subcluster with stricter threshold for newly added molecules
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
-----------
|
|
87
|
+
merge_criterion: str();
|
|
88
|
+
radius, diameter or tolerance
|
|
89
|
+
tolerance: float;
|
|
90
|
+
sets penalty value for similarity threshold when callng tolerance merge criteria
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
--------
|
|
94
|
+
merge_accept(): function
|
|
95
|
+
if cluster is accepted to merge, merge the cluster based on the criteria specified
|
|
96
|
+
"""
|
|
97
|
+
if merge_criterion == "radius":
|
|
98
|
+
|
|
99
|
+
def merge_accept(
|
|
100
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
101
|
+
):
|
|
102
|
+
jt_sim = jt_isim(new_ls + new_centroid, new_n + 1) * (new_n + 1) - jt_isim(
|
|
103
|
+
new_ls, new_n
|
|
104
|
+
) * (new_n - 1)
|
|
105
|
+
return jt_sim >= threshold * 2
|
|
106
|
+
|
|
107
|
+
elif merge_criterion == "diameter":
|
|
108
|
+
|
|
109
|
+
def merge_accept(
|
|
110
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
111
|
+
):
|
|
112
|
+
jt_radius = jt_isim(new_ls, new_n)
|
|
113
|
+
return jt_radius >= threshold
|
|
114
|
+
|
|
115
|
+
elif merge_criterion == "tolerance_tough":
|
|
116
|
+
|
|
117
|
+
def merge_accept(
|
|
118
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
119
|
+
):
|
|
120
|
+
jt_radius = jt_isim(new_ls, new_n)
|
|
121
|
+
if jt_radius < threshold:
|
|
122
|
+
return False
|
|
123
|
+
else:
|
|
124
|
+
if old_n == 1 and nom_n == 1:
|
|
125
|
+
return True
|
|
126
|
+
elif nom_n == 1:
|
|
127
|
+
return (
|
|
128
|
+
jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
|
|
129
|
+
- jt_isim(old_ls, old_n) * (old_n - 1)
|
|
130
|
+
) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
|
|
131
|
+
jt_radius >= threshold
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
return (
|
|
135
|
+
jt_isim(old_ls + nom_ls, old_n + nom_n)
|
|
136
|
+
* (old_n + nom_n)
|
|
137
|
+
* (old_n + nom_n - 1)
|
|
138
|
+
- jt_isim(old_ls, old_n) * old_n * (old_n - 1)
|
|
139
|
+
- jt_isim(nom_ls, nom_n) * nom_n * (nom_n - 1)
|
|
140
|
+
) / (2 * old_n * nom_n) >= jt_isim(old_ls, old_n) - tolerance and (
|
|
141
|
+
jt_radius >= threshold
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
elif merge_criterion in ["tolerance", "tolerance-legacy"]:
|
|
145
|
+
|
|
146
|
+
def merge_accept(
|
|
147
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
148
|
+
):
|
|
149
|
+
jt_radius = jt_isim(new_ls, new_n)
|
|
150
|
+
if jt_radius < threshold:
|
|
151
|
+
return False
|
|
152
|
+
else:
|
|
153
|
+
if old_n == 1 and nom_n == 1:
|
|
154
|
+
return True
|
|
155
|
+
elif nom_n == 1:
|
|
156
|
+
return (
|
|
157
|
+
jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
|
|
158
|
+
- jt_isim(old_ls, old_n) * (old_n - 1)
|
|
159
|
+
) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
|
|
160
|
+
jt_radius >= threshold
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
globals()["merge_accept"] = merge_accept
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def jt_isim(c_total, n_objects):
|
|
169
|
+
"""iSIM Tanimoto calculation
|
|
170
|
+
|
|
171
|
+
https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
c_total : np.ndarray
|
|
176
|
+
Sum of the elements column-wise
|
|
177
|
+
|
|
178
|
+
n_objects : int
|
|
179
|
+
Number of elements
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
----------
|
|
183
|
+
isim : float
|
|
184
|
+
iSIM Jaccard-Tanimoto value
|
|
185
|
+
"""
|
|
186
|
+
x = c_total.astype("uint64")
|
|
187
|
+
sum_kq = np.sum(x)
|
|
188
|
+
sum_kqsq = np.dot(x, x)
|
|
189
|
+
a = (sum_kqsq - sum_kq) / 2
|
|
190
|
+
return a / (a + n_objects * sum_kq - sum_kqsq)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def max_separation(Y):
|
|
194
|
+
"""Finds two objects in X that are very separated
|
|
195
|
+
This is an approximation (not guaranteed to find
|
|
196
|
+
the two absolutely most separated objects), but it is
|
|
197
|
+
a very robust O(N) implementation. Quality of clustering
|
|
198
|
+
does not diminish in the end.
|
|
199
|
+
|
|
200
|
+
Algorithm:
|
|
201
|
+
a) Find centroid of X
|
|
202
|
+
b) mol1 is the molecule most distant from the centroid
|
|
203
|
+
c) mol2 is the molecule most distant from mol1
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
(mol1, mol2) : (int, int)
|
|
208
|
+
indices of mol1 and mol2
|
|
209
|
+
1 - sims_mol1 : np.ndarray
|
|
210
|
+
Distances to mol1
|
|
211
|
+
1 - sims_mol2: np.ndarray
|
|
212
|
+
Distances to mol2
|
|
213
|
+
These are needed for node1_dist and node2_dist in _split_node
|
|
214
|
+
"""
|
|
215
|
+
# Get the centroid of the set
|
|
216
|
+
X = Y.astype("uint64")
|
|
217
|
+
n_samples = len(X)
|
|
218
|
+
linear_sum = np.sum(X, axis=0)
|
|
219
|
+
centroid = calc_centroid(linear_sum, n_samples)
|
|
220
|
+
|
|
221
|
+
# Get the similarity of each molecule to the centroid
|
|
222
|
+
pop_counts = np.sum(X, axis=1)
|
|
223
|
+
a_centroid = np.dot(X, centroid)
|
|
224
|
+
sims_med = a_centroid / (pop_counts + np.sum(centroid) - a_centroid)
|
|
225
|
+
|
|
226
|
+
# Get the least similar molecule to the centroid
|
|
227
|
+
mol1 = np.argmin(sims_med)
|
|
228
|
+
|
|
229
|
+
# Get the similarity of each molecule to mol1
|
|
230
|
+
a_mol1 = np.dot(X, X[mol1])
|
|
231
|
+
sims_mol1 = a_mol1 / (pop_counts + pop_counts[mol1] - a_mol1)
|
|
232
|
+
|
|
233
|
+
# Get the least similar molecule to mol1
|
|
234
|
+
mol2 = np.argmin(sims_mol1)
|
|
235
|
+
|
|
236
|
+
# Get the similarity of each molecule to mol2
|
|
237
|
+
a_mol2 = np.dot(X, X[mol2])
|
|
238
|
+
sims_mol2 = a_mol2 / (pop_counts + pop_counts[mol2] - a_mol2)
|
|
239
|
+
|
|
240
|
+
return (mol1, mol2), sims_mol1, sims_mol2
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def calc_centroid(linear_sum, n_samples):
|
|
244
|
+
"""Calculates centroid
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
|
|
249
|
+
linear_sum : np.ndarray
|
|
250
|
+
Sum of the elements column-wise
|
|
251
|
+
n_samples : int
|
|
252
|
+
Number of samples
|
|
253
|
+
|
|
254
|
+
Returns
|
|
255
|
+
-------
|
|
256
|
+
centroid : np.ndarray
|
|
257
|
+
Centroid fingerprints of the given set
|
|
258
|
+
"""
|
|
259
|
+
cent = np.where(linear_sum >= n_samples * 0.5, 1, 0)
|
|
260
|
+
return cent.astype("bool")
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _iterate_sparse_X(X):
|
|
264
|
+
"""This little hack returns a densified row when iterating over a sparse
|
|
265
|
+
matrix, instead of constructing a sparse matrix for every row that is
|
|
266
|
+
expensive.
|
|
267
|
+
"""
|
|
268
|
+
n_samples, n_features = X.shape
|
|
269
|
+
X_indices = X.indices
|
|
270
|
+
X_data = X.data
|
|
271
|
+
X_indptr = X.indptr
|
|
272
|
+
|
|
273
|
+
for i in range(n_samples):
|
|
274
|
+
row = np.zeros(n_features)
|
|
275
|
+
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
|
276
|
+
nonzero_indices = X_indices[startptr:endptr]
|
|
277
|
+
row[nonzero_indices] = X_data[startptr:endptr]
|
|
278
|
+
yield row
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _split_node(node, threshold, branching_factor):
|
|
282
|
+
"""The node has to be split if there is no place for a new subcluster
|
|
283
|
+
in the node.
|
|
284
|
+
1. Two empty nodes and two empty subclusters are initialized.
|
|
285
|
+
2. The pair of distant subclusters are found.
|
|
286
|
+
3. The properties of the empty subclusters and nodes are updated
|
|
287
|
+
according to the nearest distance between the subclusters to the
|
|
288
|
+
pair of distant subclusters.
|
|
289
|
+
4. The two nodes are set as children to the two subclusters.
|
|
290
|
+
"""
|
|
291
|
+
new_subcluster1 = _BFSubcluster()
|
|
292
|
+
new_subcluster2 = _BFSubcluster()
|
|
293
|
+
new_node1 = _BFNode(
|
|
294
|
+
threshold=threshold,
|
|
295
|
+
branching_factor=branching_factor,
|
|
296
|
+
is_leaf=node.is_leaf,
|
|
297
|
+
n_features=node.n_features,
|
|
298
|
+
dtype=node.init_centroids_.dtype,
|
|
299
|
+
)
|
|
300
|
+
new_node2 = _BFNode(
|
|
301
|
+
threshold=threshold,
|
|
302
|
+
branching_factor=branching_factor,
|
|
303
|
+
is_leaf=node.is_leaf,
|
|
304
|
+
n_features=node.n_features,
|
|
305
|
+
dtype=node.init_centroids_.dtype,
|
|
306
|
+
)
|
|
307
|
+
new_subcluster1.child_ = new_node1
|
|
308
|
+
new_subcluster2.child_ = new_node2
|
|
309
|
+
|
|
310
|
+
if node.is_leaf:
|
|
311
|
+
if node.prev_leaf_ is not None:
|
|
312
|
+
node.prev_leaf_.next_leaf_ = new_node1
|
|
313
|
+
new_node1.prev_leaf_ = node.prev_leaf_
|
|
314
|
+
new_node1.next_leaf_ = new_node2
|
|
315
|
+
new_node2.prev_leaf_ = new_node1
|
|
316
|
+
new_node2.next_leaf_ = node.next_leaf_
|
|
317
|
+
if node.next_leaf_ is not None:
|
|
318
|
+
node.next_leaf_.prev_leaf_ = new_node2
|
|
319
|
+
|
|
320
|
+
# O(N) implementation of max separation
|
|
321
|
+
farthest_idx, node1_dist, node2_dist = max_separation(node.centroids_)
|
|
322
|
+
# Notice that max_separation is returning similarities and not distances
|
|
323
|
+
node1_closer = node1_dist > node2_dist
|
|
324
|
+
# Make sure node1 is closest to itself even if all distances are equal.
|
|
325
|
+
# This can only happen when all node.centroids_ are duplicates leading to all
|
|
326
|
+
# distances between centroids being zero.
|
|
327
|
+
node1_closer[farthest_idx[0]] = True
|
|
328
|
+
|
|
329
|
+
for idx, subcluster in enumerate(node.subclusters_):
|
|
330
|
+
if node1_closer[idx]:
|
|
331
|
+
new_node1.append_subcluster(subcluster)
|
|
332
|
+
new_subcluster1.update(subcluster)
|
|
333
|
+
# if not singly:
|
|
334
|
+
# subcluster.parent_ = new_subcluster1
|
|
335
|
+
else:
|
|
336
|
+
new_node2.append_subcluster(subcluster)
|
|
337
|
+
new_subcluster2.update(subcluster)
|
|
338
|
+
# if not singly:
|
|
339
|
+
# subcluster.parent_ = new_subcluster2
|
|
340
|
+
return new_subcluster1, new_subcluster2
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
class _BFNode:
|
|
344
|
+
"""Each node in a BFTree is called a BFNode.
|
|
345
|
+
|
|
346
|
+
The BFNode can have a maximum of branching_factor
|
|
347
|
+
number of BFSubclusters.
|
|
348
|
+
|
|
349
|
+
Parameters
|
|
350
|
+
----------
|
|
351
|
+
threshold : float
|
|
352
|
+
Threshold needed for a new subcluster to enter a BFSubcluster.
|
|
353
|
+
|
|
354
|
+
branching_factor : int
|
|
355
|
+
Maximum number of BF subclusters in each node.
|
|
356
|
+
|
|
357
|
+
is_leaf : bool
|
|
358
|
+
We need to know if the BFNode is a leaf or not, in order to
|
|
359
|
+
retrieve the final subclusters.
|
|
360
|
+
|
|
361
|
+
n_features : int
|
|
362
|
+
The number of features.
|
|
363
|
+
|
|
364
|
+
Attributes
|
|
365
|
+
----------
|
|
366
|
+
subclusters_ : list
|
|
367
|
+
List of subclusters for a particular BFNode.
|
|
368
|
+
|
|
369
|
+
prev_leaf_ : _BFNode
|
|
370
|
+
Useful only if is_leaf is True.
|
|
371
|
+
|
|
372
|
+
next_leaf_ : _BFNode
|
|
373
|
+
next_leaf. Useful only if is_leaf is True.
|
|
374
|
+
the final subclusters.
|
|
375
|
+
|
|
376
|
+
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
|
377
|
+
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
|
378
|
+
the centroids are just a view of the ``init_centroids_`` .
|
|
379
|
+
|
|
380
|
+
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
|
381
|
+
View of ``init_centroids_``.
|
|
382
|
+
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
|
|
386
|
+
self.threshold = threshold
|
|
387
|
+
self.branching_factor = branching_factor
|
|
388
|
+
self.is_leaf = is_leaf
|
|
389
|
+
self.n_features = n_features
|
|
390
|
+
|
|
391
|
+
# The list of subclusters, centroids and squared norms
|
|
392
|
+
# to manipulate throughout.
|
|
393
|
+
self.subclusters_ = []
|
|
394
|
+
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
|
|
395
|
+
self.prev_leaf_ = None
|
|
396
|
+
self.next_leaf_ = None
|
|
397
|
+
|
|
398
|
+
def append_subcluster(self, subcluster):
|
|
399
|
+
n_samples = len(self.subclusters_)
|
|
400
|
+
self.subclusters_.append(subcluster)
|
|
401
|
+
self.init_centroids_[n_samples] = subcluster.centroid_
|
|
402
|
+
|
|
403
|
+
# Keep centroids as views. In this way
|
|
404
|
+
# if we change init_centroids, it is sufficient
|
|
405
|
+
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
|
|
406
|
+
|
|
407
|
+
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
|
|
408
|
+
"""Remove a subcluster from a node and update it with the
|
|
409
|
+
split subclusters.
|
|
410
|
+
"""
|
|
411
|
+
|
|
412
|
+
ind = self.subclusters_.index(subcluster)
|
|
413
|
+
self.subclusters_[ind] = new_subcluster1
|
|
414
|
+
self.init_centroids_[ind] = new_subcluster1.centroid_
|
|
415
|
+
self.centroids_[ind] = new_subcluster1.centroid_
|
|
416
|
+
self.append_subcluster(new_subcluster2)
|
|
417
|
+
|
|
418
|
+
def insert_bf_subcluster(self, subcluster, set_bits):
|
|
419
|
+
"""Insert a new subcluster into the node."""
|
|
420
|
+
if not self.subclusters_:
|
|
421
|
+
self.append_subcluster(subcluster)
|
|
422
|
+
return False
|
|
423
|
+
|
|
424
|
+
threshold = self.threshold
|
|
425
|
+
branching_factor = self.branching_factor
|
|
426
|
+
# We need to find the closest subcluster among all the
|
|
427
|
+
# subclusters so that we can insert our new subcluster.
|
|
428
|
+
sub_centroids = self.centroids_.astype("uint16")
|
|
429
|
+
in_centroid = subcluster.centroid_.astype("uint16")
|
|
430
|
+
a = np.dot(sub_centroids, in_centroid)
|
|
431
|
+
sim_matrix = a / (np.sum(sub_centroids, axis=1) + set_bits - a)
|
|
432
|
+
closest_index = np.argmax(sim_matrix)
|
|
433
|
+
closest_subcluster = self.subclusters_[closest_index]
|
|
434
|
+
|
|
435
|
+
# If the subcluster has a child, we need a recursive strategy.
|
|
436
|
+
if closest_subcluster.child_ is not None:
|
|
437
|
+
|
|
438
|
+
split_child = closest_subcluster.child_.insert_bf_subcluster(
|
|
439
|
+
subcluster, set_bits
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
if not split_child:
|
|
443
|
+
# If it is determined that the child need not be split, we
|
|
444
|
+
# can just update the closest_subcluster
|
|
445
|
+
closest_subcluster.update(subcluster)
|
|
446
|
+
self.init_centroids_[closest_index] = self.subclusters_[
|
|
447
|
+
closest_index
|
|
448
|
+
].centroid_
|
|
449
|
+
self.centroids_[closest_index] = self.subclusters_[
|
|
450
|
+
closest_index
|
|
451
|
+
].centroid_
|
|
452
|
+
return False
|
|
453
|
+
|
|
454
|
+
# things not too good. we need to redistribute the subclusters in
|
|
455
|
+
# our child node, and add a new subcluster in the parent
|
|
456
|
+
# subcluster to accommodate the new child.
|
|
457
|
+
else:
|
|
458
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
459
|
+
closest_subcluster.child_, threshold, branching_factor
|
|
460
|
+
)
|
|
461
|
+
self.update_split_subclusters(
|
|
462
|
+
closest_subcluster, new_subcluster1, new_subcluster2
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
if len(self.subclusters_) > self.branching_factor:
|
|
466
|
+
return True
|
|
467
|
+
return False
|
|
468
|
+
|
|
469
|
+
# good to go!
|
|
470
|
+
else:
|
|
471
|
+
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
|
|
472
|
+
if merged:
|
|
473
|
+
self.centroids_[closest_index] = closest_subcluster.centroid_
|
|
474
|
+
self.init_centroids_[closest_index] = closest_subcluster.centroid_
|
|
475
|
+
|
|
476
|
+
return False
|
|
477
|
+
|
|
478
|
+
# not close to any other subclusters, and we still
|
|
479
|
+
# have space, so add.
|
|
480
|
+
elif len(self.subclusters_) < self.branching_factor:
|
|
481
|
+
self.append_subcluster(subcluster)
|
|
482
|
+
|
|
483
|
+
return False
|
|
484
|
+
|
|
485
|
+
# We do not have enough space nor is it closer to an
|
|
486
|
+
# other subcluster. We need to split.
|
|
487
|
+
else:
|
|
488
|
+
self.append_subcluster(subcluster)
|
|
489
|
+
return True
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
class _BFSubcluster:
|
|
493
|
+
"""Each subcluster in a BFNode is called a BFSubcluster.
|
|
494
|
+
|
|
495
|
+
A BFSubcluster can have a BFNode has its child.
|
|
496
|
+
|
|
497
|
+
Parameters
|
|
498
|
+
----------
|
|
499
|
+
linear_sum : ndarray of shape (n_features,), default=None
|
|
500
|
+
Sample. This is kept optional to allow initialization of empty
|
|
501
|
+
subclusters.
|
|
502
|
+
|
|
503
|
+
Attributes
|
|
504
|
+
----------
|
|
505
|
+
n_samples_ : int
|
|
506
|
+
Number of samples that belong to each subcluster.
|
|
507
|
+
|
|
508
|
+
linear_sum_ : ndarray
|
|
509
|
+
Linear sum of all the samples in a subcluster. Prevents holding
|
|
510
|
+
all sample data in memory.
|
|
511
|
+
|
|
512
|
+
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
|
513
|
+
Centroid of the subcluster. Prevent recomputing of centroids when
|
|
514
|
+
``BFNode.centroids_`` is called.
|
|
515
|
+
|
|
516
|
+
mol_indices : list, default=[]
|
|
517
|
+
List of indices of molecules included in the given cluster.
|
|
518
|
+
|
|
519
|
+
child_ : _BFNode
|
|
520
|
+
Child Node of the subcluster. Once a given _BFNode is set as the child
|
|
521
|
+
of the _BFNode, it is set to ``self.child_``.
|
|
522
|
+
"""
|
|
523
|
+
|
|
524
|
+
def __init__(self, *, linear_sum=None, mol_indices=[]):
|
|
525
|
+
if linear_sum is None:
|
|
526
|
+
self.n_samples_ = 0
|
|
527
|
+
self.centroid_ = self.linear_sum_ = np.zeros((2048,), dtype="bool")
|
|
528
|
+
self.mol_indices = []
|
|
529
|
+
else:
|
|
530
|
+
self.n_samples_ = 1
|
|
531
|
+
self.centroid_ = self.linear_sum_ = linear_sum
|
|
532
|
+
self.mol_indices = mol_indices
|
|
533
|
+
|
|
534
|
+
self.child_ = None
|
|
535
|
+
self.parent_ = None
|
|
536
|
+
|
|
537
|
+
def update(self, subcluster):
|
|
538
|
+
self.n_samples_ += np.uint64(subcluster.n_samples_)
|
|
539
|
+
self.linear_sum_ = safe_sum(
|
|
540
|
+
self.n_samples_, self.linear_sum_, subcluster.linear_sum_
|
|
541
|
+
)
|
|
542
|
+
self.mol_indices += subcluster.mol_indices
|
|
543
|
+
self.centroid_ = calc_centroid(self.linear_sum_, self.n_samples_)
|
|
544
|
+
|
|
545
|
+
def merge_subcluster(self, nominee_cluster, threshold):
|
|
546
|
+
"""Check if a cluster is worthy enough to be merged. If
|
|
547
|
+
yes then merge.
|
|
548
|
+
"""
|
|
549
|
+
new_n = np.uint64(self.n_samples_) + np.uint64(nominee_cluster.n_samples_)
|
|
550
|
+
new_ls = safe_sum(new_n, self.linear_sum_, nominee_cluster.linear_sum_)
|
|
551
|
+
new_centroid = calc_centroid(new_ls, new_n)
|
|
552
|
+
|
|
553
|
+
if merge_accept(
|
|
554
|
+
threshold,
|
|
555
|
+
new_ls,
|
|
556
|
+
new_centroid,
|
|
557
|
+
new_n,
|
|
558
|
+
self.linear_sum_,
|
|
559
|
+
nominee_cluster.linear_sum_,
|
|
560
|
+
self.n_samples_,
|
|
561
|
+
nominee_cluster.n_samples_,
|
|
562
|
+
):
|
|
563
|
+
(self.n_samples_, self.linear_sum_, self.centroid_, self.mol_indices) = (
|
|
564
|
+
new_n,
|
|
565
|
+
new_ls,
|
|
566
|
+
new_centroid,
|
|
567
|
+
self.mol_indices + nominee_cluster.mol_indices,
|
|
568
|
+
)
|
|
569
|
+
return True
|
|
570
|
+
return False
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
class BitBirch:
|
|
574
|
+
"""Implements the BitBIRCH clustering algorithm.
|
|
575
|
+
|
|
576
|
+
BitBIRCH paper:
|
|
577
|
+
|
|
578
|
+
Memory- and time-efficient, online-learning algorithm.
|
|
579
|
+
It constructs a tree data structure with the cluster centroids being read off the leaf.
|
|
580
|
+
|
|
581
|
+
Parameters
|
|
582
|
+
----------
|
|
583
|
+
threshold : float, default=0.5
|
|
584
|
+
The similarity radius of the subcluster obtained by merging a new sample and the
|
|
585
|
+
closest subcluster should be greater than the threshold. Otherwise a new
|
|
586
|
+
subcluster is started. Setting this value to be very low promotes
|
|
587
|
+
splitting and vice-versa.
|
|
588
|
+
|
|
589
|
+
branching_factor : int, default=50
|
|
590
|
+
Maximum number of BF subclusters in each node. If a new samples enters
|
|
591
|
+
such that the number of subclusters exceed the branching_factor then
|
|
592
|
+
that node is split into two nodes with the subclusters redistributed
|
|
593
|
+
in each. The parent subcluster of that node is removed and two new
|
|
594
|
+
subclusters are added as parents of the 2 split nodes.
|
|
595
|
+
|
|
596
|
+
Attributes
|
|
597
|
+
----------
|
|
598
|
+
root_ : _BFNode
|
|
599
|
+
Root of the BFTree.
|
|
600
|
+
|
|
601
|
+
dummy_leaf_ : _BFNode
|
|
602
|
+
Start pointer to all the leaves.
|
|
603
|
+
|
|
604
|
+
subcluster_centers_ : ndarray
|
|
605
|
+
Centroids of all subclusters read directly from the leaves.
|
|
606
|
+
|
|
607
|
+
Notes
|
|
608
|
+
-----
|
|
609
|
+
The tree data structure consists of nodes with each node consisting of
|
|
610
|
+
a number of subclusters. The maximum number of subclusters in a node
|
|
611
|
+
is determined by the branching factor. Each subcluster maintains a
|
|
612
|
+
linear sum, mol_indices and the number of samples in that subcluster.
|
|
613
|
+
In addition, each subcluster can also have a node as its child, if the
|
|
614
|
+
subcluster is not a member of a leaf node.
|
|
615
|
+
|
|
616
|
+
For a new point entering the root, it is merged with the subcluster closest
|
|
617
|
+
to it and the linear sum, mol_indices and the number of samples of that
|
|
618
|
+
subcluster are updated. This is done recursively till the properties of
|
|
619
|
+
the leaf node are updated.
|
|
620
|
+
"""
|
|
621
|
+
|
|
622
|
+
def __init__(
|
|
623
|
+
self,
|
|
624
|
+
*,
|
|
625
|
+
threshold=0.5,
|
|
626
|
+
branching_factor=50,
|
|
627
|
+
):
|
|
628
|
+
self.threshold = threshold
|
|
629
|
+
self.branching_factor = branching_factor
|
|
630
|
+
self.index_tracker = 0
|
|
631
|
+
self.first_call = True
|
|
632
|
+
|
|
633
|
+
def fit(
|
|
634
|
+
self,
|
|
635
|
+
X,
|
|
636
|
+
store_centroids: bool = False,
|
|
637
|
+
input_is_packed: bool = True,
|
|
638
|
+
n_features: int | None = None,
|
|
639
|
+
max_fps: int | None = None,
|
|
640
|
+
):
|
|
641
|
+
"""
|
|
642
|
+
Build a BF Tree for the input data.
|
|
643
|
+
|
|
644
|
+
Parameters
|
|
645
|
+
----------
|
|
646
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
647
|
+
Input data.
|
|
648
|
+
|
|
649
|
+
Returns
|
|
650
|
+
-------
|
|
651
|
+
self
|
|
652
|
+
Fitted estimator.
|
|
653
|
+
"""
|
|
654
|
+
if isinstance(X, Path):
|
|
655
|
+
X = np.load(X, mmap_mode="r")[:max_fps]
|
|
656
|
+
else:
|
|
657
|
+
X = X[:max_fps]
|
|
658
|
+
threshold = self.threshold
|
|
659
|
+
branching_factor = self.branching_factor
|
|
660
|
+
n_features = _validate_n_features(X, input_is_packed, n_features)
|
|
661
|
+
d_type = X.dtype
|
|
662
|
+
|
|
663
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
664
|
+
# start a new tree.
|
|
665
|
+
if self.first_call:
|
|
666
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
667
|
+
self.root_ = _BFNode(
|
|
668
|
+
threshold=threshold,
|
|
669
|
+
branching_factor=branching_factor,
|
|
670
|
+
is_leaf=True,
|
|
671
|
+
n_features=n_features,
|
|
672
|
+
dtype=d_type,
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# To enable getting back subclusters.
|
|
676
|
+
self.dummy_leaf_ = _BFNode(
|
|
677
|
+
threshold=threshold,
|
|
678
|
+
branching_factor=branching_factor,
|
|
679
|
+
is_leaf=True,
|
|
680
|
+
n_features=n_features,
|
|
681
|
+
dtype=d_type,
|
|
682
|
+
)
|
|
683
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
684
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
685
|
+
|
|
686
|
+
# Cannot vectorize. Enough to convince to use cython.
|
|
687
|
+
if not sparse.issparse(X):
|
|
688
|
+
iter_func = iter
|
|
689
|
+
else:
|
|
690
|
+
iter_func = _iterate_sparse_X
|
|
691
|
+
|
|
692
|
+
for sample in iter_func(X):
|
|
693
|
+
unpack = _copy_or_unpack(sample, n_features, input_is_packed)
|
|
694
|
+
set_bits = np.sum(unpack.astype("uint64"))
|
|
695
|
+
subcluster = _BFSubcluster(
|
|
696
|
+
linear_sum=unpack, mol_indices=[self.index_tracker]
|
|
697
|
+
)
|
|
698
|
+
split = self.root_.insert_bf_subcluster(subcluster, set_bits)
|
|
699
|
+
|
|
700
|
+
if split:
|
|
701
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
702
|
+
self.root_, threshold, branching_factor
|
|
703
|
+
)
|
|
704
|
+
del self.root_
|
|
705
|
+
self.root_ = _BFNode(
|
|
706
|
+
threshold=threshold,
|
|
707
|
+
branching_factor=branching_factor,
|
|
708
|
+
is_leaf=False,
|
|
709
|
+
n_features=n_features,
|
|
710
|
+
dtype=d_type,
|
|
711
|
+
)
|
|
712
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
713
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
714
|
+
|
|
715
|
+
self.index_tracker += 1
|
|
716
|
+
if store_centroids:
|
|
717
|
+
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
|
718
|
+
self.subcluster_centers_ = centroids
|
|
719
|
+
self._n_features_out = self.subcluster_centers_.shape[0]
|
|
720
|
+
|
|
721
|
+
self.first_call = False
|
|
722
|
+
return self
|
|
723
|
+
|
|
724
|
+
def fit_np(self, X):
|
|
725
|
+
threshold = self.threshold
|
|
726
|
+
branching_factor = self.branching_factor
|
|
727
|
+
|
|
728
|
+
n_features = X.shape[1] - 1
|
|
729
|
+
d_type = X.dtype
|
|
730
|
+
|
|
731
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
732
|
+
# start a new tree.
|
|
733
|
+
if self.first_call:
|
|
734
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
735
|
+
self.root_ = _BFNode(
|
|
736
|
+
threshold=threshold,
|
|
737
|
+
branching_factor=branching_factor,
|
|
738
|
+
is_leaf=True,
|
|
739
|
+
n_features=n_features,
|
|
740
|
+
dtype=d_type,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
# To enable getting back subclusters.
|
|
744
|
+
self.dummy_leaf_ = _BFNode(
|
|
745
|
+
threshold=threshold,
|
|
746
|
+
branching_factor=branching_factor,
|
|
747
|
+
is_leaf=True,
|
|
748
|
+
n_features=n_features,
|
|
749
|
+
dtype=d_type,
|
|
750
|
+
)
|
|
751
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
752
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
753
|
+
|
|
754
|
+
# Cannot vectorize. Enough to convince to use cython.
|
|
755
|
+
if not sparse.issparse(X):
|
|
756
|
+
iter_func = iter
|
|
757
|
+
else:
|
|
758
|
+
iter_func = _iterate_sparse_X
|
|
759
|
+
|
|
760
|
+
for sample in iter_func(X):
|
|
761
|
+
sample_copy = sample.copy()
|
|
762
|
+
subcluster = _BFSubcluster(
|
|
763
|
+
linear_sum=sample_copy[:-1], mol_indices=[self.index_tracker]
|
|
764
|
+
)
|
|
765
|
+
n_samples = sample_copy[-1]
|
|
766
|
+
if n_samples > 1:
|
|
767
|
+
subcluster.n_samples_ = n_samples
|
|
768
|
+
subcluster.centroid_ = calc_centroid(sample_copy[:-1], n_samples)
|
|
769
|
+
set_bits = np.sum(subcluster.centroid_.astype("uint64"))
|
|
770
|
+
split = self.root_.insert_bf_subcluster(subcluster, set_bits)
|
|
771
|
+
|
|
772
|
+
if split:
|
|
773
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
774
|
+
self.root_, threshold, branching_factor
|
|
775
|
+
)
|
|
776
|
+
del self.root_
|
|
777
|
+
self.root_ = _BFNode(
|
|
778
|
+
threshold=threshold,
|
|
779
|
+
branching_factor=branching_factor,
|
|
780
|
+
is_leaf=False,
|
|
781
|
+
n_features=n_features,
|
|
782
|
+
dtype=d_type,
|
|
783
|
+
)
|
|
784
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
785
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
786
|
+
|
|
787
|
+
self.index_tracker += 1
|
|
788
|
+
|
|
789
|
+
self.first_call = False
|
|
790
|
+
return self
|
|
791
|
+
|
|
792
|
+
def fit_np_reinsert(self, X, reinsert_indices):
|
|
793
|
+
threshold = self.threshold
|
|
794
|
+
branching_factor = self.branching_factor
|
|
795
|
+
|
|
796
|
+
n_features = X.shape[1] - 1
|
|
797
|
+
d_type = X.dtype
|
|
798
|
+
|
|
799
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
800
|
+
# start a new tree.
|
|
801
|
+
if self.first_call:
|
|
802
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
803
|
+
self.root_ = _BFNode(
|
|
804
|
+
threshold=threshold,
|
|
805
|
+
branching_factor=branching_factor,
|
|
806
|
+
is_leaf=True,
|
|
807
|
+
n_features=n_features,
|
|
808
|
+
dtype=d_type,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
# To enable getting back subclusters.
|
|
812
|
+
self.dummy_leaf_ = _BFNode(
|
|
813
|
+
threshold=threshold,
|
|
814
|
+
branching_factor=branching_factor,
|
|
815
|
+
is_leaf=True,
|
|
816
|
+
n_features=n_features,
|
|
817
|
+
dtype=d_type,
|
|
818
|
+
)
|
|
819
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
820
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
821
|
+
|
|
822
|
+
# Cannot vectorize. Enough to convince to use cython.
|
|
823
|
+
if not sparse.issparse(X):
|
|
824
|
+
iter_func = iter
|
|
825
|
+
else:
|
|
826
|
+
iter_func = _iterate_sparse_X
|
|
827
|
+
|
|
828
|
+
for sample, mol_inds in zip(iter_func(X), reinsert_indices):
|
|
829
|
+
sample_copy = sample.copy()
|
|
830
|
+
subcluster = _BFSubcluster(
|
|
831
|
+
linear_sum=sample_copy[:-1], mol_indices=mol_inds
|
|
832
|
+
)
|
|
833
|
+
n_samples = sample_copy[-1]
|
|
834
|
+
if n_samples > 1:
|
|
835
|
+
subcluster.n_samples_ = n_samples
|
|
836
|
+
subcluster.centroid_ = calc_centroid(sample_copy[:-1], n_samples)
|
|
837
|
+
set_bits = np.sum(subcluster.centroid_.astype("uint64"))
|
|
838
|
+
split = self.root_.insert_bf_subcluster(subcluster, set_bits)
|
|
839
|
+
|
|
840
|
+
if split:
|
|
841
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
842
|
+
self.root_, threshold, branching_factor
|
|
843
|
+
)
|
|
844
|
+
del self.root_
|
|
845
|
+
self.root_ = _BFNode(
|
|
846
|
+
threshold=threshold,
|
|
847
|
+
branching_factor=branching_factor,
|
|
848
|
+
is_leaf=False,
|
|
849
|
+
n_features=n_features,
|
|
850
|
+
dtype=d_type,
|
|
851
|
+
)
|
|
852
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
853
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
854
|
+
|
|
855
|
+
self.first_call = False
|
|
856
|
+
return self
|
|
857
|
+
|
|
858
|
+
def fit_reinsert(
|
|
859
|
+
self,
|
|
860
|
+
X,
|
|
861
|
+
reinsert_indices,
|
|
862
|
+
store_centroids: bool = False,
|
|
863
|
+
input_is_packed: bool = True,
|
|
864
|
+
n_features: int | None = None,
|
|
865
|
+
):
|
|
866
|
+
"""X corresponds to only the molecules that will be reinserted into the tree
|
|
867
|
+
reinsert indices are the indices of the molecules that will be reinserted into the tree
|
|
868
|
+
"""
|
|
869
|
+
threshold = self.threshold
|
|
870
|
+
branching_factor = self.branching_factor
|
|
871
|
+
n_features = _validate_n_features(X, input_is_packed, n_features)
|
|
872
|
+
d_type = X.dtype
|
|
873
|
+
|
|
874
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
875
|
+
# start a new tree.
|
|
876
|
+
if self.first_call:
|
|
877
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
878
|
+
self.root_ = _BFNode(
|
|
879
|
+
threshold=threshold,
|
|
880
|
+
branching_factor=branching_factor,
|
|
881
|
+
is_leaf=True,
|
|
882
|
+
n_features=n_features,
|
|
883
|
+
dtype=d_type,
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
# To enable getting back subclusters.
|
|
887
|
+
self.dummy_leaf_ = _BFNode(
|
|
888
|
+
threshold=threshold,
|
|
889
|
+
branching_factor=branching_factor,
|
|
890
|
+
is_leaf=True,
|
|
891
|
+
n_features=n_features,
|
|
892
|
+
dtype=d_type,
|
|
893
|
+
)
|
|
894
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
895
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
896
|
+
|
|
897
|
+
# Cannot vectorize. Enough to convince to use cython.
|
|
898
|
+
if not sparse.issparse(X):
|
|
899
|
+
iter_func = iter
|
|
900
|
+
else:
|
|
901
|
+
iter_func = _iterate_sparse_X
|
|
902
|
+
|
|
903
|
+
for sample, mol_ind in zip(iter_func(X), reinsert_indices):
|
|
904
|
+
unpack = _copy_or_unpack(sample, n_features, input_is_packed)
|
|
905
|
+
set_bits = np.sum(unpack.astype("uint64"))
|
|
906
|
+
subcluster = _BFSubcluster(linear_sum=unpack, mol_indices=[mol_ind])
|
|
907
|
+
split = self.root_.insert_bf_subcluster(subcluster, set_bits)
|
|
908
|
+
if split:
|
|
909
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
910
|
+
self.root_, threshold, branching_factor
|
|
911
|
+
)
|
|
912
|
+
del self.root_
|
|
913
|
+
self.root_ = _BFNode(
|
|
914
|
+
threshold=threshold,
|
|
915
|
+
branching_factor=branching_factor,
|
|
916
|
+
is_leaf=False,
|
|
917
|
+
n_features=n_features,
|
|
918
|
+
dtype=d_type,
|
|
919
|
+
)
|
|
920
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
921
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
922
|
+
|
|
923
|
+
if store_centroids:
|
|
924
|
+
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
|
925
|
+
self.subcluster_centers_ = centroids
|
|
926
|
+
self._n_features_out = self.subcluster_centers_.shape[0]
|
|
927
|
+
|
|
928
|
+
self.first_call = False
|
|
929
|
+
return self
|
|
930
|
+
|
|
931
|
+
def _get_leaves(self):
|
|
932
|
+
"""
|
|
933
|
+
Retrieve the leaves of the BF Node.
|
|
934
|
+
|
|
935
|
+
Returns
|
|
936
|
+
-------
|
|
937
|
+
leaves : list of shape (n_leaves,)
|
|
938
|
+
List of the leaf nodes.
|
|
939
|
+
"""
|
|
940
|
+
leaf_ptr = self.dummy_leaf_.next_leaf_
|
|
941
|
+
leaves = []
|
|
942
|
+
while leaf_ptr is not None:
|
|
943
|
+
leaves.append(leaf_ptr)
|
|
944
|
+
leaf_ptr = leaf_ptr.next_leaf_
|
|
945
|
+
return leaves
|
|
946
|
+
|
|
947
|
+
def get_centroids_mol_ids(self):
|
|
948
|
+
"""Method to return a dictionary containing the centroids and mol indices of the leaves"""
|
|
949
|
+
if self.first_call:
|
|
950
|
+
raise ValueError("The model has not been fitted yet.")
|
|
951
|
+
|
|
952
|
+
centroids = []
|
|
953
|
+
mol_ids = []
|
|
954
|
+
for leaf in self._get_leaves():
|
|
955
|
+
for subcluster in leaf.subclusters_:
|
|
956
|
+
centroids.append(subcluster.centroid_)
|
|
957
|
+
mol_ids.append(subcluster.mol_indices)
|
|
958
|
+
|
|
959
|
+
dict_centroids_mol_ids = {"centroids": centroids, "mol_ids": mol_ids}
|
|
960
|
+
|
|
961
|
+
return dict_centroids_mol_ids
|
|
962
|
+
|
|
963
|
+
def get_centroids(self):
|
|
964
|
+
"""Method to return a list of Numpy arrays containing the centroids' fingerprints"""
|
|
965
|
+
if self.first_call:
|
|
966
|
+
raise ValueError("The model has not been fitted yet.")
|
|
967
|
+
|
|
968
|
+
centroids = []
|
|
969
|
+
for leaf in self._get_leaves():
|
|
970
|
+
for subcluster in leaf.subclusters_:
|
|
971
|
+
centroids.append(subcluster.centroid_)
|
|
972
|
+
|
|
973
|
+
return centroids
|
|
974
|
+
|
|
975
|
+
def get_cluster_mol_ids(self):
|
|
976
|
+
"""Method to return the indices of molecules in each cluster"""
|
|
977
|
+
if self.first_call:
|
|
978
|
+
raise ValueError("The model has not been fitted yet.")
|
|
979
|
+
|
|
980
|
+
clusters_mol_id = []
|
|
981
|
+
for leaf in self._get_leaves():
|
|
982
|
+
for subcluster in leaf.subclusters_:
|
|
983
|
+
clusters_mol_id.append(subcluster.mol_indices)
|
|
984
|
+
|
|
985
|
+
# Sort the clusters by the number of samples in the cluster
|
|
986
|
+
clusters_mol_id = sorted(clusters_mol_id, key=lambda x: len(x), reverse=True)
|
|
987
|
+
|
|
988
|
+
return clusters_mol_id
|
|
989
|
+
|
|
990
|
+
def _get_BFs(self):
|
|
991
|
+
"""Method to return the BitFeatures of the leaves"""
|
|
992
|
+
if self.first_call:
|
|
993
|
+
raise ValueError("The model has not been fitted yet.")
|
|
994
|
+
|
|
995
|
+
BFs = []
|
|
996
|
+
for leaf in self._get_leaves():
|
|
997
|
+
for subcluster in leaf.subclusters_:
|
|
998
|
+
BFs.append(subcluster)
|
|
999
|
+
|
|
1000
|
+
# Sort the BitFeatures by the number of samples in the cluster
|
|
1001
|
+
BFs = sorted(BFs, key=lambda x: x.n_samples_, reverse=True)
|
|
1002
|
+
|
|
1003
|
+
return BFs
|
|
1004
|
+
|
|
1005
|
+
def bf_to_np_refine(
|
|
1006
|
+
self,
|
|
1007
|
+
fps,
|
|
1008
|
+
initial_mol=0,
|
|
1009
|
+
input_is_packed: bool = True,
|
|
1010
|
+
n_features: int | None = None,
|
|
1011
|
+
):
|
|
1012
|
+
"""Method to prepare the BitFeatures of the largest cluster and the rest of the clusters"""
|
|
1013
|
+
if self.first_call:
|
|
1014
|
+
raise ValueError("The model has not been fitted yet.")
|
|
1015
|
+
n_features = _validate_n_features(fps, input_is_packed, n_features)
|
|
1016
|
+
BFs = self._get_BFs()
|
|
1017
|
+
big, rest = BFs[0], BFs[1:]
|
|
1018
|
+
|
|
1019
|
+
fp_64 = []
|
|
1020
|
+
fp_32 = []
|
|
1021
|
+
fp_16 = []
|
|
1022
|
+
fp_8 = []
|
|
1023
|
+
|
|
1024
|
+
mols_64 = []
|
|
1025
|
+
mols_32 = []
|
|
1026
|
+
mols_16 = []
|
|
1027
|
+
mols_8 = []
|
|
1028
|
+
|
|
1029
|
+
for BF in rest:
|
|
1030
|
+
if BF.n_samples_ >= 4294967294:
|
|
1031
|
+
fp_64.append(BF.linear_sum_)
|
|
1032
|
+
fp_64.append(BF.n_samples_)
|
|
1033
|
+
mols_64.append(BF.mol_indices)
|
|
1034
|
+
elif BF.n_samples_ >= 65534:
|
|
1035
|
+
fp_32.append(BF.linear_sum_)
|
|
1036
|
+
fp_32.append(BF.n_samples_)
|
|
1037
|
+
mols_32.append(BF.mol_indices)
|
|
1038
|
+
elif BF.n_samples_ >= 254:
|
|
1039
|
+
fp_16.append(list(BF.linear_sum_))
|
|
1040
|
+
fp_16[-1].append(BF.n_samples_)
|
|
1041
|
+
mols_16.append(BF.mol_indices)
|
|
1042
|
+
else:
|
|
1043
|
+
fp_8.append(list(BF.linear_sum_))
|
|
1044
|
+
fp_8[-1].append(BF.n_samples_)
|
|
1045
|
+
mols_8.append(BF.mol_indices)
|
|
1046
|
+
|
|
1047
|
+
for mol in big.mol_indices:
|
|
1048
|
+
if input_is_packed:
|
|
1049
|
+
fp_8.append(
|
|
1050
|
+
list(unpack_fingerprints(fps[mol - initial_mol], n_features))
|
|
1051
|
+
)
|
|
1052
|
+
else:
|
|
1053
|
+
fp_8.append(list(fps[mol - initial_mol]))
|
|
1054
|
+
fp_8[-1].append(1)
|
|
1055
|
+
mols_8.append(BF.mol_indices)
|
|
1056
|
+
|
|
1057
|
+
fps_bfs = []
|
|
1058
|
+
mols_bfs = []
|
|
1059
|
+
if len(fp_64) != 0:
|
|
1060
|
+
fp_64 = np.array(fp_64, dtype=np.uint64)
|
|
1061
|
+
fps_bfs.append(fp_64)
|
|
1062
|
+
mols_bfs.append(mols_64)
|
|
1063
|
+
if len(fp_32) != 0:
|
|
1064
|
+
fp_32 = np.array(fp_32, dtype=np.uint32)
|
|
1065
|
+
fps_bfs.append(fp_32)
|
|
1066
|
+
mols_bfs.append(mols_32)
|
|
1067
|
+
if len(fp_16) != 0:
|
|
1068
|
+
fp_16 = np.array(fp_16, dtype=np.uint16)
|
|
1069
|
+
fps_bfs.append(fp_16)
|
|
1070
|
+
mols_bfs.append(mols_16)
|
|
1071
|
+
if len(fp_8) != 0:
|
|
1072
|
+
fp_8 = np.array(fp_8, dtype=np.uint8)
|
|
1073
|
+
fps_bfs.append(fp_8)
|
|
1074
|
+
mols_bfs.append(mols_8)
|
|
1075
|
+
|
|
1076
|
+
return fps_bfs, mols_bfs
|
|
1077
|
+
|
|
1078
|
+
def bf_to_np(self):
|
|
1079
|
+
"""Method to prepare the BitFeatures of the largest cluster and the rest of the clusters"""
|
|
1080
|
+
if self.first_call:
|
|
1081
|
+
raise ValueError("The model has not been fitted yet.")
|
|
1082
|
+
|
|
1083
|
+
BFs = self._get_BFs()
|
|
1084
|
+
|
|
1085
|
+
fp_64 = []
|
|
1086
|
+
fp_32 = []
|
|
1087
|
+
fp_16 = []
|
|
1088
|
+
fp_8 = []
|
|
1089
|
+
|
|
1090
|
+
mols_64 = []
|
|
1091
|
+
mols_32 = []
|
|
1092
|
+
mols_16 = []
|
|
1093
|
+
mols_8 = []
|
|
1094
|
+
|
|
1095
|
+
for BF in BFs:
|
|
1096
|
+
if BF.n_samples_ >= 4294967294:
|
|
1097
|
+
fp_64.append(BF.linear_sum_)
|
|
1098
|
+
fp_64.append(BF.n_samples_)
|
|
1099
|
+
mols_64.append(BF.mol_indices)
|
|
1100
|
+
elif BF.n_samples_ >= 65534:
|
|
1101
|
+
fp_32.append(BF.linear_sum_)
|
|
1102
|
+
fp_32.append(BF.n_samples_)
|
|
1103
|
+
mols_32.append(BF.mol_indices)
|
|
1104
|
+
elif BF.n_samples_ >= 254:
|
|
1105
|
+
fp_16.append(list(BF.linear_sum_))
|
|
1106
|
+
fp_16[-1].append(BF.n_samples_)
|
|
1107
|
+
mols_16.append(BF.mol_indices)
|
|
1108
|
+
else:
|
|
1109
|
+
fp_8.append(list(BF.linear_sum_))
|
|
1110
|
+
fp_8[-1].append(BF.n_samples_)
|
|
1111
|
+
mols_8.append(BF.mol_indices)
|
|
1112
|
+
|
|
1113
|
+
fps_bfs = []
|
|
1114
|
+
mols_bfs = []
|
|
1115
|
+
if len(fp_64) != 0:
|
|
1116
|
+
fp_64 = np.array(fp_64, dtype=np.uint64)
|
|
1117
|
+
fps_bfs.append(fp_64)
|
|
1118
|
+
mols_bfs.append(mols_64)
|
|
1119
|
+
if len(fp_32) != 0:
|
|
1120
|
+
fp_32 = np.array(fp_32, dtype=np.uint32)
|
|
1121
|
+
fps_bfs.append(fp_32)
|
|
1122
|
+
mols_bfs.append(mols_32)
|
|
1123
|
+
if len(fp_16) != 0:
|
|
1124
|
+
fp_16 = np.array(fp_16, dtype=np.uint16)
|
|
1125
|
+
fps_bfs.append(fp_16)
|
|
1126
|
+
mols_bfs.append(mols_16)
|
|
1127
|
+
if len(fp_8) != 0:
|
|
1128
|
+
fp_8 = np.array(fp_8, dtype=np.uint8)
|
|
1129
|
+
fps_bfs.append(fp_8)
|
|
1130
|
+
mols_bfs.append(mols_8)
|
|
1131
|
+
|
|
1132
|
+
return fps_bfs, mols_bfs
|
|
1133
|
+
|
|
1134
|
+
def get_assignments(self, n_mols):
|
|
1135
|
+
clustered_ids = self.get_cluster_mol_ids()
|
|
1136
|
+
|
|
1137
|
+
assignments = np.full(n_mols, -1, dtype=int)
|
|
1138
|
+
for i, cluster in enumerate(clustered_ids):
|
|
1139
|
+
assignments[cluster] = i + 1
|
|
1140
|
+
|
|
1141
|
+
# Check that there are no unassigned molecules
|
|
1142
|
+
assert np.all(assignments != -1)
|
|
1143
|
+
|
|
1144
|
+
return assignments
|