bblean 0.6.0b2__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cp311-win_amd64.pyd +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1850 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b2.dist-info/METADATA +288 -0
- bblean-0.6.0b2.dist-info/RECORD +31 -0
- bblean-0.6.0b2.dist-info/WHEEL +5 -0
- bblean-0.6.0b2.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b2.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1252 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
# BitBIRCH is an open-source clustering module based on iSIM
|
|
3
|
+
#
|
|
4
|
+
# Please, cite the BitBIRCH paper: https://www.biorxiv.org/content/10.1101/2024.08.10.607459v1
|
|
5
|
+
#
|
|
6
|
+
# BitBIRCH is free software; you can redistribute it and/or modify
|
|
7
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
|
8
|
+
# the Free Software Foundation, version 3.
|
|
9
|
+
#
|
|
10
|
+
# BitBIRCH is distributed in the hope that it will be useful,
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
+
# GNU Lesser General Public License for more details.
|
|
14
|
+
#
|
|
15
|
+
# BitBIRCH authors (PYTHON): Ramon Alain Miranda Quintana <ramirandaq@gmail.com>, <quintana@chem.ufl.edu>
|
|
16
|
+
# Vicky (Vic) Jung <jungvicky@ufl.edu>
|
|
17
|
+
# Kenneth Lopez Perez <klopezperez@chem.ufl.edu>
|
|
18
|
+
# Kate Huddleston <kdavis2@chem.ufl.edu>
|
|
19
|
+
#
|
|
20
|
+
# BitBIRCH License: LGPL-3.0 https://www.gnu.org/licenses/lgpl-3.0.en.html#license-text
|
|
21
|
+
#
|
|
22
|
+
### Part of the tree-management code was derived from https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html
|
|
23
|
+
### Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
|
24
|
+
### Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
|
25
|
+
### Joel Nothman <joel.nothman@gmail.com>
|
|
26
|
+
### License: BSD 3 clause
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
from scipy import sparse
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Utility function to validate the n_features argument for packed inputs
|
|
34
|
+
def _validate_n_features(X, input_is_packed: bool, n_features: int | None) -> int:
|
|
35
|
+
if input_is_packed:
|
|
36
|
+
raise ValueError("Packed inputs not supported for BitBirch-int64")
|
|
37
|
+
|
|
38
|
+
x_n_features = X.shape[1]
|
|
39
|
+
if n_features is not None:
|
|
40
|
+
if n_features != x_n_features:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
"n_features is redundant for non-packed inputs"
|
|
43
|
+
" if passed, it must be equal to X.shape[1]."
|
|
44
|
+
f" For passed X, X.shape[1] = {X.shape[1]}."
|
|
45
|
+
" If this value is not what you expected,"
|
|
46
|
+
" make sure the passed X is actually unpacked."
|
|
47
|
+
)
|
|
48
|
+
return x_n_features
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def set_merge(merge_criterion, tolerance=0.05):
|
|
52
|
+
"""
|
|
53
|
+
Sets merge_accept function for merge_subcluster, based on user specified merge_criteria.
|
|
54
|
+
|
|
55
|
+
Radius: merge subcluster based on comparison to centroid of the cluster
|
|
56
|
+
Diameter: merge subcluster based on instant Tanimoto similarity of cluster
|
|
57
|
+
Tolerance: applies tolerance threshold to diameter merge criteria, which will merge subcluster with stricter threshold for newly added molecules
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
-----------
|
|
61
|
+
merge_criterion: str();
|
|
62
|
+
radius, diameter or tolerance
|
|
63
|
+
tolerance: float;
|
|
64
|
+
sets penalty value for similarity threshold when callng tolerance merge criteria
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
--------
|
|
68
|
+
merge_accept(): function
|
|
69
|
+
if cluster is accepted to merge, merge the cluster based on the criteria specified
|
|
70
|
+
"""
|
|
71
|
+
if merge_criterion == "radius":
|
|
72
|
+
|
|
73
|
+
def merge_accept(
|
|
74
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
75
|
+
):
|
|
76
|
+
jt_sim = jt_isim(new_ls + new_centroid, new_n + 1) * (new_n + 1) - jt_isim(
|
|
77
|
+
new_ls, new_n
|
|
78
|
+
) * (new_n - 1)
|
|
79
|
+
return jt_sim >= threshold * 2
|
|
80
|
+
|
|
81
|
+
elif merge_criterion == "diameter":
|
|
82
|
+
|
|
83
|
+
def merge_accept(
|
|
84
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
85
|
+
):
|
|
86
|
+
jt_radius = jt_isim(new_ls, new_n)
|
|
87
|
+
return jt_radius >= threshold
|
|
88
|
+
|
|
89
|
+
elif merge_criterion == "tolerance_tough":
|
|
90
|
+
|
|
91
|
+
def merge_accept(
|
|
92
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
93
|
+
):
|
|
94
|
+
jt_radius = jt_isim(new_ls, new_n)
|
|
95
|
+
if jt_radius < threshold:
|
|
96
|
+
return False
|
|
97
|
+
else:
|
|
98
|
+
if old_n == 1 and nom_n == 1:
|
|
99
|
+
return True
|
|
100
|
+
elif nom_n == 1:
|
|
101
|
+
return (
|
|
102
|
+
jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
|
|
103
|
+
- jt_isim(old_ls, old_n) * (old_n - 1)
|
|
104
|
+
) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
|
|
105
|
+
jt_radius >= threshold
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
return (
|
|
109
|
+
jt_isim(old_ls + nom_ls, old_n + nom_n)
|
|
110
|
+
* (old_n + nom_n)
|
|
111
|
+
* (old_n + nom_n - 1)
|
|
112
|
+
- jt_isim(old_ls, old_n) * old_n * (old_n - 1)
|
|
113
|
+
- jt_isim(nom_ls, nom_n) * nom_n * (nom_n - 1)
|
|
114
|
+
) / (2 * old_n * nom_n) >= jt_isim(old_ls, old_n) - tolerance and (
|
|
115
|
+
jt_radius >= threshold
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
elif merge_criterion in ["tolerance", "tolerance-legacy"]:
|
|
119
|
+
|
|
120
|
+
def merge_accept(
|
|
121
|
+
threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
|
|
122
|
+
):
|
|
123
|
+
jt_radius = jt_isim(new_ls, new_n)
|
|
124
|
+
if jt_radius < threshold:
|
|
125
|
+
return False
|
|
126
|
+
else:
|
|
127
|
+
if old_n == 1 and nom_n == 1:
|
|
128
|
+
return True
|
|
129
|
+
elif nom_n == 1:
|
|
130
|
+
return (
|
|
131
|
+
jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
|
|
132
|
+
- jt_isim(old_ls, old_n) * (old_n - 1)
|
|
133
|
+
) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
|
|
134
|
+
jt_radius >= threshold
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
globals()["merge_accept"] = merge_accept
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def jt_isim(c_total, n_objects):
|
|
143
|
+
"""iSIM Tanimoto calculation
|
|
144
|
+
|
|
145
|
+
https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
c_total : np.ndarray
|
|
150
|
+
Sum of the elements column-wise
|
|
151
|
+
|
|
152
|
+
n_objects : int
|
|
153
|
+
Number of elements
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
----------
|
|
157
|
+
isim : float
|
|
158
|
+
iSIM Jaccard-Tanimoto value
|
|
159
|
+
"""
|
|
160
|
+
sum_kq = np.sum(c_total)
|
|
161
|
+
sum_kqsq = np.dot(c_total, c_total)
|
|
162
|
+
a = (sum_kqsq - sum_kq) / 2
|
|
163
|
+
|
|
164
|
+
return a / (a + n_objects * sum_kq - sum_kqsq)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def max_separation(X):
|
|
168
|
+
"""Finds two objects in X that are very separated
|
|
169
|
+
This is an approximation (not guaranteed to find
|
|
170
|
+
the two absolutely most separated objects), but it is
|
|
171
|
+
a very robust O(N) implementation. Quality of clustering
|
|
172
|
+
does not diminish in the end.
|
|
173
|
+
|
|
174
|
+
Algorithm:
|
|
175
|
+
a) Find centroid of X
|
|
176
|
+
b) mol1 is the molecule most distant from the centroid
|
|
177
|
+
c) mol2 is the molecule most distant from mol1
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
(mol1, mol2) : (int, int)
|
|
182
|
+
indices of mol1 and mol2
|
|
183
|
+
1 - sims_mol1 : np.ndarray
|
|
184
|
+
Distances to mol1
|
|
185
|
+
1 - sims_mol2: np.ndarray
|
|
186
|
+
Distances to mol2
|
|
187
|
+
These are needed for node1_dist and node2_dist in _split_node
|
|
188
|
+
"""
|
|
189
|
+
# Get the centroid of the set
|
|
190
|
+
n_samples = len(X)
|
|
191
|
+
linear_sum = np.sum(X, axis=0)
|
|
192
|
+
centroid = calc_centroid(linear_sum, n_samples)
|
|
193
|
+
|
|
194
|
+
# Get the similarity of each molecule to the centroid
|
|
195
|
+
pop_counts = np.sum(X, axis=1)
|
|
196
|
+
a_centroid = np.dot(X, centroid)
|
|
197
|
+
sims_med = a_centroid / (pop_counts + np.sum(centroid) - a_centroid)
|
|
198
|
+
|
|
199
|
+
# Get the least similar molecule to the centroid
|
|
200
|
+
mol1 = np.argmin(sims_med)
|
|
201
|
+
|
|
202
|
+
# Get the similarity of each molecule to mol1
|
|
203
|
+
a_mol1 = np.dot(X, X[mol1])
|
|
204
|
+
sims_mol1 = a_mol1 / (pop_counts + pop_counts[mol1] - a_mol1)
|
|
205
|
+
|
|
206
|
+
# Get the least similar molecule to mol1
|
|
207
|
+
mol2 = np.argmin(sims_mol1)
|
|
208
|
+
|
|
209
|
+
# Get the similarity of each molecule to mol2
|
|
210
|
+
a_mol2 = np.dot(X, X[mol2])
|
|
211
|
+
sims_mol2 = a_mol2 / (pop_counts + pop_counts[mol2] - a_mol2)
|
|
212
|
+
|
|
213
|
+
return (mol1, mol2), sims_mol1, sims_mol2
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def calc_centroid(linear_sum, n_samples):
|
|
217
|
+
"""Calculates centroid
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
|
|
222
|
+
linear_sum : np.ndarray
|
|
223
|
+
Sum of the elements column-wise
|
|
224
|
+
n_samples : int
|
|
225
|
+
Number of samples
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
centroid : np.ndarray
|
|
230
|
+
Centroid fingerprints of the given set
|
|
231
|
+
"""
|
|
232
|
+
return np.where(linear_sum >= n_samples * 0.5, 1, 0)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _iterate_sparse_X(X):
|
|
236
|
+
"""This little hack returns a densified row when iterating over a sparse
|
|
237
|
+
matrix, instead of constructing a sparse matrix for every row that is
|
|
238
|
+
expensive.
|
|
239
|
+
"""
|
|
240
|
+
n_samples, n_features = X.shape
|
|
241
|
+
X_indices = X.indices
|
|
242
|
+
X_data = X.data
|
|
243
|
+
X_indptr = X.indptr
|
|
244
|
+
|
|
245
|
+
for i in range(n_samples):
|
|
246
|
+
row = np.zeros(n_features)
|
|
247
|
+
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
|
248
|
+
nonzero_indices = X_indices[startptr:endptr]
|
|
249
|
+
row[nonzero_indices] = X_data[startptr:endptr]
|
|
250
|
+
yield row
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _split_node(node, threshold, branching_factor, singly):
|
|
254
|
+
"""The node has to be split if there is no place for a new subcluster
|
|
255
|
+
in the node.
|
|
256
|
+
1. Two empty nodes and two empty subclusters are initialized.
|
|
257
|
+
2. The pair of distant subclusters are found.
|
|
258
|
+
3. The properties of the empty subclusters and nodes are updated
|
|
259
|
+
according to the nearest distance between the subclusters to the
|
|
260
|
+
pair of distant subclusters.
|
|
261
|
+
4. The two nodes are set as children to the two subclusters.
|
|
262
|
+
"""
|
|
263
|
+
new_subcluster1 = _BFSubcluster()
|
|
264
|
+
new_subcluster2 = _BFSubcluster()
|
|
265
|
+
new_node1 = _BFNode(
|
|
266
|
+
threshold=threshold,
|
|
267
|
+
branching_factor=branching_factor,
|
|
268
|
+
is_leaf=node.is_leaf,
|
|
269
|
+
n_features=node.n_features,
|
|
270
|
+
dtype=node.init_centroids_.dtype,
|
|
271
|
+
)
|
|
272
|
+
new_node2 = _BFNode(
|
|
273
|
+
threshold=threshold,
|
|
274
|
+
branching_factor=branching_factor,
|
|
275
|
+
is_leaf=node.is_leaf,
|
|
276
|
+
n_features=node.n_features,
|
|
277
|
+
dtype=node.init_centroids_.dtype,
|
|
278
|
+
)
|
|
279
|
+
new_subcluster1.child_ = new_node1
|
|
280
|
+
new_subcluster2.child_ = new_node2
|
|
281
|
+
|
|
282
|
+
if node.is_leaf:
|
|
283
|
+
if node.prev_leaf_ is not None:
|
|
284
|
+
node.prev_leaf_.next_leaf_ = new_node1
|
|
285
|
+
new_node1.prev_leaf_ = node.prev_leaf_
|
|
286
|
+
new_node1.next_leaf_ = new_node2
|
|
287
|
+
new_node2.prev_leaf_ = new_node1
|
|
288
|
+
new_node2.next_leaf_ = node.next_leaf_
|
|
289
|
+
if node.next_leaf_ is not None:
|
|
290
|
+
node.next_leaf_.prev_leaf_ = new_node2
|
|
291
|
+
|
|
292
|
+
# O(N) implementation of max separation
|
|
293
|
+
farthest_idx, node1_dist, node2_dist = max_separation(node.centroids_)
|
|
294
|
+
# Notice that max_separation is returning similarities and not distances
|
|
295
|
+
node1_closer = node1_dist > node2_dist
|
|
296
|
+
# Make sure node1 is closest to itself even if all distances are equal.
|
|
297
|
+
# This can only happen when all node.centroids_ are duplicates leading to all
|
|
298
|
+
# distances between centroids being zero.
|
|
299
|
+
node1_closer[farthest_idx[0]] = True
|
|
300
|
+
|
|
301
|
+
for idx, subcluster in enumerate(node.subclusters_):
|
|
302
|
+
if node1_closer[idx]:
|
|
303
|
+
new_node1.append_subcluster(subcluster)
|
|
304
|
+
new_subcluster1.update(subcluster)
|
|
305
|
+
if not singly:
|
|
306
|
+
subcluster.parent_ = new_subcluster1
|
|
307
|
+
else:
|
|
308
|
+
new_node2.append_subcluster(subcluster)
|
|
309
|
+
new_subcluster2.update(subcluster)
|
|
310
|
+
if not singly:
|
|
311
|
+
subcluster.parent_ = new_subcluster2
|
|
312
|
+
return new_subcluster1, new_subcluster2
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class _BFNode:
|
|
316
|
+
"""Each node in a BFTree is called a BFNode.
|
|
317
|
+
|
|
318
|
+
The BFNode can have a maximum of branching_factor
|
|
319
|
+
number of BFSubclusters.
|
|
320
|
+
|
|
321
|
+
Parameters
|
|
322
|
+
----------
|
|
323
|
+
threshold : float
|
|
324
|
+
Threshold needed for a new subcluster to enter a BFSubcluster.
|
|
325
|
+
|
|
326
|
+
branching_factor : int
|
|
327
|
+
Maximum number of BF subclusters in each node.
|
|
328
|
+
|
|
329
|
+
is_leaf : bool
|
|
330
|
+
We need to know if the BFNode is a leaf or not, in order to
|
|
331
|
+
retrieve the final subclusters.
|
|
332
|
+
|
|
333
|
+
n_features : int
|
|
334
|
+
The number of features.
|
|
335
|
+
|
|
336
|
+
Attributes
|
|
337
|
+
----------
|
|
338
|
+
subclusters_ : list
|
|
339
|
+
List of subclusters for a particular BFNode.
|
|
340
|
+
|
|
341
|
+
prev_leaf_ : _BFNode
|
|
342
|
+
Useful only if is_leaf is True.
|
|
343
|
+
|
|
344
|
+
next_leaf_ : _BFNode
|
|
345
|
+
next_leaf. Useful only if is_leaf is True.
|
|
346
|
+
the final subclusters.
|
|
347
|
+
|
|
348
|
+
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
|
349
|
+
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
|
350
|
+
the centroids are just a view of the ``init_centroids_`` .
|
|
351
|
+
|
|
352
|
+
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
|
353
|
+
View of ``init_centroids_``.
|
|
354
|
+
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
|
|
358
|
+
self.threshold = threshold
|
|
359
|
+
self.branching_factor = branching_factor
|
|
360
|
+
self.is_leaf = is_leaf
|
|
361
|
+
self.n_features = n_features
|
|
362
|
+
|
|
363
|
+
# The list of subclusters, centroids and squared norms
|
|
364
|
+
# to manipulate throughout.
|
|
365
|
+
self.subclusters_ = []
|
|
366
|
+
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
|
|
367
|
+
self.prev_leaf_ = None
|
|
368
|
+
self.next_leaf_ = None
|
|
369
|
+
|
|
370
|
+
def append_subcluster(self, subcluster):
|
|
371
|
+
n_samples = len(self.subclusters_)
|
|
372
|
+
self.subclusters_.append(subcluster)
|
|
373
|
+
self.init_centroids_[n_samples] = subcluster.centroid_
|
|
374
|
+
|
|
375
|
+
# Keep centroids as views. In this way
|
|
376
|
+
# if we change init_centroids, it is sufficient
|
|
377
|
+
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
|
|
378
|
+
|
|
379
|
+
def update_split_subclusters(
|
|
380
|
+
self, subcluster, new_subcluster1, new_subcluster2, singly
|
|
381
|
+
):
|
|
382
|
+
"""Remove a subcluster from a node and update it with the
|
|
383
|
+
split subclusters.
|
|
384
|
+
"""
|
|
385
|
+
if not singly:
|
|
386
|
+
new_subcluster1.parent_ = self.subclusters_[0].parent_
|
|
387
|
+
new_subcluster2.parent_ = self.subclusters_[0].parent_
|
|
388
|
+
|
|
389
|
+
ind = self.subclusters_.index(subcluster)
|
|
390
|
+
self.subclusters_[ind] = new_subcluster1
|
|
391
|
+
self.init_centroids_[ind] = new_subcluster1.centroid_
|
|
392
|
+
self.centroids_[ind] = new_subcluster1.centroid_
|
|
393
|
+
self.append_subcluster(new_subcluster2)
|
|
394
|
+
|
|
395
|
+
def insert_bf_subcluster(self, subcluster, set_bits, ps, singly):
|
|
396
|
+
"""Insert a new subcluster into the node."""
|
|
397
|
+
if not self.subclusters_:
|
|
398
|
+
self.append_subcluster(subcluster)
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
threshold = self.threshold
|
|
402
|
+
branching_factor = self.branching_factor
|
|
403
|
+
# We need to find the closest subcluster among all the
|
|
404
|
+
# subclusters so that we can insert our new subcluster.
|
|
405
|
+
a = np.dot(self.centroids_, subcluster.centroid_)
|
|
406
|
+
sim_matrix = a / (np.sum(self.centroids_, axis=1) + set_bits - a)
|
|
407
|
+
closest_index = np.argmax(sim_matrix)
|
|
408
|
+
closest_subcluster = self.subclusters_[closest_index]
|
|
409
|
+
|
|
410
|
+
# If the subcluster has a child, we need a recursive strategy.
|
|
411
|
+
if closest_subcluster.child_ is not None:
|
|
412
|
+
ps = closest_subcluster
|
|
413
|
+
split_child = closest_subcluster.child_.insert_bf_subcluster(
|
|
414
|
+
subcluster, set_bits, ps, singly
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
if not split_child:
|
|
418
|
+
# If it is determined that the child need not be split, we
|
|
419
|
+
# can just update the closest_subcluster
|
|
420
|
+
closest_subcluster.update(subcluster)
|
|
421
|
+
self.init_centroids_[closest_index] = self.subclusters_[
|
|
422
|
+
closest_index
|
|
423
|
+
].centroid_
|
|
424
|
+
self.centroids_[closest_index] = self.subclusters_[
|
|
425
|
+
closest_index
|
|
426
|
+
].centroid_
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
# things not too good. we need to redistribute the subclusters in
|
|
430
|
+
# our child node, and add a new subcluster in the parent
|
|
431
|
+
# subcluster to accommodate the new child.
|
|
432
|
+
else:
|
|
433
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
434
|
+
closest_subcluster.child_, threshold, branching_factor, singly
|
|
435
|
+
)
|
|
436
|
+
self.update_split_subclusters(
|
|
437
|
+
closest_subcluster, new_subcluster1, new_subcluster2, singly
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
if len(self.subclusters_) > self.branching_factor:
|
|
441
|
+
return True
|
|
442
|
+
return False
|
|
443
|
+
|
|
444
|
+
# good to go!
|
|
445
|
+
else:
|
|
446
|
+
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
|
|
447
|
+
if merged:
|
|
448
|
+
self.centroids_[closest_index] = closest_subcluster.centroid_
|
|
449
|
+
self.init_centroids_[closest_index] = closest_subcluster.centroid_
|
|
450
|
+
if not singly:
|
|
451
|
+
closest_subcluster.parent_ = ps
|
|
452
|
+
return False
|
|
453
|
+
|
|
454
|
+
# not close to any other subclusters, and we still
|
|
455
|
+
# have space, so add.
|
|
456
|
+
elif len(self.subclusters_) < self.branching_factor:
|
|
457
|
+
self.append_subcluster(subcluster)
|
|
458
|
+
if not singly:
|
|
459
|
+
closest_subcluster.parent_ = ps
|
|
460
|
+
return False
|
|
461
|
+
|
|
462
|
+
# We do not have enough space nor is it closer to an
|
|
463
|
+
# other subcluster. We need to split.
|
|
464
|
+
else:
|
|
465
|
+
self.append_subcluster(subcluster)
|
|
466
|
+
return True
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
class _BFSubcluster:
|
|
470
|
+
"""Each subcluster in a BFNode is called a BFSubcluster.
|
|
471
|
+
|
|
472
|
+
A BFSubcluster can have a BFNode has its child.
|
|
473
|
+
|
|
474
|
+
Parameters
|
|
475
|
+
----------
|
|
476
|
+
linear_sum : ndarray of shape (n_features,), default=None
|
|
477
|
+
Sample. This is kept optional to allow initialization of empty
|
|
478
|
+
subclusters.
|
|
479
|
+
|
|
480
|
+
Attributes
|
|
481
|
+
----------
|
|
482
|
+
n_samples_ : int
|
|
483
|
+
Number of samples that belong to each subcluster.
|
|
484
|
+
|
|
485
|
+
linear_sum_ : ndarray
|
|
486
|
+
Linear sum of all the samples in a subcluster. Prevents holding
|
|
487
|
+
all sample data in memory.
|
|
488
|
+
|
|
489
|
+
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
|
490
|
+
Centroid of the subcluster. Prevent recomputing of centroids when
|
|
491
|
+
``BFNode.centroids_`` is called.
|
|
492
|
+
|
|
493
|
+
mol_indices : list, default=[]
|
|
494
|
+
List of indices of molecules included in the given cluster.
|
|
495
|
+
|
|
496
|
+
child_ : _BFNode
|
|
497
|
+
Child Node of the subcluster. Once a given _BFNode is set as the child
|
|
498
|
+
of the _BFNode, it is set to ``self.child_``.
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
def __init__(self, *, linear_sum=None, mol_indices=[]):
|
|
502
|
+
if linear_sum is None:
|
|
503
|
+
self.n_samples_ = 0
|
|
504
|
+
self.centroid_ = self.linear_sum_ = 0
|
|
505
|
+
self.mol_indices = []
|
|
506
|
+
else:
|
|
507
|
+
self.n_samples_ = 1
|
|
508
|
+
self.centroid_ = self.linear_sum_ = linear_sum
|
|
509
|
+
self.mol_indices = mol_indices
|
|
510
|
+
|
|
511
|
+
self.child_ = None
|
|
512
|
+
self.parent_ = None
|
|
513
|
+
|
|
514
|
+
def update(self, subcluster):
|
|
515
|
+
self.n_samples_ += subcluster.n_samples_
|
|
516
|
+
self.linear_sum_ += subcluster.linear_sum_
|
|
517
|
+
self.mol_indices += subcluster.mol_indices
|
|
518
|
+
self.centroid_ = calc_centroid(self.linear_sum_, self.n_samples_)
|
|
519
|
+
|
|
520
|
+
def merge_subcluster(self, nominee_cluster, threshold):
|
|
521
|
+
"""Check if a cluster is worthy enough to be merged. If
|
|
522
|
+
yes then merge.
|
|
523
|
+
"""
|
|
524
|
+
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
|
|
525
|
+
new_n = self.n_samples_ + nominee_cluster.n_samples_
|
|
526
|
+
new_centroid = calc_centroid(new_ls, new_n)
|
|
527
|
+
|
|
528
|
+
if merge_accept(
|
|
529
|
+
threshold,
|
|
530
|
+
new_ls,
|
|
531
|
+
new_centroid,
|
|
532
|
+
new_n,
|
|
533
|
+
self.linear_sum_,
|
|
534
|
+
nominee_cluster.linear_sum_,
|
|
535
|
+
self.n_samples_,
|
|
536
|
+
nominee_cluster.n_samples_,
|
|
537
|
+
):
|
|
538
|
+
(self.n_samples_, self.linear_sum_, self.centroid_, self.mol_indices) = (
|
|
539
|
+
new_n,
|
|
540
|
+
new_ls,
|
|
541
|
+
new_centroid,
|
|
542
|
+
self.mol_indices + nominee_cluster.mol_indices,
|
|
543
|
+
)
|
|
544
|
+
return True
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class BitBirch:
|
|
549
|
+
"""Implements the BitBIRCH clustering algorithm.
|
|
550
|
+
|
|
551
|
+
BitBIRCH paper:
|
|
552
|
+
|
|
553
|
+
Memory- and time-efficient, online-learning algorithm.
|
|
554
|
+
It constructs a tree data structure with the cluster centroids being read off the leaf.
|
|
555
|
+
|
|
556
|
+
Parameters
|
|
557
|
+
----------
|
|
558
|
+
threshold : float, default=0.5
|
|
559
|
+
The similarity radius of the subcluster obtained by merging a new sample and the
|
|
560
|
+
closest subcluster should be greater than the threshold. Otherwise a new
|
|
561
|
+
subcluster is started. Setting this value to be very low promotes
|
|
562
|
+
splitting and vice-versa.
|
|
563
|
+
|
|
564
|
+
branching_factor : int, default=50
|
|
565
|
+
Maximum number of BF subclusters in each node. If a new samples enters
|
|
566
|
+
such that the number of subclusters exceed the branching_factor then
|
|
567
|
+
that node is split into two nodes with the subclusters redistributed
|
|
568
|
+
in each. The parent subcluster of that node is removed and two new
|
|
569
|
+
subclusters are added as parents of the 2 split nodes.
|
|
570
|
+
|
|
571
|
+
Attributes
|
|
572
|
+
----------
|
|
573
|
+
root_ : _BFNode
|
|
574
|
+
Root of the BFTree.
|
|
575
|
+
|
|
576
|
+
dummy_leaf_ : _BFNode
|
|
577
|
+
Start pointer to all the leaves.
|
|
578
|
+
|
|
579
|
+
subcluster_centers_ : ndarray
|
|
580
|
+
Centroids of all subclusters read directly from the leaves.
|
|
581
|
+
|
|
582
|
+
Notes
|
|
583
|
+
-----
|
|
584
|
+
The tree data structure consists of nodes with each node consisting of
|
|
585
|
+
a number of subclusters. The maximum number of subclusters in a node
|
|
586
|
+
is determined by the branching factor. Each subcluster maintains a
|
|
587
|
+
linear sum, mol_indices and the number of samples in that subcluster.
|
|
588
|
+
In addition, each subcluster can also have a node as its child, if the
|
|
589
|
+
subcluster is not a member of a leaf node.
|
|
590
|
+
|
|
591
|
+
For a new point entering the root, it is merged with the subcluster closest
|
|
592
|
+
to it and the linear sum, mol_indices and the number of samples of that
|
|
593
|
+
subcluster are updated. This is done recursively till the properties of
|
|
594
|
+
the leaf node are updated.
|
|
595
|
+
"""
|
|
596
|
+
|
|
597
|
+
def __init__(
|
|
598
|
+
self,
|
|
599
|
+
*,
|
|
600
|
+
threshold=0.5,
|
|
601
|
+
branching_factor=50,
|
|
602
|
+
):
|
|
603
|
+
self.threshold = threshold
|
|
604
|
+
self.branching_factor = branching_factor
|
|
605
|
+
self.index_tracker = 0
|
|
606
|
+
self.first_call = True
|
|
607
|
+
|
|
608
|
+
def fit(
|
|
609
|
+
self,
|
|
610
|
+
X,
|
|
611
|
+
singly=True,
|
|
612
|
+
store_centroids=False,
|
|
613
|
+
input_is_packed: bool = False,
|
|
614
|
+
n_features: int | None = None,
|
|
615
|
+
max_fps: int | None = None,
|
|
616
|
+
):
|
|
617
|
+
"""
|
|
618
|
+
Build a BF Tree for the input data.
|
|
619
|
+
|
|
620
|
+
Parameters
|
|
621
|
+
----------
|
|
622
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
623
|
+
Input data.
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
self
|
|
628
|
+
Fitted estimator.
|
|
629
|
+
"""
|
|
630
|
+
if isinstance(X, Path):
|
|
631
|
+
X = np.load(X, mmap_mode="r")[:max_fps]
|
|
632
|
+
else:
|
|
633
|
+
X = X[:max_fps]
|
|
634
|
+
threshold = self.threshold
|
|
635
|
+
branching_factor = self.branching_factor
|
|
636
|
+
n_features = _validate_n_features(X, input_is_packed, n_features)
|
|
637
|
+
d_type = X.dtype
|
|
638
|
+
|
|
639
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
640
|
+
# start a new tree.
|
|
641
|
+
if self.first_call:
|
|
642
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
643
|
+
self.root_ = _BFNode(
|
|
644
|
+
threshold=threshold,
|
|
645
|
+
branching_factor=branching_factor,
|
|
646
|
+
is_leaf=True,
|
|
647
|
+
n_features=n_features,
|
|
648
|
+
dtype=d_type,
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# To enable getting back subclusters.
|
|
652
|
+
self.dummy_leaf_ = _BFNode(
|
|
653
|
+
threshold=threshold,
|
|
654
|
+
branching_factor=branching_factor,
|
|
655
|
+
is_leaf=True,
|
|
656
|
+
n_features=n_features,
|
|
657
|
+
dtype=d_type,
|
|
658
|
+
)
|
|
659
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
660
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
661
|
+
|
|
662
|
+
# Cannot vectorize. Enough to convince to use cython.
|
|
663
|
+
if not sparse.issparse(X):
|
|
664
|
+
iter_func = iter
|
|
665
|
+
else:
|
|
666
|
+
iter_func = _iterate_sparse_X
|
|
667
|
+
|
|
668
|
+
for sample in iter_func(X):
|
|
669
|
+
set_bits = np.sum(sample)
|
|
670
|
+
subcluster = _BFSubcluster(
|
|
671
|
+
linear_sum=sample, mol_indices=[self.index_tracker]
|
|
672
|
+
)
|
|
673
|
+
split = self.root_.insert_bf_subcluster(
|
|
674
|
+
subcluster, set_bits, subcluster.parent_, singly
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
if split:
|
|
678
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
679
|
+
self.root_, threshold, branching_factor, singly
|
|
680
|
+
)
|
|
681
|
+
del self.root_
|
|
682
|
+
self.root_ = _BFNode(
|
|
683
|
+
threshold=threshold,
|
|
684
|
+
branching_factor=branching_factor,
|
|
685
|
+
is_leaf=False,
|
|
686
|
+
n_features=n_features,
|
|
687
|
+
dtype=d_type,
|
|
688
|
+
)
|
|
689
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
690
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
691
|
+
|
|
692
|
+
if not singly:
|
|
693
|
+
for i in new_subcluster1.child_.subclusters_:
|
|
694
|
+
i.parent_ = new_subcluster1
|
|
695
|
+
for i in new_subcluster2.child_.subclusters_:
|
|
696
|
+
i.parent_ = new_subcluster2
|
|
697
|
+
|
|
698
|
+
self.index_tracker += 1
|
|
699
|
+
if store_centroids:
|
|
700
|
+
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
|
701
|
+
self.subcluster_centers_ = centroids
|
|
702
|
+
self._n_features_out = self.subcluster_centers_.shape[0]
|
|
703
|
+
|
|
704
|
+
self.first_call = False
|
|
705
|
+
return self
|
|
706
|
+
|
|
707
|
+
def fit_reinsert(
|
|
708
|
+
self,
|
|
709
|
+
X,
|
|
710
|
+
reinsert_indices,
|
|
711
|
+
singly=False,
|
|
712
|
+
store_centroids=False,
|
|
713
|
+
input_is_packed: bool = False,
|
|
714
|
+
n_features: int | None = None,
|
|
715
|
+
):
|
|
716
|
+
"""X corresponds to only the molecules that will be reinserted into the tree
|
|
717
|
+
reinsert indices are the indices of the molecules that will be reinserted into the tree
|
|
718
|
+
"""
|
|
719
|
+
threshold = self.threshold
|
|
720
|
+
branching_factor = self.branching_factor
|
|
721
|
+
n_features = _validate_n_features(X, input_is_packed, n_features)
|
|
722
|
+
d_type = X.dtype
|
|
723
|
+
|
|
724
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
725
|
+
# start a new tree.
|
|
726
|
+
if self.first_call:
|
|
727
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
728
|
+
self.root_ = _BFNode(
|
|
729
|
+
threshold=threshold,
|
|
730
|
+
branching_factor=branching_factor,
|
|
731
|
+
is_leaf=True,
|
|
732
|
+
n_features=n_features,
|
|
733
|
+
dtype=d_type,
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
# To enable getting back subclusters.
|
|
737
|
+
self.dummy_leaf_ = _BFNode(
|
|
738
|
+
threshold=threshold,
|
|
739
|
+
branching_factor=branching_factor,
|
|
740
|
+
is_leaf=True,
|
|
741
|
+
n_features=n_features,
|
|
742
|
+
dtype=d_type,
|
|
743
|
+
)
|
|
744
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
745
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
746
|
+
|
|
747
|
+
# Cannot vectorize. Enough to convince to use cython.
|
|
748
|
+
if not sparse.issparse(X):
|
|
749
|
+
iter_func = iter
|
|
750
|
+
else:
|
|
751
|
+
iter_func = _iterate_sparse_X
|
|
752
|
+
|
|
753
|
+
for sample, mol_ind in zip(iter_func(X), reinsert_indices):
|
|
754
|
+
set_bits = np.sum(sample)
|
|
755
|
+
subcluster = _BFSubcluster(linear_sum=sample, mol_indices=[mol_ind])
|
|
756
|
+
split = self.root_.insert_bf_subcluster(
|
|
757
|
+
subcluster, set_bits, subcluster.parent_, False
|
|
758
|
+
)
|
|
759
|
+
if split:
|
|
760
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
761
|
+
self.root_, threshold, branching_factor, False
|
|
762
|
+
)
|
|
763
|
+
del self.root_
|
|
764
|
+
self.root_ = _BFNode(
|
|
765
|
+
threshold=threshold,
|
|
766
|
+
branching_factor=branching_factor,
|
|
767
|
+
is_leaf=False,
|
|
768
|
+
n_features=n_features,
|
|
769
|
+
dtype=d_type,
|
|
770
|
+
)
|
|
771
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
772
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
773
|
+
|
|
774
|
+
if not singly:
|
|
775
|
+
for i in new_subcluster1.child_.subclusters_:
|
|
776
|
+
i.parent_ = new_subcluster1
|
|
777
|
+
for i in new_subcluster2.child_.subclusters_:
|
|
778
|
+
i.parent_ = new_subcluster2
|
|
779
|
+
if store_centroids:
|
|
780
|
+
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
|
781
|
+
self.subcluster_centers_ = centroids
|
|
782
|
+
self._n_features_out = self.subcluster_centers_.shape[0]
|
|
783
|
+
|
|
784
|
+
self.first_call = False
|
|
785
|
+
return self
|
|
786
|
+
|
|
787
|
+
def fit_BFs(self, X, store_centroids=False):
|
|
788
|
+
"""
|
|
789
|
+
Method to fit a BitBirch model with the given BitFeatyres.
|
|
790
|
+
|
|
791
|
+
Parameters:
|
|
792
|
+
-----------
|
|
793
|
+
X : list of BitFeatures
|
|
794
|
+
|
|
795
|
+
Returns:
|
|
796
|
+
--------
|
|
797
|
+
self
|
|
798
|
+
"""
|
|
799
|
+
|
|
800
|
+
# Check that the input is a list of BitFeatures
|
|
801
|
+
if type(X) != list or len(X[0]) != 3:
|
|
802
|
+
raise ValueError("The input must be a list of BitFeatures")
|
|
803
|
+
|
|
804
|
+
threshold = self.threshold
|
|
805
|
+
branching_factor = self.branching_factor
|
|
806
|
+
|
|
807
|
+
n_features = len(X[0][1])
|
|
808
|
+
d_type = X[0][1].dtype
|
|
809
|
+
|
|
810
|
+
# If partial_fit is called for the first time or fit is called, we
|
|
811
|
+
# start a new tree.
|
|
812
|
+
if self.first_call:
|
|
813
|
+
# The first root is the leaf. Manipulate this object throughout.
|
|
814
|
+
self.root_ = _BFNode(
|
|
815
|
+
threshold=threshold,
|
|
816
|
+
branching_factor=branching_factor,
|
|
817
|
+
is_leaf=True,
|
|
818
|
+
n_features=n_features,
|
|
819
|
+
dtype=d_type,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# To enable getting back subclusters.
|
|
823
|
+
self.dummy_leaf_ = _BFNode(
|
|
824
|
+
threshold=threshold,
|
|
825
|
+
branching_factor=branching_factor,
|
|
826
|
+
is_leaf=True,
|
|
827
|
+
n_features=n_features,
|
|
828
|
+
dtype=d_type,
|
|
829
|
+
)
|
|
830
|
+
self.dummy_leaf_.next_leaf_ = self.root_
|
|
831
|
+
self.root_.prev_leaf_ = self.dummy_leaf_
|
|
832
|
+
|
|
833
|
+
for sample in iter(X):
|
|
834
|
+
|
|
835
|
+
cluster = _BFSubcluster()
|
|
836
|
+
cluster.n_samples_, cluster.linear_sum_, cluster.mol_indices = (
|
|
837
|
+
sample[0],
|
|
838
|
+
sample[1],
|
|
839
|
+
sample[2],
|
|
840
|
+
)
|
|
841
|
+
cluster.centroid_ = calc_centroid(cluster.linear_sum_, cluster.n_samples_)
|
|
842
|
+
|
|
843
|
+
set_bits = np.sum(cluster.centroid_)
|
|
844
|
+
split = self.root_.insert_bf_subcluster(
|
|
845
|
+
cluster, set_bits, cluster.parent_, True
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
if split:
|
|
849
|
+
new_subcluster1, new_subcluster2 = _split_node(
|
|
850
|
+
self.root_, threshold, branching_factor, True
|
|
851
|
+
)
|
|
852
|
+
del self.root_
|
|
853
|
+
self.root_ = _BFNode(
|
|
854
|
+
threshold=threshold,
|
|
855
|
+
branching_factor=branching_factor,
|
|
856
|
+
is_leaf=False,
|
|
857
|
+
n_features=n_features,
|
|
858
|
+
dtype=d_type,
|
|
859
|
+
)
|
|
860
|
+
self.root_.append_subcluster(new_subcluster1)
|
|
861
|
+
self.root_.append_subcluster(new_subcluster2)
|
|
862
|
+
self.index_tracker += 1
|
|
863
|
+
if store_centroids:
|
|
864
|
+
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
|
865
|
+
self.subcluster_centers_ = centroids
|
|
866
|
+
self._n_features_out = self.subcluster_centers_.shape[0]
|
|
867
|
+
|
|
868
|
+
self.first_call = False
|
|
869
|
+
return self
|
|
870
|
+
|
|
871
|
+
def _get_leaves(self):
|
|
872
|
+
"""
|
|
873
|
+
Retrieve the leaves of the BF Node.
|
|
874
|
+
|
|
875
|
+
Returns
|
|
876
|
+
-------
|
|
877
|
+
leaves : list of shape (n_leaves,)
|
|
878
|
+
List of the leaf nodes.
|
|
879
|
+
"""
|
|
880
|
+
leaf_ptr = self.dummy_leaf_.next_leaf_
|
|
881
|
+
leaves = []
|
|
882
|
+
while leaf_ptr is not None:
|
|
883
|
+
leaves.append(leaf_ptr)
|
|
884
|
+
leaf_ptr = leaf_ptr.next_leaf_
|
|
885
|
+
return leaves
|
|
886
|
+
|
|
887
|
+
def get_centroids_mol_ids(self):
|
|
888
|
+
"""Method to return a dictionary containing the centroids and mol indices of the leaves"""
|
|
889
|
+
if self.first_call:
|
|
890
|
+
raise ValueError("The model has not been fitted yet.")
|
|
891
|
+
|
|
892
|
+
centroids = []
|
|
893
|
+
mol_ids = []
|
|
894
|
+
for leaf in self._get_leaves():
|
|
895
|
+
for subcluster in leaf.subclusters_:
|
|
896
|
+
centroids.append(subcluster.centroid_)
|
|
897
|
+
mol_ids.append(subcluster.mol_indices)
|
|
898
|
+
|
|
899
|
+
dict_centroids_mol_ids = {"centroids": centroids, "mol_ids": mol_ids}
|
|
900
|
+
|
|
901
|
+
return dict_centroids_mol_ids
|
|
902
|
+
|
|
903
|
+
def get_centroids(self):
|
|
904
|
+
"""Method to return a list of Numpy arrays containing the centroids' fingerprints"""
|
|
905
|
+
if self.first_call:
|
|
906
|
+
raise ValueError("The model has not been fitted yet.")
|
|
907
|
+
|
|
908
|
+
centroids = []
|
|
909
|
+
for leaf in self._get_leaves():
|
|
910
|
+
for subcluster in leaf.subclusters_:
|
|
911
|
+
centroids.append(subcluster.centroid_)
|
|
912
|
+
|
|
913
|
+
return centroids
|
|
914
|
+
|
|
915
|
+
def get_cluster_mol_ids(self):
|
|
916
|
+
"""Method to return the indices of molecules in each cluster"""
|
|
917
|
+
if self.first_call:
|
|
918
|
+
raise ValueError("The model has not been fitted yet.")
|
|
919
|
+
|
|
920
|
+
clusters_mol_id = []
|
|
921
|
+
for leaf in self._get_leaves():
|
|
922
|
+
for subcluster in leaf.subclusters_:
|
|
923
|
+
clusters_mol_id.append(subcluster.mol_indices)
|
|
924
|
+
|
|
925
|
+
# Sort the clusters by the number of samples in the cluster
|
|
926
|
+
clusters_mol_id = sorted(clusters_mol_id, key=lambda x: len(x), reverse=True)
|
|
927
|
+
|
|
928
|
+
return clusters_mol_id
|
|
929
|
+
|
|
930
|
+
def _get_BFs(self):
|
|
931
|
+
"""Method to return the BitFeatures of the leaves"""
|
|
932
|
+
if self.first_call:
|
|
933
|
+
raise ValueError("The model has not been fitted yet.")
|
|
934
|
+
|
|
935
|
+
BFs = []
|
|
936
|
+
for leaf in self._get_leaves():
|
|
937
|
+
for subcluster in leaf.subclusters_:
|
|
938
|
+
BFs.append(subcluster)
|
|
939
|
+
|
|
940
|
+
# Sort the BitFeatures by the number of samples in the cluster
|
|
941
|
+
BFs = sorted(BFs, key=lambda x: x.n_samples_, reverse=True)
|
|
942
|
+
|
|
943
|
+
return BFs
|
|
944
|
+
|
|
945
|
+
def prepare_data_BFs(self, fps, initial_mol=0):
|
|
946
|
+
"""Method to prepare the BitFeatures of the largest cluster and the rest of the clusters"""
|
|
947
|
+
if self.first_call:
|
|
948
|
+
raise ValueError("The model has not been fitted yet.")
|
|
949
|
+
|
|
950
|
+
BFs = self._get_BFs()
|
|
951
|
+
big, rest = BFs[0], BFs[1:]
|
|
952
|
+
|
|
953
|
+
data = []
|
|
954
|
+
for BF in rest:
|
|
955
|
+
data.append(
|
|
956
|
+
[BF.n_samples_, BF.linear_sum_.astype(np.int64), BF.mol_indices]
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
bigs = []
|
|
960
|
+
for mol in big.mol_indices:
|
|
961
|
+
bigs.append([1, fps[mol - initial_mol].astype(np.int64), [mol]])
|
|
962
|
+
|
|
963
|
+
return data, bigs
|
|
964
|
+
|
|
965
|
+
def get_assignments(self, n_mols):
|
|
966
|
+
clustered_ids = self.get_cluster_mol_ids()
|
|
967
|
+
|
|
968
|
+
assignments = np.full(n_mols, -1, dtype=int)
|
|
969
|
+
for i, cluster in enumerate(clustered_ids):
|
|
970
|
+
assignments[cluster] = i + 1
|
|
971
|
+
|
|
972
|
+
# Check that there are no unassigned molecules
|
|
973
|
+
assert np.all(assignments != -1)
|
|
974
|
+
|
|
975
|
+
return assignments
|
|
976
|
+
|
|
977
|
+
# Refinement functionality
|
|
978
|
+
|
|
979
|
+
def _get_prune_indices(self):
|
|
980
|
+
"""Method to return the indices of molecules in the largest cluster, specifically to be used in fit_reinsert."""
|
|
981
|
+
largest_cluster = max(
|
|
982
|
+
(
|
|
983
|
+
(leaf_idx, cluster_idx, len(cluster.mol_indices))
|
|
984
|
+
for leaf_idx, leaf in enumerate(self._get_leaves())
|
|
985
|
+
for cluster_idx, cluster in enumerate(leaf.subclusters_)
|
|
986
|
+
),
|
|
987
|
+
key=lambda x: x[2], # Sort by the cluster size
|
|
988
|
+
default=(None, None, 0), # Default if no clusters are found
|
|
989
|
+
)
|
|
990
|
+
prune_indices = lazyPrune(largest_cluster[0], largest_cluster[1], self)
|
|
991
|
+
return prune_indices
|
|
992
|
+
|
|
993
|
+
def prune(self, X):
|
|
994
|
+
"""
|
|
995
|
+
Retrieves the molecules in the largest cluster and "prunes" the tree by redistributing these molecules.
|
|
996
|
+
|
|
997
|
+
Parameters
|
|
998
|
+
----------
|
|
999
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
1000
|
+
Input data.
|
|
1001
|
+
|
|
1002
|
+
Returns
|
|
1003
|
+
-------
|
|
1004
|
+
self
|
|
1005
|
+
Fitted estimator.
|
|
1006
|
+
"""
|
|
1007
|
+
mol_indices = self._get_prune_indices()
|
|
1008
|
+
cluster_fps = X[mol_indices]
|
|
1009
|
+
self.fit_reinsert(cluster_fps, mol_indices)
|
|
1010
|
+
return self
|
|
1011
|
+
|
|
1012
|
+
def _calc_centroid(self, X, cluster):
|
|
1013
|
+
"""Calculates centroid
|
|
1014
|
+
|
|
1015
|
+
Parameters
|
|
1016
|
+
----------
|
|
1017
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
1018
|
+
Input data.
|
|
1019
|
+
cluster: np.array of the molecule indices within the cluster
|
|
1020
|
+
|
|
1021
|
+
Returns
|
|
1022
|
+
-------
|
|
1023
|
+
centroid : np.ndarray
|
|
1024
|
+
Centroid fingerprints of the given set
|
|
1025
|
+
"""
|
|
1026
|
+
|
|
1027
|
+
full_fp = X[cluster]
|
|
1028
|
+
linear_sum = np.sum(full_fp, axis=0)
|
|
1029
|
+
n_samples = len(full_fp)
|
|
1030
|
+
return calc_centroid(linear_sum, n_samples)
|
|
1031
|
+
|
|
1032
|
+
def _get_top_cluster_params(self, X, top):
|
|
1033
|
+
"""Method to recieve the cluster mol indices, centroids, and fingerprints of the top user-specified clusters.
|
|
1034
|
+
|
|
1035
|
+
Parameters
|
|
1036
|
+
----------
|
|
1037
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
1038
|
+
Input data.
|
|
1039
|
+
|
|
1040
|
+
top: int
|
|
1041
|
+
default: 20; specifies number of top largest clusters to reassign
|
|
1042
|
+
|
|
1043
|
+
Returns
|
|
1044
|
+
-------
|
|
1045
|
+
top_clusters: sorted list
|
|
1046
|
+
list of len top; containing the cluster mol ids of given clusters
|
|
1047
|
+
|
|
1048
|
+
centroids: np.array; shape (top, n_features)
|
|
1049
|
+
centroids of the top clusters
|
|
1050
|
+
|
|
1051
|
+
mol_indices: list of indices with the top clusters; len of number of molecules in all top clusters
|
|
1052
|
+
|
|
1053
|
+
data_top_clusters: np.array; shape (n_molecules, n_features)
|
|
1054
|
+
fingerprints of the molecules in the top clusters
|
|
1055
|
+
"""
|
|
1056
|
+
top_clusters = sorted(self.get_cluster_mol_ids(), key=len, reverse=True)[:top]
|
|
1057
|
+
centroids = np.array([self._calc_centroid(X, c) for c in top_clusters])
|
|
1058
|
+
mol_indices = [i for c in top_clusters for i in c]
|
|
1059
|
+
data_top_clusters = np.array([X[i] for i in mol_indices])
|
|
1060
|
+
assert data_top_clusters.shape[0] == len(mol_indices)
|
|
1061
|
+
|
|
1062
|
+
return top_clusters, centroids, mol_indices, data_top_clusters
|
|
1063
|
+
|
|
1064
|
+
def _get_sim_matrix(self, data_top_clusters, centroids):
|
|
1065
|
+
"""Method to receive the similarity matrix of the user-sepcified top clusters for reassign.
|
|
1066
|
+
|
|
1067
|
+
Parameters
|
|
1068
|
+
----------
|
|
1069
|
+
data_top_clusters: np.array; shape (n_molecules, n_features)
|
|
1070
|
+
fingerprints of the molecules in the top clusters
|
|
1071
|
+
|
|
1072
|
+
centroids: np.array; shape (top, n_features)
|
|
1073
|
+
centroids of the top clusters
|
|
1074
|
+
|
|
1075
|
+
Returns
|
|
1076
|
+
-------
|
|
1077
|
+
row_max: np.array; shape(n_molecules)
|
|
1078
|
+
Each entry is the index of the cluster centroid to which the molecule was closest
|
|
1079
|
+
"""
|
|
1080
|
+
pop_counts = np.sum(centroids, axis=1)
|
|
1081
|
+
pop_mols = np.sum(data_top_clusters, axis=1).reshape(-1, 1)
|
|
1082
|
+
ei = np.einsum("ij,kj->ik", data_top_clusters, centroids)
|
|
1083
|
+
tanis = ei / (pop_counts + pop_mols - ei)
|
|
1084
|
+
row_max = np.argmax(tanis, axis=1)
|
|
1085
|
+
assert row_max.shape == data_top_clusters.shape[0:1]
|
|
1086
|
+
|
|
1087
|
+
return row_max
|
|
1088
|
+
|
|
1089
|
+
def reassign(self, X, top=20, quick=False):
|
|
1090
|
+
"""
|
|
1091
|
+
Reassign molecules across the user-specified (default 20) top largest clusters in the tree based on the tanimoto similarity matrix.
|
|
1092
|
+
|
|
1093
|
+
Parameters
|
|
1094
|
+
----------
|
|
1095
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
1096
|
+
Input data.
|
|
1097
|
+
|
|
1098
|
+
top: int
|
|
1099
|
+
default: 20; specifies number of top largest clusters to reassign
|
|
1100
|
+
|
|
1101
|
+
quick: boolean
|
|
1102
|
+
default: False; if quick specifed, the whole tree will not be return, but a dictionary of top cluster data will be returned
|
|
1103
|
+
|
|
1104
|
+
Return
|
|
1105
|
+
------
|
|
1106
|
+
quick return: dict
|
|
1107
|
+
dictionary sorted by largest to smallest newly reassigned top clusters
|
|
1108
|
+
|
|
1109
|
+
self
|
|
1110
|
+
Fitted estimator.
|
|
1111
|
+
"""
|
|
1112
|
+
top_clusters, centroids, mol_indices, data_top_clusters = (
|
|
1113
|
+
self._get_top_cluster_params(X, top)
|
|
1114
|
+
)
|
|
1115
|
+
row_max = self._get_sim_matrix(data_top_clusters, centroids)
|
|
1116
|
+
|
|
1117
|
+
final_clusters = {i: [] for i in np.unique(row_max)}
|
|
1118
|
+
for mol, cluster_idx in zip(mol_indices, row_max):
|
|
1119
|
+
final_clusters[cluster_idx].append(mol)
|
|
1120
|
+
|
|
1121
|
+
if quick:
|
|
1122
|
+
return dict(
|
|
1123
|
+
sorted(
|
|
1124
|
+
final_clusters.items(), key=lambda item: len(item[1]), reverse=True
|
|
1125
|
+
)
|
|
1126
|
+
) # Return dict sorted by largest to smallest newly reassigned top clusters
|
|
1127
|
+
|
|
1128
|
+
else:
|
|
1129
|
+
sub_clusters = [
|
|
1130
|
+
sc for leaf in self._get_leaves() for sc in leaf.subclusters_
|
|
1131
|
+
]
|
|
1132
|
+
assert len(sub_clusters) == len(self.get_cluster_mol_ids())
|
|
1133
|
+
|
|
1134
|
+
top_sub_clusters = sorted(
|
|
1135
|
+
sub_clusters, key=lambda c: c.n_samples_, reverse=True
|
|
1136
|
+
)[:top]
|
|
1137
|
+
|
|
1138
|
+
for sc, tc in zip(top_sub_clusters, top_clusters):
|
|
1139
|
+
assert sc.n_samples_ == len(tc)
|
|
1140
|
+
|
|
1141
|
+
for cluster, c_ids in zip(top_sub_clusters, final_clusters.values()):
|
|
1142
|
+
cluster.n_samples_ = len(c_ids)
|
|
1143
|
+
cluster.linear_sum = np.sum(X[c_ids], axis=0)
|
|
1144
|
+
cluster.mol_indices = c_ids
|
|
1145
|
+
cluster.centroid_ = calc_centroid(
|
|
1146
|
+
cluster.linear_sum_, cluster.n_samples_
|
|
1147
|
+
)
|
|
1148
|
+
return self
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
def lazyPrune(leaf_index, subc_index, brc):
|
|
1152
|
+
# from bitbirch_tools import bitbirch_double_link_tolerance as bitbirch
|
|
1153
|
+
# path from leaf level to root, indices are marked
|
|
1154
|
+
pSubs = []
|
|
1155
|
+
pSubPath(brc._get_leaves()[leaf_index].subclusters_[subc_index], pSubs, brc)
|
|
1156
|
+
|
|
1157
|
+
# traverse all the way down to the leaf node
|
|
1158
|
+
node = brc.root_
|
|
1159
|
+
for index in pSubs[:0:-1]:
|
|
1160
|
+
node = node.subclusters_[index].child_
|
|
1161
|
+
|
|
1162
|
+
# information needed to update all preceding subclusters
|
|
1163
|
+
cluster_ls = node.subclusters_[subc_index].linear_sum_
|
|
1164
|
+
cluster_n = node.subclusters_[subc_index].n_samples_
|
|
1165
|
+
cluster_mol_indices = node.subclusters_[subc_index].mol_indices
|
|
1166
|
+
|
|
1167
|
+
"""# IMPORTANT: tree must have at least three levels, at least that is what is assumed when inputting dataset
|
|
1168
|
+
parent_sub = node.subclusters_[0].parent_ # needed to traverse node upwards
|
|
1169
|
+
parent_node = parent_sub.parent_.child_ # needed to reassign leaf nodes"""
|
|
1170
|
+
|
|
1171
|
+
parent_sub = node.subclusters_[0].parent_
|
|
1172
|
+
if parent_sub.parent_ is None:
|
|
1173
|
+
parent_node = brc.root_
|
|
1174
|
+
else:
|
|
1175
|
+
parent_node = parent_sub.parent_.child_
|
|
1176
|
+
|
|
1177
|
+
# eliminate the biggest subcluster from the leaf node
|
|
1178
|
+
node.subclusters_.pop(subc_index)
|
|
1179
|
+
node.centroids_ = np.delete(node.centroids_, subc_index, 0)
|
|
1180
|
+
shift_delete(node.init_centroids_, subc_index)
|
|
1181
|
+
|
|
1182
|
+
if len(node.subclusters_) == 0:
|
|
1183
|
+
parent_sub.child_ = None
|
|
1184
|
+
|
|
1185
|
+
# rearrange leaf pointers if need to
|
|
1186
|
+
|
|
1187
|
+
# if the leaf node disappears, we assign parent node to replace leaf node
|
|
1188
|
+
num_childs = 0
|
|
1189
|
+
for i in parent_node.subclusters_:
|
|
1190
|
+
if i.child_ is not None:
|
|
1191
|
+
num_childs += 1
|
|
1192
|
+
if num_childs == 0:
|
|
1193
|
+
parent_node.is_leaf = True
|
|
1194
|
+
if node.prev_leaf_ is not None:
|
|
1195
|
+
node.prev_leaf_.next_leaf_ = parent_node
|
|
1196
|
+
parent_node.prev_leaf_ = node.prev_leaf_
|
|
1197
|
+
parent_node.next_leaf_ = node.next_leaf_
|
|
1198
|
+
if node.next_leaf_ is not None:
|
|
1199
|
+
node.next_leaf_.prev_leaf_ = parent_node
|
|
1200
|
+
|
|
1201
|
+
# leaf node disappears and there are no reassignments
|
|
1202
|
+
elif len(node.subclusters_) == 0:
|
|
1203
|
+
if node.prev_leaf_ is not None:
|
|
1204
|
+
node.prev_leaf_.next_leaf_ = node.next_leaf_
|
|
1205
|
+
if node.next_leaf_ is not None:
|
|
1206
|
+
node.next_leaf_.prev_leaf_ = node.prev_leaf_
|
|
1207
|
+
|
|
1208
|
+
if parent_sub.parent_ is None:
|
|
1209
|
+
node = brc.root_
|
|
1210
|
+
else:
|
|
1211
|
+
node = parent_sub.parent_.child_
|
|
1212
|
+
|
|
1213
|
+
# here, we update all the preceding subclusters all the way up to the root
|
|
1214
|
+
for index in pSubs[1:]:
|
|
1215
|
+
sub = node.subclusters_[index]
|
|
1216
|
+
|
|
1217
|
+
sub.linear_sum_ -= cluster_ls
|
|
1218
|
+
sub.n_samples_ -= cluster_n
|
|
1219
|
+
sub.centroid_ = calc_centroid(sub.linear_sum_, sub.n_samples_)
|
|
1220
|
+
sub.mol_indices = [x for x in sub.mol_indices if x not in cluster_mol_indices]
|
|
1221
|
+
|
|
1222
|
+
node.centroids_[index] = sub.centroid_
|
|
1223
|
+
node.init_centroids_[index] = sub.centroid_
|
|
1224
|
+
|
|
1225
|
+
if sub.parent_ is None:
|
|
1226
|
+
break
|
|
1227
|
+
elif sub.parent_.parent_ is None:
|
|
1228
|
+
node = brc.root_
|
|
1229
|
+
else:
|
|
1230
|
+
node = sub.parent_.parent_.child_
|
|
1231
|
+
|
|
1232
|
+
return cluster_mol_indices
|
|
1233
|
+
|
|
1234
|
+
|
|
1235
|
+
def shift_delete(array, index):
|
|
1236
|
+
for i in range(index, len(array) - 1):
|
|
1237
|
+
array[i] = array[i + 1]
|
|
1238
|
+
array[-1] = 0
|
|
1239
|
+
|
|
1240
|
+
|
|
1241
|
+
def pSubPath(subcluster, pSubs, brc):
|
|
1242
|
+
# figure out the path of subclusters from leaf level to root
|
|
1243
|
+
|
|
1244
|
+
if subcluster is None:
|
|
1245
|
+
return
|
|
1246
|
+
|
|
1247
|
+
if subcluster.parent_ is None:
|
|
1248
|
+
pSubs.append(brc.root_.subclusters_.index(subcluster))
|
|
1249
|
+
else:
|
|
1250
|
+
pSubs.append(subcluster.parent_.child_.subclusters_.index(subcluster))
|
|
1251
|
+
|
|
1252
|
+
pSubPath(subcluster.parent_, pSubs, brc)
|