bblean 0.6.0b2__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1144 @@
1
+ # type: ignore
2
+ # BitBIRCH is an open-source clustering module based on iSIM
3
+ #
4
+ # Please, cite the BitBIRCH paper: https://doi.org/10.1039/D5DD00030K
5
+ #
6
+ # BitBIRCH is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, version 3.
9
+ #
10
+ # BitBIRCH is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # BitBIRCH License: GPL-3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
16
+ #
17
+ # Memory-efficient BitBIRCH authors: Ramon Alain Miranda Quintana <ramirandaq@gmail.com>, <quintana@chem.ufl.edu>
18
+ # Krizstina Zsigmond <kzsigmond@ufl.edu>
19
+ #
20
+ ### Part of the tree-management code was derived from https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html
21
+ ### Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
22
+ ### Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
23
+ ### Joel Nothman <joel.nothman@gmail.com>
24
+ ### License: BSD 3 clause
25
+ # Parts of the BitBIRCH algorithm were previously released under the LGPL-3.0 license by:
26
+ # Ramon Alain Miranda Quintana <ramirandaq@gmail.com>, <quintana@chem.ufl.edu>
27
+ # Vicky (Vic) Jung <jungvicky@ufl.edu>
28
+ # Kenneth Lopez Perez <klopezperez@chem.ufl.edu>
29
+ # Kate Huddleston <kdavis2@chem.ufl.edu>
30
+ from pathlib import Path
31
+
32
+ import numpy as np
33
+ from scipy import sparse
34
+
35
+
36
+ def safe_sum(nmax, np1, np2):
37
+ if nmax >= 4294967294:
38
+ return np1.astype("uint64") + np2.astype("uint64")
39
+ elif nmax >= 65534:
40
+ return np1.astype("uint32") + np2.astype("uint32")
41
+ elif nmax >= 254:
42
+ return np1.astype("uint16") + np2.astype("uint16")
43
+ else:
44
+ return np1 + np2
45
+
46
+
47
+ def _copy_or_unpack(x, n_features, input_is_packed: bool = True):
48
+ return unpack_fingerprints(x, n_features) if input_is_packed else x.copy()
49
+
50
+
51
+ def unpack_fingerprints(a, n_features: int):
52
+ """Unpacks uint8 arrays into boolean arrays"""
53
+ # n_features is required to discard padded zeros if it is not a multiple of 8
54
+ return np.unpackbits(a, axis=-1, count=n_features)
55
+
56
+
57
+ # Utility function to validate the n_features argument for packed inputs
58
+ def _validate_n_features(X, input_is_packed: bool, n_features: int | None) -> int:
59
+ if input_is_packed:
60
+ if n_features is None:
61
+ raise ValueError("n_features is required for packed inputs")
62
+ return n_features
63
+
64
+ x_n_features = X.shape[1]
65
+ if n_features is not None:
66
+ if n_features != x_n_features:
67
+ raise ValueError(
68
+ "n_features is redundant for non-packed inputs"
69
+ " if passed, it must be equal to X.shape[1]."
70
+ f" For passed X, X.shape[1] = {X.shape[1]}."
71
+ " If this value is not what you expected,"
72
+ " make sure the passed X is actually unpacked."
73
+ )
74
+ return x_n_features
75
+
76
+
77
+ def set_merge(merge_criterion, tolerance=0.05):
78
+ """
79
+ Sets merge_accept function for merge_subcluster, based on user specified merge_criteria.
80
+
81
+ Radius: merge subcluster based on comparison to centroid of the cluster
82
+ Diameter: merge subcluster based on instant Tanimoto similarity of cluster
83
+ Tolerance: applies tolerance threshold to diameter merge criteria, which will merge subcluster with stricter threshold for newly added molecules
84
+
85
+ Parameters:
86
+ -----------
87
+ merge_criterion: str();
88
+ radius, diameter or tolerance
89
+ tolerance: float;
90
+ sets penalty value for similarity threshold when callng tolerance merge criteria
91
+
92
+ Returns:
93
+ --------
94
+ merge_accept(): function
95
+ if cluster is accepted to merge, merge the cluster based on the criteria specified
96
+ """
97
+ if merge_criterion == "radius":
98
+
99
+ def merge_accept(
100
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
101
+ ):
102
+ jt_sim = jt_isim(new_ls + new_centroid, new_n + 1) * (new_n + 1) - jt_isim(
103
+ new_ls, new_n
104
+ ) * (new_n - 1)
105
+ return jt_sim >= threshold * 2
106
+
107
+ elif merge_criterion == "diameter":
108
+
109
+ def merge_accept(
110
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
111
+ ):
112
+ jt_radius = jt_isim(new_ls, new_n)
113
+ return jt_radius >= threshold
114
+
115
+ elif merge_criterion == "tolerance_tough":
116
+
117
+ def merge_accept(
118
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
119
+ ):
120
+ jt_radius = jt_isim(new_ls, new_n)
121
+ if jt_radius < threshold:
122
+ return False
123
+ else:
124
+ if old_n == 1 and nom_n == 1:
125
+ return True
126
+ elif nom_n == 1:
127
+ return (
128
+ jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
129
+ - jt_isim(old_ls, old_n) * (old_n - 1)
130
+ ) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
131
+ jt_radius >= threshold
132
+ )
133
+ else:
134
+ return (
135
+ jt_isim(old_ls + nom_ls, old_n + nom_n)
136
+ * (old_n + nom_n)
137
+ * (old_n + nom_n - 1)
138
+ - jt_isim(old_ls, old_n) * old_n * (old_n - 1)
139
+ - jt_isim(nom_ls, nom_n) * nom_n * (nom_n - 1)
140
+ ) / (2 * old_n * nom_n) >= jt_isim(old_ls, old_n) - tolerance and (
141
+ jt_radius >= threshold
142
+ )
143
+
144
+ elif merge_criterion in ["tolerance", "tolerance-legacy"]:
145
+
146
+ def merge_accept(
147
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
148
+ ):
149
+ jt_radius = jt_isim(new_ls, new_n)
150
+ if jt_radius < threshold:
151
+ return False
152
+ else:
153
+ if old_n == 1 and nom_n == 1:
154
+ return True
155
+ elif nom_n == 1:
156
+ return (
157
+ jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
158
+ - jt_isim(old_ls, old_n) * (old_n - 1)
159
+ ) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
160
+ jt_radius >= threshold
161
+ )
162
+ else:
163
+ return True
164
+
165
+ globals()["merge_accept"] = merge_accept
166
+
167
+
168
+ def jt_isim(c_total, n_objects):
169
+ """iSIM Tanimoto calculation
170
+
171
+ https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
172
+
173
+ Parameters
174
+ ----------
175
+ c_total : np.ndarray
176
+ Sum of the elements column-wise
177
+
178
+ n_objects : int
179
+ Number of elements
180
+
181
+ Returns
182
+ ----------
183
+ isim : float
184
+ iSIM Jaccard-Tanimoto value
185
+ """
186
+ x = c_total.astype("uint64")
187
+ sum_kq = np.sum(x)
188
+ sum_kqsq = np.dot(x, x)
189
+ a = (sum_kqsq - sum_kq) / 2
190
+ return a / (a + n_objects * sum_kq - sum_kqsq)
191
+
192
+
193
+ def max_separation(Y):
194
+ """Finds two objects in X that are very separated
195
+ This is an approximation (not guaranteed to find
196
+ the two absolutely most separated objects), but it is
197
+ a very robust O(N) implementation. Quality of clustering
198
+ does not diminish in the end.
199
+
200
+ Algorithm:
201
+ a) Find centroid of X
202
+ b) mol1 is the molecule most distant from the centroid
203
+ c) mol2 is the molecule most distant from mol1
204
+
205
+ Returns
206
+ -------
207
+ (mol1, mol2) : (int, int)
208
+ indices of mol1 and mol2
209
+ 1 - sims_mol1 : np.ndarray
210
+ Distances to mol1
211
+ 1 - sims_mol2: np.ndarray
212
+ Distances to mol2
213
+ These are needed for node1_dist and node2_dist in _split_node
214
+ """
215
+ # Get the centroid of the set
216
+ X = Y.astype("uint64")
217
+ n_samples = len(X)
218
+ linear_sum = np.sum(X, axis=0)
219
+ centroid = calc_centroid(linear_sum, n_samples)
220
+
221
+ # Get the similarity of each molecule to the centroid
222
+ pop_counts = np.sum(X, axis=1)
223
+ a_centroid = np.dot(X, centroid)
224
+ sims_med = a_centroid / (pop_counts + np.sum(centroid) - a_centroid)
225
+
226
+ # Get the least similar molecule to the centroid
227
+ mol1 = np.argmin(sims_med)
228
+
229
+ # Get the similarity of each molecule to mol1
230
+ a_mol1 = np.dot(X, X[mol1])
231
+ sims_mol1 = a_mol1 / (pop_counts + pop_counts[mol1] - a_mol1)
232
+
233
+ # Get the least similar molecule to mol1
234
+ mol2 = np.argmin(sims_mol1)
235
+
236
+ # Get the similarity of each molecule to mol2
237
+ a_mol2 = np.dot(X, X[mol2])
238
+ sims_mol2 = a_mol2 / (pop_counts + pop_counts[mol2] - a_mol2)
239
+
240
+ return (mol1, mol2), sims_mol1, sims_mol2
241
+
242
+
243
+ def calc_centroid(linear_sum, n_samples):
244
+ """Calculates centroid
245
+
246
+ Parameters
247
+ ----------
248
+
249
+ linear_sum : np.ndarray
250
+ Sum of the elements column-wise
251
+ n_samples : int
252
+ Number of samples
253
+
254
+ Returns
255
+ -------
256
+ centroid : np.ndarray
257
+ Centroid fingerprints of the given set
258
+ """
259
+ cent = np.where(linear_sum >= n_samples * 0.5, 1, 0)
260
+ return cent.astype("bool")
261
+
262
+
263
+ def _iterate_sparse_X(X):
264
+ """This little hack returns a densified row when iterating over a sparse
265
+ matrix, instead of constructing a sparse matrix for every row that is
266
+ expensive.
267
+ """
268
+ n_samples, n_features = X.shape
269
+ X_indices = X.indices
270
+ X_data = X.data
271
+ X_indptr = X.indptr
272
+
273
+ for i in range(n_samples):
274
+ row = np.zeros(n_features)
275
+ startptr, endptr = X_indptr[i], X_indptr[i + 1]
276
+ nonzero_indices = X_indices[startptr:endptr]
277
+ row[nonzero_indices] = X_data[startptr:endptr]
278
+ yield row
279
+
280
+
281
+ def _split_node(node, threshold, branching_factor):
282
+ """The node has to be split if there is no place for a new subcluster
283
+ in the node.
284
+ 1. Two empty nodes and two empty subclusters are initialized.
285
+ 2. The pair of distant subclusters are found.
286
+ 3. The properties of the empty subclusters and nodes are updated
287
+ according to the nearest distance between the subclusters to the
288
+ pair of distant subclusters.
289
+ 4. The two nodes are set as children to the two subclusters.
290
+ """
291
+ new_subcluster1 = _BFSubcluster()
292
+ new_subcluster2 = _BFSubcluster()
293
+ new_node1 = _BFNode(
294
+ threshold=threshold,
295
+ branching_factor=branching_factor,
296
+ is_leaf=node.is_leaf,
297
+ n_features=node.n_features,
298
+ dtype=node.init_centroids_.dtype,
299
+ )
300
+ new_node2 = _BFNode(
301
+ threshold=threshold,
302
+ branching_factor=branching_factor,
303
+ is_leaf=node.is_leaf,
304
+ n_features=node.n_features,
305
+ dtype=node.init_centroids_.dtype,
306
+ )
307
+ new_subcluster1.child_ = new_node1
308
+ new_subcluster2.child_ = new_node2
309
+
310
+ if node.is_leaf:
311
+ if node.prev_leaf_ is not None:
312
+ node.prev_leaf_.next_leaf_ = new_node1
313
+ new_node1.prev_leaf_ = node.prev_leaf_
314
+ new_node1.next_leaf_ = new_node2
315
+ new_node2.prev_leaf_ = new_node1
316
+ new_node2.next_leaf_ = node.next_leaf_
317
+ if node.next_leaf_ is not None:
318
+ node.next_leaf_.prev_leaf_ = new_node2
319
+
320
+ # O(N) implementation of max separation
321
+ farthest_idx, node1_dist, node2_dist = max_separation(node.centroids_)
322
+ # Notice that max_separation is returning similarities and not distances
323
+ node1_closer = node1_dist > node2_dist
324
+ # Make sure node1 is closest to itself even if all distances are equal.
325
+ # This can only happen when all node.centroids_ are duplicates leading to all
326
+ # distances between centroids being zero.
327
+ node1_closer[farthest_idx[0]] = True
328
+
329
+ for idx, subcluster in enumerate(node.subclusters_):
330
+ if node1_closer[idx]:
331
+ new_node1.append_subcluster(subcluster)
332
+ new_subcluster1.update(subcluster)
333
+ # if not singly:
334
+ # subcluster.parent_ = new_subcluster1
335
+ else:
336
+ new_node2.append_subcluster(subcluster)
337
+ new_subcluster2.update(subcluster)
338
+ # if not singly:
339
+ # subcluster.parent_ = new_subcluster2
340
+ return new_subcluster1, new_subcluster2
341
+
342
+
343
+ class _BFNode:
344
+ """Each node in a BFTree is called a BFNode.
345
+
346
+ The BFNode can have a maximum of branching_factor
347
+ number of BFSubclusters.
348
+
349
+ Parameters
350
+ ----------
351
+ threshold : float
352
+ Threshold needed for a new subcluster to enter a BFSubcluster.
353
+
354
+ branching_factor : int
355
+ Maximum number of BF subclusters in each node.
356
+
357
+ is_leaf : bool
358
+ We need to know if the BFNode is a leaf or not, in order to
359
+ retrieve the final subclusters.
360
+
361
+ n_features : int
362
+ The number of features.
363
+
364
+ Attributes
365
+ ----------
366
+ subclusters_ : list
367
+ List of subclusters for a particular BFNode.
368
+
369
+ prev_leaf_ : _BFNode
370
+ Useful only if is_leaf is True.
371
+
372
+ next_leaf_ : _BFNode
373
+ next_leaf. Useful only if is_leaf is True.
374
+ the final subclusters.
375
+
376
+ init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
377
+ Manipulate ``init_centroids_`` throughout rather than centroids_ since
378
+ the centroids are just a view of the ``init_centroids_`` .
379
+
380
+ centroids_ : ndarray of shape (branching_factor + 1, n_features)
381
+ View of ``init_centroids_``.
382
+
383
+ """
384
+
385
+ def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
386
+ self.threshold = threshold
387
+ self.branching_factor = branching_factor
388
+ self.is_leaf = is_leaf
389
+ self.n_features = n_features
390
+
391
+ # The list of subclusters, centroids and squared norms
392
+ # to manipulate throughout.
393
+ self.subclusters_ = []
394
+ self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
395
+ self.prev_leaf_ = None
396
+ self.next_leaf_ = None
397
+
398
+ def append_subcluster(self, subcluster):
399
+ n_samples = len(self.subclusters_)
400
+ self.subclusters_.append(subcluster)
401
+ self.init_centroids_[n_samples] = subcluster.centroid_
402
+
403
+ # Keep centroids as views. In this way
404
+ # if we change init_centroids, it is sufficient
405
+ self.centroids_ = self.init_centroids_[: n_samples + 1, :]
406
+
407
+ def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
408
+ """Remove a subcluster from a node and update it with the
409
+ split subclusters.
410
+ """
411
+
412
+ ind = self.subclusters_.index(subcluster)
413
+ self.subclusters_[ind] = new_subcluster1
414
+ self.init_centroids_[ind] = new_subcluster1.centroid_
415
+ self.centroids_[ind] = new_subcluster1.centroid_
416
+ self.append_subcluster(new_subcluster2)
417
+
418
+ def insert_bf_subcluster(self, subcluster, set_bits):
419
+ """Insert a new subcluster into the node."""
420
+ if not self.subclusters_:
421
+ self.append_subcluster(subcluster)
422
+ return False
423
+
424
+ threshold = self.threshold
425
+ branching_factor = self.branching_factor
426
+ # We need to find the closest subcluster among all the
427
+ # subclusters so that we can insert our new subcluster.
428
+ sub_centroids = self.centroids_.astype("uint16")
429
+ in_centroid = subcluster.centroid_.astype("uint16")
430
+ a = np.dot(sub_centroids, in_centroid)
431
+ sim_matrix = a / (np.sum(sub_centroids, axis=1) + set_bits - a)
432
+ closest_index = np.argmax(sim_matrix)
433
+ closest_subcluster = self.subclusters_[closest_index]
434
+
435
+ # If the subcluster has a child, we need a recursive strategy.
436
+ if closest_subcluster.child_ is not None:
437
+
438
+ split_child = closest_subcluster.child_.insert_bf_subcluster(
439
+ subcluster, set_bits
440
+ )
441
+
442
+ if not split_child:
443
+ # If it is determined that the child need not be split, we
444
+ # can just update the closest_subcluster
445
+ closest_subcluster.update(subcluster)
446
+ self.init_centroids_[closest_index] = self.subclusters_[
447
+ closest_index
448
+ ].centroid_
449
+ self.centroids_[closest_index] = self.subclusters_[
450
+ closest_index
451
+ ].centroid_
452
+ return False
453
+
454
+ # things not too good. we need to redistribute the subclusters in
455
+ # our child node, and add a new subcluster in the parent
456
+ # subcluster to accommodate the new child.
457
+ else:
458
+ new_subcluster1, new_subcluster2 = _split_node(
459
+ closest_subcluster.child_, threshold, branching_factor
460
+ )
461
+ self.update_split_subclusters(
462
+ closest_subcluster, new_subcluster1, new_subcluster2
463
+ )
464
+
465
+ if len(self.subclusters_) > self.branching_factor:
466
+ return True
467
+ return False
468
+
469
+ # good to go!
470
+ else:
471
+ merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
472
+ if merged:
473
+ self.centroids_[closest_index] = closest_subcluster.centroid_
474
+ self.init_centroids_[closest_index] = closest_subcluster.centroid_
475
+
476
+ return False
477
+
478
+ # not close to any other subclusters, and we still
479
+ # have space, so add.
480
+ elif len(self.subclusters_) < self.branching_factor:
481
+ self.append_subcluster(subcluster)
482
+
483
+ return False
484
+
485
+ # We do not have enough space nor is it closer to an
486
+ # other subcluster. We need to split.
487
+ else:
488
+ self.append_subcluster(subcluster)
489
+ return True
490
+
491
+
492
+ class _BFSubcluster:
493
+ """Each subcluster in a BFNode is called a BFSubcluster.
494
+
495
+ A BFSubcluster can have a BFNode has its child.
496
+
497
+ Parameters
498
+ ----------
499
+ linear_sum : ndarray of shape (n_features,), default=None
500
+ Sample. This is kept optional to allow initialization of empty
501
+ subclusters.
502
+
503
+ Attributes
504
+ ----------
505
+ n_samples_ : int
506
+ Number of samples that belong to each subcluster.
507
+
508
+ linear_sum_ : ndarray
509
+ Linear sum of all the samples in a subcluster. Prevents holding
510
+ all sample data in memory.
511
+
512
+ centroid_ : ndarray of shape (branching_factor + 1, n_features)
513
+ Centroid of the subcluster. Prevent recomputing of centroids when
514
+ ``BFNode.centroids_`` is called.
515
+
516
+ mol_indices : list, default=[]
517
+ List of indices of molecules included in the given cluster.
518
+
519
+ child_ : _BFNode
520
+ Child Node of the subcluster. Once a given _BFNode is set as the child
521
+ of the _BFNode, it is set to ``self.child_``.
522
+ """
523
+
524
+ def __init__(self, *, linear_sum=None, mol_indices=[]):
525
+ if linear_sum is None:
526
+ self.n_samples_ = 0
527
+ self.centroid_ = self.linear_sum_ = np.zeros((2048,), dtype="bool")
528
+ self.mol_indices = []
529
+ else:
530
+ self.n_samples_ = 1
531
+ self.centroid_ = self.linear_sum_ = linear_sum
532
+ self.mol_indices = mol_indices
533
+
534
+ self.child_ = None
535
+ self.parent_ = None
536
+
537
+ def update(self, subcluster):
538
+ self.n_samples_ += np.uint64(subcluster.n_samples_)
539
+ self.linear_sum_ = safe_sum(
540
+ self.n_samples_, self.linear_sum_, subcluster.linear_sum_
541
+ )
542
+ self.mol_indices += subcluster.mol_indices
543
+ self.centroid_ = calc_centroid(self.linear_sum_, self.n_samples_)
544
+
545
+ def merge_subcluster(self, nominee_cluster, threshold):
546
+ """Check if a cluster is worthy enough to be merged. If
547
+ yes then merge.
548
+ """
549
+ new_n = np.uint64(self.n_samples_) + np.uint64(nominee_cluster.n_samples_)
550
+ new_ls = safe_sum(new_n, self.linear_sum_, nominee_cluster.linear_sum_)
551
+ new_centroid = calc_centroid(new_ls, new_n)
552
+
553
+ if merge_accept(
554
+ threshold,
555
+ new_ls,
556
+ new_centroid,
557
+ new_n,
558
+ self.linear_sum_,
559
+ nominee_cluster.linear_sum_,
560
+ self.n_samples_,
561
+ nominee_cluster.n_samples_,
562
+ ):
563
+ (self.n_samples_, self.linear_sum_, self.centroid_, self.mol_indices) = (
564
+ new_n,
565
+ new_ls,
566
+ new_centroid,
567
+ self.mol_indices + nominee_cluster.mol_indices,
568
+ )
569
+ return True
570
+ return False
571
+
572
+
573
+ class BitBirch:
574
+ """Implements the BitBIRCH clustering algorithm.
575
+
576
+ BitBIRCH paper:
577
+
578
+ Memory- and time-efficient, online-learning algorithm.
579
+ It constructs a tree data structure with the cluster centroids being read off the leaf.
580
+
581
+ Parameters
582
+ ----------
583
+ threshold : float, default=0.5
584
+ The similarity radius of the subcluster obtained by merging a new sample and the
585
+ closest subcluster should be greater than the threshold. Otherwise a new
586
+ subcluster is started. Setting this value to be very low promotes
587
+ splitting and vice-versa.
588
+
589
+ branching_factor : int, default=50
590
+ Maximum number of BF subclusters in each node. If a new samples enters
591
+ such that the number of subclusters exceed the branching_factor then
592
+ that node is split into two nodes with the subclusters redistributed
593
+ in each. The parent subcluster of that node is removed and two new
594
+ subclusters are added as parents of the 2 split nodes.
595
+
596
+ Attributes
597
+ ----------
598
+ root_ : _BFNode
599
+ Root of the BFTree.
600
+
601
+ dummy_leaf_ : _BFNode
602
+ Start pointer to all the leaves.
603
+
604
+ subcluster_centers_ : ndarray
605
+ Centroids of all subclusters read directly from the leaves.
606
+
607
+ Notes
608
+ -----
609
+ The tree data structure consists of nodes with each node consisting of
610
+ a number of subclusters. The maximum number of subclusters in a node
611
+ is determined by the branching factor. Each subcluster maintains a
612
+ linear sum, mol_indices and the number of samples in that subcluster.
613
+ In addition, each subcluster can also have a node as its child, if the
614
+ subcluster is not a member of a leaf node.
615
+
616
+ For a new point entering the root, it is merged with the subcluster closest
617
+ to it and the linear sum, mol_indices and the number of samples of that
618
+ subcluster are updated. This is done recursively till the properties of
619
+ the leaf node are updated.
620
+ """
621
+
622
+ def __init__(
623
+ self,
624
+ *,
625
+ threshold=0.5,
626
+ branching_factor=50,
627
+ ):
628
+ self.threshold = threshold
629
+ self.branching_factor = branching_factor
630
+ self.index_tracker = 0
631
+ self.first_call = True
632
+
633
+ def fit(
634
+ self,
635
+ X,
636
+ store_centroids: bool = False,
637
+ input_is_packed: bool = True,
638
+ n_features: int | None = None,
639
+ max_fps: int | None = None,
640
+ ):
641
+ """
642
+ Build a BF Tree for the input data.
643
+
644
+ Parameters
645
+ ----------
646
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
647
+ Input data.
648
+
649
+ Returns
650
+ -------
651
+ self
652
+ Fitted estimator.
653
+ """
654
+ if isinstance(X, Path):
655
+ X = np.load(X, mmap_mode="r")[:max_fps]
656
+ else:
657
+ X = X[:max_fps]
658
+ threshold = self.threshold
659
+ branching_factor = self.branching_factor
660
+ n_features = _validate_n_features(X, input_is_packed, n_features)
661
+ d_type = X.dtype
662
+
663
+ # If partial_fit is called for the first time or fit is called, we
664
+ # start a new tree.
665
+ if self.first_call:
666
+ # The first root is the leaf. Manipulate this object throughout.
667
+ self.root_ = _BFNode(
668
+ threshold=threshold,
669
+ branching_factor=branching_factor,
670
+ is_leaf=True,
671
+ n_features=n_features,
672
+ dtype=d_type,
673
+ )
674
+
675
+ # To enable getting back subclusters.
676
+ self.dummy_leaf_ = _BFNode(
677
+ threshold=threshold,
678
+ branching_factor=branching_factor,
679
+ is_leaf=True,
680
+ n_features=n_features,
681
+ dtype=d_type,
682
+ )
683
+ self.dummy_leaf_.next_leaf_ = self.root_
684
+ self.root_.prev_leaf_ = self.dummy_leaf_
685
+
686
+ # Cannot vectorize. Enough to convince to use cython.
687
+ if not sparse.issparse(X):
688
+ iter_func = iter
689
+ else:
690
+ iter_func = _iterate_sparse_X
691
+
692
+ for sample in iter_func(X):
693
+ unpack = _copy_or_unpack(sample, n_features, input_is_packed)
694
+ set_bits = np.sum(unpack.astype("uint64"))
695
+ subcluster = _BFSubcluster(
696
+ linear_sum=unpack, mol_indices=[self.index_tracker]
697
+ )
698
+ split = self.root_.insert_bf_subcluster(subcluster, set_bits)
699
+
700
+ if split:
701
+ new_subcluster1, new_subcluster2 = _split_node(
702
+ self.root_, threshold, branching_factor
703
+ )
704
+ del self.root_
705
+ self.root_ = _BFNode(
706
+ threshold=threshold,
707
+ branching_factor=branching_factor,
708
+ is_leaf=False,
709
+ n_features=n_features,
710
+ dtype=d_type,
711
+ )
712
+ self.root_.append_subcluster(new_subcluster1)
713
+ self.root_.append_subcluster(new_subcluster2)
714
+
715
+ self.index_tracker += 1
716
+ if store_centroids:
717
+ centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
718
+ self.subcluster_centers_ = centroids
719
+ self._n_features_out = self.subcluster_centers_.shape[0]
720
+
721
+ self.first_call = False
722
+ return self
723
+
724
+ def fit_np(self, X):
725
+ threshold = self.threshold
726
+ branching_factor = self.branching_factor
727
+
728
+ n_features = X.shape[1] - 1
729
+ d_type = X.dtype
730
+
731
+ # If partial_fit is called for the first time or fit is called, we
732
+ # start a new tree.
733
+ if self.first_call:
734
+ # The first root is the leaf. Manipulate this object throughout.
735
+ self.root_ = _BFNode(
736
+ threshold=threshold,
737
+ branching_factor=branching_factor,
738
+ is_leaf=True,
739
+ n_features=n_features,
740
+ dtype=d_type,
741
+ )
742
+
743
+ # To enable getting back subclusters.
744
+ self.dummy_leaf_ = _BFNode(
745
+ threshold=threshold,
746
+ branching_factor=branching_factor,
747
+ is_leaf=True,
748
+ n_features=n_features,
749
+ dtype=d_type,
750
+ )
751
+ self.dummy_leaf_.next_leaf_ = self.root_
752
+ self.root_.prev_leaf_ = self.dummy_leaf_
753
+
754
+ # Cannot vectorize. Enough to convince to use cython.
755
+ if not sparse.issparse(X):
756
+ iter_func = iter
757
+ else:
758
+ iter_func = _iterate_sparse_X
759
+
760
+ for sample in iter_func(X):
761
+ sample_copy = sample.copy()
762
+ subcluster = _BFSubcluster(
763
+ linear_sum=sample_copy[:-1], mol_indices=[self.index_tracker]
764
+ )
765
+ n_samples = sample_copy[-1]
766
+ if n_samples > 1:
767
+ subcluster.n_samples_ = n_samples
768
+ subcluster.centroid_ = calc_centroid(sample_copy[:-1], n_samples)
769
+ set_bits = np.sum(subcluster.centroid_.astype("uint64"))
770
+ split = self.root_.insert_bf_subcluster(subcluster, set_bits)
771
+
772
+ if split:
773
+ new_subcluster1, new_subcluster2 = _split_node(
774
+ self.root_, threshold, branching_factor
775
+ )
776
+ del self.root_
777
+ self.root_ = _BFNode(
778
+ threshold=threshold,
779
+ branching_factor=branching_factor,
780
+ is_leaf=False,
781
+ n_features=n_features,
782
+ dtype=d_type,
783
+ )
784
+ self.root_.append_subcluster(new_subcluster1)
785
+ self.root_.append_subcluster(new_subcluster2)
786
+
787
+ self.index_tracker += 1
788
+
789
+ self.first_call = False
790
+ return self
791
+
792
+ def fit_np_reinsert(self, X, reinsert_indices):
793
+ threshold = self.threshold
794
+ branching_factor = self.branching_factor
795
+
796
+ n_features = X.shape[1] - 1
797
+ d_type = X.dtype
798
+
799
+ # If partial_fit is called for the first time or fit is called, we
800
+ # start a new tree.
801
+ if self.first_call:
802
+ # The first root is the leaf. Manipulate this object throughout.
803
+ self.root_ = _BFNode(
804
+ threshold=threshold,
805
+ branching_factor=branching_factor,
806
+ is_leaf=True,
807
+ n_features=n_features,
808
+ dtype=d_type,
809
+ )
810
+
811
+ # To enable getting back subclusters.
812
+ self.dummy_leaf_ = _BFNode(
813
+ threshold=threshold,
814
+ branching_factor=branching_factor,
815
+ is_leaf=True,
816
+ n_features=n_features,
817
+ dtype=d_type,
818
+ )
819
+ self.dummy_leaf_.next_leaf_ = self.root_
820
+ self.root_.prev_leaf_ = self.dummy_leaf_
821
+
822
+ # Cannot vectorize. Enough to convince to use cython.
823
+ if not sparse.issparse(X):
824
+ iter_func = iter
825
+ else:
826
+ iter_func = _iterate_sparse_X
827
+
828
+ for sample, mol_inds in zip(iter_func(X), reinsert_indices):
829
+ sample_copy = sample.copy()
830
+ subcluster = _BFSubcluster(
831
+ linear_sum=sample_copy[:-1], mol_indices=mol_inds
832
+ )
833
+ n_samples = sample_copy[-1]
834
+ if n_samples > 1:
835
+ subcluster.n_samples_ = n_samples
836
+ subcluster.centroid_ = calc_centroid(sample_copy[:-1], n_samples)
837
+ set_bits = np.sum(subcluster.centroid_.astype("uint64"))
838
+ split = self.root_.insert_bf_subcluster(subcluster, set_bits)
839
+
840
+ if split:
841
+ new_subcluster1, new_subcluster2 = _split_node(
842
+ self.root_, threshold, branching_factor
843
+ )
844
+ del self.root_
845
+ self.root_ = _BFNode(
846
+ threshold=threshold,
847
+ branching_factor=branching_factor,
848
+ is_leaf=False,
849
+ n_features=n_features,
850
+ dtype=d_type,
851
+ )
852
+ self.root_.append_subcluster(new_subcluster1)
853
+ self.root_.append_subcluster(new_subcluster2)
854
+
855
+ self.first_call = False
856
+ return self
857
+
858
+ def fit_reinsert(
859
+ self,
860
+ X,
861
+ reinsert_indices,
862
+ store_centroids: bool = False,
863
+ input_is_packed: bool = True,
864
+ n_features: int | None = None,
865
+ ):
866
+ """X corresponds to only the molecules that will be reinserted into the tree
867
+ reinsert indices are the indices of the molecules that will be reinserted into the tree
868
+ """
869
+ threshold = self.threshold
870
+ branching_factor = self.branching_factor
871
+ n_features = _validate_n_features(X, input_is_packed, n_features)
872
+ d_type = X.dtype
873
+
874
+ # If partial_fit is called for the first time or fit is called, we
875
+ # start a new tree.
876
+ if self.first_call:
877
+ # The first root is the leaf. Manipulate this object throughout.
878
+ self.root_ = _BFNode(
879
+ threshold=threshold,
880
+ branching_factor=branching_factor,
881
+ is_leaf=True,
882
+ n_features=n_features,
883
+ dtype=d_type,
884
+ )
885
+
886
+ # To enable getting back subclusters.
887
+ self.dummy_leaf_ = _BFNode(
888
+ threshold=threshold,
889
+ branching_factor=branching_factor,
890
+ is_leaf=True,
891
+ n_features=n_features,
892
+ dtype=d_type,
893
+ )
894
+ self.dummy_leaf_.next_leaf_ = self.root_
895
+ self.root_.prev_leaf_ = self.dummy_leaf_
896
+
897
+ # Cannot vectorize. Enough to convince to use cython.
898
+ if not sparse.issparse(X):
899
+ iter_func = iter
900
+ else:
901
+ iter_func = _iterate_sparse_X
902
+
903
+ for sample, mol_ind in zip(iter_func(X), reinsert_indices):
904
+ unpack = _copy_or_unpack(sample, n_features, input_is_packed)
905
+ set_bits = np.sum(unpack.astype("uint64"))
906
+ subcluster = _BFSubcluster(linear_sum=unpack, mol_indices=[mol_ind])
907
+ split = self.root_.insert_bf_subcluster(subcluster, set_bits)
908
+ if split:
909
+ new_subcluster1, new_subcluster2 = _split_node(
910
+ self.root_, threshold, branching_factor
911
+ )
912
+ del self.root_
913
+ self.root_ = _BFNode(
914
+ threshold=threshold,
915
+ branching_factor=branching_factor,
916
+ is_leaf=False,
917
+ n_features=n_features,
918
+ dtype=d_type,
919
+ )
920
+ self.root_.append_subcluster(new_subcluster1)
921
+ self.root_.append_subcluster(new_subcluster2)
922
+
923
+ if store_centroids:
924
+ centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
925
+ self.subcluster_centers_ = centroids
926
+ self._n_features_out = self.subcluster_centers_.shape[0]
927
+
928
+ self.first_call = False
929
+ return self
930
+
931
+ def _get_leaves(self):
932
+ """
933
+ Retrieve the leaves of the BF Node.
934
+
935
+ Returns
936
+ -------
937
+ leaves : list of shape (n_leaves,)
938
+ List of the leaf nodes.
939
+ """
940
+ leaf_ptr = self.dummy_leaf_.next_leaf_
941
+ leaves = []
942
+ while leaf_ptr is not None:
943
+ leaves.append(leaf_ptr)
944
+ leaf_ptr = leaf_ptr.next_leaf_
945
+ return leaves
946
+
947
+ def get_centroids_mol_ids(self):
948
+ """Method to return a dictionary containing the centroids and mol indices of the leaves"""
949
+ if self.first_call:
950
+ raise ValueError("The model has not been fitted yet.")
951
+
952
+ centroids = []
953
+ mol_ids = []
954
+ for leaf in self._get_leaves():
955
+ for subcluster in leaf.subclusters_:
956
+ centroids.append(subcluster.centroid_)
957
+ mol_ids.append(subcluster.mol_indices)
958
+
959
+ dict_centroids_mol_ids = {"centroids": centroids, "mol_ids": mol_ids}
960
+
961
+ return dict_centroids_mol_ids
962
+
963
+ def get_centroids(self):
964
+ """Method to return a list of Numpy arrays containing the centroids' fingerprints"""
965
+ if self.first_call:
966
+ raise ValueError("The model has not been fitted yet.")
967
+
968
+ centroids = []
969
+ for leaf in self._get_leaves():
970
+ for subcluster in leaf.subclusters_:
971
+ centroids.append(subcluster.centroid_)
972
+
973
+ return centroids
974
+
975
+ def get_cluster_mol_ids(self):
976
+ """Method to return the indices of molecules in each cluster"""
977
+ if self.first_call:
978
+ raise ValueError("The model has not been fitted yet.")
979
+
980
+ clusters_mol_id = []
981
+ for leaf in self._get_leaves():
982
+ for subcluster in leaf.subclusters_:
983
+ clusters_mol_id.append(subcluster.mol_indices)
984
+
985
+ # Sort the clusters by the number of samples in the cluster
986
+ clusters_mol_id = sorted(clusters_mol_id, key=lambda x: len(x), reverse=True)
987
+
988
+ return clusters_mol_id
989
+
990
+ def _get_BFs(self):
991
+ """Method to return the BitFeatures of the leaves"""
992
+ if self.first_call:
993
+ raise ValueError("The model has not been fitted yet.")
994
+
995
+ BFs = []
996
+ for leaf in self._get_leaves():
997
+ for subcluster in leaf.subclusters_:
998
+ BFs.append(subcluster)
999
+
1000
+ # Sort the BitFeatures by the number of samples in the cluster
1001
+ BFs = sorted(BFs, key=lambda x: x.n_samples_, reverse=True)
1002
+
1003
+ return BFs
1004
+
1005
+ def bf_to_np_refine(
1006
+ self,
1007
+ fps,
1008
+ initial_mol=0,
1009
+ input_is_packed: bool = True,
1010
+ n_features: int | None = None,
1011
+ ):
1012
+ """Method to prepare the BitFeatures of the largest cluster and the rest of the clusters"""
1013
+ if self.first_call:
1014
+ raise ValueError("The model has not been fitted yet.")
1015
+ n_features = _validate_n_features(fps, input_is_packed, n_features)
1016
+ BFs = self._get_BFs()
1017
+ big, rest = BFs[0], BFs[1:]
1018
+
1019
+ fp_64 = []
1020
+ fp_32 = []
1021
+ fp_16 = []
1022
+ fp_8 = []
1023
+
1024
+ mols_64 = []
1025
+ mols_32 = []
1026
+ mols_16 = []
1027
+ mols_8 = []
1028
+
1029
+ for BF in rest:
1030
+ if BF.n_samples_ >= 4294967294:
1031
+ fp_64.append(BF.linear_sum_)
1032
+ fp_64.append(BF.n_samples_)
1033
+ mols_64.append(BF.mol_indices)
1034
+ elif BF.n_samples_ >= 65534:
1035
+ fp_32.append(BF.linear_sum_)
1036
+ fp_32.append(BF.n_samples_)
1037
+ mols_32.append(BF.mol_indices)
1038
+ elif BF.n_samples_ >= 254:
1039
+ fp_16.append(list(BF.linear_sum_))
1040
+ fp_16[-1].append(BF.n_samples_)
1041
+ mols_16.append(BF.mol_indices)
1042
+ else:
1043
+ fp_8.append(list(BF.linear_sum_))
1044
+ fp_8[-1].append(BF.n_samples_)
1045
+ mols_8.append(BF.mol_indices)
1046
+
1047
+ for mol in big.mol_indices:
1048
+ if input_is_packed:
1049
+ fp_8.append(
1050
+ list(unpack_fingerprints(fps[mol - initial_mol], n_features))
1051
+ )
1052
+ else:
1053
+ fp_8.append(list(fps[mol - initial_mol]))
1054
+ fp_8[-1].append(1)
1055
+ mols_8.append(BF.mol_indices)
1056
+
1057
+ fps_bfs = []
1058
+ mols_bfs = []
1059
+ if len(fp_64) != 0:
1060
+ fp_64 = np.array(fp_64, dtype=np.uint64)
1061
+ fps_bfs.append(fp_64)
1062
+ mols_bfs.append(mols_64)
1063
+ if len(fp_32) != 0:
1064
+ fp_32 = np.array(fp_32, dtype=np.uint32)
1065
+ fps_bfs.append(fp_32)
1066
+ mols_bfs.append(mols_32)
1067
+ if len(fp_16) != 0:
1068
+ fp_16 = np.array(fp_16, dtype=np.uint16)
1069
+ fps_bfs.append(fp_16)
1070
+ mols_bfs.append(mols_16)
1071
+ if len(fp_8) != 0:
1072
+ fp_8 = np.array(fp_8, dtype=np.uint8)
1073
+ fps_bfs.append(fp_8)
1074
+ mols_bfs.append(mols_8)
1075
+
1076
+ return fps_bfs, mols_bfs
1077
+
1078
+ def bf_to_np(self):
1079
+ """Method to prepare the BitFeatures of the largest cluster and the rest of the clusters"""
1080
+ if self.first_call:
1081
+ raise ValueError("The model has not been fitted yet.")
1082
+
1083
+ BFs = self._get_BFs()
1084
+
1085
+ fp_64 = []
1086
+ fp_32 = []
1087
+ fp_16 = []
1088
+ fp_8 = []
1089
+
1090
+ mols_64 = []
1091
+ mols_32 = []
1092
+ mols_16 = []
1093
+ mols_8 = []
1094
+
1095
+ for BF in BFs:
1096
+ if BF.n_samples_ >= 4294967294:
1097
+ fp_64.append(BF.linear_sum_)
1098
+ fp_64.append(BF.n_samples_)
1099
+ mols_64.append(BF.mol_indices)
1100
+ elif BF.n_samples_ >= 65534:
1101
+ fp_32.append(BF.linear_sum_)
1102
+ fp_32.append(BF.n_samples_)
1103
+ mols_32.append(BF.mol_indices)
1104
+ elif BF.n_samples_ >= 254:
1105
+ fp_16.append(list(BF.linear_sum_))
1106
+ fp_16[-1].append(BF.n_samples_)
1107
+ mols_16.append(BF.mol_indices)
1108
+ else:
1109
+ fp_8.append(list(BF.linear_sum_))
1110
+ fp_8[-1].append(BF.n_samples_)
1111
+ mols_8.append(BF.mol_indices)
1112
+
1113
+ fps_bfs = []
1114
+ mols_bfs = []
1115
+ if len(fp_64) != 0:
1116
+ fp_64 = np.array(fp_64, dtype=np.uint64)
1117
+ fps_bfs.append(fp_64)
1118
+ mols_bfs.append(mols_64)
1119
+ if len(fp_32) != 0:
1120
+ fp_32 = np.array(fp_32, dtype=np.uint32)
1121
+ fps_bfs.append(fp_32)
1122
+ mols_bfs.append(mols_32)
1123
+ if len(fp_16) != 0:
1124
+ fp_16 = np.array(fp_16, dtype=np.uint16)
1125
+ fps_bfs.append(fp_16)
1126
+ mols_bfs.append(mols_16)
1127
+ if len(fp_8) != 0:
1128
+ fp_8 = np.array(fp_8, dtype=np.uint8)
1129
+ fps_bfs.append(fp_8)
1130
+ mols_bfs.append(mols_8)
1131
+
1132
+ return fps_bfs, mols_bfs
1133
+
1134
+ def get_assignments(self, n_mols):
1135
+ clustered_ids = self.get_cluster_mol_ids()
1136
+
1137
+ assignments = np.full(n_mols, -1, dtype=int)
1138
+ for i, cluster in enumerate(clustered_ids):
1139
+ assignments[cluster] = i + 1
1140
+
1141
+ # Check that there are no unassigned molecules
1142
+ assert np.all(assignments != -1)
1143
+
1144
+ return assignments