bblean 0.6.0b2__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1252 @@
1
+ # type: ignore
2
+ # BitBIRCH is an open-source clustering module based on iSIM
3
+ #
4
+ # Please, cite the BitBIRCH paper: https://www.biorxiv.org/content/10.1101/2024.08.10.607459v1
5
+ #
6
+ # BitBIRCH is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU Lesser General Public License as published by
8
+ # the Free Software Foundation, version 3.
9
+ #
10
+ # BitBIRCH is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU Lesser General Public License for more details.
14
+ #
15
+ # BitBIRCH authors (PYTHON): Ramon Alain Miranda Quintana <ramirandaq@gmail.com>, <quintana@chem.ufl.edu>
16
+ # Vicky (Vic) Jung <jungvicky@ufl.edu>
17
+ # Kenneth Lopez Perez <klopezperez@chem.ufl.edu>
18
+ # Kate Huddleston <kdavis2@chem.ufl.edu>
19
+ #
20
+ # BitBIRCH License: LGPL-3.0 https://www.gnu.org/licenses/lgpl-3.0.en.html#license-text
21
+ #
22
+ ### Part of the tree-management code was derived from https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html
23
+ ### Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
24
+ ### Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
25
+ ### Joel Nothman <joel.nothman@gmail.com>
26
+ ### License: BSD 3 clause
27
+ from pathlib import Path
28
+
29
+ import numpy as np
30
+ from scipy import sparse
31
+
32
+
33
+ # Utility function to validate the n_features argument for packed inputs
34
+ def _validate_n_features(X, input_is_packed: bool, n_features: int | None) -> int:
35
+ if input_is_packed:
36
+ raise ValueError("Packed inputs not supported for BitBirch-int64")
37
+
38
+ x_n_features = X.shape[1]
39
+ if n_features is not None:
40
+ if n_features != x_n_features:
41
+ raise ValueError(
42
+ "n_features is redundant for non-packed inputs"
43
+ " if passed, it must be equal to X.shape[1]."
44
+ f" For passed X, X.shape[1] = {X.shape[1]}."
45
+ " If this value is not what you expected,"
46
+ " make sure the passed X is actually unpacked."
47
+ )
48
+ return x_n_features
49
+
50
+
51
+ def set_merge(merge_criterion, tolerance=0.05):
52
+ """
53
+ Sets merge_accept function for merge_subcluster, based on user specified merge_criteria.
54
+
55
+ Radius: merge subcluster based on comparison to centroid of the cluster
56
+ Diameter: merge subcluster based on instant Tanimoto similarity of cluster
57
+ Tolerance: applies tolerance threshold to diameter merge criteria, which will merge subcluster with stricter threshold for newly added molecules
58
+
59
+ Parameters:
60
+ -----------
61
+ merge_criterion: str();
62
+ radius, diameter or tolerance
63
+ tolerance: float;
64
+ sets penalty value for similarity threshold when callng tolerance merge criteria
65
+
66
+ Returns:
67
+ --------
68
+ merge_accept(): function
69
+ if cluster is accepted to merge, merge the cluster based on the criteria specified
70
+ """
71
+ if merge_criterion == "radius":
72
+
73
+ def merge_accept(
74
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
75
+ ):
76
+ jt_sim = jt_isim(new_ls + new_centroid, new_n + 1) * (new_n + 1) - jt_isim(
77
+ new_ls, new_n
78
+ ) * (new_n - 1)
79
+ return jt_sim >= threshold * 2
80
+
81
+ elif merge_criterion == "diameter":
82
+
83
+ def merge_accept(
84
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
85
+ ):
86
+ jt_radius = jt_isim(new_ls, new_n)
87
+ return jt_radius >= threshold
88
+
89
+ elif merge_criterion == "tolerance_tough":
90
+
91
+ def merge_accept(
92
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
93
+ ):
94
+ jt_radius = jt_isim(new_ls, new_n)
95
+ if jt_radius < threshold:
96
+ return False
97
+ else:
98
+ if old_n == 1 and nom_n == 1:
99
+ return True
100
+ elif nom_n == 1:
101
+ return (
102
+ jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
103
+ - jt_isim(old_ls, old_n) * (old_n - 1)
104
+ ) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
105
+ jt_radius >= threshold
106
+ )
107
+ else:
108
+ return (
109
+ jt_isim(old_ls + nom_ls, old_n + nom_n)
110
+ * (old_n + nom_n)
111
+ * (old_n + nom_n - 1)
112
+ - jt_isim(old_ls, old_n) * old_n * (old_n - 1)
113
+ - jt_isim(nom_ls, nom_n) * nom_n * (nom_n - 1)
114
+ ) / (2 * old_n * nom_n) >= jt_isim(old_ls, old_n) - tolerance and (
115
+ jt_radius >= threshold
116
+ )
117
+
118
+ elif merge_criterion in ["tolerance", "tolerance-legacy"]:
119
+
120
+ def merge_accept(
121
+ threshold, new_ls, new_centroid, new_n, old_ls, nom_ls, old_n, nom_n
122
+ ):
123
+ jt_radius = jt_isim(new_ls, new_n)
124
+ if jt_radius < threshold:
125
+ return False
126
+ else:
127
+ if old_n == 1 and nom_n == 1:
128
+ return True
129
+ elif nom_n == 1:
130
+ return (
131
+ jt_isim(old_ls + nom_ls, old_n + 1) * (old_n + 1)
132
+ - jt_isim(old_ls, old_n) * (old_n - 1)
133
+ ) / 2 >= jt_isim(old_ls, old_n) - tolerance and (
134
+ jt_radius >= threshold
135
+ )
136
+ else:
137
+ return True
138
+
139
+ globals()["merge_accept"] = merge_accept
140
+
141
+
142
+ def jt_isim(c_total, n_objects):
143
+ """iSIM Tanimoto calculation
144
+
145
+ https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
146
+
147
+ Parameters
148
+ ----------
149
+ c_total : np.ndarray
150
+ Sum of the elements column-wise
151
+
152
+ n_objects : int
153
+ Number of elements
154
+
155
+ Returns
156
+ ----------
157
+ isim : float
158
+ iSIM Jaccard-Tanimoto value
159
+ """
160
+ sum_kq = np.sum(c_total)
161
+ sum_kqsq = np.dot(c_total, c_total)
162
+ a = (sum_kqsq - sum_kq) / 2
163
+
164
+ return a / (a + n_objects * sum_kq - sum_kqsq)
165
+
166
+
167
+ def max_separation(X):
168
+ """Finds two objects in X that are very separated
169
+ This is an approximation (not guaranteed to find
170
+ the two absolutely most separated objects), but it is
171
+ a very robust O(N) implementation. Quality of clustering
172
+ does not diminish in the end.
173
+
174
+ Algorithm:
175
+ a) Find centroid of X
176
+ b) mol1 is the molecule most distant from the centroid
177
+ c) mol2 is the molecule most distant from mol1
178
+
179
+ Returns
180
+ -------
181
+ (mol1, mol2) : (int, int)
182
+ indices of mol1 and mol2
183
+ 1 - sims_mol1 : np.ndarray
184
+ Distances to mol1
185
+ 1 - sims_mol2: np.ndarray
186
+ Distances to mol2
187
+ These are needed for node1_dist and node2_dist in _split_node
188
+ """
189
+ # Get the centroid of the set
190
+ n_samples = len(X)
191
+ linear_sum = np.sum(X, axis=0)
192
+ centroid = calc_centroid(linear_sum, n_samples)
193
+
194
+ # Get the similarity of each molecule to the centroid
195
+ pop_counts = np.sum(X, axis=1)
196
+ a_centroid = np.dot(X, centroid)
197
+ sims_med = a_centroid / (pop_counts + np.sum(centroid) - a_centroid)
198
+
199
+ # Get the least similar molecule to the centroid
200
+ mol1 = np.argmin(sims_med)
201
+
202
+ # Get the similarity of each molecule to mol1
203
+ a_mol1 = np.dot(X, X[mol1])
204
+ sims_mol1 = a_mol1 / (pop_counts + pop_counts[mol1] - a_mol1)
205
+
206
+ # Get the least similar molecule to mol1
207
+ mol2 = np.argmin(sims_mol1)
208
+
209
+ # Get the similarity of each molecule to mol2
210
+ a_mol2 = np.dot(X, X[mol2])
211
+ sims_mol2 = a_mol2 / (pop_counts + pop_counts[mol2] - a_mol2)
212
+
213
+ return (mol1, mol2), sims_mol1, sims_mol2
214
+
215
+
216
+ def calc_centroid(linear_sum, n_samples):
217
+ """Calculates centroid
218
+
219
+ Parameters
220
+ ----------
221
+
222
+ linear_sum : np.ndarray
223
+ Sum of the elements column-wise
224
+ n_samples : int
225
+ Number of samples
226
+
227
+ Returns
228
+ -------
229
+ centroid : np.ndarray
230
+ Centroid fingerprints of the given set
231
+ """
232
+ return np.where(linear_sum >= n_samples * 0.5, 1, 0)
233
+
234
+
235
+ def _iterate_sparse_X(X):
236
+ """This little hack returns a densified row when iterating over a sparse
237
+ matrix, instead of constructing a sparse matrix for every row that is
238
+ expensive.
239
+ """
240
+ n_samples, n_features = X.shape
241
+ X_indices = X.indices
242
+ X_data = X.data
243
+ X_indptr = X.indptr
244
+
245
+ for i in range(n_samples):
246
+ row = np.zeros(n_features)
247
+ startptr, endptr = X_indptr[i], X_indptr[i + 1]
248
+ nonzero_indices = X_indices[startptr:endptr]
249
+ row[nonzero_indices] = X_data[startptr:endptr]
250
+ yield row
251
+
252
+
253
+ def _split_node(node, threshold, branching_factor, singly):
254
+ """The node has to be split if there is no place for a new subcluster
255
+ in the node.
256
+ 1. Two empty nodes and two empty subclusters are initialized.
257
+ 2. The pair of distant subclusters are found.
258
+ 3. The properties of the empty subclusters and nodes are updated
259
+ according to the nearest distance between the subclusters to the
260
+ pair of distant subclusters.
261
+ 4. The two nodes are set as children to the two subclusters.
262
+ """
263
+ new_subcluster1 = _BFSubcluster()
264
+ new_subcluster2 = _BFSubcluster()
265
+ new_node1 = _BFNode(
266
+ threshold=threshold,
267
+ branching_factor=branching_factor,
268
+ is_leaf=node.is_leaf,
269
+ n_features=node.n_features,
270
+ dtype=node.init_centroids_.dtype,
271
+ )
272
+ new_node2 = _BFNode(
273
+ threshold=threshold,
274
+ branching_factor=branching_factor,
275
+ is_leaf=node.is_leaf,
276
+ n_features=node.n_features,
277
+ dtype=node.init_centroids_.dtype,
278
+ )
279
+ new_subcluster1.child_ = new_node1
280
+ new_subcluster2.child_ = new_node2
281
+
282
+ if node.is_leaf:
283
+ if node.prev_leaf_ is not None:
284
+ node.prev_leaf_.next_leaf_ = new_node1
285
+ new_node1.prev_leaf_ = node.prev_leaf_
286
+ new_node1.next_leaf_ = new_node2
287
+ new_node2.prev_leaf_ = new_node1
288
+ new_node2.next_leaf_ = node.next_leaf_
289
+ if node.next_leaf_ is not None:
290
+ node.next_leaf_.prev_leaf_ = new_node2
291
+
292
+ # O(N) implementation of max separation
293
+ farthest_idx, node1_dist, node2_dist = max_separation(node.centroids_)
294
+ # Notice that max_separation is returning similarities and not distances
295
+ node1_closer = node1_dist > node2_dist
296
+ # Make sure node1 is closest to itself even if all distances are equal.
297
+ # This can only happen when all node.centroids_ are duplicates leading to all
298
+ # distances between centroids being zero.
299
+ node1_closer[farthest_idx[0]] = True
300
+
301
+ for idx, subcluster in enumerate(node.subclusters_):
302
+ if node1_closer[idx]:
303
+ new_node1.append_subcluster(subcluster)
304
+ new_subcluster1.update(subcluster)
305
+ if not singly:
306
+ subcluster.parent_ = new_subcluster1
307
+ else:
308
+ new_node2.append_subcluster(subcluster)
309
+ new_subcluster2.update(subcluster)
310
+ if not singly:
311
+ subcluster.parent_ = new_subcluster2
312
+ return new_subcluster1, new_subcluster2
313
+
314
+
315
+ class _BFNode:
316
+ """Each node in a BFTree is called a BFNode.
317
+
318
+ The BFNode can have a maximum of branching_factor
319
+ number of BFSubclusters.
320
+
321
+ Parameters
322
+ ----------
323
+ threshold : float
324
+ Threshold needed for a new subcluster to enter a BFSubcluster.
325
+
326
+ branching_factor : int
327
+ Maximum number of BF subclusters in each node.
328
+
329
+ is_leaf : bool
330
+ We need to know if the BFNode is a leaf or not, in order to
331
+ retrieve the final subclusters.
332
+
333
+ n_features : int
334
+ The number of features.
335
+
336
+ Attributes
337
+ ----------
338
+ subclusters_ : list
339
+ List of subclusters for a particular BFNode.
340
+
341
+ prev_leaf_ : _BFNode
342
+ Useful only if is_leaf is True.
343
+
344
+ next_leaf_ : _BFNode
345
+ next_leaf. Useful only if is_leaf is True.
346
+ the final subclusters.
347
+
348
+ init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
349
+ Manipulate ``init_centroids_`` throughout rather than centroids_ since
350
+ the centroids are just a view of the ``init_centroids_`` .
351
+
352
+ centroids_ : ndarray of shape (branching_factor + 1, n_features)
353
+ View of ``init_centroids_``.
354
+
355
+ """
356
+
357
+ def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
358
+ self.threshold = threshold
359
+ self.branching_factor = branching_factor
360
+ self.is_leaf = is_leaf
361
+ self.n_features = n_features
362
+
363
+ # The list of subclusters, centroids and squared norms
364
+ # to manipulate throughout.
365
+ self.subclusters_ = []
366
+ self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
367
+ self.prev_leaf_ = None
368
+ self.next_leaf_ = None
369
+
370
+ def append_subcluster(self, subcluster):
371
+ n_samples = len(self.subclusters_)
372
+ self.subclusters_.append(subcluster)
373
+ self.init_centroids_[n_samples] = subcluster.centroid_
374
+
375
+ # Keep centroids as views. In this way
376
+ # if we change init_centroids, it is sufficient
377
+ self.centroids_ = self.init_centroids_[: n_samples + 1, :]
378
+
379
+ def update_split_subclusters(
380
+ self, subcluster, new_subcluster1, new_subcluster2, singly
381
+ ):
382
+ """Remove a subcluster from a node and update it with the
383
+ split subclusters.
384
+ """
385
+ if not singly:
386
+ new_subcluster1.parent_ = self.subclusters_[0].parent_
387
+ new_subcluster2.parent_ = self.subclusters_[0].parent_
388
+
389
+ ind = self.subclusters_.index(subcluster)
390
+ self.subclusters_[ind] = new_subcluster1
391
+ self.init_centroids_[ind] = new_subcluster1.centroid_
392
+ self.centroids_[ind] = new_subcluster1.centroid_
393
+ self.append_subcluster(new_subcluster2)
394
+
395
+ def insert_bf_subcluster(self, subcluster, set_bits, ps, singly):
396
+ """Insert a new subcluster into the node."""
397
+ if not self.subclusters_:
398
+ self.append_subcluster(subcluster)
399
+ return False
400
+
401
+ threshold = self.threshold
402
+ branching_factor = self.branching_factor
403
+ # We need to find the closest subcluster among all the
404
+ # subclusters so that we can insert our new subcluster.
405
+ a = np.dot(self.centroids_, subcluster.centroid_)
406
+ sim_matrix = a / (np.sum(self.centroids_, axis=1) + set_bits - a)
407
+ closest_index = np.argmax(sim_matrix)
408
+ closest_subcluster = self.subclusters_[closest_index]
409
+
410
+ # If the subcluster has a child, we need a recursive strategy.
411
+ if closest_subcluster.child_ is not None:
412
+ ps = closest_subcluster
413
+ split_child = closest_subcluster.child_.insert_bf_subcluster(
414
+ subcluster, set_bits, ps, singly
415
+ )
416
+
417
+ if not split_child:
418
+ # If it is determined that the child need not be split, we
419
+ # can just update the closest_subcluster
420
+ closest_subcluster.update(subcluster)
421
+ self.init_centroids_[closest_index] = self.subclusters_[
422
+ closest_index
423
+ ].centroid_
424
+ self.centroids_[closest_index] = self.subclusters_[
425
+ closest_index
426
+ ].centroid_
427
+ return False
428
+
429
+ # things not too good. we need to redistribute the subclusters in
430
+ # our child node, and add a new subcluster in the parent
431
+ # subcluster to accommodate the new child.
432
+ else:
433
+ new_subcluster1, new_subcluster2 = _split_node(
434
+ closest_subcluster.child_, threshold, branching_factor, singly
435
+ )
436
+ self.update_split_subclusters(
437
+ closest_subcluster, new_subcluster1, new_subcluster2, singly
438
+ )
439
+
440
+ if len(self.subclusters_) > self.branching_factor:
441
+ return True
442
+ return False
443
+
444
+ # good to go!
445
+ else:
446
+ merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
447
+ if merged:
448
+ self.centroids_[closest_index] = closest_subcluster.centroid_
449
+ self.init_centroids_[closest_index] = closest_subcluster.centroid_
450
+ if not singly:
451
+ closest_subcluster.parent_ = ps
452
+ return False
453
+
454
+ # not close to any other subclusters, and we still
455
+ # have space, so add.
456
+ elif len(self.subclusters_) < self.branching_factor:
457
+ self.append_subcluster(subcluster)
458
+ if not singly:
459
+ closest_subcluster.parent_ = ps
460
+ return False
461
+
462
+ # We do not have enough space nor is it closer to an
463
+ # other subcluster. We need to split.
464
+ else:
465
+ self.append_subcluster(subcluster)
466
+ return True
467
+
468
+
469
+ class _BFSubcluster:
470
+ """Each subcluster in a BFNode is called a BFSubcluster.
471
+
472
+ A BFSubcluster can have a BFNode has its child.
473
+
474
+ Parameters
475
+ ----------
476
+ linear_sum : ndarray of shape (n_features,), default=None
477
+ Sample. This is kept optional to allow initialization of empty
478
+ subclusters.
479
+
480
+ Attributes
481
+ ----------
482
+ n_samples_ : int
483
+ Number of samples that belong to each subcluster.
484
+
485
+ linear_sum_ : ndarray
486
+ Linear sum of all the samples in a subcluster. Prevents holding
487
+ all sample data in memory.
488
+
489
+ centroid_ : ndarray of shape (branching_factor + 1, n_features)
490
+ Centroid of the subcluster. Prevent recomputing of centroids when
491
+ ``BFNode.centroids_`` is called.
492
+
493
+ mol_indices : list, default=[]
494
+ List of indices of molecules included in the given cluster.
495
+
496
+ child_ : _BFNode
497
+ Child Node of the subcluster. Once a given _BFNode is set as the child
498
+ of the _BFNode, it is set to ``self.child_``.
499
+ """
500
+
501
+ def __init__(self, *, linear_sum=None, mol_indices=[]):
502
+ if linear_sum is None:
503
+ self.n_samples_ = 0
504
+ self.centroid_ = self.linear_sum_ = 0
505
+ self.mol_indices = []
506
+ else:
507
+ self.n_samples_ = 1
508
+ self.centroid_ = self.linear_sum_ = linear_sum
509
+ self.mol_indices = mol_indices
510
+
511
+ self.child_ = None
512
+ self.parent_ = None
513
+
514
+ def update(self, subcluster):
515
+ self.n_samples_ += subcluster.n_samples_
516
+ self.linear_sum_ += subcluster.linear_sum_
517
+ self.mol_indices += subcluster.mol_indices
518
+ self.centroid_ = calc_centroid(self.linear_sum_, self.n_samples_)
519
+
520
+ def merge_subcluster(self, nominee_cluster, threshold):
521
+ """Check if a cluster is worthy enough to be merged. If
522
+ yes then merge.
523
+ """
524
+ new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
525
+ new_n = self.n_samples_ + nominee_cluster.n_samples_
526
+ new_centroid = calc_centroid(new_ls, new_n)
527
+
528
+ if merge_accept(
529
+ threshold,
530
+ new_ls,
531
+ new_centroid,
532
+ new_n,
533
+ self.linear_sum_,
534
+ nominee_cluster.linear_sum_,
535
+ self.n_samples_,
536
+ nominee_cluster.n_samples_,
537
+ ):
538
+ (self.n_samples_, self.linear_sum_, self.centroid_, self.mol_indices) = (
539
+ new_n,
540
+ new_ls,
541
+ new_centroid,
542
+ self.mol_indices + nominee_cluster.mol_indices,
543
+ )
544
+ return True
545
+ return False
546
+
547
+
548
+ class BitBirch:
549
+ """Implements the BitBIRCH clustering algorithm.
550
+
551
+ BitBIRCH paper:
552
+
553
+ Memory- and time-efficient, online-learning algorithm.
554
+ It constructs a tree data structure with the cluster centroids being read off the leaf.
555
+
556
+ Parameters
557
+ ----------
558
+ threshold : float, default=0.5
559
+ The similarity radius of the subcluster obtained by merging a new sample and the
560
+ closest subcluster should be greater than the threshold. Otherwise a new
561
+ subcluster is started. Setting this value to be very low promotes
562
+ splitting and vice-versa.
563
+
564
+ branching_factor : int, default=50
565
+ Maximum number of BF subclusters in each node. If a new samples enters
566
+ such that the number of subclusters exceed the branching_factor then
567
+ that node is split into two nodes with the subclusters redistributed
568
+ in each. The parent subcluster of that node is removed and two new
569
+ subclusters are added as parents of the 2 split nodes.
570
+
571
+ Attributes
572
+ ----------
573
+ root_ : _BFNode
574
+ Root of the BFTree.
575
+
576
+ dummy_leaf_ : _BFNode
577
+ Start pointer to all the leaves.
578
+
579
+ subcluster_centers_ : ndarray
580
+ Centroids of all subclusters read directly from the leaves.
581
+
582
+ Notes
583
+ -----
584
+ The tree data structure consists of nodes with each node consisting of
585
+ a number of subclusters. The maximum number of subclusters in a node
586
+ is determined by the branching factor. Each subcluster maintains a
587
+ linear sum, mol_indices and the number of samples in that subcluster.
588
+ In addition, each subcluster can also have a node as its child, if the
589
+ subcluster is not a member of a leaf node.
590
+
591
+ For a new point entering the root, it is merged with the subcluster closest
592
+ to it and the linear sum, mol_indices and the number of samples of that
593
+ subcluster are updated. This is done recursively till the properties of
594
+ the leaf node are updated.
595
+ """
596
+
597
+ def __init__(
598
+ self,
599
+ *,
600
+ threshold=0.5,
601
+ branching_factor=50,
602
+ ):
603
+ self.threshold = threshold
604
+ self.branching_factor = branching_factor
605
+ self.index_tracker = 0
606
+ self.first_call = True
607
+
608
+ def fit(
609
+ self,
610
+ X,
611
+ singly=True,
612
+ store_centroids=False,
613
+ input_is_packed: bool = False,
614
+ n_features: int | None = None,
615
+ max_fps: int | None = None,
616
+ ):
617
+ """
618
+ Build a BF Tree for the input data.
619
+
620
+ Parameters
621
+ ----------
622
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
623
+ Input data.
624
+
625
+ Returns
626
+ -------
627
+ self
628
+ Fitted estimator.
629
+ """
630
+ if isinstance(X, Path):
631
+ X = np.load(X, mmap_mode="r")[:max_fps]
632
+ else:
633
+ X = X[:max_fps]
634
+ threshold = self.threshold
635
+ branching_factor = self.branching_factor
636
+ n_features = _validate_n_features(X, input_is_packed, n_features)
637
+ d_type = X.dtype
638
+
639
+ # If partial_fit is called for the first time or fit is called, we
640
+ # start a new tree.
641
+ if self.first_call:
642
+ # The first root is the leaf. Manipulate this object throughout.
643
+ self.root_ = _BFNode(
644
+ threshold=threshold,
645
+ branching_factor=branching_factor,
646
+ is_leaf=True,
647
+ n_features=n_features,
648
+ dtype=d_type,
649
+ )
650
+
651
+ # To enable getting back subclusters.
652
+ self.dummy_leaf_ = _BFNode(
653
+ threshold=threshold,
654
+ branching_factor=branching_factor,
655
+ is_leaf=True,
656
+ n_features=n_features,
657
+ dtype=d_type,
658
+ )
659
+ self.dummy_leaf_.next_leaf_ = self.root_
660
+ self.root_.prev_leaf_ = self.dummy_leaf_
661
+
662
+ # Cannot vectorize. Enough to convince to use cython.
663
+ if not sparse.issparse(X):
664
+ iter_func = iter
665
+ else:
666
+ iter_func = _iterate_sparse_X
667
+
668
+ for sample in iter_func(X):
669
+ set_bits = np.sum(sample)
670
+ subcluster = _BFSubcluster(
671
+ linear_sum=sample, mol_indices=[self.index_tracker]
672
+ )
673
+ split = self.root_.insert_bf_subcluster(
674
+ subcluster, set_bits, subcluster.parent_, singly
675
+ )
676
+
677
+ if split:
678
+ new_subcluster1, new_subcluster2 = _split_node(
679
+ self.root_, threshold, branching_factor, singly
680
+ )
681
+ del self.root_
682
+ self.root_ = _BFNode(
683
+ threshold=threshold,
684
+ branching_factor=branching_factor,
685
+ is_leaf=False,
686
+ n_features=n_features,
687
+ dtype=d_type,
688
+ )
689
+ self.root_.append_subcluster(new_subcluster1)
690
+ self.root_.append_subcluster(new_subcluster2)
691
+
692
+ if not singly:
693
+ for i in new_subcluster1.child_.subclusters_:
694
+ i.parent_ = new_subcluster1
695
+ for i in new_subcluster2.child_.subclusters_:
696
+ i.parent_ = new_subcluster2
697
+
698
+ self.index_tracker += 1
699
+ if store_centroids:
700
+ centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
701
+ self.subcluster_centers_ = centroids
702
+ self._n_features_out = self.subcluster_centers_.shape[0]
703
+
704
+ self.first_call = False
705
+ return self
706
+
707
+ def fit_reinsert(
708
+ self,
709
+ X,
710
+ reinsert_indices,
711
+ singly=False,
712
+ store_centroids=False,
713
+ input_is_packed: bool = False,
714
+ n_features: int | None = None,
715
+ ):
716
+ """X corresponds to only the molecules that will be reinserted into the tree
717
+ reinsert indices are the indices of the molecules that will be reinserted into the tree
718
+ """
719
+ threshold = self.threshold
720
+ branching_factor = self.branching_factor
721
+ n_features = _validate_n_features(X, input_is_packed, n_features)
722
+ d_type = X.dtype
723
+
724
+ # If partial_fit is called for the first time or fit is called, we
725
+ # start a new tree.
726
+ if self.first_call:
727
+ # The first root is the leaf. Manipulate this object throughout.
728
+ self.root_ = _BFNode(
729
+ threshold=threshold,
730
+ branching_factor=branching_factor,
731
+ is_leaf=True,
732
+ n_features=n_features,
733
+ dtype=d_type,
734
+ )
735
+
736
+ # To enable getting back subclusters.
737
+ self.dummy_leaf_ = _BFNode(
738
+ threshold=threshold,
739
+ branching_factor=branching_factor,
740
+ is_leaf=True,
741
+ n_features=n_features,
742
+ dtype=d_type,
743
+ )
744
+ self.dummy_leaf_.next_leaf_ = self.root_
745
+ self.root_.prev_leaf_ = self.dummy_leaf_
746
+
747
+ # Cannot vectorize. Enough to convince to use cython.
748
+ if not sparse.issparse(X):
749
+ iter_func = iter
750
+ else:
751
+ iter_func = _iterate_sparse_X
752
+
753
+ for sample, mol_ind in zip(iter_func(X), reinsert_indices):
754
+ set_bits = np.sum(sample)
755
+ subcluster = _BFSubcluster(linear_sum=sample, mol_indices=[mol_ind])
756
+ split = self.root_.insert_bf_subcluster(
757
+ subcluster, set_bits, subcluster.parent_, False
758
+ )
759
+ if split:
760
+ new_subcluster1, new_subcluster2 = _split_node(
761
+ self.root_, threshold, branching_factor, False
762
+ )
763
+ del self.root_
764
+ self.root_ = _BFNode(
765
+ threshold=threshold,
766
+ branching_factor=branching_factor,
767
+ is_leaf=False,
768
+ n_features=n_features,
769
+ dtype=d_type,
770
+ )
771
+ self.root_.append_subcluster(new_subcluster1)
772
+ self.root_.append_subcluster(new_subcluster2)
773
+
774
+ if not singly:
775
+ for i in new_subcluster1.child_.subclusters_:
776
+ i.parent_ = new_subcluster1
777
+ for i in new_subcluster2.child_.subclusters_:
778
+ i.parent_ = new_subcluster2
779
+ if store_centroids:
780
+ centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
781
+ self.subcluster_centers_ = centroids
782
+ self._n_features_out = self.subcluster_centers_.shape[0]
783
+
784
+ self.first_call = False
785
+ return self
786
+
787
+ def fit_BFs(self, X, store_centroids=False):
788
+ """
789
+ Method to fit a BitBirch model with the given BitFeatyres.
790
+
791
+ Parameters:
792
+ -----------
793
+ X : list of BitFeatures
794
+
795
+ Returns:
796
+ --------
797
+ self
798
+ """
799
+
800
+ # Check that the input is a list of BitFeatures
801
+ if type(X) != list or len(X[0]) != 3:
802
+ raise ValueError("The input must be a list of BitFeatures")
803
+
804
+ threshold = self.threshold
805
+ branching_factor = self.branching_factor
806
+
807
+ n_features = len(X[0][1])
808
+ d_type = X[0][1].dtype
809
+
810
+ # If partial_fit is called for the first time or fit is called, we
811
+ # start a new tree.
812
+ if self.first_call:
813
+ # The first root is the leaf. Manipulate this object throughout.
814
+ self.root_ = _BFNode(
815
+ threshold=threshold,
816
+ branching_factor=branching_factor,
817
+ is_leaf=True,
818
+ n_features=n_features,
819
+ dtype=d_type,
820
+ )
821
+
822
+ # To enable getting back subclusters.
823
+ self.dummy_leaf_ = _BFNode(
824
+ threshold=threshold,
825
+ branching_factor=branching_factor,
826
+ is_leaf=True,
827
+ n_features=n_features,
828
+ dtype=d_type,
829
+ )
830
+ self.dummy_leaf_.next_leaf_ = self.root_
831
+ self.root_.prev_leaf_ = self.dummy_leaf_
832
+
833
+ for sample in iter(X):
834
+
835
+ cluster = _BFSubcluster()
836
+ cluster.n_samples_, cluster.linear_sum_, cluster.mol_indices = (
837
+ sample[0],
838
+ sample[1],
839
+ sample[2],
840
+ )
841
+ cluster.centroid_ = calc_centroid(cluster.linear_sum_, cluster.n_samples_)
842
+
843
+ set_bits = np.sum(cluster.centroid_)
844
+ split = self.root_.insert_bf_subcluster(
845
+ cluster, set_bits, cluster.parent_, True
846
+ )
847
+
848
+ if split:
849
+ new_subcluster1, new_subcluster2 = _split_node(
850
+ self.root_, threshold, branching_factor, True
851
+ )
852
+ del self.root_
853
+ self.root_ = _BFNode(
854
+ threshold=threshold,
855
+ branching_factor=branching_factor,
856
+ is_leaf=False,
857
+ n_features=n_features,
858
+ dtype=d_type,
859
+ )
860
+ self.root_.append_subcluster(new_subcluster1)
861
+ self.root_.append_subcluster(new_subcluster2)
862
+ self.index_tracker += 1
863
+ if store_centroids:
864
+ centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
865
+ self.subcluster_centers_ = centroids
866
+ self._n_features_out = self.subcluster_centers_.shape[0]
867
+
868
+ self.first_call = False
869
+ return self
870
+
871
+ def _get_leaves(self):
872
+ """
873
+ Retrieve the leaves of the BF Node.
874
+
875
+ Returns
876
+ -------
877
+ leaves : list of shape (n_leaves,)
878
+ List of the leaf nodes.
879
+ """
880
+ leaf_ptr = self.dummy_leaf_.next_leaf_
881
+ leaves = []
882
+ while leaf_ptr is not None:
883
+ leaves.append(leaf_ptr)
884
+ leaf_ptr = leaf_ptr.next_leaf_
885
+ return leaves
886
+
887
+ def get_centroids_mol_ids(self):
888
+ """Method to return a dictionary containing the centroids and mol indices of the leaves"""
889
+ if self.first_call:
890
+ raise ValueError("The model has not been fitted yet.")
891
+
892
+ centroids = []
893
+ mol_ids = []
894
+ for leaf in self._get_leaves():
895
+ for subcluster in leaf.subclusters_:
896
+ centroids.append(subcluster.centroid_)
897
+ mol_ids.append(subcluster.mol_indices)
898
+
899
+ dict_centroids_mol_ids = {"centroids": centroids, "mol_ids": mol_ids}
900
+
901
+ return dict_centroids_mol_ids
902
+
903
+ def get_centroids(self):
904
+ """Method to return a list of Numpy arrays containing the centroids' fingerprints"""
905
+ if self.first_call:
906
+ raise ValueError("The model has not been fitted yet.")
907
+
908
+ centroids = []
909
+ for leaf in self._get_leaves():
910
+ for subcluster in leaf.subclusters_:
911
+ centroids.append(subcluster.centroid_)
912
+
913
+ return centroids
914
+
915
+ def get_cluster_mol_ids(self):
916
+ """Method to return the indices of molecules in each cluster"""
917
+ if self.first_call:
918
+ raise ValueError("The model has not been fitted yet.")
919
+
920
+ clusters_mol_id = []
921
+ for leaf in self._get_leaves():
922
+ for subcluster in leaf.subclusters_:
923
+ clusters_mol_id.append(subcluster.mol_indices)
924
+
925
+ # Sort the clusters by the number of samples in the cluster
926
+ clusters_mol_id = sorted(clusters_mol_id, key=lambda x: len(x), reverse=True)
927
+
928
+ return clusters_mol_id
929
+
930
+ def _get_BFs(self):
931
+ """Method to return the BitFeatures of the leaves"""
932
+ if self.first_call:
933
+ raise ValueError("The model has not been fitted yet.")
934
+
935
+ BFs = []
936
+ for leaf in self._get_leaves():
937
+ for subcluster in leaf.subclusters_:
938
+ BFs.append(subcluster)
939
+
940
+ # Sort the BitFeatures by the number of samples in the cluster
941
+ BFs = sorted(BFs, key=lambda x: x.n_samples_, reverse=True)
942
+
943
+ return BFs
944
+
945
+ def prepare_data_BFs(self, fps, initial_mol=0):
946
+ """Method to prepare the BitFeatures of the largest cluster and the rest of the clusters"""
947
+ if self.first_call:
948
+ raise ValueError("The model has not been fitted yet.")
949
+
950
+ BFs = self._get_BFs()
951
+ big, rest = BFs[0], BFs[1:]
952
+
953
+ data = []
954
+ for BF in rest:
955
+ data.append(
956
+ [BF.n_samples_, BF.linear_sum_.astype(np.int64), BF.mol_indices]
957
+ )
958
+
959
+ bigs = []
960
+ for mol in big.mol_indices:
961
+ bigs.append([1, fps[mol - initial_mol].astype(np.int64), [mol]])
962
+
963
+ return data, bigs
964
+
965
+ def get_assignments(self, n_mols):
966
+ clustered_ids = self.get_cluster_mol_ids()
967
+
968
+ assignments = np.full(n_mols, -1, dtype=int)
969
+ for i, cluster in enumerate(clustered_ids):
970
+ assignments[cluster] = i + 1
971
+
972
+ # Check that there are no unassigned molecules
973
+ assert np.all(assignments != -1)
974
+
975
+ return assignments
976
+
977
+ # Refinement functionality
978
+
979
+ def _get_prune_indices(self):
980
+ """Method to return the indices of molecules in the largest cluster, specifically to be used in fit_reinsert."""
981
+ largest_cluster = max(
982
+ (
983
+ (leaf_idx, cluster_idx, len(cluster.mol_indices))
984
+ for leaf_idx, leaf in enumerate(self._get_leaves())
985
+ for cluster_idx, cluster in enumerate(leaf.subclusters_)
986
+ ),
987
+ key=lambda x: x[2], # Sort by the cluster size
988
+ default=(None, None, 0), # Default if no clusters are found
989
+ )
990
+ prune_indices = lazyPrune(largest_cluster[0], largest_cluster[1], self)
991
+ return prune_indices
992
+
993
+ def prune(self, X):
994
+ """
995
+ Retrieves the molecules in the largest cluster and "prunes" the tree by redistributing these molecules.
996
+
997
+ Parameters
998
+ ----------
999
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
1000
+ Input data.
1001
+
1002
+ Returns
1003
+ -------
1004
+ self
1005
+ Fitted estimator.
1006
+ """
1007
+ mol_indices = self._get_prune_indices()
1008
+ cluster_fps = X[mol_indices]
1009
+ self.fit_reinsert(cluster_fps, mol_indices)
1010
+ return self
1011
+
1012
+ def _calc_centroid(self, X, cluster):
1013
+ """Calculates centroid
1014
+
1015
+ Parameters
1016
+ ----------
1017
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
1018
+ Input data.
1019
+ cluster: np.array of the molecule indices within the cluster
1020
+
1021
+ Returns
1022
+ -------
1023
+ centroid : np.ndarray
1024
+ Centroid fingerprints of the given set
1025
+ """
1026
+
1027
+ full_fp = X[cluster]
1028
+ linear_sum = np.sum(full_fp, axis=0)
1029
+ n_samples = len(full_fp)
1030
+ return calc_centroid(linear_sum, n_samples)
1031
+
1032
+ def _get_top_cluster_params(self, X, top):
1033
+ """Method to recieve the cluster mol indices, centroids, and fingerprints of the top user-specified clusters.
1034
+
1035
+ Parameters
1036
+ ----------
1037
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
1038
+ Input data.
1039
+
1040
+ top: int
1041
+ default: 20; specifies number of top largest clusters to reassign
1042
+
1043
+ Returns
1044
+ -------
1045
+ top_clusters: sorted list
1046
+ list of len top; containing the cluster mol ids of given clusters
1047
+
1048
+ centroids: np.array; shape (top, n_features)
1049
+ centroids of the top clusters
1050
+
1051
+ mol_indices: list of indices with the top clusters; len of number of molecules in all top clusters
1052
+
1053
+ data_top_clusters: np.array; shape (n_molecules, n_features)
1054
+ fingerprints of the molecules in the top clusters
1055
+ """
1056
+ top_clusters = sorted(self.get_cluster_mol_ids(), key=len, reverse=True)[:top]
1057
+ centroids = np.array([self._calc_centroid(X, c) for c in top_clusters])
1058
+ mol_indices = [i for c in top_clusters for i in c]
1059
+ data_top_clusters = np.array([X[i] for i in mol_indices])
1060
+ assert data_top_clusters.shape[0] == len(mol_indices)
1061
+
1062
+ return top_clusters, centroids, mol_indices, data_top_clusters
1063
+
1064
+ def _get_sim_matrix(self, data_top_clusters, centroids):
1065
+ """Method to receive the similarity matrix of the user-sepcified top clusters for reassign.
1066
+
1067
+ Parameters
1068
+ ----------
1069
+ data_top_clusters: np.array; shape (n_molecules, n_features)
1070
+ fingerprints of the molecules in the top clusters
1071
+
1072
+ centroids: np.array; shape (top, n_features)
1073
+ centroids of the top clusters
1074
+
1075
+ Returns
1076
+ -------
1077
+ row_max: np.array; shape(n_molecules)
1078
+ Each entry is the index of the cluster centroid to which the molecule was closest
1079
+ """
1080
+ pop_counts = np.sum(centroids, axis=1)
1081
+ pop_mols = np.sum(data_top_clusters, axis=1).reshape(-1, 1)
1082
+ ei = np.einsum("ij,kj->ik", data_top_clusters, centroids)
1083
+ tanis = ei / (pop_counts + pop_mols - ei)
1084
+ row_max = np.argmax(tanis, axis=1)
1085
+ assert row_max.shape == data_top_clusters.shape[0:1]
1086
+
1087
+ return row_max
1088
+
1089
+ def reassign(self, X, top=20, quick=False):
1090
+ """
1091
+ Reassign molecules across the user-specified (default 20) top largest clusters in the tree based on the tanimoto similarity matrix.
1092
+
1093
+ Parameters
1094
+ ----------
1095
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
1096
+ Input data.
1097
+
1098
+ top: int
1099
+ default: 20; specifies number of top largest clusters to reassign
1100
+
1101
+ quick: boolean
1102
+ default: False; if quick specifed, the whole tree will not be return, but a dictionary of top cluster data will be returned
1103
+
1104
+ Return
1105
+ ------
1106
+ quick return: dict
1107
+ dictionary sorted by largest to smallest newly reassigned top clusters
1108
+
1109
+ self
1110
+ Fitted estimator.
1111
+ """
1112
+ top_clusters, centroids, mol_indices, data_top_clusters = (
1113
+ self._get_top_cluster_params(X, top)
1114
+ )
1115
+ row_max = self._get_sim_matrix(data_top_clusters, centroids)
1116
+
1117
+ final_clusters = {i: [] for i in np.unique(row_max)}
1118
+ for mol, cluster_idx in zip(mol_indices, row_max):
1119
+ final_clusters[cluster_idx].append(mol)
1120
+
1121
+ if quick:
1122
+ return dict(
1123
+ sorted(
1124
+ final_clusters.items(), key=lambda item: len(item[1]), reverse=True
1125
+ )
1126
+ ) # Return dict sorted by largest to smallest newly reassigned top clusters
1127
+
1128
+ else:
1129
+ sub_clusters = [
1130
+ sc for leaf in self._get_leaves() for sc in leaf.subclusters_
1131
+ ]
1132
+ assert len(sub_clusters) == len(self.get_cluster_mol_ids())
1133
+
1134
+ top_sub_clusters = sorted(
1135
+ sub_clusters, key=lambda c: c.n_samples_, reverse=True
1136
+ )[:top]
1137
+
1138
+ for sc, tc in zip(top_sub_clusters, top_clusters):
1139
+ assert sc.n_samples_ == len(tc)
1140
+
1141
+ for cluster, c_ids in zip(top_sub_clusters, final_clusters.values()):
1142
+ cluster.n_samples_ = len(c_ids)
1143
+ cluster.linear_sum = np.sum(X[c_ids], axis=0)
1144
+ cluster.mol_indices = c_ids
1145
+ cluster.centroid_ = calc_centroid(
1146
+ cluster.linear_sum_, cluster.n_samples_
1147
+ )
1148
+ return self
1149
+
1150
+
1151
+ def lazyPrune(leaf_index, subc_index, brc):
1152
+ # from bitbirch_tools import bitbirch_double_link_tolerance as bitbirch
1153
+ # path from leaf level to root, indices are marked
1154
+ pSubs = []
1155
+ pSubPath(brc._get_leaves()[leaf_index].subclusters_[subc_index], pSubs, brc)
1156
+
1157
+ # traverse all the way down to the leaf node
1158
+ node = brc.root_
1159
+ for index in pSubs[:0:-1]:
1160
+ node = node.subclusters_[index].child_
1161
+
1162
+ # information needed to update all preceding subclusters
1163
+ cluster_ls = node.subclusters_[subc_index].linear_sum_
1164
+ cluster_n = node.subclusters_[subc_index].n_samples_
1165
+ cluster_mol_indices = node.subclusters_[subc_index].mol_indices
1166
+
1167
+ """# IMPORTANT: tree must have at least three levels, at least that is what is assumed when inputting dataset
1168
+ parent_sub = node.subclusters_[0].parent_ # needed to traverse node upwards
1169
+ parent_node = parent_sub.parent_.child_ # needed to reassign leaf nodes"""
1170
+
1171
+ parent_sub = node.subclusters_[0].parent_
1172
+ if parent_sub.parent_ is None:
1173
+ parent_node = brc.root_
1174
+ else:
1175
+ parent_node = parent_sub.parent_.child_
1176
+
1177
+ # eliminate the biggest subcluster from the leaf node
1178
+ node.subclusters_.pop(subc_index)
1179
+ node.centroids_ = np.delete(node.centroids_, subc_index, 0)
1180
+ shift_delete(node.init_centroids_, subc_index)
1181
+
1182
+ if len(node.subclusters_) == 0:
1183
+ parent_sub.child_ = None
1184
+
1185
+ # rearrange leaf pointers if need to
1186
+
1187
+ # if the leaf node disappears, we assign parent node to replace leaf node
1188
+ num_childs = 0
1189
+ for i in parent_node.subclusters_:
1190
+ if i.child_ is not None:
1191
+ num_childs += 1
1192
+ if num_childs == 0:
1193
+ parent_node.is_leaf = True
1194
+ if node.prev_leaf_ is not None:
1195
+ node.prev_leaf_.next_leaf_ = parent_node
1196
+ parent_node.prev_leaf_ = node.prev_leaf_
1197
+ parent_node.next_leaf_ = node.next_leaf_
1198
+ if node.next_leaf_ is not None:
1199
+ node.next_leaf_.prev_leaf_ = parent_node
1200
+
1201
+ # leaf node disappears and there are no reassignments
1202
+ elif len(node.subclusters_) == 0:
1203
+ if node.prev_leaf_ is not None:
1204
+ node.prev_leaf_.next_leaf_ = node.next_leaf_
1205
+ if node.next_leaf_ is not None:
1206
+ node.next_leaf_.prev_leaf_ = node.prev_leaf_
1207
+
1208
+ if parent_sub.parent_ is None:
1209
+ node = brc.root_
1210
+ else:
1211
+ node = parent_sub.parent_.child_
1212
+
1213
+ # here, we update all the preceding subclusters all the way up to the root
1214
+ for index in pSubs[1:]:
1215
+ sub = node.subclusters_[index]
1216
+
1217
+ sub.linear_sum_ -= cluster_ls
1218
+ sub.n_samples_ -= cluster_n
1219
+ sub.centroid_ = calc_centroid(sub.linear_sum_, sub.n_samples_)
1220
+ sub.mol_indices = [x for x in sub.mol_indices if x not in cluster_mol_indices]
1221
+
1222
+ node.centroids_[index] = sub.centroid_
1223
+ node.init_centroids_[index] = sub.centroid_
1224
+
1225
+ if sub.parent_ is None:
1226
+ break
1227
+ elif sub.parent_.parent_ is None:
1228
+ node = brc.root_
1229
+ else:
1230
+ node = sub.parent_.parent_.child_
1231
+
1232
+ return cluster_mol_indices
1233
+
1234
+
1235
+ def shift_delete(array, index):
1236
+ for i in range(index, len(array) - 1):
1237
+ array[i] = array[i + 1]
1238
+ array[-1] = 0
1239
+
1240
+
1241
+ def pSubPath(subcluster, pSubs, brc):
1242
+ # figure out the path of subclusters from leaf level to root
1243
+
1244
+ if subcluster is None:
1245
+ return
1246
+
1247
+ if subcluster.parent_ is None:
1248
+ pSubs.append(brc.root_.subclusters_.index(subcluster))
1249
+ else:
1250
+ pSubs.append(subcluster.parent_.child_.subclusters_.index(subcluster))
1251
+
1252
+ pSubPath(subcluster.parent_, pSubs, brc)