bblean 0.6.0b2__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cp312-win_amd64.pyd +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1850 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b2.dist-info/METADATA +288 -0
- bblean-0.6.0b2.dist-info/RECORD +31 -0
- bblean-0.6.0b2.dist-info/WHEEL +5 -0
- bblean-0.6.0b2.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b2.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b2.dist-info/top_level.txt +1 -0
bblean/similarity.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Optimized molecular similarity calculators"""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from numpy.typing import NDArray
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
# NOTE: The most expensive calculation is *jt_sim_packed*, followed by _popcount_2d,
|
|
10
|
+
# centroid_from_sum, packing and unpacking
|
|
11
|
+
# TODO: Packing and unpacking *should be done in C++ using a lookup table*
|
|
12
|
+
__all__ = [
|
|
13
|
+
# JT sim between two (sets of) fingerprints, and average tanimoto (using iSIM)
|
|
14
|
+
"jt_isim_from_sum",
|
|
15
|
+
"jt_isim",
|
|
16
|
+
"jt_sim_packed",
|
|
17
|
+
"jt_most_dissimilar_packed",
|
|
18
|
+
# Radius and diameter from sum
|
|
19
|
+
"jt_isim_radius_from_sum",
|
|
20
|
+
"jt_isim_radius_compl_from_sum",
|
|
21
|
+
"jt_isim_diameter_from_sum",
|
|
22
|
+
# Radius and diameter from fps (packed and unpacked)
|
|
23
|
+
"jt_isim_radius",
|
|
24
|
+
"jt_isim_radius_compl",
|
|
25
|
+
"jt_isim_diameter",
|
|
26
|
+
# Centroid and medoid
|
|
27
|
+
# Radius and diameter unpacked / packed
|
|
28
|
+
"centroid_from_sum",
|
|
29
|
+
"centroid",
|
|
30
|
+
"jt_isim_medoid",
|
|
31
|
+
# Complementary similarity
|
|
32
|
+
"jt_compl_isim",
|
|
33
|
+
"jt_stratified_sampling",
|
|
34
|
+
"jt_sim_matrix_packed",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
from bblean._py_similarity import (
|
|
38
|
+
centroid_from_sum,
|
|
39
|
+
centroid,
|
|
40
|
+
jt_compl_isim,
|
|
41
|
+
jt_isim_medoid,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# jt_isim_packed and jt_isim_unpacked are not exposed, only used within functions for
|
|
45
|
+
# speed
|
|
46
|
+
|
|
47
|
+
if os.getenv("BITBIRCH_NO_EXTENSIONS"):
|
|
48
|
+
from bblean._py_similarity import (
|
|
49
|
+
jt_isim_from_sum,
|
|
50
|
+
jt_isim_unpacked,
|
|
51
|
+
jt_isim_packed,
|
|
52
|
+
_jt_sim_arr_vec_packed,
|
|
53
|
+
jt_most_dissimilar_packed,
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
try:
|
|
57
|
+
from bblean._cpp_similarity import ( # type: ignore
|
|
58
|
+
jt_isim_from_sum,
|
|
59
|
+
_jt_sim_arr_vec_packed,
|
|
60
|
+
jt_isim_unpacked_u8,
|
|
61
|
+
jt_isim_packed_u8,
|
|
62
|
+
jt_most_dissimilar_packed,
|
|
63
|
+
unpack_fingerprints,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Wrap these two since doing
|
|
67
|
+
def jt_isim_unpacked(arr: NDArray[np.integer]) -> float:
|
|
68
|
+
# Wrapping like this is slightly faster than letting pybind11 autocast
|
|
69
|
+
if arr.dtype == np.uint64:
|
|
70
|
+
return jt_isim_from_sum(
|
|
71
|
+
np.sum(arr, axis=0, dtype=np.uint64), len(arr) # type: ignore
|
|
72
|
+
)
|
|
73
|
+
return jt_isim_unpacked_u8(arr)
|
|
74
|
+
|
|
75
|
+
# Probably a mypy bug
|
|
76
|
+
def jt_isim_packed( # type: ignore
|
|
77
|
+
arr: NDArray[np.integer], n_features: int | None = None
|
|
78
|
+
) -> float:
|
|
79
|
+
# Wrapping like this is slightly faster than letting pybind11 autocast
|
|
80
|
+
if arr.dtype == np.uint64:
|
|
81
|
+
return jt_isim_from_sum(
|
|
82
|
+
np.sum(
|
|
83
|
+
unpack_fingerprints(arr, n_features), # type: ignore
|
|
84
|
+
axis=0,
|
|
85
|
+
dtype=np.uint64,
|
|
86
|
+
),
|
|
87
|
+
len(arr),
|
|
88
|
+
)
|
|
89
|
+
return jt_isim_packed_u8(arr)
|
|
90
|
+
|
|
91
|
+
except ImportError:
|
|
92
|
+
from bblean._py_similarity import ( # type: ignore
|
|
93
|
+
jt_isim_from_sum,
|
|
94
|
+
jt_isim_unpacked,
|
|
95
|
+
jt_isim_packed,
|
|
96
|
+
_jt_sim_arr_vec_packed,
|
|
97
|
+
jt_most_dissimilar_packed,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
warnings.warn(
|
|
101
|
+
"C++ optimized similarity calculations not available,"
|
|
102
|
+
" falling back to python implementation"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def jt_isim(
|
|
107
|
+
fps: NDArray[np.integer],
|
|
108
|
+
input_is_packed: bool = True,
|
|
109
|
+
n_features: int | None = None,
|
|
110
|
+
) -> float:
|
|
111
|
+
r"""Average Tanimoto, using iSIM
|
|
112
|
+
|
|
113
|
+
iSIM Tanimoto was first propsed in:
|
|
114
|
+
https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
|
|
115
|
+
|
|
116
|
+
:math:`iSIM_{JT}(X)` is an excellent :math:`O(N)` approximation of the average
|
|
117
|
+
Tanimoto similarity of a set of fingerprints.
|
|
118
|
+
|
|
119
|
+
Also equivalent to the complement of the Tanimoto diameter
|
|
120
|
+
:math:`iSIM_{JT}(X) = 1 - D_{JT}(X)`.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
arr : np.ndarray
|
|
125
|
+
2D fingerprint array
|
|
126
|
+
|
|
127
|
+
input_is_packed : bool
|
|
128
|
+
Whether the input array has packed fingerprints
|
|
129
|
+
|
|
130
|
+
n_features: int | None
|
|
131
|
+
Number of features when unpacking fingerprints. Only required if
|
|
132
|
+
not a multiple of 8
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
----------
|
|
136
|
+
isim : float
|
|
137
|
+
iSIM Jaccard-Tanimoto value
|
|
138
|
+
"""
|
|
139
|
+
if input_is_packed:
|
|
140
|
+
return jt_isim_packed(fps, n_features)
|
|
141
|
+
return jt_isim_unpacked(fps)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def jt_isim_diameter(
|
|
145
|
+
arr: NDArray[np.integer],
|
|
146
|
+
input_is_packed: bool = True,
|
|
147
|
+
n_features: int | None = None,
|
|
148
|
+
) -> float:
|
|
149
|
+
r"""Calculate the Tanimoto diameter of a set of fingerprints"""
|
|
150
|
+
return jt_isim_diameter_from_sum(
|
|
151
|
+
np.sum(
|
|
152
|
+
unpack_fingerprints(arr, n_features) if input_is_packed else arr,
|
|
153
|
+
axis=0,
|
|
154
|
+
dtype=np.uint64,
|
|
155
|
+
), # type: ignore
|
|
156
|
+
len(arr),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def jt_isim_radius(
|
|
161
|
+
arr: NDArray[np.integer],
|
|
162
|
+
input_is_packed: bool = True,
|
|
163
|
+
n_features: int | None = None,
|
|
164
|
+
) -> float:
|
|
165
|
+
r"""Calculate the Tanimoto radius of a set of fingerprints"""
|
|
166
|
+
return jt_isim_radius_from_sum(
|
|
167
|
+
np.sum(
|
|
168
|
+
unpack_fingerprints(arr, n_features) if input_is_packed else arr,
|
|
169
|
+
axis=0,
|
|
170
|
+
dtype=np.uint64,
|
|
171
|
+
), # type: ignore
|
|
172
|
+
len(arr),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def jt_isim_radius_compl(
|
|
177
|
+
arr: NDArray[np.integer],
|
|
178
|
+
input_is_packed: bool = True,
|
|
179
|
+
n_features: int | None = None,
|
|
180
|
+
) -> float:
|
|
181
|
+
r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
|
|
182
|
+
return jt_isim_radius_compl_from_sum(
|
|
183
|
+
np.sum(
|
|
184
|
+
unpack_fingerprints(arr, n_features) if input_is_packed else arr,
|
|
185
|
+
axis=0,
|
|
186
|
+
dtype=np.uint64,
|
|
187
|
+
), # type: ignore
|
|
188
|
+
len(arr),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def jt_isim_radius_compl_from_sum(ls: NDArray[np.integer], n: int) -> float:
|
|
193
|
+
r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
|
|
194
|
+
# Calculates 1 - R = Rc
|
|
195
|
+
# NOTE: Use uint64 sum since jt_isim_from_sum casts to uint64 internally
|
|
196
|
+
# This prevents multiple casts
|
|
197
|
+
new_unpacked_centroid = centroid_from_sum(ls, n, pack=False)
|
|
198
|
+
new_ls_1 = np.add(ls, new_unpacked_centroid, dtype=np.uint64)
|
|
199
|
+
new_n_1 = n + 1
|
|
200
|
+
new_jt = jt_isim_from_sum(ls, n)
|
|
201
|
+
new_jt_1 = jt_isim_from_sum(new_ls_1, new_n_1)
|
|
202
|
+
return (new_jt_1 * new_n_1 - new_jt * (n - 1)) / 2
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def jt_isim_radius_from_sum(ls: NDArray[np.integer], n: int) -> float:
|
|
206
|
+
r"""Calculate the Tanimoto radius of a set of fingerprints"""
|
|
207
|
+
return 1 - jt_isim_radius_compl_from_sum(ls, n)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def jt_isim_diameter_from_sum(ls: NDArray[np.integer], n: int) -> float:
|
|
211
|
+
r"""Calculate the Tanimoto diameter of a set of fingerprints.
|
|
212
|
+
|
|
213
|
+
Equivalent to ``1 - jt_isim_from_sum(ls, n)``"""
|
|
214
|
+
return 1 - jt_isim_from_sum(ls, n)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# General wrapper that works both in C++ and python
|
|
218
|
+
def jt_sim_packed(
|
|
219
|
+
x: NDArray[np.uint8],
|
|
220
|
+
y: NDArray[np.uint8],
|
|
221
|
+
) -> NDArray[np.float64]:
|
|
222
|
+
r"""Tanimoto similarity between packed fingerprints
|
|
223
|
+
|
|
224
|
+
Either both inputs are vectors of shape (F,) (Numpy scalar is returned), or
|
|
225
|
+
one is an vector (F,) and the other an array of shape (N, F) (Numpy array of
|
|
226
|
+
shape (N,) is returned).
|
|
227
|
+
"""
|
|
228
|
+
if x.ndim == 1 and y.ndim == 1:
|
|
229
|
+
return _jt_sim_arr_vec_packed(x.reshape(1, -1), y)[0]
|
|
230
|
+
if x.ndim == 2:
|
|
231
|
+
return _jt_sim_arr_vec_packed(x, y)
|
|
232
|
+
if y.ndim == 2:
|
|
233
|
+
return _jt_sim_arr_vec_packed(y, x)
|
|
234
|
+
raise ValueError(
|
|
235
|
+
"Expected either two 1D vectors, or one 1D vector and one 2D array"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def jt_sim_matrix_packed(arr: NDArray[np.uint8]) -> NDArray[np.float64]:
|
|
240
|
+
r"""Tanimoto similarity matrix between all pairs of packed fps in arr"""
|
|
241
|
+
matrix = np.ones((len(arr), len(arr)), dtype=np.float64)
|
|
242
|
+
for i in range(len(arr)):
|
|
243
|
+
# Set the similarities for each row
|
|
244
|
+
matrix[i, i + 1 :] = jt_sim_packed(arr[i], arr[i + 1 :])
|
|
245
|
+
# Set the similarities for each column (symmetric)
|
|
246
|
+
matrix[i + 1 :, i] = matrix[i, i + 1 :]
|
|
247
|
+
return matrix
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def estimate_jt_std(
|
|
251
|
+
fps: NDArray[np.uint8],
|
|
252
|
+
n_samples: int | None = None,
|
|
253
|
+
input_is_packed: bool = True,
|
|
254
|
+
n_features: int | None = None,
|
|
255
|
+
) -> float:
|
|
256
|
+
r"""Estimate std of tanimoto sim using a deterministic sample"""
|
|
257
|
+
num_fps = len(fps)
|
|
258
|
+
if n_samples is None:
|
|
259
|
+
n_samples = max(num_fps // 1000, 50)
|
|
260
|
+
sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
|
|
261
|
+
|
|
262
|
+
# Work with sample from now on
|
|
263
|
+
fps = fps[sample_idxs]
|
|
264
|
+
num_fps = len(fps)
|
|
265
|
+
pairs = np.empty(num_fps * (num_fps - 1) // 2, dtype=np.float64)
|
|
266
|
+
# NOTE: Calc upper triangular part of pairwise matrix only, slightly more efficient,
|
|
267
|
+
# but difference is negligible in tests
|
|
268
|
+
offset = 0
|
|
269
|
+
for i in range(len(fps)):
|
|
270
|
+
num = num_fps - i - 1
|
|
271
|
+
pairs[offset : offset + num] = jt_sim_packed(fps[i], fps[i + 1 :])
|
|
272
|
+
offset += num
|
|
273
|
+
return np.std(pairs).item()
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def jt_stratified_sampling(
|
|
277
|
+
fps: NDArray[np.uint8],
|
|
278
|
+
n_samples: int,
|
|
279
|
+
input_is_packed: bool = True,
|
|
280
|
+
n_features: int | None = None,
|
|
281
|
+
) -> NDArray[np.int64]:
|
|
282
|
+
r"""Sample from a set of fingerprints according to their complementary similarity
|
|
283
|
+
|
|
284
|
+
Given a group of fingerprints, calculate all complementary similarities, order, and
|
|
285
|
+
sample the first element from consecutive groups of length ``num_fps // n_samples +
|
|
286
|
+
1``.
|
|
287
|
+
|
|
288
|
+
.. note ::
|
|
289
|
+
|
|
290
|
+
This is not true statistical stratified sampling, it is not random, and the
|
|
291
|
+
strata are not homogeneous. It is meant as a reliable, deterministic method to
|
|
292
|
+
obtain a representative sample from a set of fingerprints.
|
|
293
|
+
"""
|
|
294
|
+
# Stratified sampling without replacement
|
|
295
|
+
if n_samples == 0:
|
|
296
|
+
return np.array([], dtype=np.int64)
|
|
297
|
+
if n_samples > len(fps):
|
|
298
|
+
raise ValueError("n_samples must be <= len(fps)")
|
|
299
|
+
# Get the indices that would sort the complementary similarities
|
|
300
|
+
sorted_indices = np.argsort(jt_compl_isim(fps, input_is_packed, n_features))
|
|
301
|
+
# Split into n_samples strata
|
|
302
|
+
strata = np.array_split(sorted_indices, n_samples)
|
|
303
|
+
# Get first index of each strata
|
|
304
|
+
return np.array([s[0] for s in strata])
|
bblean/sklearn.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
r"""BitBirch 'Lean' classes that fully respects the sklearn API contract.
|
|
2
|
+
|
|
3
|
+
Use these classes as a drop-in replacement of `sklearn.cluster.Birch` if you are used to
|
|
4
|
+
the `sklearn` way of doing things, with the caveat that global clustering is not
|
|
5
|
+
currently supported.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import typing as tp
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
|
+
import numpy as np
|
|
11
|
+
import typing_extensions as tpx
|
|
12
|
+
|
|
13
|
+
from sklearn.utils.validation import check_is_fitted, validate_data
|
|
14
|
+
from sklearn.metrics import pairwise_distances_argmin, pairwise_distances
|
|
15
|
+
from sklearn.base import (
|
|
16
|
+
BaseEstimator,
|
|
17
|
+
ClassNamePrefixFeaturesOutMixin,
|
|
18
|
+
ClusterMixin,
|
|
19
|
+
TransformerMixin,
|
|
20
|
+
_fit_context,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from bblean.fingerprints import unpack_fingerprints
|
|
24
|
+
from bblean.bitbirch import BitBirch as _BitBirch
|
|
25
|
+
from bblean._merges import MergeAcceptFunction
|
|
26
|
+
|
|
27
|
+
__all__ = ["BitBirch", "UnpackedBitBirch"]
|
|
28
|
+
|
|
29
|
+
# Required functions for sklearn API:
|
|
30
|
+
# - fit() *must be defined*
|
|
31
|
+
# - transform() *must be defined*
|
|
32
|
+
# - fit_predict() (ClusterMixin) default implementation is to fit and then return lbls
|
|
33
|
+
# - predict() # overloaded to use jt instead of euclidean
|
|
34
|
+
# - fit_transform() (TransformerMixin, delegates to *fit* and *transform*)
|
|
35
|
+
# - set_output() (TransformerMixin via _SetOutputMixin)
|
|
36
|
+
# set_output(transform="pandas") or transform="default" (numpy array) (or "polars",
|
|
37
|
+
# if polars is installed)
|
|
38
|
+
|
|
39
|
+
# The following requires _n_features_out after fitting
|
|
40
|
+
# - get_feature_names_out() ["bitbirch0", "bitbirch1", ...] (ClassNamePrefix...)
|
|
41
|
+
|
|
42
|
+
# - get_metadata_routing() () (!?) New feature, unclear what this is and unnecessary
|
|
43
|
+
# - partial_fit() () Same as fit() for BitBirch
|
|
44
|
+
|
|
45
|
+
# These require that the parameters are specified in __init__, and are assigned
|
|
46
|
+
# to names (or attributes) with the convention self.<param>.
|
|
47
|
+
# - get_params() (BaseEstimator)
|
|
48
|
+
# - set_params() (BaseEstimator)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BitBirch(
|
|
52
|
+
ClassNamePrefixFeaturesOutMixin,
|
|
53
|
+
ClusterMixin,
|
|
54
|
+
TransformerMixin,
|
|
55
|
+
BaseEstimator,
|
|
56
|
+
_BitBirch,
|
|
57
|
+
):
|
|
58
|
+
r"""Implements the BitBIRCH clustering algorithm, 'Lean' version.
|
|
59
|
+
|
|
60
|
+
Inputs to this estimator are *packed* fingerprints by default. If you need get a
|
|
61
|
+
class that always accepts an unpacked input use `bblean.sklearn.UnpackedBitBirch`
|
|
62
|
+
|
|
63
|
+
See `bblean.bitbirch.BitBirch` for more details"""
|
|
64
|
+
|
|
65
|
+
_parameter_constraints: dict[str, list[tp.Any]] = {}
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
*,
|
|
70
|
+
threshold: float = 0.65,
|
|
71
|
+
branching_factor: int = 50,
|
|
72
|
+
merge_criterion: str | MergeAcceptFunction | None = None,
|
|
73
|
+
tolerance: float | None = None,
|
|
74
|
+
compute_labels: bool = True,
|
|
75
|
+
):
|
|
76
|
+
super().__init__(
|
|
77
|
+
threshold=threshold,
|
|
78
|
+
branching_factor=branching_factor,
|
|
79
|
+
merge_criterion=merge_criterion,
|
|
80
|
+
tolerance=tolerance,
|
|
81
|
+
)
|
|
82
|
+
self.compute_labels = compute_labels
|
|
83
|
+
|
|
84
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
85
|
+
def fit( # type: ignore
|
|
86
|
+
self, X, y=None, input_is_packed: bool = True, n_features: int | None = None
|
|
87
|
+
) -> tpx.Self:
|
|
88
|
+
super().fit(X, input_is_packed=input_is_packed, n_features=n_features)
|
|
89
|
+
centroids = np.stack(
|
|
90
|
+
[bf.unpacked_centroid for bf in self._get_leaf_bfs(sort=True)]
|
|
91
|
+
)
|
|
92
|
+
self.subcluster_centers_ = centroids
|
|
93
|
+
self.subcluster_labels_ = np.arange(1, len(centroids) + 1)
|
|
94
|
+
self._n_features_out = centroids.shape[0]
|
|
95
|
+
if self.compute_labels:
|
|
96
|
+
self.labels_ = self.get_assignments()
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
100
|
+
def partial_fit( # type: ignore
|
|
101
|
+
self,
|
|
102
|
+
X=None,
|
|
103
|
+
y=None,
|
|
104
|
+
input_is_packed: bool = True,
|
|
105
|
+
n_features: int | None = None,
|
|
106
|
+
) -> tpx.Self:
|
|
107
|
+
if X is None:
|
|
108
|
+
raise ValueError()
|
|
109
|
+
self.fit(X, input_is_packed=input_is_packed, n_features=n_features)
|
|
110
|
+
if self.compute_labels:
|
|
111
|
+
self.labels_ = self.get_assignments()
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
# Overloaded since self.labels_ may not be set
|
|
115
|
+
def fit_predict( # type: ignore
|
|
116
|
+
self, X, y=None, input_is_packed: bool = True, n_features: int | None = None
|
|
117
|
+
) -> NDArray[np.integer]:
|
|
118
|
+
self.fit(X, input_is_packed=input_is_packed, n_features=n_features)
|
|
119
|
+
if not self.compute_labels:
|
|
120
|
+
self.labels_ = self.get_assignments()
|
|
121
|
+
return self.labels_
|
|
122
|
+
|
|
123
|
+
def predict( # type: ignore
|
|
124
|
+
self, X, input_is_packed: bool = True, n_features: int | None = None
|
|
125
|
+
) -> NDArray[np.integer]:
|
|
126
|
+
"""Predict data using the ``centroids`` of subclusters."""
|
|
127
|
+
check_is_fitted(self)
|
|
128
|
+
X = validate_data(self, X, accept_sparse="csr", reset=False)
|
|
129
|
+
X = (
|
|
130
|
+
(unpack_fingerprints(X, n_features=n_features) if input_is_packed else X)
|
|
131
|
+
.astype(np.uint8, copy=False)
|
|
132
|
+
.view(np.bool)
|
|
133
|
+
)
|
|
134
|
+
# TODO: Even when both inputs are bool, this function warns for some reason
|
|
135
|
+
# I believe this may be a sklearn bug
|
|
136
|
+
centers = self.subcluster_centers_.astype(np.uint8, copy=False).view(np.bool)
|
|
137
|
+
argmin = pairwise_distances_argmin(X, centers, metric="jaccard")
|
|
138
|
+
return self.subcluster_labels_[argmin]
|
|
139
|
+
|
|
140
|
+
def transform( # type: ignore
|
|
141
|
+
self,
|
|
142
|
+
X,
|
|
143
|
+
input_is_packed: bool = True,
|
|
144
|
+
n_features: int | None = None,
|
|
145
|
+
):
|
|
146
|
+
check_is_fitted(self)
|
|
147
|
+
X = validate_data(self, X, accept_sparse="csr", reset=False)
|
|
148
|
+
X = (
|
|
149
|
+
(unpack_fingerprints(X, n_features=n_features) if input_is_packed else X)
|
|
150
|
+
.astype(np.uint8, copy=False)
|
|
151
|
+
.view(np.bool)
|
|
152
|
+
)
|
|
153
|
+
centers = self.subcluster_centers_.astype(np.uint8, copy=False).view(np.bool)
|
|
154
|
+
return pairwise_distances(X, centers, metric="jaccard")
|
|
155
|
+
|
|
156
|
+
def __sklearn_tags__(self): # type: ignore
|
|
157
|
+
tags = super().__sklearn_tags__()
|
|
158
|
+
tags.input_tags.sparse = True
|
|
159
|
+
return tags
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class UnpackedBitBirch(BitBirch):
|
|
163
|
+
r"""Implements the BitBIRCH clustering algorithm, 'Lean' version.
|
|
164
|
+
|
|
165
|
+
Inputs to this estimator are *unpacked* fingerprints always
|
|
166
|
+
|
|
167
|
+
See `bblean.bitbirch.BitBirch` for more details"""
|
|
168
|
+
|
|
169
|
+
def fit( # type: ignore
|
|
170
|
+
self, X, y=None, input_is_packed: bool = False, n_features: int | None = None
|
|
171
|
+
) -> tpx.Self:
|
|
172
|
+
return super().fit(X, y, input_is_packed=input_is_packed, n_features=n_features)
|
|
173
|
+
|
|
174
|
+
def partial_fit( # type: ignore
|
|
175
|
+
self, X, y=None, input_is_packed: bool = False, n_features: int | None = None
|
|
176
|
+
):
|
|
177
|
+
return super().partial_fit(
|
|
178
|
+
X, y, input_is_packed=input_is_packed, n_features=n_features
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
def fit_predict( # type: ignore
|
|
182
|
+
self, X, y=None, input_is_packed: bool = False, n_features: int | None = None
|
|
183
|
+
):
|
|
184
|
+
return super().fit_predict(
|
|
185
|
+
X, y, input_is_packed=input_is_packed, n_features=n_features
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def predict( # type: ignore
|
|
189
|
+
self,
|
|
190
|
+
X,
|
|
191
|
+
input_is_packed: bool = False,
|
|
192
|
+
n_features: int | None = None,
|
|
193
|
+
):
|
|
194
|
+
return super().predict(
|
|
195
|
+
X, input_is_packed=input_is_packed, n_features=n_features
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def transform( # type: ignore
|
|
199
|
+
self, X, input_is_packed: bool = False, n_features: int | None = None
|
|
200
|
+
):
|
|
201
|
+
return super().transform(
|
|
202
|
+
X, input_is_packed=input_is_packed, n_features=n_features
|
|
203
|
+
)
|
bblean/smiles.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
r"""SMILES manipulation"""
|
|
2
|
+
|
|
3
|
+
import typing as tp
|
|
4
|
+
from numpy.typing import NDArray
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from bblean.utils import batched
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"load_smiles",
|
|
12
|
+
"calc_num_smiles",
|
|
13
|
+
"iter_smiles_from_paths",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
SmilesPaths = tp.Iterable[Path | str] | Path | str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_smiles(smiles_paths: SmilesPaths, max_num: int = -1) -> NDArray[np.str_]:
|
|
20
|
+
r"""Simple utility to load smiles from a ``*.smi`` file"""
|
|
21
|
+
smiles = []
|
|
22
|
+
for i, smi in enumerate(iter_smiles_from_paths(smiles_paths)):
|
|
23
|
+
if i == max_num:
|
|
24
|
+
break
|
|
25
|
+
smiles.append(smi)
|
|
26
|
+
return np.asarray(smiles)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def calc_num_smiles(smiles_paths: SmilesPaths) -> int:
|
|
30
|
+
r"""Get the total number of smiles in a sequene of paths"""
|
|
31
|
+
return sum(1 for _ in iter_smiles_from_paths(smiles_paths))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def iter_smiles_from_paths(
|
|
35
|
+
smiles_paths: SmilesPaths,
|
|
36
|
+
) -> tp.Iterator[str]:
|
|
37
|
+
r"""Iterate over smiles in a sequence of smiles paths"""
|
|
38
|
+
if isinstance(smiles_paths, (Path, str)):
|
|
39
|
+
smiles_paths = [smiles_paths]
|
|
40
|
+
for smi_path in smiles_paths:
|
|
41
|
+
with open(smi_path, mode="rt", encoding="utf-8") as f:
|
|
42
|
+
for smi in f:
|
|
43
|
+
yield smi
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _iter_ranges_and_smiles_batches(
|
|
47
|
+
smiles_paths: SmilesPaths,
|
|
48
|
+
num_per_batch: int,
|
|
49
|
+
) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
|
|
50
|
+
start_idx = 0
|
|
51
|
+
for batch in batched(iter_smiles_from_paths(smiles_paths), num_per_batch):
|
|
52
|
+
size = len(batch)
|
|
53
|
+
end_idx = start_idx + size
|
|
54
|
+
yield (start_idx, end_idx), batch
|
|
55
|
+
start_idx = end_idx
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _iter_idxs_and_smiles_batches(
|
|
59
|
+
smiles_paths: SmilesPaths, num_per_batch: int
|
|
60
|
+
) -> tp.Iterable[tuple[int, tuple[str, ...]]]:
|
|
61
|
+
yield from enumerate(batched(iter_smiles_from_paths(smiles_paths), num_per_batch))
|
bblean/utils.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
r"""Misc. utility functions"""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import itertools
|
|
6
|
+
import typing as tp
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
import platform
|
|
10
|
+
import importlib
|
|
11
|
+
|
|
12
|
+
import psutil
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"batched",
|
|
17
|
+
"min_safe_uint",
|
|
18
|
+
"cpp_extensions_are_enabled",
|
|
19
|
+
"cpp_extensions_are_installed",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
_T = tp.TypeVar("_T")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def min_safe_uint(nmax: int) -> np.dtype:
|
|
26
|
+
r"""Returns the min uint dtype that holds a (positive) py int, excluding "object".
|
|
27
|
+
|
|
28
|
+
Input must be a positive python integer.
|
|
29
|
+
"""
|
|
30
|
+
out = np.min_scalar_type(nmax)
|
|
31
|
+
# Check if the dtype is a pointer to a python bigint
|
|
32
|
+
if out.hasobject:
|
|
33
|
+
raise ValueError(f"n_samples: {nmax} is too large to hold in a uint64 array")
|
|
34
|
+
return out
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Itertools recipe
|
|
38
|
+
def batched(iterable: tp.Iterable[_T], n: int) -> tp.Iterator[tuple[_T, ...]]:
|
|
39
|
+
r"""Batch data into tuples of length n. The last batch may be shorter.
|
|
40
|
+
|
|
41
|
+
This is equivalent to the batched receip from `itertools`.
|
|
42
|
+
"""
|
|
43
|
+
# batched('ABCDEFG', 3) --> ('A', 'B', 'C') ('D', 'E', 'F') ('G',)
|
|
44
|
+
if n < 1:
|
|
45
|
+
raise ValueError("n must be at least one")
|
|
46
|
+
it = iter(iterable)
|
|
47
|
+
while batch := tuple(itertools.islice(it, n)):
|
|
48
|
+
yield batch
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _import_bitbirch_variant(
|
|
52
|
+
variant: str = "lean",
|
|
53
|
+
) -> tuple[tp.Any, tp.Callable[..., None]]:
|
|
54
|
+
if variant not in ("lean", "int64", "uint8"):
|
|
55
|
+
raise ValueError(f"Unknown variant {variant}")
|
|
56
|
+
if variant == "lean":
|
|
57
|
+
# Most up-to-date bb variant
|
|
58
|
+
module = importlib.import_module("bblean.bitbirch")
|
|
59
|
+
elif variant == "uint8":
|
|
60
|
+
# Legacy variant of bb that uses uint8 and supports packing, but no extra optim
|
|
61
|
+
module = importlib.import_module("bblean._legacy.bb_uint8")
|
|
62
|
+
elif variant == "int64":
|
|
63
|
+
# Legacy variant of bb that uses int64 fps (dense only)
|
|
64
|
+
module = importlib.import_module("bblean._legacy.bb_int64")
|
|
65
|
+
|
|
66
|
+
Cls = getattr(module, "BitBirch")
|
|
67
|
+
fn = getattr(module, "set_merge")
|
|
68
|
+
return Cls, fn
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _num_avail_cpus() -> int:
|
|
72
|
+
if sys.platform == "darwin":
|
|
73
|
+
# macOS doesn't expose cpu affinity, so assume all cpu's are available
|
|
74
|
+
return os.cpu_count()
|
|
75
|
+
return len(psutil.Process().cpu_affinity())
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _cpu_name() -> str:
|
|
79
|
+
if sys.platform == "darwin":
|
|
80
|
+
try:
|
|
81
|
+
return subprocess.run(
|
|
82
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
83
|
+
capture_output=True,
|
|
84
|
+
text=True,
|
|
85
|
+
check=True,
|
|
86
|
+
).stdout.strip()
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
if sys.platform == "linux":
|
|
91
|
+
with open("/proc/cpuinfo") as f:
|
|
92
|
+
for line in f:
|
|
93
|
+
if line.startswith("model name"):
|
|
94
|
+
return line.split(":", 1)[1].strip()
|
|
95
|
+
|
|
96
|
+
# Fallback for windows and all cases where it could not be found
|
|
97
|
+
return platform.processor()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _has_files_or_valid_symlinks(path: Path) -> bool:
|
|
101
|
+
has_files = False
|
|
102
|
+
for p in path.iterdir():
|
|
103
|
+
if p.is_symlink() and not p.exists():
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
if p.is_file():
|
|
107
|
+
has_files = True
|
|
108
|
+
return has_files
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def cpp_extensions_are_enabled() -> bool:
|
|
112
|
+
r"""Query whether the C++ BitBRICH extensions are currently enabled"""
|
|
113
|
+
if os.getenv("BITBIRCH_NO_EXTENSIONS"):
|
|
114
|
+
return False
|
|
115
|
+
try:
|
|
116
|
+
from bblean._cpp_similarity import jt_isim_from_sum # noqa
|
|
117
|
+
|
|
118
|
+
return True
|
|
119
|
+
except ImportError:
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def cpp_extensions_are_installed() -> bool:
|
|
124
|
+
r"""Query whether the C++ BitBRICH extensions are currently installed"""
|
|
125
|
+
try:
|
|
126
|
+
from bblean._cpp_similarity import jt_isim_from_sum # noqa
|
|
127
|
+
|
|
128
|
+
return True
|
|
129
|
+
except ImportError:
|
|
130
|
+
return False
|