bblean 0.6.0b1__cp313-cp313-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bblean/similarity.py ADDED
@@ -0,0 +1,304 @@
1
+ """Optimized molecular similarity calculators"""
2
+
3
+ import os
4
+ import warnings
5
+
6
+ from numpy.typing import NDArray
7
+ import numpy as np
8
+
9
+ # NOTE: The most expensive calculation is *jt_sim_packed*, followed by _popcount_2d,
10
+ # centroid_from_sum, packing and unpacking
11
+ # TODO: Packing and unpacking *should be done in C++ using a lookup table*
12
+ __all__ = [
13
+ # JT sim between two (sets of) fingerprints, and average tanimoto (using iSIM)
14
+ "jt_isim_from_sum",
15
+ "jt_isim",
16
+ "jt_sim_packed",
17
+ "jt_most_dissimilar_packed",
18
+ # Radius and diameter from sum
19
+ "jt_isim_radius_from_sum",
20
+ "jt_isim_radius_compl_from_sum",
21
+ "jt_isim_diameter_from_sum",
22
+ # Radius and diameter from fps (packed and unpacked)
23
+ "jt_isim_radius",
24
+ "jt_isim_radius_compl",
25
+ "jt_isim_diameter",
26
+ # Centroid and medoid
27
+ # Radius and diameter unpacked / packed
28
+ "centroid_from_sum",
29
+ "centroid",
30
+ "jt_isim_medoid",
31
+ # Complementary similarity
32
+ "jt_compl_isim",
33
+ "jt_stratified_sampling",
34
+ "jt_sim_matrix_packed",
35
+ ]
36
+
37
+ from bblean._py_similarity import (
38
+ centroid_from_sum,
39
+ centroid,
40
+ jt_compl_isim,
41
+ jt_isim_medoid,
42
+ )
43
+
44
+ # jt_isim_packed and jt_isim_unpacked are not exposed, only used within functions for
45
+ # speed
46
+
47
+ if os.getenv("BITBIRCH_NO_EXTENSIONS"):
48
+ from bblean._py_similarity import (
49
+ jt_isim_from_sum,
50
+ jt_isim_unpacked,
51
+ jt_isim_packed,
52
+ _jt_sim_arr_vec_packed,
53
+ jt_most_dissimilar_packed,
54
+ )
55
+ else:
56
+ try:
57
+ from bblean._cpp_similarity import ( # type: ignore
58
+ jt_isim_from_sum,
59
+ _jt_sim_arr_vec_packed,
60
+ jt_isim_unpacked_u8,
61
+ jt_isim_packed_u8,
62
+ jt_most_dissimilar_packed,
63
+ unpack_fingerprints,
64
+ )
65
+
66
+ # Wrap these two since doing
67
+ def jt_isim_unpacked(arr: NDArray[np.integer]) -> float:
68
+ # Wrapping like this is slightly faster than letting pybind11 autocast
69
+ if arr.dtype == np.uint64:
70
+ return jt_isim_from_sum(
71
+ np.sum(arr, axis=0, dtype=np.uint64), len(arr) # type: ignore
72
+ )
73
+ return jt_isim_unpacked_u8(arr)
74
+
75
+ # Probably a mypy bug
76
+ def jt_isim_packed( # type: ignore
77
+ arr: NDArray[np.integer], n_features: int | None = None
78
+ ) -> float:
79
+ # Wrapping like this is slightly faster than letting pybind11 autocast
80
+ if arr.dtype == np.uint64:
81
+ return jt_isim_from_sum(
82
+ np.sum(
83
+ unpack_fingerprints(arr, n_features), # type: ignore
84
+ axis=0,
85
+ dtype=np.uint64,
86
+ ),
87
+ len(arr),
88
+ )
89
+ return jt_isim_packed_u8(arr)
90
+
91
+ except ImportError:
92
+ from bblean._py_similarity import ( # type: ignore
93
+ jt_isim_from_sum,
94
+ jt_isim_unpacked,
95
+ jt_isim_packed,
96
+ _jt_sim_arr_vec_packed,
97
+ jt_most_dissimilar_packed,
98
+ )
99
+
100
+ warnings.warn(
101
+ "C++ optimized similarity calculations not available,"
102
+ " falling back to python implementation"
103
+ )
104
+
105
+
106
+ def jt_isim(
107
+ fps: NDArray[np.integer],
108
+ input_is_packed: bool = True,
109
+ n_features: int | None = None,
110
+ ) -> float:
111
+ r"""Average Tanimoto, using iSIM
112
+
113
+ iSIM Tanimoto was first propsed in:
114
+ https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
115
+
116
+ :math:`iSIM_{JT}(X)` is an excellent :math:`O(N)` approximation of the average
117
+ Tanimoto similarity of a set of fingerprints.
118
+
119
+ Also equivalent to the complement of the Tanimoto diameter
120
+ :math:`iSIM_{JT}(X) = 1 - D_{JT}(X)`.
121
+
122
+ Parameters
123
+ ----------
124
+ arr : np.ndarray
125
+ 2D fingerprint array
126
+
127
+ input_is_packed : bool
128
+ Whether the input array has packed fingerprints
129
+
130
+ n_features: int | None
131
+ Number of features when unpacking fingerprints. Only required if
132
+ not a multiple of 8
133
+
134
+ Returns
135
+ ----------
136
+ isim : float
137
+ iSIM Jaccard-Tanimoto value
138
+ """
139
+ if input_is_packed:
140
+ return jt_isim_packed(fps, n_features)
141
+ return jt_isim_unpacked(fps)
142
+
143
+
144
+ def jt_isim_diameter(
145
+ arr: NDArray[np.integer],
146
+ input_is_packed: bool = True,
147
+ n_features: int | None = None,
148
+ ) -> float:
149
+ r"""Calculate the Tanimoto diameter of a set of fingerprints"""
150
+ return jt_isim_diameter_from_sum(
151
+ np.sum(
152
+ unpack_fingerprints(arr, n_features) if input_is_packed else arr,
153
+ axis=0,
154
+ dtype=np.uint64,
155
+ ), # type: ignore
156
+ len(arr),
157
+ )
158
+
159
+
160
+ def jt_isim_radius(
161
+ arr: NDArray[np.integer],
162
+ input_is_packed: bool = True,
163
+ n_features: int | None = None,
164
+ ) -> float:
165
+ r"""Calculate the Tanimoto radius of a set of fingerprints"""
166
+ return jt_isim_radius_from_sum(
167
+ np.sum(
168
+ unpack_fingerprints(arr, n_features) if input_is_packed else arr,
169
+ axis=0,
170
+ dtype=np.uint64,
171
+ ), # type: ignore
172
+ len(arr),
173
+ )
174
+
175
+
176
+ def jt_isim_radius_compl(
177
+ arr: NDArray[np.integer],
178
+ input_is_packed: bool = True,
179
+ n_features: int | None = None,
180
+ ) -> float:
181
+ r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
182
+ return jt_isim_radius_compl_from_sum(
183
+ np.sum(
184
+ unpack_fingerprints(arr, n_features) if input_is_packed else arr,
185
+ axis=0,
186
+ dtype=np.uint64,
187
+ ), # type: ignore
188
+ len(arr),
189
+ )
190
+
191
+
192
+ def jt_isim_radius_compl_from_sum(ls: NDArray[np.integer], n: int) -> float:
193
+ r"""Calculate the complement of the Tanimoto radius of a set of fingerprints"""
194
+ # Calculates 1 - R = Rc
195
+ # NOTE: Use uint64 sum since jt_isim_from_sum casts to uint64 internally
196
+ # This prevents multiple casts
197
+ new_unpacked_centroid = centroid_from_sum(ls, n, pack=False)
198
+ new_ls_1 = np.add(ls, new_unpacked_centroid, dtype=np.uint64)
199
+ new_n_1 = n + 1
200
+ new_jt = jt_isim_from_sum(ls, n)
201
+ new_jt_1 = jt_isim_from_sum(new_ls_1, new_n_1)
202
+ return (new_jt_1 * new_n_1 - new_jt * (n - 1)) / 2
203
+
204
+
205
+ def jt_isim_radius_from_sum(ls: NDArray[np.integer], n: int) -> float:
206
+ r"""Calculate the Tanimoto radius of a set of fingerprints"""
207
+ return 1 - jt_isim_radius_compl_from_sum(ls, n)
208
+
209
+
210
+ def jt_isim_diameter_from_sum(ls: NDArray[np.integer], n: int) -> float:
211
+ r"""Calculate the Tanimoto diameter of a set of fingerprints.
212
+
213
+ Equivalent to ``1 - jt_isim_from_sum(ls, n)``"""
214
+ return 1 - jt_isim_from_sum(ls, n)
215
+
216
+
217
+ # General wrapper that works both in C++ and python
218
+ def jt_sim_packed(
219
+ x: NDArray[np.uint8],
220
+ y: NDArray[np.uint8],
221
+ ) -> NDArray[np.float64]:
222
+ r"""Tanimoto similarity between packed fingerprints
223
+
224
+ Either both inputs are vectors of shape (F,) (Numpy scalar is returned), or
225
+ one is an vector (F,) and the other an array of shape (N, F) (Numpy array of
226
+ shape (N,) is returned).
227
+ """
228
+ if x.ndim == 1 and y.ndim == 1:
229
+ return _jt_sim_arr_vec_packed(x.reshape(1, -1), y)[0]
230
+ if x.ndim == 2:
231
+ return _jt_sim_arr_vec_packed(x, y)
232
+ if y.ndim == 2:
233
+ return _jt_sim_arr_vec_packed(y, x)
234
+ raise ValueError(
235
+ "Expected either two 1D vectors, or one 1D vector and one 2D array"
236
+ )
237
+
238
+
239
+ def jt_sim_matrix_packed(arr: NDArray[np.uint8]) -> NDArray[np.float64]:
240
+ r"""Tanimoto similarity matrix between all pairs of packed fps in arr"""
241
+ matrix = np.ones((len(arr), len(arr)), dtype=np.float64)
242
+ for i in range(len(arr)):
243
+ # Set the similarities for each row
244
+ matrix[i, i + 1 :] = jt_sim_packed(arr[i], arr[i + 1 :])
245
+ # Set the similarities for each column (symmetric)
246
+ matrix[i + 1 :, i] = matrix[i, i + 1 :]
247
+ return matrix
248
+
249
+
250
+ def estimate_jt_std(
251
+ fps: NDArray[np.uint8],
252
+ n_samples: int | None = None,
253
+ input_is_packed: bool = True,
254
+ n_features: int | None = None,
255
+ ) -> float:
256
+ r"""Estimate std of tanimoto sim using a deterministic sample"""
257
+ num_fps = len(fps)
258
+ if n_samples is None:
259
+ n_samples = max(num_fps // 1000, 50)
260
+ sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
261
+
262
+ # Work with sample from now on
263
+ fps = fps[sample_idxs]
264
+ num_fps = len(fps)
265
+ pairs = np.empty(num_fps * (num_fps - 1) // 2, dtype=np.float64)
266
+ # NOTE: Calc upper triangular part of pairwise matrix only, slightly more efficient,
267
+ # but difference is negligible in tests
268
+ offset = 0
269
+ for i in range(len(fps)):
270
+ num = num_fps - i - 1
271
+ pairs[offset : offset + num] = jt_sim_packed(fps[i], fps[i + 1 :])
272
+ offset += num
273
+ return np.std(pairs).item()
274
+
275
+
276
+ def jt_stratified_sampling(
277
+ fps: NDArray[np.uint8],
278
+ n_samples: int,
279
+ input_is_packed: bool = True,
280
+ n_features: int | None = None,
281
+ ) -> NDArray[np.int64]:
282
+ r"""Sample from a set of fingerprints according to their complementary similarity
283
+
284
+ Given a group of fingerprints, calculate all complementary similarities, order, and
285
+ sample the first element from consecutive groups of length ``num_fps // n_samples +
286
+ 1``.
287
+
288
+ .. note ::
289
+
290
+ This is not true statistical stratified sampling, it is not random, and the
291
+ strata are not homogeneous. It is meant as a reliable, deterministic method to
292
+ obtain a representative sample from a set of fingerprints.
293
+ """
294
+ # Stratified sampling without replacement
295
+ if n_samples == 0:
296
+ return np.array([], dtype=np.int64)
297
+ if n_samples > len(fps):
298
+ raise ValueError("n_samples must be <= len(fps)")
299
+ # Get the indices that would sort the complementary similarities
300
+ sorted_indices = np.argsort(jt_compl_isim(fps, input_is_packed, n_features))
301
+ # Split into n_samples strata
302
+ strata = np.array_split(sorted_indices, n_samples)
303
+ # Get first index of each strata
304
+ return np.array([s[0] for s in strata])
bblean/sklearn.py ADDED
@@ -0,0 +1,203 @@
1
+ r"""BitBirch 'Lean' classes that fully respects the sklearn API contract.
2
+
3
+ Use these classes as a drop-in replacement of `sklearn.cluster.Birch` if you are used to
4
+ the `sklearn` way of doing things, with the caveat that global clustering is not
5
+ currently supported.
6
+ """
7
+
8
+ import typing as tp
9
+ from numpy.typing import NDArray
10
+ import numpy as np
11
+ import typing_extensions as tpx
12
+
13
+ from sklearn.utils.validation import check_is_fitted, validate_data
14
+ from sklearn.metrics import pairwise_distances_argmin, pairwise_distances
15
+ from sklearn.base import (
16
+ BaseEstimator,
17
+ ClassNamePrefixFeaturesOutMixin,
18
+ ClusterMixin,
19
+ TransformerMixin,
20
+ _fit_context,
21
+ )
22
+
23
+ from bblean.fingerprints import unpack_fingerprints
24
+ from bblean.bitbirch import BitBirch as _BitBirch
25
+ from bblean._merges import MergeAcceptFunction
26
+
27
+ __all__ = ["BitBirch", "UnpackedBitBirch"]
28
+
29
+ # Required functions for sklearn API:
30
+ # - fit() *must be defined*
31
+ # - transform() *must be defined*
32
+ # - fit_predict() (ClusterMixin) default implementation is to fit and then return lbls
33
+ # - predict() # overloaded to use jt instead of euclidean
34
+ # - fit_transform() (TransformerMixin, delegates to *fit* and *transform*)
35
+ # - set_output() (TransformerMixin via _SetOutputMixin)
36
+ # set_output(transform="pandas") or transform="default" (numpy array) (or "polars",
37
+ # if polars is installed)
38
+
39
+ # The following requires _n_features_out after fitting
40
+ # - get_feature_names_out() ["bitbirch0", "bitbirch1", ...] (ClassNamePrefix...)
41
+
42
+ # - get_metadata_routing() () (!?) New feature, unclear what this is and unnecessary
43
+ # - partial_fit() () Same as fit() for BitBirch
44
+
45
+ # These require that the parameters are specified in __init__, and are assigned
46
+ # to names (or attributes) with the convention self.<param>.
47
+ # - get_params() (BaseEstimator)
48
+ # - set_params() (BaseEstimator)
49
+
50
+
51
+ class BitBirch(
52
+ ClassNamePrefixFeaturesOutMixin,
53
+ ClusterMixin,
54
+ TransformerMixin,
55
+ BaseEstimator,
56
+ _BitBirch,
57
+ ):
58
+ r"""Implements the BitBIRCH clustering algorithm, 'Lean' version.
59
+
60
+ Inputs to this estimator are *packed* fingerprints by default. If you need get a
61
+ class that always accepts an unpacked input use `bblean.sklearn.UnpackedBitBirch`
62
+
63
+ See `bblean.bitbirch.BitBirch` for more details"""
64
+
65
+ _parameter_constraints: dict[str, list[tp.Any]] = {}
66
+
67
+ def __init__(
68
+ self,
69
+ *,
70
+ threshold: float = 0.65,
71
+ branching_factor: int = 50,
72
+ merge_criterion: str | MergeAcceptFunction | None = None,
73
+ tolerance: float | None = None,
74
+ compute_labels: bool = True,
75
+ ):
76
+ super().__init__(
77
+ threshold=threshold,
78
+ branching_factor=branching_factor,
79
+ merge_criterion=merge_criterion,
80
+ tolerance=tolerance,
81
+ )
82
+ self.compute_labels = compute_labels
83
+
84
+ @_fit_context(prefer_skip_nested_validation=True)
85
+ def fit( # type: ignore
86
+ self, X, y=None, input_is_packed: bool = True, n_features: int | None = None
87
+ ) -> tpx.Self:
88
+ super().fit(X, input_is_packed=input_is_packed, n_features=n_features)
89
+ centroids = np.stack(
90
+ [bf.unpacked_centroid for bf in self._get_leaf_bfs(sort=True)]
91
+ )
92
+ self.subcluster_centers_ = centroids
93
+ self.subcluster_labels_ = np.arange(1, len(centroids) + 1)
94
+ self._n_features_out = centroids.shape[0]
95
+ if self.compute_labels:
96
+ self.labels_ = self.get_assignments()
97
+ return self
98
+
99
+ @_fit_context(prefer_skip_nested_validation=True)
100
+ def partial_fit( # type: ignore
101
+ self,
102
+ X=None,
103
+ y=None,
104
+ input_is_packed: bool = True,
105
+ n_features: int | None = None,
106
+ ) -> tpx.Self:
107
+ if X is None:
108
+ raise ValueError()
109
+ self.fit(X, input_is_packed=input_is_packed, n_features=n_features)
110
+ if self.compute_labels:
111
+ self.labels_ = self.get_assignments()
112
+ return self
113
+
114
+ # Overloaded since self.labels_ may not be set
115
+ def fit_predict( # type: ignore
116
+ self, X, y=None, input_is_packed: bool = True, n_features: int | None = None
117
+ ) -> NDArray[np.integer]:
118
+ self.fit(X, input_is_packed=input_is_packed, n_features=n_features)
119
+ if not self.compute_labels:
120
+ self.labels_ = self.get_assignments()
121
+ return self.labels_
122
+
123
+ def predict( # type: ignore
124
+ self, X, input_is_packed: bool = True, n_features: int | None = None
125
+ ) -> NDArray[np.integer]:
126
+ """Predict data using the ``centroids`` of subclusters."""
127
+ check_is_fitted(self)
128
+ X = validate_data(self, X, accept_sparse="csr", reset=False)
129
+ X = (
130
+ (unpack_fingerprints(X, n_features=n_features) if input_is_packed else X)
131
+ .astype(np.uint8, copy=False)
132
+ .view(np.bool)
133
+ )
134
+ # TODO: Even when both inputs are bool, this function warns for some reason
135
+ # I believe this may be a sklearn bug
136
+ centers = self.subcluster_centers_.astype(np.uint8, copy=False).view(np.bool)
137
+ argmin = pairwise_distances_argmin(X, centers, metric="jaccard")
138
+ return self.subcluster_labels_[argmin]
139
+
140
+ def transform( # type: ignore
141
+ self,
142
+ X,
143
+ input_is_packed: bool = True,
144
+ n_features: int | None = None,
145
+ ):
146
+ check_is_fitted(self)
147
+ X = validate_data(self, X, accept_sparse="csr", reset=False)
148
+ X = (
149
+ (unpack_fingerprints(X, n_features=n_features) if input_is_packed else X)
150
+ .astype(np.uint8, copy=False)
151
+ .view(np.bool)
152
+ )
153
+ centers = self.subcluster_centers_.astype(np.uint8, copy=False).view(np.bool)
154
+ return pairwise_distances(X, centers, metric="jaccard")
155
+
156
+ def __sklearn_tags__(self): # type: ignore
157
+ tags = super().__sklearn_tags__()
158
+ tags.input_tags.sparse = True
159
+ return tags
160
+
161
+
162
+ class UnpackedBitBirch(BitBirch):
163
+ r"""Implements the BitBIRCH clustering algorithm, 'Lean' version.
164
+
165
+ Inputs to this estimator are *unpacked* fingerprints always
166
+
167
+ See `bblean.bitbirch.BitBirch` for more details"""
168
+
169
+ def fit( # type: ignore
170
+ self, X, y=None, input_is_packed: bool = False, n_features: int | None = None
171
+ ) -> tpx.Self:
172
+ return super().fit(X, y, input_is_packed=input_is_packed, n_features=n_features)
173
+
174
+ def partial_fit( # type: ignore
175
+ self, X, y=None, input_is_packed: bool = False, n_features: int | None = None
176
+ ):
177
+ return super().partial_fit(
178
+ X, y, input_is_packed=input_is_packed, n_features=n_features
179
+ )
180
+
181
+ def fit_predict( # type: ignore
182
+ self, X, y=None, input_is_packed: bool = False, n_features: int | None = None
183
+ ):
184
+ return super().fit_predict(
185
+ X, y, input_is_packed=input_is_packed, n_features=n_features
186
+ )
187
+
188
+ def predict( # type: ignore
189
+ self,
190
+ X,
191
+ input_is_packed: bool = False,
192
+ n_features: int | None = None,
193
+ ):
194
+ return super().predict(
195
+ X, input_is_packed=input_is_packed, n_features=n_features
196
+ )
197
+
198
+ def transform( # type: ignore
199
+ self, X, input_is_packed: bool = False, n_features: int | None = None
200
+ ):
201
+ return super().transform(
202
+ X, input_is_packed=input_is_packed, n_features=n_features
203
+ )
bblean/smiles.py ADDED
@@ -0,0 +1,61 @@
1
+ r"""SMILES manipulation"""
2
+
3
+ import typing as tp
4
+ from numpy.typing import NDArray
5
+ import numpy as np
6
+ from pathlib import Path
7
+
8
+ from bblean.utils import batched
9
+
10
+ __all__ = [
11
+ "load_smiles",
12
+ "calc_num_smiles",
13
+ "iter_smiles_from_paths",
14
+ ]
15
+
16
+ SmilesPaths = tp.Iterable[Path | str] | Path | str
17
+
18
+
19
+ def load_smiles(smiles_paths: SmilesPaths, max_num: int = -1) -> NDArray[np.str_]:
20
+ r"""Simple utility to load smiles from a ``*.smi`` file"""
21
+ smiles = []
22
+ for i, smi in enumerate(iter_smiles_from_paths(smiles_paths)):
23
+ if i == max_num:
24
+ break
25
+ smiles.append(smi)
26
+ return np.asarray(smiles)
27
+
28
+
29
+ def calc_num_smiles(smiles_paths: SmilesPaths) -> int:
30
+ r"""Get the total number of smiles in a sequene of paths"""
31
+ return sum(1 for _ in iter_smiles_from_paths(smiles_paths))
32
+
33
+
34
+ def iter_smiles_from_paths(
35
+ smiles_paths: SmilesPaths,
36
+ ) -> tp.Iterator[str]:
37
+ r"""Iterate over smiles in a sequence of smiles paths"""
38
+ if isinstance(smiles_paths, (Path, str)):
39
+ smiles_paths = [smiles_paths]
40
+ for smi_path in smiles_paths:
41
+ with open(smi_path, mode="rt", encoding="utf-8") as f:
42
+ for smi in f:
43
+ yield smi
44
+
45
+
46
+ def _iter_ranges_and_smiles_batches(
47
+ smiles_paths: SmilesPaths,
48
+ num_per_batch: int,
49
+ ) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
50
+ start_idx = 0
51
+ for batch in batched(iter_smiles_from_paths(smiles_paths), num_per_batch):
52
+ size = len(batch)
53
+ end_idx = start_idx + size
54
+ yield (start_idx, end_idx), batch
55
+ start_idx = end_idx
56
+
57
+
58
+ def _iter_idxs_and_smiles_batches(
59
+ smiles_paths: SmilesPaths, num_per_batch: int
60
+ ) -> tp.Iterable[tuple[int, tuple[str, ...]]]:
61
+ yield from enumerate(batched(iter_smiles_from_paths(smiles_paths), num_per_batch))
bblean/utils.py ADDED
@@ -0,0 +1,130 @@
1
+ r"""Misc. utility functions"""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ import itertools
6
+ import typing as tp
7
+ import sys
8
+ import subprocess
9
+ import platform
10
+ import importlib
11
+
12
+ import psutil
13
+ import numpy as np
14
+
15
+ __all__ = [
16
+ "batched",
17
+ "min_safe_uint",
18
+ "cpp_extensions_are_enabled",
19
+ "cpp_extensions_are_installed",
20
+ ]
21
+
22
+ _T = tp.TypeVar("_T")
23
+
24
+
25
+ def min_safe_uint(nmax: int) -> np.dtype:
26
+ r"""Returns the min uint dtype that holds a (positive) py int, excluding "object".
27
+
28
+ Input must be a positive python integer.
29
+ """
30
+ out = np.min_scalar_type(nmax)
31
+ # Check if the dtype is a pointer to a python bigint
32
+ if out.hasobject:
33
+ raise ValueError(f"n_samples: {nmax} is too large to hold in a uint64 array")
34
+ return out
35
+
36
+
37
+ # Itertools recipe
38
+ def batched(iterable: tp.Iterable[_T], n: int) -> tp.Iterator[tuple[_T, ...]]:
39
+ r"""Batch data into tuples of length n. The last batch may be shorter.
40
+
41
+ This is equivalent to the batched receip from `itertools`.
42
+ """
43
+ # batched('ABCDEFG', 3) --> ('A', 'B', 'C') ('D', 'E', 'F') ('G',)
44
+ if n < 1:
45
+ raise ValueError("n must be at least one")
46
+ it = iter(iterable)
47
+ while batch := tuple(itertools.islice(it, n)):
48
+ yield batch
49
+
50
+
51
+ def _import_bitbirch_variant(
52
+ variant: str = "lean",
53
+ ) -> tuple[tp.Any, tp.Callable[..., None]]:
54
+ if variant not in ("lean", "int64", "uint8"):
55
+ raise ValueError(f"Unknown variant {variant}")
56
+ if variant == "lean":
57
+ # Most up-to-date bb variant
58
+ module = importlib.import_module("bblean.bitbirch")
59
+ elif variant == "uint8":
60
+ # Legacy variant of bb that uses uint8 and supports packing, but no extra optim
61
+ module = importlib.import_module("bblean._legacy.bb_uint8")
62
+ elif variant == "int64":
63
+ # Legacy variant of bb that uses int64 fps (dense only)
64
+ module = importlib.import_module("bblean._legacy.bb_int64")
65
+
66
+ Cls = getattr(module, "BitBirch")
67
+ fn = getattr(module, "set_merge")
68
+ return Cls, fn
69
+
70
+
71
+ def _num_avail_cpus() -> int:
72
+ if sys.platform == "darwin":
73
+ # macOS doesn't expose cpu affinity, so assume all cpu's are available
74
+ return os.cpu_count()
75
+ return len(psutil.Process().cpu_affinity())
76
+
77
+
78
+ def _cpu_name() -> str:
79
+ if sys.platform == "darwin":
80
+ try:
81
+ return subprocess.run(
82
+ ["sysctl", "-n", "machdep.cpu.brand_string"],
83
+ capture_output=True,
84
+ text=True,
85
+ check=True,
86
+ ).stdout.strip()
87
+ except Exception:
88
+ pass
89
+
90
+ if sys.platform == "linux":
91
+ with open("/proc/cpuinfo") as f:
92
+ for line in f:
93
+ if line.startswith("model name"):
94
+ return line.split(":", 1)[1].strip()
95
+
96
+ # Fallback for windows and all cases where it could not be found
97
+ return platform.processor()
98
+
99
+
100
+ def _has_files_or_valid_symlinks(path: Path) -> bool:
101
+ has_files = False
102
+ for p in path.iterdir():
103
+ if p.is_symlink() and not p.exists():
104
+ return False
105
+
106
+ if p.is_file():
107
+ has_files = True
108
+ return has_files
109
+
110
+
111
+ def cpp_extensions_are_enabled() -> bool:
112
+ r"""Query whether the C++ BitBRICH extensions are currently enabled"""
113
+ if os.getenv("BITBIRCH_NO_EXTENSIONS"):
114
+ return False
115
+ try:
116
+ from bblean._cpp_similarity import jt_isim_from_sum # noqa
117
+
118
+ return True
119
+ except ImportError:
120
+ return False
121
+
122
+
123
+ def cpp_extensions_are_installed() -> bool:
124
+ r"""Query whether the C++ BitBRICH extensions are currently installed"""
125
+ try:
126
+ from bblean._cpp_similarity import jt_isim_from_sum # noqa
127
+
128
+ return True
129
+ except ImportError:
130
+ return False