bblean 0.6.0b1__cp312-cp312-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cpython-312-darwin.so +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1854 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b1.dist-info/METADATA +283 -0
- bblean-0.6.0b1.dist-info/RECORD +31 -0
- bblean-0.6.0b1.dist-info/WHEEL +6 -0
- bblean-0.6.0b1.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b1.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b1.dist-info/top_level.txt +1 -0
bblean/_timer.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
r"""General timing tools"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Timer:
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self._timings_s: dict[str, float] = {}
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def timings_s(self) -> dict[str, float]:
|
|
16
|
+
return self._timings_s.copy()
|
|
17
|
+
|
|
18
|
+
def init_timing(self, label: str = "total") -> None:
|
|
19
|
+
if label in self._timings_s:
|
|
20
|
+
raise ValueError(f"{label} has already been tracked")
|
|
21
|
+
self._timings_s[label] = time.perf_counter()
|
|
22
|
+
|
|
23
|
+
def end_timing(
|
|
24
|
+
self, label: str = "total", console: Console | None = None, indent: bool = True
|
|
25
|
+
) -> None:
|
|
26
|
+
if label not in self._timings_s:
|
|
27
|
+
raise ValueError(f"{label} has not been initialized")
|
|
28
|
+
self._timings_s[label] = time.perf_counter() - self._timings_s[label]
|
|
29
|
+
t = self._timings_s[label]
|
|
30
|
+
if console is not None:
|
|
31
|
+
if indent:
|
|
32
|
+
indent_str = " "
|
|
33
|
+
else:
|
|
34
|
+
indent_str = ""
|
|
35
|
+
if label == "total":
|
|
36
|
+
console.print(f"{indent_str}- Total time elapsed: {t:.4f} s")
|
|
37
|
+
else:
|
|
38
|
+
console.print(f"{indent_str}- Time for {label}: {t:.4f} s")
|
|
39
|
+
|
|
40
|
+
def dump(self, path: Path) -> None:
|
|
41
|
+
with open(path, mode="wt", encoding="utf-8") as f:
|
|
42
|
+
json.dump(self._timings_s, f, indent=4)
|
bblean/_version.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.6.0b1'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 6, 0, 'b1')
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
bblean/analysis.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
r"""Analysis of clustering results"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
import dataclasses
|
|
6
|
+
import typing as tp
|
|
7
|
+
from functools import cached_property
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
from numpy.typing import NDArray
|
|
12
|
+
from rdkit.Chem.Scaffolds import MurckoScaffold
|
|
13
|
+
|
|
14
|
+
from bblean._config import DEFAULTS
|
|
15
|
+
from bblean.similarity import jt_isim
|
|
16
|
+
from bblean.fingerprints import (
|
|
17
|
+
fps_from_smiles,
|
|
18
|
+
unpack_fingerprints,
|
|
19
|
+
pack_fingerprints,
|
|
20
|
+
_FingerprintFileSequence,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"scaffold_analysis",
|
|
25
|
+
"cluster_analysis",
|
|
26
|
+
"ScaffoldAnalysis",
|
|
27
|
+
"ClusterAnalysis",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class ScaffoldAnalysis:
|
|
33
|
+
r""":meta private:"""
|
|
34
|
+
|
|
35
|
+
unique_num: int
|
|
36
|
+
isim: float
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ClusterAnalysis:
|
|
40
|
+
r""":meta private:"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
selected_cluster_sizes: list[int],
|
|
45
|
+
all_cluster_sizes: list[int],
|
|
46
|
+
df: pd.DataFrame,
|
|
47
|
+
total_fps_num: int,
|
|
48
|
+
selected_fps: NDArray[np.uint8] | None = None,
|
|
49
|
+
fps_are_packed: bool = True,
|
|
50
|
+
n_features: int | None = None,
|
|
51
|
+
min_size: int | None = None,
|
|
52
|
+
) -> None:
|
|
53
|
+
self.total_fps = total_fps_num
|
|
54
|
+
self.stats = pd.Series(all_cluster_sizes).describe()
|
|
55
|
+
self._all_cluster_sizes = all_cluster_sizes
|
|
56
|
+
self._selected_cluster_sizes = selected_cluster_sizes
|
|
57
|
+
self._fps = selected_fps
|
|
58
|
+
self._df = df
|
|
59
|
+
self.fps_are_packed = fps_are_packed
|
|
60
|
+
self.n_features = n_features
|
|
61
|
+
self.min_size = min_size
|
|
62
|
+
|
|
63
|
+
def all_clusters_num_with_size_above(self, size: int) -> int:
|
|
64
|
+
return sum(1 for c in self._all_cluster_sizes if c > size)
|
|
65
|
+
|
|
66
|
+
@cached_property
|
|
67
|
+
def all_singletons_num(self) -> int:
|
|
68
|
+
return sum(1 for c in self._all_cluster_sizes if c == 1)
|
|
69
|
+
|
|
70
|
+
def get_top_cluster_fps(self, packed: bool = True) -> list[NDArray[np.uint8]]:
|
|
71
|
+
if self._fps is None:
|
|
72
|
+
raise RuntimeError("Fingerprints not present")
|
|
73
|
+
fps = self.top_packed_fps if packed else self.top_unpacked_fps
|
|
74
|
+
out = []
|
|
75
|
+
offset = 0
|
|
76
|
+
for s in self._selected_cluster_sizes:
|
|
77
|
+
out.append(fps[offset : offset + s])
|
|
78
|
+
offset += s
|
|
79
|
+
return out
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def all_clusters_mean_size(self) -> float:
|
|
83
|
+
return float(self.stats["mean"])
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def all_clusters_median_size(self) -> int:
|
|
87
|
+
return int(self.stats["50%"])
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def all_clusters_q1(self) -> int:
|
|
91
|
+
return int(self.stats["25%"])
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def all_clusters_q3(self) -> int:
|
|
95
|
+
return int(self.stats["75%"])
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def all_clusters_min_size(self) -> int:
|
|
99
|
+
return int(self.stats["min"])
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def all_clusters_max_size(self) -> int:
|
|
103
|
+
return int(self.stats["max"])
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def all_clusters_num(self) -> int:
|
|
107
|
+
return int(self.stats["count"])
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def top_unpacked_fps(self) -> NDArray[np.uint8]:
|
|
111
|
+
if self._fps is None:
|
|
112
|
+
raise RuntimeError("Fingerprints not present")
|
|
113
|
+
if self.fps_are_packed:
|
|
114
|
+
return unpack_fingerprints(self._fps, self.n_features)
|
|
115
|
+
return self._fps
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def top_packed_fps(self) -> NDArray[np.uint8]:
|
|
119
|
+
if self._fps is None:
|
|
120
|
+
raise RuntimeError("Fingerprints not present")
|
|
121
|
+
if self.fps_are_packed:
|
|
122
|
+
return self._fps
|
|
123
|
+
return pack_fingerprints(self._fps)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def has_scaffolds(self) -> bool:
|
|
127
|
+
return "unique_scaffolds_num" in self._df.columns
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def has_fps(self) -> bool:
|
|
131
|
+
return self._fps is not None
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def has_all_clusters(self) -> bool:
|
|
135
|
+
return self.clusters_num == self.all_clusters_num
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def clusters_num(self) -> int:
|
|
139
|
+
return len(self._df)
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def isims(self) -> pd.Series:
|
|
143
|
+
return self._df["isim"]
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def labels(self) -> pd.Series:
|
|
147
|
+
return self._df["labels"]
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def sizes(self) -> pd.Series:
|
|
151
|
+
return self._df["sizes"]
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def unique_scaffolds_num(self) -> pd.Series:
|
|
155
|
+
return self._df["unique_scaffolds_num"]
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def unique_scaffolds_isim(self) -> pd.Series:
|
|
159
|
+
return self._df["unique_scaffolds_isim"]
|
|
160
|
+
|
|
161
|
+
def dump_metrics(self, path: Path) -> None:
|
|
162
|
+
self._df.to_csv(path, index=False)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# Get the number of unique scaffolds and the scaffold isim
|
|
166
|
+
def scaffold_analysis(
|
|
167
|
+
smiles: tp.Iterable[str], fp_kind: str = DEFAULTS.fp_kind
|
|
168
|
+
) -> ScaffoldAnalysis:
|
|
169
|
+
r"""Perform a scaffold analysis of a sequence of smiles
|
|
170
|
+
|
|
171
|
+
Note that the order of the input smiles is not relevant
|
|
172
|
+
"""
|
|
173
|
+
if isinstance(smiles, str):
|
|
174
|
+
smiles = [smiles]
|
|
175
|
+
scaffolds = [MurckoScaffold.MurckoScaffoldSmilesFromSmiles(smi) for smi in smiles]
|
|
176
|
+
unique_scaffolds = set(scaffolds)
|
|
177
|
+
scaffolds_fps = fps_from_smiles(unique_scaffolds, kind=fp_kind, pack=False)
|
|
178
|
+
scaffolds_isim = jt_isim(scaffolds_fps, input_is_packed=False)
|
|
179
|
+
return ScaffoldAnalysis(len(unique_scaffolds), scaffolds_isim)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def cluster_analysis(
|
|
183
|
+
clusters: list[list[int]],
|
|
184
|
+
fps: NDArray[np.integer] | Path | tp.Sequence[Path] | None = None,
|
|
185
|
+
smiles: tp.Iterable[str] = (),
|
|
186
|
+
n_features: int | None = None,
|
|
187
|
+
top: int | None = 20,
|
|
188
|
+
assume_sorted: bool = True,
|
|
189
|
+
scaffold_fp_kind: str = DEFAULTS.fp_kind,
|
|
190
|
+
input_is_packed: bool = True,
|
|
191
|
+
min_size: int = 0,
|
|
192
|
+
) -> ClusterAnalysis:
|
|
193
|
+
r"""Perform a cluster analysis starting from clusters, smiles, and fingerprints"""
|
|
194
|
+
if isinstance(smiles, str):
|
|
195
|
+
smiles = [smiles]
|
|
196
|
+
smiles = np.asarray(smiles)
|
|
197
|
+
|
|
198
|
+
if not assume_sorted:
|
|
199
|
+
# Largest first
|
|
200
|
+
clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
|
|
201
|
+
all_cluster_sizes = [len(c) for c in clusters]
|
|
202
|
+
total_fps = sum(all_cluster_sizes)
|
|
203
|
+
# Filter by min size
|
|
204
|
+
_clusters = []
|
|
205
|
+
for i, c in enumerate(clusters):
|
|
206
|
+
if all_cluster_sizes[i] < min_size:
|
|
207
|
+
break
|
|
208
|
+
if top is not None and i >= top:
|
|
209
|
+
break
|
|
210
|
+
_clusters.append(c)
|
|
211
|
+
clusters = _clusters
|
|
212
|
+
|
|
213
|
+
info: dict[str, list[tp.Any]] = defaultdict(list)
|
|
214
|
+
fps_provider: tp.Union[_FingerprintFileSequence, NDArray[np.uint8], None]
|
|
215
|
+
if fps is None:
|
|
216
|
+
fps_provider = None
|
|
217
|
+
elif isinstance(fps, Path):
|
|
218
|
+
fps_provider = np.load(fps, mmap_mode="r")
|
|
219
|
+
elif not isinstance(fps, np.ndarray):
|
|
220
|
+
fps_provider = _FingerprintFileSequence(fps)
|
|
221
|
+
else:
|
|
222
|
+
fps_provider = tp.cast(NDArray[np.uint8], fps.astype(np.uint8, copy=False))
|
|
223
|
+
|
|
224
|
+
if fps_provider is None:
|
|
225
|
+
selected = None
|
|
226
|
+
else:
|
|
227
|
+
selected = np.empty(
|
|
228
|
+
(sum(len(c) for c in clusters), fps_provider.shape[1]), dtype=np.uint8
|
|
229
|
+
)
|
|
230
|
+
start = 0
|
|
231
|
+
for i, c in enumerate(clusters, 1):
|
|
232
|
+
size = len(c)
|
|
233
|
+
# If a file sequence is passed, the cluster indices must be sorted.
|
|
234
|
+
# the cluster analysis is idx-order-independent, so this is fine
|
|
235
|
+
info["labels"].append(i)
|
|
236
|
+
info["sizes"].append(size)
|
|
237
|
+
if smiles.size:
|
|
238
|
+
analysis = scaffold_analysis(smiles[c], fp_kind=scaffold_fp_kind)
|
|
239
|
+
info["unique_scaffolds_num"].append(analysis.unique_num)
|
|
240
|
+
info["unique_scaffolds_isim"].append(analysis.isim)
|
|
241
|
+
if fps_provider is not None:
|
|
242
|
+
assert selected is not None
|
|
243
|
+
_fps = fps_provider[sorted(c)]
|
|
244
|
+
info["isim"].append(
|
|
245
|
+
jt_isim(_fps, input_is_packed=input_is_packed, n_features=n_features)
|
|
246
|
+
)
|
|
247
|
+
selected[start : start + size] = _fps
|
|
248
|
+
start += size
|
|
249
|
+
return ClusterAnalysis(
|
|
250
|
+
[len(c) for c in clusters],
|
|
251
|
+
all_cluster_sizes,
|
|
252
|
+
pd.DataFrame(info),
|
|
253
|
+
selected_fps=selected,
|
|
254
|
+
total_fps_num=total_fps,
|
|
255
|
+
fps_are_packed=input_is_packed,
|
|
256
|
+
n_features=n_features,
|
|
257
|
+
min_size=min_size,
|
|
258
|
+
)
|