bblean 0.6.0b1__cp312-cp312-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bblean/_timer.py ADDED
@@ -0,0 +1,42 @@
1
+ r"""General timing tools"""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ import time
6
+
7
+ from rich.console import Console
8
+
9
+
10
+ class Timer:
11
+ def __init__(self) -> None:
12
+ self._timings_s: dict[str, float] = {}
13
+
14
+ @property
15
+ def timings_s(self) -> dict[str, float]:
16
+ return self._timings_s.copy()
17
+
18
+ def init_timing(self, label: str = "total") -> None:
19
+ if label in self._timings_s:
20
+ raise ValueError(f"{label} has already been tracked")
21
+ self._timings_s[label] = time.perf_counter()
22
+
23
+ def end_timing(
24
+ self, label: str = "total", console: Console | None = None, indent: bool = True
25
+ ) -> None:
26
+ if label not in self._timings_s:
27
+ raise ValueError(f"{label} has not been initialized")
28
+ self._timings_s[label] = time.perf_counter() - self._timings_s[label]
29
+ t = self._timings_s[label]
30
+ if console is not None:
31
+ if indent:
32
+ indent_str = " "
33
+ else:
34
+ indent_str = ""
35
+ if label == "total":
36
+ console.print(f"{indent_str}- Total time elapsed: {t:.4f} s")
37
+ else:
38
+ console.print(f"{indent_str}- Time for {label}: {t:.4f} s")
39
+
40
+ def dump(self, path: Path) -> None:
41
+ with open(path, mode="wt", encoding="utf-8") as f:
42
+ json.dump(self._timings_s, f, indent=4)
bblean/_version.py ADDED
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.6.0b1'
32
+ __version_tuple__ = version_tuple = (0, 6, 0, 'b1')
33
+
34
+ __commit_id__ = commit_id = None
bblean/analysis.py ADDED
@@ -0,0 +1,258 @@
1
+ r"""Analysis of clustering results"""
2
+
3
+ from pathlib import Path
4
+ from collections import defaultdict
5
+ import dataclasses
6
+ import typing as tp
7
+ from functools import cached_property
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from numpy.typing import NDArray
12
+ from rdkit.Chem.Scaffolds import MurckoScaffold
13
+
14
+ from bblean._config import DEFAULTS
15
+ from bblean.similarity import jt_isim
16
+ from bblean.fingerprints import (
17
+ fps_from_smiles,
18
+ unpack_fingerprints,
19
+ pack_fingerprints,
20
+ _FingerprintFileSequence,
21
+ )
22
+
23
+ __all__ = [
24
+ "scaffold_analysis",
25
+ "cluster_analysis",
26
+ "ScaffoldAnalysis",
27
+ "ClusterAnalysis",
28
+ ]
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class ScaffoldAnalysis:
33
+ r""":meta private:"""
34
+
35
+ unique_num: int
36
+ isim: float
37
+
38
+
39
+ class ClusterAnalysis:
40
+ r""":meta private:"""
41
+
42
+ def __init__(
43
+ self,
44
+ selected_cluster_sizes: list[int],
45
+ all_cluster_sizes: list[int],
46
+ df: pd.DataFrame,
47
+ total_fps_num: int,
48
+ selected_fps: NDArray[np.uint8] | None = None,
49
+ fps_are_packed: bool = True,
50
+ n_features: int | None = None,
51
+ min_size: int | None = None,
52
+ ) -> None:
53
+ self.total_fps = total_fps_num
54
+ self.stats = pd.Series(all_cluster_sizes).describe()
55
+ self._all_cluster_sizes = all_cluster_sizes
56
+ self._selected_cluster_sizes = selected_cluster_sizes
57
+ self._fps = selected_fps
58
+ self._df = df
59
+ self.fps_are_packed = fps_are_packed
60
+ self.n_features = n_features
61
+ self.min_size = min_size
62
+
63
+ def all_clusters_num_with_size_above(self, size: int) -> int:
64
+ return sum(1 for c in self._all_cluster_sizes if c > size)
65
+
66
+ @cached_property
67
+ def all_singletons_num(self) -> int:
68
+ return sum(1 for c in self._all_cluster_sizes if c == 1)
69
+
70
+ def get_top_cluster_fps(self, packed: bool = True) -> list[NDArray[np.uint8]]:
71
+ if self._fps is None:
72
+ raise RuntimeError("Fingerprints not present")
73
+ fps = self.top_packed_fps if packed else self.top_unpacked_fps
74
+ out = []
75
+ offset = 0
76
+ for s in self._selected_cluster_sizes:
77
+ out.append(fps[offset : offset + s])
78
+ offset += s
79
+ return out
80
+
81
+ @property
82
+ def all_clusters_mean_size(self) -> float:
83
+ return float(self.stats["mean"])
84
+
85
+ @property
86
+ def all_clusters_median_size(self) -> int:
87
+ return int(self.stats["50%"])
88
+
89
+ @property
90
+ def all_clusters_q1(self) -> int:
91
+ return int(self.stats["25%"])
92
+
93
+ @property
94
+ def all_clusters_q3(self) -> int:
95
+ return int(self.stats["75%"])
96
+
97
+ @property
98
+ def all_clusters_min_size(self) -> int:
99
+ return int(self.stats["min"])
100
+
101
+ @property
102
+ def all_clusters_max_size(self) -> int:
103
+ return int(self.stats["max"])
104
+
105
+ @property
106
+ def all_clusters_num(self) -> int:
107
+ return int(self.stats["count"])
108
+
109
+ @property
110
+ def top_unpacked_fps(self) -> NDArray[np.uint8]:
111
+ if self._fps is None:
112
+ raise RuntimeError("Fingerprints not present")
113
+ if self.fps_are_packed:
114
+ return unpack_fingerprints(self._fps, self.n_features)
115
+ return self._fps
116
+
117
+ @property
118
+ def top_packed_fps(self) -> NDArray[np.uint8]:
119
+ if self._fps is None:
120
+ raise RuntimeError("Fingerprints not present")
121
+ if self.fps_are_packed:
122
+ return self._fps
123
+ return pack_fingerprints(self._fps)
124
+
125
+ @property
126
+ def has_scaffolds(self) -> bool:
127
+ return "unique_scaffolds_num" in self._df.columns
128
+
129
+ @property
130
+ def has_fps(self) -> bool:
131
+ return self._fps is not None
132
+
133
+ @property
134
+ def has_all_clusters(self) -> bool:
135
+ return self.clusters_num == self.all_clusters_num
136
+
137
+ @property
138
+ def clusters_num(self) -> int:
139
+ return len(self._df)
140
+
141
+ @property
142
+ def isims(self) -> pd.Series:
143
+ return self._df["isim"]
144
+
145
+ @property
146
+ def labels(self) -> pd.Series:
147
+ return self._df["labels"]
148
+
149
+ @property
150
+ def sizes(self) -> pd.Series:
151
+ return self._df["sizes"]
152
+
153
+ @property
154
+ def unique_scaffolds_num(self) -> pd.Series:
155
+ return self._df["unique_scaffolds_num"]
156
+
157
+ @property
158
+ def unique_scaffolds_isim(self) -> pd.Series:
159
+ return self._df["unique_scaffolds_isim"]
160
+
161
+ def dump_metrics(self, path: Path) -> None:
162
+ self._df.to_csv(path, index=False)
163
+
164
+
165
+ # Get the number of unique scaffolds and the scaffold isim
166
+ def scaffold_analysis(
167
+ smiles: tp.Iterable[str], fp_kind: str = DEFAULTS.fp_kind
168
+ ) -> ScaffoldAnalysis:
169
+ r"""Perform a scaffold analysis of a sequence of smiles
170
+
171
+ Note that the order of the input smiles is not relevant
172
+ """
173
+ if isinstance(smiles, str):
174
+ smiles = [smiles]
175
+ scaffolds = [MurckoScaffold.MurckoScaffoldSmilesFromSmiles(smi) for smi in smiles]
176
+ unique_scaffolds = set(scaffolds)
177
+ scaffolds_fps = fps_from_smiles(unique_scaffolds, kind=fp_kind, pack=False)
178
+ scaffolds_isim = jt_isim(scaffolds_fps, input_is_packed=False)
179
+ return ScaffoldAnalysis(len(unique_scaffolds), scaffolds_isim)
180
+
181
+
182
+ def cluster_analysis(
183
+ clusters: list[list[int]],
184
+ fps: NDArray[np.integer] | Path | tp.Sequence[Path] | None = None,
185
+ smiles: tp.Iterable[str] = (),
186
+ n_features: int | None = None,
187
+ top: int | None = 20,
188
+ assume_sorted: bool = True,
189
+ scaffold_fp_kind: str = DEFAULTS.fp_kind,
190
+ input_is_packed: bool = True,
191
+ min_size: int = 0,
192
+ ) -> ClusterAnalysis:
193
+ r"""Perform a cluster analysis starting from clusters, smiles, and fingerprints"""
194
+ if isinstance(smiles, str):
195
+ smiles = [smiles]
196
+ smiles = np.asarray(smiles)
197
+
198
+ if not assume_sorted:
199
+ # Largest first
200
+ clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
201
+ all_cluster_sizes = [len(c) for c in clusters]
202
+ total_fps = sum(all_cluster_sizes)
203
+ # Filter by min size
204
+ _clusters = []
205
+ for i, c in enumerate(clusters):
206
+ if all_cluster_sizes[i] < min_size:
207
+ break
208
+ if top is not None and i >= top:
209
+ break
210
+ _clusters.append(c)
211
+ clusters = _clusters
212
+
213
+ info: dict[str, list[tp.Any]] = defaultdict(list)
214
+ fps_provider: tp.Union[_FingerprintFileSequence, NDArray[np.uint8], None]
215
+ if fps is None:
216
+ fps_provider = None
217
+ elif isinstance(fps, Path):
218
+ fps_provider = np.load(fps, mmap_mode="r")
219
+ elif not isinstance(fps, np.ndarray):
220
+ fps_provider = _FingerprintFileSequence(fps)
221
+ else:
222
+ fps_provider = tp.cast(NDArray[np.uint8], fps.astype(np.uint8, copy=False))
223
+
224
+ if fps_provider is None:
225
+ selected = None
226
+ else:
227
+ selected = np.empty(
228
+ (sum(len(c) for c in clusters), fps_provider.shape[1]), dtype=np.uint8
229
+ )
230
+ start = 0
231
+ for i, c in enumerate(clusters, 1):
232
+ size = len(c)
233
+ # If a file sequence is passed, the cluster indices must be sorted.
234
+ # the cluster analysis is idx-order-independent, so this is fine
235
+ info["labels"].append(i)
236
+ info["sizes"].append(size)
237
+ if smiles.size:
238
+ analysis = scaffold_analysis(smiles[c], fp_kind=scaffold_fp_kind)
239
+ info["unique_scaffolds_num"].append(analysis.unique_num)
240
+ info["unique_scaffolds_isim"].append(analysis.isim)
241
+ if fps_provider is not None:
242
+ assert selected is not None
243
+ _fps = fps_provider[sorted(c)]
244
+ info["isim"].append(
245
+ jt_isim(_fps, input_is_packed=input_is_packed, n_features=n_features)
246
+ )
247
+ selected[start : start + size] = _fps
248
+ start += size
249
+ return ClusterAnalysis(
250
+ [len(c) for c in clusters],
251
+ all_cluster_sizes,
252
+ pd.DataFrame(info),
253
+ selected_fps=selected,
254
+ total_fps_num=total_fps,
255
+ fps_are_packed=input_is_packed,
256
+ n_features=n_features,
257
+ min_size=min_size,
258
+ )