bblean 0.6.0b2__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cp313-win_amd64.pyd +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1850 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b2.dist-info/METADATA +288 -0
- bblean-0.6.0b2.dist-info/RECORD +31 -0
- bblean-0.6.0b2.dist-info/WHEEL +5 -0
- bblean-0.6.0b2.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b2.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b2.dist-info/top_level.txt +1 -0
bblean/_memory.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
r"""Monitor and collect memory stats"""
|
|
2
|
+
|
|
3
|
+
import typing as tp
|
|
4
|
+
import mmap
|
|
5
|
+
import warnings
|
|
6
|
+
from enum import Enum
|
|
7
|
+
import ctypes
|
|
8
|
+
import dataclasses
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
import os
|
|
13
|
+
import multiprocessing as mp
|
|
14
|
+
|
|
15
|
+
import typing_extensions as tpx
|
|
16
|
+
import psutil
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numpy.typing import NDArray
|
|
19
|
+
from rich.console import Console
|
|
20
|
+
|
|
21
|
+
_BYTES_TO_GIB = 1 / 1024**3
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Madv(Enum):
|
|
25
|
+
WILLNEED = 3
|
|
26
|
+
SEQUENTIAL = 2
|
|
27
|
+
# PAGEOUT and DONTNEED reduce memory usage around 40%
|
|
28
|
+
# TODO: Check exactly what DONTNEED does. I believe PAGEOUT *swaps out*
|
|
29
|
+
# so DONTNEED may be preferred since it may have less perf. penalty
|
|
30
|
+
DONTNEED = 4
|
|
31
|
+
PAGEOUT = 21
|
|
32
|
+
FREE = 8 # *ONLY* works on anonymous pages (not file-backed like numpy arrays)
|
|
33
|
+
# Cold does *not* immediatly release memory, it is just a soft hint that
|
|
34
|
+
# those pages won't be needed soon
|
|
35
|
+
COLD = 20
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Get handle to the system's libc
|
|
39
|
+
def _get_libc() -> tp.Any:
|
|
40
|
+
if sys.platform == "linux":
|
|
41
|
+
return ctypes.CDLL("libc.so.6", use_errno=True)
|
|
42
|
+
elif sys.platform == "darwin":
|
|
43
|
+
return ctypes.CDLL("libc.dylib", use_errno=True)
|
|
44
|
+
# For now, do nothing in Windows
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# This reduces memory usage around 40%, since the kernel can release
|
|
49
|
+
# pages once the array has been iterated over. The issue is, after this has been done,
|
|
50
|
+
# the array is out of the RAM, so refinement is not possible.
|
|
51
|
+
def _madvise_dontneed(page_start: int, size: int) -> None:
|
|
52
|
+
_madvise(page_start, size, Madv.DONTNEED)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# let the kernel know that access to this range of addrs will be sequential
|
|
56
|
+
# (pages can be read-ahead and discarded fast after read if needed)
|
|
57
|
+
def _madvise_sequential(page_start: int, size: int) -> None:
|
|
58
|
+
_madvise(page_start, size, Madv.SEQUENTIAL)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _madvise(page_start: int, size: int, opt: Madv) -> None:
|
|
62
|
+
libc = _get_libc()
|
|
63
|
+
if libc is None:
|
|
64
|
+
return
|
|
65
|
+
if libc.madvise(ctypes.c_void_p(page_start), size, opt.value) != 0:
|
|
66
|
+
errno = ctypes.get_errno()
|
|
67
|
+
warnings.warn(f"{opt} failed with error code {errno}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
_Input = tp.Union[NDArray[np.integer], list[NDArray[np.integer]]]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclasses.dataclass
|
|
74
|
+
class _ArrayMemPagesManager:
|
|
75
|
+
can_release: bool
|
|
76
|
+
_pagesizex: int
|
|
77
|
+
_iters_per_pagex: int
|
|
78
|
+
_curr_page_start_addr: int
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_bb_input(cls, X: _Input, can_release: bool | None = None) -> tpx.Self:
|
|
82
|
+
pagesizex = mmap.PAGESIZE * 512
|
|
83
|
+
if (
|
|
84
|
+
isinstance(X, np.memmap)
|
|
85
|
+
and X.ndim == 2
|
|
86
|
+
and (pagesizex % X.shape[1] == 0)
|
|
87
|
+
and X.offset < X.shape[1]
|
|
88
|
+
):
|
|
89
|
+
# In most cases pagesizex % n_features == 0 and offset < n_features
|
|
90
|
+
# Every n_iters, release the prev page and add pagesizex to start_addr
|
|
91
|
+
iters_per_pagex = int(pagesizex / X.shape[1]) # ~ 8192 iterations
|
|
92
|
+
curr_page_start_addr = X.ctypes.data - X.offset
|
|
93
|
+
_can_release = True
|
|
94
|
+
else:
|
|
95
|
+
iters_per_pagex = 0
|
|
96
|
+
curr_page_start_addr = 0
|
|
97
|
+
_can_release = False
|
|
98
|
+
if can_release is not None:
|
|
99
|
+
_can_release = can_release
|
|
100
|
+
return cls(_can_release, pagesizex, iters_per_pagex, curr_page_start_addr)
|
|
101
|
+
|
|
102
|
+
def should_release_curr_page(self, row_idx: int) -> bool:
|
|
103
|
+
return row_idx % self._iters_per_pagex == 0
|
|
104
|
+
|
|
105
|
+
def release_curr_page_and_update_addr(self) -> None:
|
|
106
|
+
_madvise_dontneed(self._curr_page_start_addr, self._pagesizex)
|
|
107
|
+
self._curr_page_start_addr += self._pagesizex
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _mmap_file_and_madvise_sequential(
|
|
111
|
+
path: Path, max_fps: int | None = None
|
|
112
|
+
) -> NDArray[np.integer]:
|
|
113
|
+
arr = np.load(path, mmap_mode="r")[:max_fps]
|
|
114
|
+
# Numpy actually puts the *whole file* in mmap mode (arr + header)
|
|
115
|
+
# This means the array data starts from a nonzero offset starting from the backing
|
|
116
|
+
# buffer if we want the address to the start of the file we need to displace the
|
|
117
|
+
# addr of the arry by the bsize of the header, which can be accessed by arr.offset
|
|
118
|
+
#
|
|
119
|
+
# This is required since madvise needs a page-aligned address (address must
|
|
120
|
+
# be a multiple of mmap.PAGESIZE (portable) == os.sysconf("SC_PAGE_SIZE")
|
|
121
|
+
# (mac|linux), typically 4096 B).
|
|
122
|
+
#
|
|
123
|
+
# TODO: In some cases, for some reason, this fails with errno 22
|
|
124
|
+
# failure is harmless, but could incurr in a slight perf penalty
|
|
125
|
+
_madvise_sequential(arr.ctypes.data - arr.offset, arr.nbytes)
|
|
126
|
+
return arr
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def system_mem_gib() -> tuple[int, int] | tuple[None, None]:
|
|
130
|
+
mem = psutil.virtual_memory()
|
|
131
|
+
return mem.total * _BYTES_TO_GIB, mem.available * _BYTES_TO_GIB
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_peak_memory_gib(out_dir: Path) -> float | None:
|
|
135
|
+
file = out_dir / "max-rss.txt"
|
|
136
|
+
if not file.exists():
|
|
137
|
+
return None
|
|
138
|
+
with open(file, mode="r", encoding="utf-8") as f:
|
|
139
|
+
peak_mem_gib = float(f.read().strip())
|
|
140
|
+
return peak_mem_gib
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def monitor_rss_process(
|
|
144
|
+
file: Path | str, interval_s: float, start_time: float, parent_pid: int
|
|
145
|
+
) -> None:
|
|
146
|
+
file = Path(file)
|
|
147
|
+
this_pid = os.getpid()
|
|
148
|
+
ps = psutil.Process(parent_pid)
|
|
149
|
+
|
|
150
|
+
def total_rss() -> float:
|
|
151
|
+
total_rss = ps.memory_info().rss
|
|
152
|
+
for proc in ps.children(recursive=True):
|
|
153
|
+
if proc.pid == this_pid:
|
|
154
|
+
continue
|
|
155
|
+
try:
|
|
156
|
+
total_rss += proc.memory_info().rss
|
|
157
|
+
except psutil.NoSuchProcess:
|
|
158
|
+
# Prevent race condition since process may have finished before it can
|
|
159
|
+
# be polled
|
|
160
|
+
continue
|
|
161
|
+
return total_rss
|
|
162
|
+
|
|
163
|
+
with open(file, mode="w", encoding="utf-8") as f:
|
|
164
|
+
f.write("rss_gib,time_s\n")
|
|
165
|
+
f.flush()
|
|
166
|
+
os.fsync(f.fileno())
|
|
167
|
+
|
|
168
|
+
max_rss_gib = 0.0
|
|
169
|
+
while True:
|
|
170
|
+
total_rss_gib = total_rss() * _BYTES_TO_GIB
|
|
171
|
+
with open(file, mode="a", encoding="utf-8") as f:
|
|
172
|
+
f.write(f"{total_rss_gib},{time.perf_counter() - start_time}\n")
|
|
173
|
+
f.flush()
|
|
174
|
+
os.fsync(f.fileno())
|
|
175
|
+
if total_rss_gib > max_rss_gib:
|
|
176
|
+
max_rss_gib = total_rss_gib
|
|
177
|
+
with open(file.parent / "max-rss.txt", mode="w", encoding="utf-8") as f:
|
|
178
|
+
f.write(f"{max_rss_gib}\n")
|
|
179
|
+
f.flush()
|
|
180
|
+
os.fsync(f.fileno())
|
|
181
|
+
time.sleep(interval_s)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def launch_monitor_rss_daemon(
|
|
185
|
+
out_file: Path, interval_s: float, console: Console | None = None
|
|
186
|
+
) -> None:
|
|
187
|
+
if console is not None:
|
|
188
|
+
console.print("** Monitoring total RAM usage **\n")
|
|
189
|
+
mp.Process(
|
|
190
|
+
target=monitor_rss_process,
|
|
191
|
+
kwargs=dict(
|
|
192
|
+
file=out_file,
|
|
193
|
+
interval_s=interval_s,
|
|
194
|
+
start_time=time.perf_counter(),
|
|
195
|
+
parent_pid=os.getpid(),
|
|
196
|
+
),
|
|
197
|
+
daemon=True,
|
|
198
|
+
).start()
|
bblean/_merges.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
r"""Merging criteria for BitBIRCH clustering"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.typing import NDArray
|
|
5
|
+
|
|
6
|
+
# NOTE: jt_isim_from_sum is equivalent to jt_isim_diameter_compl_from_sum
|
|
7
|
+
from bblean.similarity import jt_isim_from_sum, jt_isim_radius_compl_from_sum
|
|
8
|
+
|
|
9
|
+
BUILTIN_MERGES = [
|
|
10
|
+
"radius",
|
|
11
|
+
"diameter",
|
|
12
|
+
"tolerance-diameter",
|
|
13
|
+
"tolerance-radius",
|
|
14
|
+
"tolerance-legacy",
|
|
15
|
+
"never-merge",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MergeAcceptFunction:
|
|
20
|
+
# For the merge functions, although outputs of jt_isim_from_sum f64, directly using
|
|
21
|
+
# f64 is *not* faster than starting with uint64
|
|
22
|
+
name: str = ""
|
|
23
|
+
|
|
24
|
+
def __call__(
|
|
25
|
+
self,
|
|
26
|
+
threshold: float,
|
|
27
|
+
new_ls: NDArray[np.integer],
|
|
28
|
+
new_n: int,
|
|
29
|
+
old_ls: NDArray[np.integer],
|
|
30
|
+
nom_ls: NDArray[np.integer],
|
|
31
|
+
old_n: int,
|
|
32
|
+
nom_n: int,
|
|
33
|
+
) -> bool:
|
|
34
|
+
raise NotImplementedError("Must be implemented by subclasses")
|
|
35
|
+
|
|
36
|
+
def __repr__(self) -> str:
|
|
37
|
+
return f"{self.__class__.__name__}()"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RadiusMerge(MergeAcceptFunction):
|
|
41
|
+
name = "radius"
|
|
42
|
+
|
|
43
|
+
def __call__(
|
|
44
|
+
self,
|
|
45
|
+
threshold: float,
|
|
46
|
+
new_ls: NDArray[np.integer],
|
|
47
|
+
new_n: int,
|
|
48
|
+
old_ls: NDArray[np.integer],
|
|
49
|
+
nom_ls: NDArray[np.integer],
|
|
50
|
+
old_n: int,
|
|
51
|
+
nom_n: int,
|
|
52
|
+
) -> bool:
|
|
53
|
+
return jt_isim_radius_compl_from_sum(new_ls, new_n) >= threshold
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class DiameterMerge(MergeAcceptFunction):
|
|
57
|
+
name = "diameter"
|
|
58
|
+
|
|
59
|
+
def __call__(
|
|
60
|
+
self,
|
|
61
|
+
threshold: float,
|
|
62
|
+
new_ls: NDArray[np.integer],
|
|
63
|
+
new_n: int,
|
|
64
|
+
old_ls: NDArray[np.integer],
|
|
65
|
+
nom_ls: NDArray[np.integer],
|
|
66
|
+
old_n: int,
|
|
67
|
+
nom_n: int,
|
|
68
|
+
) -> bool:
|
|
69
|
+
return jt_isim_from_sum(new_ls, new_n) >= threshold
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ToleranceDiameterMerge(MergeAcceptFunction):
|
|
73
|
+
name = "tolerance-diameter"
|
|
74
|
+
# NOTE: The reliability of the estimate of the cluster should be a function of the
|
|
75
|
+
# size of the old cluster, so in this metric, tolerance is larger for small clusters
|
|
76
|
+
# tolerance = max{ alpha * (exp(-decay * N_old) - offset), 0}
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
tolerance: float = 0.05,
|
|
81
|
+
n_max: int = 1000,
|
|
82
|
+
decay: float = 1e-3,
|
|
83
|
+
adaptive: bool = True,
|
|
84
|
+
) -> None:
|
|
85
|
+
self.tolerance = tolerance
|
|
86
|
+
self.decay = decay
|
|
87
|
+
self.offset = np.exp(-decay * n_max)
|
|
88
|
+
if not adaptive:
|
|
89
|
+
self.decay = 0.0
|
|
90
|
+
self.offset = 0.0
|
|
91
|
+
|
|
92
|
+
def __call__(
|
|
93
|
+
self,
|
|
94
|
+
threshold: float,
|
|
95
|
+
new_ls: NDArray[np.integer],
|
|
96
|
+
new_n: int,
|
|
97
|
+
old_ls: NDArray[np.integer],
|
|
98
|
+
nom_ls: NDArray[np.integer],
|
|
99
|
+
old_n: int,
|
|
100
|
+
nom_n: int,
|
|
101
|
+
) -> bool:
|
|
102
|
+
new_dc = jt_isim_from_sum(new_ls, new_n)
|
|
103
|
+
if new_dc < threshold:
|
|
104
|
+
return False
|
|
105
|
+
# If the old n is 1 then merge directly (infinite tolerance), since the
|
|
106
|
+
# old_d is undefined for a single fp
|
|
107
|
+
if old_n == 1:
|
|
108
|
+
return True
|
|
109
|
+
# Only merge if the new_dc is greater or equal to the old, up to some tolerance,
|
|
110
|
+
# which decays with N
|
|
111
|
+
old_dc = jt_isim_from_sum(old_ls, old_n)
|
|
112
|
+
tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
|
|
113
|
+
return new_dc >= old_dc - tol
|
|
114
|
+
|
|
115
|
+
def __repr__(self) -> str:
|
|
116
|
+
return f"{self.__class__.__name__}({self.tolerance})"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class ToleranceRadiusMerge(ToleranceDiameterMerge):
|
|
120
|
+
name = "tolerance-radius"
|
|
121
|
+
|
|
122
|
+
def __call__(
|
|
123
|
+
self,
|
|
124
|
+
threshold: float,
|
|
125
|
+
new_ls: NDArray[np.integer],
|
|
126
|
+
new_n: int,
|
|
127
|
+
old_ls: NDArray[np.integer],
|
|
128
|
+
nom_ls: NDArray[np.integer],
|
|
129
|
+
old_n: int,
|
|
130
|
+
nom_n: int,
|
|
131
|
+
) -> bool:
|
|
132
|
+
new_rc = jt_isim_radius_compl_from_sum(new_ls, new_n)
|
|
133
|
+
if new_rc < threshold:
|
|
134
|
+
return False
|
|
135
|
+
if old_n == 1:
|
|
136
|
+
return True
|
|
137
|
+
old_rc = jt_isim_radius_compl_from_sum(old_ls, old_n)
|
|
138
|
+
tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
|
|
139
|
+
return new_rc >= old_rc - tol
|
|
140
|
+
|
|
141
|
+
def __repr__(self) -> str:
|
|
142
|
+
return f"{self.__class__.__name__}({self.tolerance})"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class NeverMerge(ToleranceDiameterMerge):
|
|
146
|
+
name = "never-merge"
|
|
147
|
+
|
|
148
|
+
def __call__(
|
|
149
|
+
self,
|
|
150
|
+
threshold: float,
|
|
151
|
+
new_ls: NDArray[np.integer],
|
|
152
|
+
new_n: int,
|
|
153
|
+
old_ls: NDArray[np.integer],
|
|
154
|
+
nom_ls: NDArray[np.integer],
|
|
155
|
+
old_n: int,
|
|
156
|
+
nom_n: int,
|
|
157
|
+
) -> bool:
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
def __repr__(self) -> str:
|
|
161
|
+
return f"{self.__class__.__name__}()"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class ToleranceMerge(MergeAcceptFunction):
|
|
165
|
+
name = "tolerance-legacy"
|
|
166
|
+
|
|
167
|
+
def __init__(self, tolerance: float = 0.05) -> None:
|
|
168
|
+
self.tolerance = tolerance
|
|
169
|
+
|
|
170
|
+
def __call__(
|
|
171
|
+
self,
|
|
172
|
+
threshold: float,
|
|
173
|
+
new_ls: NDArray[np.integer],
|
|
174
|
+
new_n: int,
|
|
175
|
+
old_ls: NDArray[np.integer],
|
|
176
|
+
nom_ls: NDArray[np.integer],
|
|
177
|
+
old_n: int,
|
|
178
|
+
nom_n: int,
|
|
179
|
+
) -> bool:
|
|
180
|
+
# First two branches are equivalent to 'diameter'
|
|
181
|
+
new_dc = jt_isim_from_sum(new_ls, new_n)
|
|
182
|
+
if new_dc < threshold:
|
|
183
|
+
return False
|
|
184
|
+
if old_n == 1 or nom_n != 1:
|
|
185
|
+
return True
|
|
186
|
+
# 'new_dc >= threshold' and 'new_n == old_n + 1' are guaranteed here
|
|
187
|
+
old_dc = jt_isim_from_sum(old_ls, old_n)
|
|
188
|
+
return (new_dc * new_n - old_dc * (old_n - 1)) / 2 >= old_dc - self.tolerance
|
|
189
|
+
|
|
190
|
+
def __repr__(self) -> str:
|
|
191
|
+
return f"{self.__class__.__name__}({self.tolerance})"
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_merge_accept_fn(
|
|
195
|
+
merge_criterion: str, tolerance: float = 0.05
|
|
196
|
+
) -> MergeAcceptFunction:
|
|
197
|
+
if merge_criterion == "radius":
|
|
198
|
+
return RadiusMerge()
|
|
199
|
+
elif merge_criterion == "diameter":
|
|
200
|
+
return DiameterMerge()
|
|
201
|
+
elif merge_criterion == "tolerance-legacy":
|
|
202
|
+
return ToleranceMerge(tolerance)
|
|
203
|
+
elif merge_criterion == "tolerance-diameter":
|
|
204
|
+
return ToleranceDiameterMerge(tolerance)
|
|
205
|
+
elif merge_criterion == "tolerance-radius":
|
|
206
|
+
return ToleranceRadiusMerge(tolerance)
|
|
207
|
+
elif merge_criterion == "never-merge":
|
|
208
|
+
return NeverMerge(tolerance)
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"Unknown merge criterion {merge_criterion} "
|
|
211
|
+
"Valid criteria are: radius|diameter|tolerance-diameter|tolerance-radius"
|
|
212
|
+
)
|
bblean/_py_similarity.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
r"""Fallback python implementation of molecular similarity calculators"""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from bblean.utils import min_safe_uint
|
|
9
|
+
from bblean.fingerprints import unpack_fingerprints, pack_fingerprints
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def centroid_from_sum(
|
|
13
|
+
linear_sum: NDArray[np.integer], n_samples: int, *, pack: bool = True
|
|
14
|
+
) -> NDArray[np.uint8]:
|
|
15
|
+
r"""Calculates the majority vote centroid from a sum of fingerprint values
|
|
16
|
+
|
|
17
|
+
The majority vote centroid is an good approximation of the Tanimoto centroid.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
|
|
22
|
+
linear_sum : np.ndarray
|
|
23
|
+
Sum of the elements column-wise
|
|
24
|
+
n_samples : int
|
|
25
|
+
Number of samples
|
|
26
|
+
pack : bool
|
|
27
|
+
Whether to pack the resulting fingerprints
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
centroid : np.ndarray[np.uint8]
|
|
32
|
+
Centroid fingerprints of the given set
|
|
33
|
+
"""
|
|
34
|
+
# NOTE: Numpy guarantees bools are stored as 0xFF -> True and 0x00 -> False,
|
|
35
|
+
# so this view is fully safe
|
|
36
|
+
if n_samples <= 1:
|
|
37
|
+
centroid = linear_sum.astype(np.uint8, copy=False)
|
|
38
|
+
else:
|
|
39
|
+
centroid = (linear_sum >= n_samples * 0.5).view(np.uint8)
|
|
40
|
+
if pack:
|
|
41
|
+
return np.packbits(centroid, axis=-1)
|
|
42
|
+
return centroid
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def centroid(
|
|
46
|
+
fps: NDArray[np.uint8],
|
|
47
|
+
input_is_packed: bool = True,
|
|
48
|
+
n_features: int | None = None,
|
|
49
|
+
*,
|
|
50
|
+
pack: bool = True,
|
|
51
|
+
) -> NDArray[np.uint8]:
|
|
52
|
+
r"""Calculates the majority vote centroid from a set of fingerprints
|
|
53
|
+
|
|
54
|
+
The majority vote centroid is an good approximation of the Tanimoto centroid.
|
|
55
|
+
"""
|
|
56
|
+
if input_is_packed:
|
|
57
|
+
fps = unpack_fingerprints(fps, n_features)
|
|
58
|
+
return centroid_from_sum(
|
|
59
|
+
np.sum(fps, axis=0, dtype=np.uint64), # type: ignore
|
|
60
|
+
len(fps),
|
|
61
|
+
pack=pack,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def jt_compl_isim(
|
|
66
|
+
fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
|
|
67
|
+
) -> NDArray[np.float64]:
|
|
68
|
+
"""Get all complementary (Tanimoto) similarities of a set of fps, using iSIM"""
|
|
69
|
+
if input_is_packed:
|
|
70
|
+
fps = unpack_fingerprints(fps, n_features)
|
|
71
|
+
# Vectorized calculation of all compl isim
|
|
72
|
+
# For all compl isim N is N_total - 1
|
|
73
|
+
n_objects = len(fps) - 1
|
|
74
|
+
if n_objects < 2:
|
|
75
|
+
msg = "Invalid fps. len(fps) must be >= 3"
|
|
76
|
+
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
|
77
|
+
return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
|
|
78
|
+
linear_sum = np.sum(fps, axis=0)
|
|
79
|
+
n_objects = len(fps) - 1
|
|
80
|
+
comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
|
|
81
|
+
|
|
82
|
+
return np.array(comp_sims, dtype=np.float64)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _jt_isim_medoid_index(
|
|
86
|
+
fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
|
|
87
|
+
) -> int:
|
|
88
|
+
return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def jt_isim_medoid(
|
|
92
|
+
fps: NDArray[np.uint8],
|
|
93
|
+
input_is_packed: bool = True,
|
|
94
|
+
n_features: int | None = None,
|
|
95
|
+
pack: bool = True,
|
|
96
|
+
) -> tuple[int, NDArray[np.uint8]]:
|
|
97
|
+
r"""Calculate the (Tanimoto) medoid of a set of fingerprints, using iSIM
|
|
98
|
+
|
|
99
|
+
Returns both the index of the medoid in the input array and the medoid itself
|
|
100
|
+
|
|
101
|
+
.. note::
|
|
102
|
+
Returns the first (or only) fingerprint for array of size 2 and 1 respectively.
|
|
103
|
+
Raises ValueError for arrays of size 0
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
if not fps.size:
|
|
107
|
+
raise ValueError("Size of fingerprints set must be > 0")
|
|
108
|
+
if input_is_packed:
|
|
109
|
+
fps = unpack_fingerprints(fps, n_features)
|
|
110
|
+
if len(fps) < 3:
|
|
111
|
+
idx = 0 # Medoid undefined for sets of 3 or more fingerprints
|
|
112
|
+
else:
|
|
113
|
+
idx = _jt_isim_medoid_index(fps, input_is_packed=False)
|
|
114
|
+
m = fps[idx]
|
|
115
|
+
if pack:
|
|
116
|
+
return idx, pack_fingerprints(m)
|
|
117
|
+
return idx, m
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# Requires numpy >= 2.0
|
|
121
|
+
def _popcount(a: NDArray[np.uint8]) -> NDArray[np.uint32]:
|
|
122
|
+
# a is packed uint8 array with last axis = bytes
|
|
123
|
+
# Sum bit-counts across bytes to get per-object totals
|
|
124
|
+
|
|
125
|
+
# If the array has columns that are a multiple of 8, doing a bitwise count
|
|
126
|
+
# over the buffer reinterpreted as uint64 is slightly faster.
|
|
127
|
+
# This is zero cost if the exception is not triggered. Not having a be a multiple of
|
|
128
|
+
# 8 is a very unlikely scenario, since fps are typically 1024 or 2048
|
|
129
|
+
b: NDArray[np.integer]
|
|
130
|
+
try:
|
|
131
|
+
b = a.view(np.uint64)
|
|
132
|
+
except ValueError:
|
|
133
|
+
b = a
|
|
134
|
+
return np.bitwise_count(b).sum(axis=-1, dtype=np.uint32)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# O(N) approximation to obtain "most dissimilar fingerprints" within an array
|
|
138
|
+
def jt_most_dissimilar_packed(
|
|
139
|
+
Y: NDArray[np.uint8], n_features: int | None = None
|
|
140
|
+
) -> tuple[np.integer, np.integer, NDArray[np.float64], NDArray[np.float64]]:
|
|
141
|
+
"""Finds two fps in a packed fp array that are the most Tanimoto-dissimilar
|
|
142
|
+
|
|
143
|
+
This is not guaranteed to find the most dissimilar fps, it is
|
|
144
|
+
a robust O(N) approximation that doesn't affect final cluster quality.
|
|
145
|
+
First find centroid of Y, then find fp_1, the most dissimilar molecule
|
|
146
|
+
to the centroid. Finally find fp_2, the most dissimilar molecule to fp_1
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
fp_1 : int
|
|
151
|
+
index of the first fingerprint
|
|
152
|
+
fp_2 : int
|
|
153
|
+
index of the second fingerprint
|
|
154
|
+
sims_fp_1 : np.ndarray
|
|
155
|
+
Tanimoto similarities of Y to fp_1
|
|
156
|
+
sims_fp_2: np.ndarray
|
|
157
|
+
Tanimoto similarities of Y to fp_2
|
|
158
|
+
"""
|
|
159
|
+
# Get the centroid of the fps
|
|
160
|
+
n_samples = len(Y)
|
|
161
|
+
Y_unpacked = unpack_fingerprints(Y, n_features)
|
|
162
|
+
# np.sum() automatically promotes to uint64 unless forced to a smaller dtype
|
|
163
|
+
linear_sum = np.sum(Y_unpacked, axis=0, dtype=min_safe_uint(n_samples))
|
|
164
|
+
packed_centroid = centroid_from_sum(linear_sum, n_samples, pack=True)
|
|
165
|
+
|
|
166
|
+
cardinalities = _popcount(Y)
|
|
167
|
+
|
|
168
|
+
# Get similarity of each fp to the centroid, and the least similar fp idx (fp_1)
|
|
169
|
+
sims_cent = _jt_sim_packed_precalc_cardinalities(Y, packed_centroid, cardinalities)
|
|
170
|
+
fp_1 = np.argmin(sims_cent)
|
|
171
|
+
|
|
172
|
+
# Get similarity of each fp to fp_1, and the least similar fp idx (fp_2)
|
|
173
|
+
sims_fp_1 = _jt_sim_packed_precalc_cardinalities(Y, Y[fp_1], cardinalities)
|
|
174
|
+
fp_2 = np.argmin(sims_fp_1)
|
|
175
|
+
|
|
176
|
+
# Get similarity of each fp to fp_2
|
|
177
|
+
sims_fp_2 = _jt_sim_packed_precalc_cardinalities(Y, Y[fp_2], cardinalities)
|
|
178
|
+
return fp_1, fp_2, sims_fp_1, sims_fp_2
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _jt_sim_arr_vec_packed(
|
|
182
|
+
x: NDArray[np.uint8],
|
|
183
|
+
y: NDArray[np.uint8],
|
|
184
|
+
) -> NDArray[np.float64]:
|
|
185
|
+
r"""Tanimoto similarity between packed fingerprints
|
|
186
|
+
|
|
187
|
+
Either both inputs are vectors of shape (F,) (Numpy scalar is returned), or one is
|
|
188
|
+
an vector (F,) and the other an array of shape (N, F) (Numpy array of shape (N,) is
|
|
189
|
+
returned).
|
|
190
|
+
"""
|
|
191
|
+
if x.ndim != 2 or y.ndim != 1:
|
|
192
|
+
raise ValueError("Expected a 2D array and a 1D vector as inputs")
|
|
193
|
+
return _jt_sim_packed_precalc_cardinalities(x, y, _popcount(x))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _jt_sim_packed_precalc_cardinalities(
|
|
197
|
+
x: NDArray[np.uint8],
|
|
198
|
+
y: NDArray[np.uint8],
|
|
199
|
+
cardinalities: NDArray[np.integer],
|
|
200
|
+
) -> NDArray[np.float64]:
|
|
201
|
+
# _cardinalities must be the result of calling _popcount(arr)
|
|
202
|
+
|
|
203
|
+
# Maximum value in the denominator sum is the 2 * n_features (which is typically
|
|
204
|
+
# uint16, but we use uint32 for safety)
|
|
205
|
+
intersection = _popcount(np.bitwise_and(x, y))
|
|
206
|
+
|
|
207
|
+
# Return value requires an out-of-place operation since it casts uints to f64
|
|
208
|
+
#
|
|
209
|
+
# There may be NaN in the similarity array if the both the cardinality
|
|
210
|
+
# and the vector are just zeros, in which case the intersection is 0 -> 0 / 0
|
|
211
|
+
#
|
|
212
|
+
# In these cases the fps are equal so the similarity *should be 1*, so we
|
|
213
|
+
# clamp the denominator, which is A | B (zero only if A & B is zero too).
|
|
214
|
+
return intersection / np.maximum(cardinalities + _popcount(y) - intersection, 1)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def jt_isim_unpacked(arr: NDArray[np.integer]) -> float:
|
|
218
|
+
# cast is slower
|
|
219
|
+
return jt_isim_from_sum(
|
|
220
|
+
np.sum(arr, axis=0, dtype=np.uint64), len(arr) # type: ignore
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def jt_isim_packed(fps: NDArray[np.integer], n_features: int | None = None) -> float:
|
|
225
|
+
# cast is slower
|
|
226
|
+
return jt_isim_from_sum(
|
|
227
|
+
np.sum(
|
|
228
|
+
unpack_fingerprints(fps, n_features), # type: ignore
|
|
229
|
+
axis=0,
|
|
230
|
+
dtype=np.uint64,
|
|
231
|
+
),
|
|
232
|
+
len(fps),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def jt_isim_from_sum(linear_sum: NDArray[np.integer], n_objects: int) -> float:
|
|
237
|
+
r"""iSIM Tanimoto, from sum of rows of a fingerprint array and number of rows
|
|
238
|
+
|
|
239
|
+
iSIM Tanimoto was first propsed in:
|
|
240
|
+
https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
|
|
241
|
+
|
|
242
|
+
:math:`iSIM_{JT}(X)` is an excellent :math:`O(N)` approximation of the average
|
|
243
|
+
Tanimoto similarity of a set of fingerprints.
|
|
244
|
+
|
|
245
|
+
Also equivalent to the complement of the Tanimoto diameter
|
|
246
|
+
:math:`iSIM_{JT}(X) = 1 - D_{JT}(X)`.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
c_total : np.ndarray
|
|
251
|
+
Sum of the elements from an array of fingerprints X, column-wise
|
|
252
|
+
c_total = np.sum(X, axis=0)
|
|
253
|
+
|
|
254
|
+
n_objects : int
|
|
255
|
+
Number of elements
|
|
256
|
+
n_objects = X.shape[0]
|
|
257
|
+
|
|
258
|
+
Returns
|
|
259
|
+
----------
|
|
260
|
+
isim : float
|
|
261
|
+
iSIM Jaccard-Tanimoto value
|
|
262
|
+
"""
|
|
263
|
+
if n_objects < 2:
|
|
264
|
+
warnings.warn(
|
|
265
|
+
f"Invalid n_objects = {n_objects} in isim. Expected n_objects >= 2",
|
|
266
|
+
RuntimeWarning,
|
|
267
|
+
stacklevel=2,
|
|
268
|
+
)
|
|
269
|
+
return np.nan
|
|
270
|
+
|
|
271
|
+
x = linear_sum.astype(np.uint64, copy=False)
|
|
272
|
+
sum_kq = np.sum(x)
|
|
273
|
+
# isim of fingerprints that are all zeros should be 1 (they are all equal)
|
|
274
|
+
if sum_kq == 0:
|
|
275
|
+
return 1
|
|
276
|
+
sum_kqsq = np.dot(x, x) # *dot* conserves dtype
|
|
277
|
+
a = (sum_kqsq - sum_kq) / 2 # 'a' is scalar f64
|
|
278
|
+
return a / (a + n_objects * sum_kq - sum_kqsq)
|