bblean 0.6.0b2__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bblean/_memory.py ADDED
@@ -0,0 +1,198 @@
1
+ r"""Monitor and collect memory stats"""
2
+
3
+ import typing as tp
4
+ import mmap
5
+ import warnings
6
+ from enum import Enum
7
+ import ctypes
8
+ import dataclasses
9
+ from pathlib import Path
10
+ import sys
11
+ import time
12
+ import os
13
+ import multiprocessing as mp
14
+
15
+ import typing_extensions as tpx
16
+ import psutil
17
+ import numpy as np
18
+ from numpy.typing import NDArray
19
+ from rich.console import Console
20
+
21
+ _BYTES_TO_GIB = 1 / 1024**3
22
+
23
+
24
+ class Madv(Enum):
25
+ WILLNEED = 3
26
+ SEQUENTIAL = 2
27
+ # PAGEOUT and DONTNEED reduce memory usage around 40%
28
+ # TODO: Check exactly what DONTNEED does. I believe PAGEOUT *swaps out*
29
+ # so DONTNEED may be preferred since it may have less perf. penalty
30
+ DONTNEED = 4
31
+ PAGEOUT = 21
32
+ FREE = 8 # *ONLY* works on anonymous pages (not file-backed like numpy arrays)
33
+ # Cold does *not* immediatly release memory, it is just a soft hint that
34
+ # those pages won't be needed soon
35
+ COLD = 20
36
+
37
+
38
+ # Get handle to the system's libc
39
+ def _get_libc() -> tp.Any:
40
+ if sys.platform == "linux":
41
+ return ctypes.CDLL("libc.so.6", use_errno=True)
42
+ elif sys.platform == "darwin":
43
+ return ctypes.CDLL("libc.dylib", use_errno=True)
44
+ # For now, do nothing in Windows
45
+ return
46
+
47
+
48
+ # This reduces memory usage around 40%, since the kernel can release
49
+ # pages once the array has been iterated over. The issue is, after this has been done,
50
+ # the array is out of the RAM, so refinement is not possible.
51
+ def _madvise_dontneed(page_start: int, size: int) -> None:
52
+ _madvise(page_start, size, Madv.DONTNEED)
53
+
54
+
55
+ # let the kernel know that access to this range of addrs will be sequential
56
+ # (pages can be read-ahead and discarded fast after read if needed)
57
+ def _madvise_sequential(page_start: int, size: int) -> None:
58
+ _madvise(page_start, size, Madv.SEQUENTIAL)
59
+
60
+
61
+ def _madvise(page_start: int, size: int, opt: Madv) -> None:
62
+ libc = _get_libc()
63
+ if libc is None:
64
+ return
65
+ if libc.madvise(ctypes.c_void_p(page_start), size, opt.value) != 0:
66
+ errno = ctypes.get_errno()
67
+ warnings.warn(f"{opt} failed with error code {errno}")
68
+
69
+
70
+ _Input = tp.Union[NDArray[np.integer], list[NDArray[np.integer]]]
71
+
72
+
73
+ @dataclasses.dataclass
74
+ class _ArrayMemPagesManager:
75
+ can_release: bool
76
+ _pagesizex: int
77
+ _iters_per_pagex: int
78
+ _curr_page_start_addr: int
79
+
80
+ @classmethod
81
+ def from_bb_input(cls, X: _Input, can_release: bool | None = None) -> tpx.Self:
82
+ pagesizex = mmap.PAGESIZE * 512
83
+ if (
84
+ isinstance(X, np.memmap)
85
+ and X.ndim == 2
86
+ and (pagesizex % X.shape[1] == 0)
87
+ and X.offset < X.shape[1]
88
+ ):
89
+ # In most cases pagesizex % n_features == 0 and offset < n_features
90
+ # Every n_iters, release the prev page and add pagesizex to start_addr
91
+ iters_per_pagex = int(pagesizex / X.shape[1]) # ~ 8192 iterations
92
+ curr_page_start_addr = X.ctypes.data - X.offset
93
+ _can_release = True
94
+ else:
95
+ iters_per_pagex = 0
96
+ curr_page_start_addr = 0
97
+ _can_release = False
98
+ if can_release is not None:
99
+ _can_release = can_release
100
+ return cls(_can_release, pagesizex, iters_per_pagex, curr_page_start_addr)
101
+
102
+ def should_release_curr_page(self, row_idx: int) -> bool:
103
+ return row_idx % self._iters_per_pagex == 0
104
+
105
+ def release_curr_page_and_update_addr(self) -> None:
106
+ _madvise_dontneed(self._curr_page_start_addr, self._pagesizex)
107
+ self._curr_page_start_addr += self._pagesizex
108
+
109
+
110
+ def _mmap_file_and_madvise_sequential(
111
+ path: Path, max_fps: int | None = None
112
+ ) -> NDArray[np.integer]:
113
+ arr = np.load(path, mmap_mode="r")[:max_fps]
114
+ # Numpy actually puts the *whole file* in mmap mode (arr + header)
115
+ # This means the array data starts from a nonzero offset starting from the backing
116
+ # buffer if we want the address to the start of the file we need to displace the
117
+ # addr of the arry by the bsize of the header, which can be accessed by arr.offset
118
+ #
119
+ # This is required since madvise needs a page-aligned address (address must
120
+ # be a multiple of mmap.PAGESIZE (portable) == os.sysconf("SC_PAGE_SIZE")
121
+ # (mac|linux), typically 4096 B).
122
+ #
123
+ # TODO: In some cases, for some reason, this fails with errno 22
124
+ # failure is harmless, but could incurr in a slight perf penalty
125
+ _madvise_sequential(arr.ctypes.data - arr.offset, arr.nbytes)
126
+ return arr
127
+
128
+
129
+ def system_mem_gib() -> tuple[int, int] | tuple[None, None]:
130
+ mem = psutil.virtual_memory()
131
+ return mem.total * _BYTES_TO_GIB, mem.available * _BYTES_TO_GIB
132
+
133
+
134
+ def get_peak_memory_gib(out_dir: Path) -> float | None:
135
+ file = out_dir / "max-rss.txt"
136
+ if not file.exists():
137
+ return None
138
+ with open(file, mode="r", encoding="utf-8") as f:
139
+ peak_mem_gib = float(f.read().strip())
140
+ return peak_mem_gib
141
+
142
+
143
+ def monitor_rss_process(
144
+ file: Path | str, interval_s: float, start_time: float, parent_pid: int
145
+ ) -> None:
146
+ file = Path(file)
147
+ this_pid = os.getpid()
148
+ ps = psutil.Process(parent_pid)
149
+
150
+ def total_rss() -> float:
151
+ total_rss = ps.memory_info().rss
152
+ for proc in ps.children(recursive=True):
153
+ if proc.pid == this_pid:
154
+ continue
155
+ try:
156
+ total_rss += proc.memory_info().rss
157
+ except psutil.NoSuchProcess:
158
+ # Prevent race condition since process may have finished before it can
159
+ # be polled
160
+ continue
161
+ return total_rss
162
+
163
+ with open(file, mode="w", encoding="utf-8") as f:
164
+ f.write("rss_gib,time_s\n")
165
+ f.flush()
166
+ os.fsync(f.fileno())
167
+
168
+ max_rss_gib = 0.0
169
+ while True:
170
+ total_rss_gib = total_rss() * _BYTES_TO_GIB
171
+ with open(file, mode="a", encoding="utf-8") as f:
172
+ f.write(f"{total_rss_gib},{time.perf_counter() - start_time}\n")
173
+ f.flush()
174
+ os.fsync(f.fileno())
175
+ if total_rss_gib > max_rss_gib:
176
+ max_rss_gib = total_rss_gib
177
+ with open(file.parent / "max-rss.txt", mode="w", encoding="utf-8") as f:
178
+ f.write(f"{max_rss_gib}\n")
179
+ f.flush()
180
+ os.fsync(f.fileno())
181
+ time.sleep(interval_s)
182
+
183
+
184
+ def launch_monitor_rss_daemon(
185
+ out_file: Path, interval_s: float, console: Console | None = None
186
+ ) -> None:
187
+ if console is not None:
188
+ console.print("** Monitoring total RAM usage **\n")
189
+ mp.Process(
190
+ target=monitor_rss_process,
191
+ kwargs=dict(
192
+ file=out_file,
193
+ interval_s=interval_s,
194
+ start_time=time.perf_counter(),
195
+ parent_pid=os.getpid(),
196
+ ),
197
+ daemon=True,
198
+ ).start()
bblean/_merges.py ADDED
@@ -0,0 +1,212 @@
1
+ r"""Merging criteria for BitBIRCH clustering"""
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+
6
+ # NOTE: jt_isim_from_sum is equivalent to jt_isim_diameter_compl_from_sum
7
+ from bblean.similarity import jt_isim_from_sum, jt_isim_radius_compl_from_sum
8
+
9
+ BUILTIN_MERGES = [
10
+ "radius",
11
+ "diameter",
12
+ "tolerance-diameter",
13
+ "tolerance-radius",
14
+ "tolerance-legacy",
15
+ "never-merge",
16
+ ]
17
+
18
+
19
+ class MergeAcceptFunction:
20
+ # For the merge functions, although outputs of jt_isim_from_sum f64, directly using
21
+ # f64 is *not* faster than starting with uint64
22
+ name: str = ""
23
+
24
+ def __call__(
25
+ self,
26
+ threshold: float,
27
+ new_ls: NDArray[np.integer],
28
+ new_n: int,
29
+ old_ls: NDArray[np.integer],
30
+ nom_ls: NDArray[np.integer],
31
+ old_n: int,
32
+ nom_n: int,
33
+ ) -> bool:
34
+ raise NotImplementedError("Must be implemented by subclasses")
35
+
36
+ def __repr__(self) -> str:
37
+ return f"{self.__class__.__name__}()"
38
+
39
+
40
+ class RadiusMerge(MergeAcceptFunction):
41
+ name = "radius"
42
+
43
+ def __call__(
44
+ self,
45
+ threshold: float,
46
+ new_ls: NDArray[np.integer],
47
+ new_n: int,
48
+ old_ls: NDArray[np.integer],
49
+ nom_ls: NDArray[np.integer],
50
+ old_n: int,
51
+ nom_n: int,
52
+ ) -> bool:
53
+ return jt_isim_radius_compl_from_sum(new_ls, new_n) >= threshold
54
+
55
+
56
+ class DiameterMerge(MergeAcceptFunction):
57
+ name = "diameter"
58
+
59
+ def __call__(
60
+ self,
61
+ threshold: float,
62
+ new_ls: NDArray[np.integer],
63
+ new_n: int,
64
+ old_ls: NDArray[np.integer],
65
+ nom_ls: NDArray[np.integer],
66
+ old_n: int,
67
+ nom_n: int,
68
+ ) -> bool:
69
+ return jt_isim_from_sum(new_ls, new_n) >= threshold
70
+
71
+
72
+ class ToleranceDiameterMerge(MergeAcceptFunction):
73
+ name = "tolerance-diameter"
74
+ # NOTE: The reliability of the estimate of the cluster should be a function of the
75
+ # size of the old cluster, so in this metric, tolerance is larger for small clusters
76
+ # tolerance = max{ alpha * (exp(-decay * N_old) - offset), 0}
77
+
78
+ def __init__(
79
+ self,
80
+ tolerance: float = 0.05,
81
+ n_max: int = 1000,
82
+ decay: float = 1e-3,
83
+ adaptive: bool = True,
84
+ ) -> None:
85
+ self.tolerance = tolerance
86
+ self.decay = decay
87
+ self.offset = np.exp(-decay * n_max)
88
+ if not adaptive:
89
+ self.decay = 0.0
90
+ self.offset = 0.0
91
+
92
+ def __call__(
93
+ self,
94
+ threshold: float,
95
+ new_ls: NDArray[np.integer],
96
+ new_n: int,
97
+ old_ls: NDArray[np.integer],
98
+ nom_ls: NDArray[np.integer],
99
+ old_n: int,
100
+ nom_n: int,
101
+ ) -> bool:
102
+ new_dc = jt_isim_from_sum(new_ls, new_n)
103
+ if new_dc < threshold:
104
+ return False
105
+ # If the old n is 1 then merge directly (infinite tolerance), since the
106
+ # old_d is undefined for a single fp
107
+ if old_n == 1:
108
+ return True
109
+ # Only merge if the new_dc is greater or equal to the old, up to some tolerance,
110
+ # which decays with N
111
+ old_dc = jt_isim_from_sum(old_ls, old_n)
112
+ tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
113
+ return new_dc >= old_dc - tol
114
+
115
+ def __repr__(self) -> str:
116
+ return f"{self.__class__.__name__}({self.tolerance})"
117
+
118
+
119
+ class ToleranceRadiusMerge(ToleranceDiameterMerge):
120
+ name = "tolerance-radius"
121
+
122
+ def __call__(
123
+ self,
124
+ threshold: float,
125
+ new_ls: NDArray[np.integer],
126
+ new_n: int,
127
+ old_ls: NDArray[np.integer],
128
+ nom_ls: NDArray[np.integer],
129
+ old_n: int,
130
+ nom_n: int,
131
+ ) -> bool:
132
+ new_rc = jt_isim_radius_compl_from_sum(new_ls, new_n)
133
+ if new_rc < threshold:
134
+ return False
135
+ if old_n == 1:
136
+ return True
137
+ old_rc = jt_isim_radius_compl_from_sum(old_ls, old_n)
138
+ tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
139
+ return new_rc >= old_rc - tol
140
+
141
+ def __repr__(self) -> str:
142
+ return f"{self.__class__.__name__}({self.tolerance})"
143
+
144
+
145
+ class NeverMerge(ToleranceDiameterMerge):
146
+ name = "never-merge"
147
+
148
+ def __call__(
149
+ self,
150
+ threshold: float,
151
+ new_ls: NDArray[np.integer],
152
+ new_n: int,
153
+ old_ls: NDArray[np.integer],
154
+ nom_ls: NDArray[np.integer],
155
+ old_n: int,
156
+ nom_n: int,
157
+ ) -> bool:
158
+ return False
159
+
160
+ def __repr__(self) -> str:
161
+ return f"{self.__class__.__name__}()"
162
+
163
+
164
+ class ToleranceMerge(MergeAcceptFunction):
165
+ name = "tolerance-legacy"
166
+
167
+ def __init__(self, tolerance: float = 0.05) -> None:
168
+ self.tolerance = tolerance
169
+
170
+ def __call__(
171
+ self,
172
+ threshold: float,
173
+ new_ls: NDArray[np.integer],
174
+ new_n: int,
175
+ old_ls: NDArray[np.integer],
176
+ nom_ls: NDArray[np.integer],
177
+ old_n: int,
178
+ nom_n: int,
179
+ ) -> bool:
180
+ # First two branches are equivalent to 'diameter'
181
+ new_dc = jt_isim_from_sum(new_ls, new_n)
182
+ if new_dc < threshold:
183
+ return False
184
+ if old_n == 1 or nom_n != 1:
185
+ return True
186
+ # 'new_dc >= threshold' and 'new_n == old_n + 1' are guaranteed here
187
+ old_dc = jt_isim_from_sum(old_ls, old_n)
188
+ return (new_dc * new_n - old_dc * (old_n - 1)) / 2 >= old_dc - self.tolerance
189
+
190
+ def __repr__(self) -> str:
191
+ return f"{self.__class__.__name__}({self.tolerance})"
192
+
193
+
194
+ def get_merge_accept_fn(
195
+ merge_criterion: str, tolerance: float = 0.05
196
+ ) -> MergeAcceptFunction:
197
+ if merge_criterion == "radius":
198
+ return RadiusMerge()
199
+ elif merge_criterion == "diameter":
200
+ return DiameterMerge()
201
+ elif merge_criterion == "tolerance-legacy":
202
+ return ToleranceMerge(tolerance)
203
+ elif merge_criterion == "tolerance-diameter":
204
+ return ToleranceDiameterMerge(tolerance)
205
+ elif merge_criterion == "tolerance-radius":
206
+ return ToleranceRadiusMerge(tolerance)
207
+ elif merge_criterion == "never-merge":
208
+ return NeverMerge(tolerance)
209
+ raise ValueError(
210
+ f"Unknown merge criterion {merge_criterion} "
211
+ "Valid criteria are: radius|diameter|tolerance-diameter|tolerance-radius"
212
+ )
@@ -0,0 +1,278 @@
1
+ r"""Fallback python implementation of molecular similarity calculators"""
2
+
3
+ import warnings
4
+
5
+ from numpy.typing import NDArray
6
+ import numpy as np
7
+
8
+ from bblean.utils import min_safe_uint
9
+ from bblean.fingerprints import unpack_fingerprints, pack_fingerprints
10
+
11
+
12
+ def centroid_from_sum(
13
+ linear_sum: NDArray[np.integer], n_samples: int, *, pack: bool = True
14
+ ) -> NDArray[np.uint8]:
15
+ r"""Calculates the majority vote centroid from a sum of fingerprint values
16
+
17
+ The majority vote centroid is an good approximation of the Tanimoto centroid.
18
+
19
+ Parameters
20
+ ----------
21
+
22
+ linear_sum : np.ndarray
23
+ Sum of the elements column-wise
24
+ n_samples : int
25
+ Number of samples
26
+ pack : bool
27
+ Whether to pack the resulting fingerprints
28
+
29
+ Returns
30
+ -------
31
+ centroid : np.ndarray[np.uint8]
32
+ Centroid fingerprints of the given set
33
+ """
34
+ # NOTE: Numpy guarantees bools are stored as 0xFF -> True and 0x00 -> False,
35
+ # so this view is fully safe
36
+ if n_samples <= 1:
37
+ centroid = linear_sum.astype(np.uint8, copy=False)
38
+ else:
39
+ centroid = (linear_sum >= n_samples * 0.5).view(np.uint8)
40
+ if pack:
41
+ return np.packbits(centroid, axis=-1)
42
+ return centroid
43
+
44
+
45
+ def centroid(
46
+ fps: NDArray[np.uint8],
47
+ input_is_packed: bool = True,
48
+ n_features: int | None = None,
49
+ *,
50
+ pack: bool = True,
51
+ ) -> NDArray[np.uint8]:
52
+ r"""Calculates the majority vote centroid from a set of fingerprints
53
+
54
+ The majority vote centroid is an good approximation of the Tanimoto centroid.
55
+ """
56
+ if input_is_packed:
57
+ fps = unpack_fingerprints(fps, n_features)
58
+ return centroid_from_sum(
59
+ np.sum(fps, axis=0, dtype=np.uint64), # type: ignore
60
+ len(fps),
61
+ pack=pack,
62
+ )
63
+
64
+
65
+ def jt_compl_isim(
66
+ fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
67
+ ) -> NDArray[np.float64]:
68
+ """Get all complementary (Tanimoto) similarities of a set of fps, using iSIM"""
69
+ if input_is_packed:
70
+ fps = unpack_fingerprints(fps, n_features)
71
+ # Vectorized calculation of all compl isim
72
+ # For all compl isim N is N_total - 1
73
+ n_objects = len(fps) - 1
74
+ if n_objects < 2:
75
+ msg = "Invalid fps. len(fps) must be >= 3"
76
+ warnings.warn(msg, RuntimeWarning, stacklevel=2)
77
+ return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
78
+ linear_sum = np.sum(fps, axis=0)
79
+ n_objects = len(fps) - 1
80
+ comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
81
+
82
+ return np.array(comp_sims, dtype=np.float64)
83
+
84
+
85
+ def _jt_isim_medoid_index(
86
+ fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
87
+ ) -> int:
88
+ return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
89
+
90
+
91
+ def jt_isim_medoid(
92
+ fps: NDArray[np.uint8],
93
+ input_is_packed: bool = True,
94
+ n_features: int | None = None,
95
+ pack: bool = True,
96
+ ) -> tuple[int, NDArray[np.uint8]]:
97
+ r"""Calculate the (Tanimoto) medoid of a set of fingerprints, using iSIM
98
+
99
+ Returns both the index of the medoid in the input array and the medoid itself
100
+
101
+ .. note::
102
+ Returns the first (or only) fingerprint for array of size 2 and 1 respectively.
103
+ Raises ValueError for arrays of size 0
104
+
105
+ """
106
+ if not fps.size:
107
+ raise ValueError("Size of fingerprints set must be > 0")
108
+ if input_is_packed:
109
+ fps = unpack_fingerprints(fps, n_features)
110
+ if len(fps) < 3:
111
+ idx = 0 # Medoid undefined for sets of 3 or more fingerprints
112
+ else:
113
+ idx = _jt_isim_medoid_index(fps, input_is_packed=False)
114
+ m = fps[idx]
115
+ if pack:
116
+ return idx, pack_fingerprints(m)
117
+ return idx, m
118
+
119
+
120
+ # Requires numpy >= 2.0
121
+ def _popcount(a: NDArray[np.uint8]) -> NDArray[np.uint32]:
122
+ # a is packed uint8 array with last axis = bytes
123
+ # Sum bit-counts across bytes to get per-object totals
124
+
125
+ # If the array has columns that are a multiple of 8, doing a bitwise count
126
+ # over the buffer reinterpreted as uint64 is slightly faster.
127
+ # This is zero cost if the exception is not triggered. Not having a be a multiple of
128
+ # 8 is a very unlikely scenario, since fps are typically 1024 or 2048
129
+ b: NDArray[np.integer]
130
+ try:
131
+ b = a.view(np.uint64)
132
+ except ValueError:
133
+ b = a
134
+ return np.bitwise_count(b).sum(axis=-1, dtype=np.uint32)
135
+
136
+
137
+ # O(N) approximation to obtain "most dissimilar fingerprints" within an array
138
+ def jt_most_dissimilar_packed(
139
+ Y: NDArray[np.uint8], n_features: int | None = None
140
+ ) -> tuple[np.integer, np.integer, NDArray[np.float64], NDArray[np.float64]]:
141
+ """Finds two fps in a packed fp array that are the most Tanimoto-dissimilar
142
+
143
+ This is not guaranteed to find the most dissimilar fps, it is
144
+ a robust O(N) approximation that doesn't affect final cluster quality.
145
+ First find centroid of Y, then find fp_1, the most dissimilar molecule
146
+ to the centroid. Finally find fp_2, the most dissimilar molecule to fp_1
147
+
148
+ Returns
149
+ -------
150
+ fp_1 : int
151
+ index of the first fingerprint
152
+ fp_2 : int
153
+ index of the second fingerprint
154
+ sims_fp_1 : np.ndarray
155
+ Tanimoto similarities of Y to fp_1
156
+ sims_fp_2: np.ndarray
157
+ Tanimoto similarities of Y to fp_2
158
+ """
159
+ # Get the centroid of the fps
160
+ n_samples = len(Y)
161
+ Y_unpacked = unpack_fingerprints(Y, n_features)
162
+ # np.sum() automatically promotes to uint64 unless forced to a smaller dtype
163
+ linear_sum = np.sum(Y_unpacked, axis=0, dtype=min_safe_uint(n_samples))
164
+ packed_centroid = centroid_from_sum(linear_sum, n_samples, pack=True)
165
+
166
+ cardinalities = _popcount(Y)
167
+
168
+ # Get similarity of each fp to the centroid, and the least similar fp idx (fp_1)
169
+ sims_cent = _jt_sim_packed_precalc_cardinalities(Y, packed_centroid, cardinalities)
170
+ fp_1 = np.argmin(sims_cent)
171
+
172
+ # Get similarity of each fp to fp_1, and the least similar fp idx (fp_2)
173
+ sims_fp_1 = _jt_sim_packed_precalc_cardinalities(Y, Y[fp_1], cardinalities)
174
+ fp_2 = np.argmin(sims_fp_1)
175
+
176
+ # Get similarity of each fp to fp_2
177
+ sims_fp_2 = _jt_sim_packed_precalc_cardinalities(Y, Y[fp_2], cardinalities)
178
+ return fp_1, fp_2, sims_fp_1, sims_fp_2
179
+
180
+
181
+ def _jt_sim_arr_vec_packed(
182
+ x: NDArray[np.uint8],
183
+ y: NDArray[np.uint8],
184
+ ) -> NDArray[np.float64]:
185
+ r"""Tanimoto similarity between packed fingerprints
186
+
187
+ Either both inputs are vectors of shape (F,) (Numpy scalar is returned), or one is
188
+ an vector (F,) and the other an array of shape (N, F) (Numpy array of shape (N,) is
189
+ returned).
190
+ """
191
+ if x.ndim != 2 or y.ndim != 1:
192
+ raise ValueError("Expected a 2D array and a 1D vector as inputs")
193
+ return _jt_sim_packed_precalc_cardinalities(x, y, _popcount(x))
194
+
195
+
196
+ def _jt_sim_packed_precalc_cardinalities(
197
+ x: NDArray[np.uint8],
198
+ y: NDArray[np.uint8],
199
+ cardinalities: NDArray[np.integer],
200
+ ) -> NDArray[np.float64]:
201
+ # _cardinalities must be the result of calling _popcount(arr)
202
+
203
+ # Maximum value in the denominator sum is the 2 * n_features (which is typically
204
+ # uint16, but we use uint32 for safety)
205
+ intersection = _popcount(np.bitwise_and(x, y))
206
+
207
+ # Return value requires an out-of-place operation since it casts uints to f64
208
+ #
209
+ # There may be NaN in the similarity array if the both the cardinality
210
+ # and the vector are just zeros, in which case the intersection is 0 -> 0 / 0
211
+ #
212
+ # In these cases the fps are equal so the similarity *should be 1*, so we
213
+ # clamp the denominator, which is A | B (zero only if A & B is zero too).
214
+ return intersection / np.maximum(cardinalities + _popcount(y) - intersection, 1)
215
+
216
+
217
+ def jt_isim_unpacked(arr: NDArray[np.integer]) -> float:
218
+ # cast is slower
219
+ return jt_isim_from_sum(
220
+ np.sum(arr, axis=0, dtype=np.uint64), len(arr) # type: ignore
221
+ )
222
+
223
+
224
+ def jt_isim_packed(fps: NDArray[np.integer], n_features: int | None = None) -> float:
225
+ # cast is slower
226
+ return jt_isim_from_sum(
227
+ np.sum(
228
+ unpack_fingerprints(fps, n_features), # type: ignore
229
+ axis=0,
230
+ dtype=np.uint64,
231
+ ),
232
+ len(fps),
233
+ )
234
+
235
+
236
+ def jt_isim_from_sum(linear_sum: NDArray[np.integer], n_objects: int) -> float:
237
+ r"""iSIM Tanimoto, from sum of rows of a fingerprint array and number of rows
238
+
239
+ iSIM Tanimoto was first propsed in:
240
+ https://pubs.rsc.org/en/content/articlelanding/2024/dd/d4dd00041b
241
+
242
+ :math:`iSIM_{JT}(X)` is an excellent :math:`O(N)` approximation of the average
243
+ Tanimoto similarity of a set of fingerprints.
244
+
245
+ Also equivalent to the complement of the Tanimoto diameter
246
+ :math:`iSIM_{JT}(X) = 1 - D_{JT}(X)`.
247
+
248
+ Parameters
249
+ ----------
250
+ c_total : np.ndarray
251
+ Sum of the elements from an array of fingerprints X, column-wise
252
+ c_total = np.sum(X, axis=0)
253
+
254
+ n_objects : int
255
+ Number of elements
256
+ n_objects = X.shape[0]
257
+
258
+ Returns
259
+ ----------
260
+ isim : float
261
+ iSIM Jaccard-Tanimoto value
262
+ """
263
+ if n_objects < 2:
264
+ warnings.warn(
265
+ f"Invalid n_objects = {n_objects} in isim. Expected n_objects >= 2",
266
+ RuntimeWarning,
267
+ stacklevel=2,
268
+ )
269
+ return np.nan
270
+
271
+ x = linear_sum.astype(np.uint64, copy=False)
272
+ sum_kq = np.sum(x)
273
+ # isim of fingerprints that are all zeros should be 1 (they are all equal)
274
+ if sum_kq == 0:
275
+ return 1
276
+ sum_kqsq = np.dot(x, x) # *dot* conserves dtype
277
+ a = (sum_kqsq - sum_kq) / 2 # 'a' is scalar f64
278
+ return a / (a + n_objects * sum_kq - sum_kqsq)