bblean 0.8.0__tar.gz → 0.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bblean-0.8.0 → bblean-0.8.2}/PKG-INFO +1 -1
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_merges.py +44 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_version.py +2 -2
- {bblean-0.8.0 → bblean-0.8.2}/bblean/bitbirch.py +60 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/cli.py +49 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/fingerprints.py +115 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/similarity.py +9 -5
- {bblean-0.8.0 → bblean-0.8.2}/bblean/smiles.py +6 -4
- {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/PKG-INFO +1 -1
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_simple.py +15 -1
- {bblean-0.8.0 → bblean-0.8.2}/.cruft.json +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.flake8 +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.github/CODEOWNERS +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.github/workflows/ci-cpp.yaml +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.github/workflows/ci.yaml +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.github/workflows/upload-to-pypi.yaml +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.gitignore +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/.pre-commit-config.yaml +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/LICENSE +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/LICENSES/BSD-3-Clause.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/LICENSES/GPL-3.0-only.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/README.md +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/__init__.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_config.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_console.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_legacy/__init__.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_legacy/bb_int64.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_legacy/bb_uint8.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_memory.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_py_similarity.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/_timer.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/analysis.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/csrc/README.md +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/csrc/similarity.cpp +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/metrics.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/multiround.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/plotting.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/sklearn.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean/utils.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean-demo-v2.gif +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean-demo.cast +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/SOURCES.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/dependency_links.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/entry_points.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/requires.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/bblean.egg-info/top_level.txt +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/api.svg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/installing.svg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/logo-dark-bw.svg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/logo-light-bw.svg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/publications.svg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/style.css +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_static/user-guide.svg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/_templates/module.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/api-reference.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/conf.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/index.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/installing.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/publications.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/linux_memory_setup.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/notebooks/bitbirch_best_practices.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/notebooks/bitbirch_quickstart.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide/parameters.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/docs/src/user-guide.rst +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/environment.yaml +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/best_practices_functions.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/best_practices_plots.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/bitbirch_best_practices.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/bitbirch_best_practices_RDKit.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/best_practices/bitbirch_parameter.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/biogen_logS.csv +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/bitbirch_best_practices.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/bitbirch_quickstart.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/chembl-33-natural-products-subset.smi +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/examples/dataset_splitting.ipynb +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/pyproject.toml +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/setup.cfg +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/setup.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/chembl-sample-3k.smi +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/chembl-sample-bad.smi +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/legacy_merges.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/legacy_metrics.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_bb_consistency.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_cli.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_fake_fps.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_fingerprints.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_global_clustering.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_import_bblean.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_merges.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_metrics.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_multiround.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_refine.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_regression.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_sampling.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_similarity.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_sklearn.py +0 -0
- {bblean-0.8.0 → bblean-0.8.2}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bblean
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.2
|
|
4
4
|
Summary: BitBirch-Lean Python package
|
|
5
5
|
Author: The Miranda-Quintana Lab and other BitBirch developers
|
|
6
6
|
Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
|
|
@@ -69,6 +69,48 @@ class DiameterMerge(MergeAcceptFunction):
|
|
|
69
69
|
return jt_isim_from_sum(new_ls, new_n) >= threshold
|
|
70
70
|
|
|
71
71
|
|
|
72
|
+
class FlexibleToleranceDiameterMerge(MergeAcceptFunction):
|
|
73
|
+
name = "flexible-tolerance-diameter"
|
|
74
|
+
# NOTE: Equivalent to tolerance-diameter but uses min(old_dc, threshold) as the
|
|
75
|
+
# criteria
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
tolerance: float = 0.05,
|
|
80
|
+
n_max: int = 1000,
|
|
81
|
+
decay: float = 1e-3,
|
|
82
|
+
adaptive: bool = True,
|
|
83
|
+
) -> None:
|
|
84
|
+
self.tolerance = tolerance
|
|
85
|
+
self.decay = decay
|
|
86
|
+
self.offset = np.exp(-decay * n_max)
|
|
87
|
+
if not adaptive:
|
|
88
|
+
self.decay = 0.0
|
|
89
|
+
self.offset = 0.0
|
|
90
|
+
|
|
91
|
+
def __call__(
|
|
92
|
+
self,
|
|
93
|
+
threshold: float,
|
|
94
|
+
new_ls: NDArray[np.integer],
|
|
95
|
+
new_n: int,
|
|
96
|
+
old_ls: NDArray[np.integer],
|
|
97
|
+
nom_ls: NDArray[np.integer],
|
|
98
|
+
old_n: int,
|
|
99
|
+
nom_n: int,
|
|
100
|
+
) -> bool:
|
|
101
|
+
new_dc = jt_isim_from_sum(new_ls, new_n)
|
|
102
|
+
if new_dc < threshold:
|
|
103
|
+
return False
|
|
104
|
+
if old_n == 1:
|
|
105
|
+
return True
|
|
106
|
+
old_dc = jt_isim_from_sum(old_ls, old_n)
|
|
107
|
+
tol = max(self.tolerance * (np.exp(-self.decay * old_n) - self.offset), 0.0)
|
|
108
|
+
return new_dc >= min(old_dc, threshold) - tol
|
|
109
|
+
|
|
110
|
+
def __repr__(self) -> str:
|
|
111
|
+
return f"{self.__class__.__name__}({self.tolerance})"
|
|
112
|
+
|
|
113
|
+
|
|
72
114
|
class ToleranceDiameterMerge(MergeAcceptFunction):
|
|
73
115
|
name = "tolerance-diameter"
|
|
74
116
|
# NOTE: The reliability of the estimate of the cluster should be a function of the
|
|
@@ -202,6 +244,8 @@ def get_merge_accept_fn(
|
|
|
202
244
|
return ToleranceMerge(tolerance)
|
|
203
245
|
elif merge_criterion == "tolerance-diameter":
|
|
204
246
|
return ToleranceDiameterMerge(tolerance)
|
|
247
|
+
elif merge_criterion == "flexible-tolerance-diameter":
|
|
248
|
+
return FlexibleToleranceDiameterMerge(tolerance)
|
|
205
249
|
elif merge_criterion == "tolerance-radius":
|
|
206
250
|
return ToleranceRadiusMerge(tolerance)
|
|
207
251
|
elif merge_criterion == "never-merge":
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.8.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 8,
|
|
31
|
+
__version__ = version = '0.8.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 8, 2)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -75,6 +75,8 @@ from bblean.similarity import (
|
|
|
75
75
|
jt_most_dissimilar_packed,
|
|
76
76
|
jt_isim_medoid,
|
|
77
77
|
centroid_from_sum,
|
|
78
|
+
estimate_jt_std,
|
|
79
|
+
jt_isim,
|
|
78
80
|
)
|
|
79
81
|
|
|
80
82
|
if os.getenv("BITBIRCH_NO_EXTENSIONS"):
|
|
@@ -90,6 +92,64 @@ else:
|
|
|
90
92
|
__all__ = ["BitBirch"]
|
|
91
93
|
|
|
92
94
|
|
|
95
|
+
@tp.overload
|
|
96
|
+
def guess_threshold(
|
|
97
|
+
fps: NDArray[np.uint8],
|
|
98
|
+
input_is_packed: bool = True,
|
|
99
|
+
n_features: int | None = None,
|
|
100
|
+
max_samples: int = 1_000_000,
|
|
101
|
+
factor: float = 3.0,
|
|
102
|
+
return_mean_std: tp.Literal[False] = False,
|
|
103
|
+
) -> float:
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@tp.overload
|
|
108
|
+
def guess_threshold(
|
|
109
|
+
fps: NDArray[np.uint8],
|
|
110
|
+
input_is_packed: bool = True,
|
|
111
|
+
n_features: int | None = None,
|
|
112
|
+
max_samples: int = 1_000_000,
|
|
113
|
+
factor: float = 3.0,
|
|
114
|
+
return_mean_std: tp.Literal[True] = True,
|
|
115
|
+
) -> tuple[float, float, float]:
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def guess_threshold(
|
|
120
|
+
fps: NDArray[np.uint8],
|
|
121
|
+
input_is_packed: bool = True,
|
|
122
|
+
n_features: int | None = None,
|
|
123
|
+
max_samples: int = 1_000_000,
|
|
124
|
+
factor: float = 3.0,
|
|
125
|
+
return_mean_std: bool = False,
|
|
126
|
+
) -> float | tuple[float, float, float]:
|
|
127
|
+
r""":meta private:
|
|
128
|
+
|
|
129
|
+
Guess the optimal bitbirch threshold
|
|
130
|
+
|
|
131
|
+
Uses the heuristic mean_tanimoto + 3.0 * std_tanimoto
|
|
132
|
+
"""
|
|
133
|
+
num_fps = len(fps)
|
|
134
|
+
if num_fps > max_samples:
|
|
135
|
+
rng = np.random.default_rng(42)
|
|
136
|
+
random_choices = rng.choice(num_fps, size=max_samples, replace=False)
|
|
137
|
+
fps = fps[random_choices]
|
|
138
|
+
num_fps = len(fps)
|
|
139
|
+
mean = jt_isim(fps, input_is_packed, n_features)
|
|
140
|
+
if num_fps <= 50:
|
|
141
|
+
n_samples = num_fps
|
|
142
|
+
else:
|
|
143
|
+
n_samples = max(5 * np.sqrt(num_fps), 50)
|
|
144
|
+
std = estimate_jt_std(
|
|
145
|
+
fps, input_is_packed=input_is_packed, n_features=n_features, n_samples=n_samples
|
|
146
|
+
)
|
|
147
|
+
thresh = mean + factor * std
|
|
148
|
+
if return_mean_std:
|
|
149
|
+
return thresh, mean, std
|
|
150
|
+
return thresh
|
|
151
|
+
|
|
152
|
+
|
|
93
153
|
# For backwards compatibility with the global "set_merge", keep weak references to all
|
|
94
154
|
# the BitBirch instances and update them when set_merge is called
|
|
95
155
|
_BITBIRCH_INSTANCES: WeakSet["BitBirch"] = WeakSet()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
r"""Command line interface entrypoints"""
|
|
2
2
|
|
|
3
|
+
import numpy as np
|
|
3
4
|
import warnings
|
|
4
5
|
import random
|
|
5
6
|
import typing as tp
|
|
@@ -930,6 +931,54 @@ def _plot_summary(
|
|
|
930
931
|
)
|
|
931
932
|
|
|
932
933
|
|
|
934
|
+
@app.command("thresh")
|
|
935
|
+
def _guess_threshold(
|
|
936
|
+
ctx: Context,
|
|
937
|
+
input_: Annotated[
|
|
938
|
+
Path,
|
|
939
|
+
Argument(help="`*.npy` file with fingerprints"),
|
|
940
|
+
],
|
|
941
|
+
factor: Annotated[
|
|
942
|
+
float,
|
|
943
|
+
Option("-f", "--factor"),
|
|
944
|
+
] = 3.0,
|
|
945
|
+
n_features: Annotated[
|
|
946
|
+
int | None,
|
|
947
|
+
Option(
|
|
948
|
+
"--n-features",
|
|
949
|
+
help="Number of features in the fingerprints."
|
|
950
|
+
" It must be provided for packed inputs *if it is not a multiple of 8*."
|
|
951
|
+
" For typical fingerprint sizes (e.g. 2048, 1024), it is not required",
|
|
952
|
+
rich_help_panel="Advanced",
|
|
953
|
+
),
|
|
954
|
+
] = None,
|
|
955
|
+
input_is_packed: Annotated[
|
|
956
|
+
bool,
|
|
957
|
+
Option(
|
|
958
|
+
"--packed-input/--unpacked-input",
|
|
959
|
+
help="Toggle whether the input consists on packed or unpacked fingerprints",
|
|
960
|
+
rich_help_panel="Advanced",
|
|
961
|
+
),
|
|
962
|
+
] = True,
|
|
963
|
+
max_samples: Annotated[
|
|
964
|
+
int,
|
|
965
|
+
Option("-m", "--max-samples"),
|
|
966
|
+
] = 1_000_000,
|
|
967
|
+
) -> None:
|
|
968
|
+
r"""Estimate the optimal BitBirch threshold for a fingerprints file"""
|
|
969
|
+
from bblean.bitbirch import guess_threshold
|
|
970
|
+
from bblean._console import get_console
|
|
971
|
+
|
|
972
|
+
console = get_console()
|
|
973
|
+
fps = np.load(input_)
|
|
974
|
+
thresh, mean, std = guess_threshold(
|
|
975
|
+
fps, input_is_packed, n_features, max_samples, factor, return_mean_std=True
|
|
976
|
+
)
|
|
977
|
+
console.print(f"Estimated average similarity: {mean:.4f}")
|
|
978
|
+
console.print(f"Estimated similarity deviation: {std:.4f}")
|
|
979
|
+
console.print(f"Estimated optimal threshold: {thresh:.4f}")
|
|
980
|
+
|
|
981
|
+
|
|
933
982
|
@app.command("run")
|
|
934
983
|
def _run(
|
|
935
984
|
ctx: Context,
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
r"""Utilites for manipulating fingerprints and fingerprint files"""
|
|
2
2
|
|
|
3
|
+
import sys
|
|
4
|
+
import math
|
|
5
|
+
import weakref
|
|
3
6
|
import warnings
|
|
4
7
|
import dataclasses
|
|
5
8
|
from pathlib import Path
|
|
6
9
|
from numpy.typing import NDArray, DTypeLike
|
|
7
10
|
import numpy as np
|
|
8
11
|
import typing as tp
|
|
12
|
+
import multiprocessing as mp
|
|
9
13
|
import multiprocessing.shared_memory as shmem
|
|
10
14
|
|
|
11
15
|
from rich.console import Console
|
|
@@ -13,6 +17,8 @@ from rdkit.Chem import rdFingerprintGenerator, MolFromSmiles, SanitizeFlags, San
|
|
|
13
17
|
|
|
14
18
|
from bblean._config import DEFAULTS
|
|
15
19
|
from bblean._console import get_console
|
|
20
|
+
from bblean.smiles import _iter_ranges_and_smiles_batches
|
|
21
|
+
from bblean.utils import _num_avail_cpus
|
|
16
22
|
|
|
17
23
|
__all__ = [
|
|
18
24
|
"make_fake_fingerprints",
|
|
@@ -441,3 +447,112 @@ class _FingerprintArrayFiller:
|
|
|
441
447
|
fps[i, :] = fp
|
|
442
448
|
fps_shmem.close()
|
|
443
449
|
invalid_mask_shmem.close()
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
@tp.overload
|
|
453
|
+
def fps_from_smiles_parallel(
|
|
454
|
+
smiles: tp.Iterable[str],
|
|
455
|
+
kind: str = DEFAULTS.fp_kind,
|
|
456
|
+
n_features: int = DEFAULTS.n_features,
|
|
457
|
+
dtype: DTypeLike = np.uint8,
|
|
458
|
+
sanitize: str = "all",
|
|
459
|
+
skip_invalid: tp.Literal[False] = False,
|
|
460
|
+
pack: bool = True,
|
|
461
|
+
num_ps: int = 1,
|
|
462
|
+
replace_dummy_atoms: bool = False,
|
|
463
|
+
tab_separated: bool = False,
|
|
464
|
+
mp_context: tp.Any = None,
|
|
465
|
+
) -> NDArray[np.uint8]:
|
|
466
|
+
pass
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
@tp.overload
|
|
470
|
+
def fps_from_smiles_parallel(
|
|
471
|
+
smiles: tp.Iterable[str],
|
|
472
|
+
kind: str = DEFAULTS.fp_kind,
|
|
473
|
+
n_features: int = DEFAULTS.n_features,
|
|
474
|
+
dtype: DTypeLike = np.uint8,
|
|
475
|
+
sanitize: str = "all",
|
|
476
|
+
skip_invalid: tp.Literal[True] = True,
|
|
477
|
+
pack: bool = True,
|
|
478
|
+
num_ps: int = 1,
|
|
479
|
+
replace_dummy_atoms: bool = False,
|
|
480
|
+
tab_separated: bool = False,
|
|
481
|
+
mp_context: tp.Any = None,
|
|
482
|
+
) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
|
|
483
|
+
pass
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
# NOTE: This function is proof of concept and kinda dangerous since it registers
|
|
487
|
+
# a custom destructor for the numpy array
|
|
488
|
+
# It is also *only usable if called inside an if __name__ == "__main__" guard*
|
|
489
|
+
# For now lets hide it
|
|
490
|
+
def fps_from_smiles_parallel(
|
|
491
|
+
smiles: tp.Iterable[str],
|
|
492
|
+
kind: str = DEFAULTS.fp_kind,
|
|
493
|
+
n_features: int = DEFAULTS.n_features,
|
|
494
|
+
dtype: DTypeLike = np.uint8,
|
|
495
|
+
sanitize: str = "all",
|
|
496
|
+
skip_invalid: bool = False,
|
|
497
|
+
pack: bool = True,
|
|
498
|
+
num_ps: int | None = None,
|
|
499
|
+
replace_dummy_atoms: bool = False,
|
|
500
|
+
tab_separated: bool = False,
|
|
501
|
+
mp_context: tp.Any = None,
|
|
502
|
+
) -> tp.Union[NDArray[np.uint8], tuple[NDArray[np.uint8], NDArray[np.int64]]]:
|
|
503
|
+
r""":meta private:"""
|
|
504
|
+
if mp_context is None:
|
|
505
|
+
mp_context = mp.get_context("forkserver" if sys.platform == "linux" else None)
|
|
506
|
+
if isinstance(smiles, str):
|
|
507
|
+
smiles = [smiles]
|
|
508
|
+
smiles = list(smiles)
|
|
509
|
+
smiles_num = len(smiles)
|
|
510
|
+
if num_ps is None:
|
|
511
|
+
num_ps = _num_avail_cpus()
|
|
512
|
+
|
|
513
|
+
if pack:
|
|
514
|
+
out_dim = (n_features + 7) // 8
|
|
515
|
+
else:
|
|
516
|
+
out_dim = n_features
|
|
517
|
+
shmem_size = smiles_num * out_dim * np.dtype(dtype).itemsize
|
|
518
|
+
fps_shmem = shmem.SharedMemory(create=True, size=shmem_size)
|
|
519
|
+
invalid_mask_shmem = shmem.SharedMemory(create=True, size=smiles_num)
|
|
520
|
+
fps_array_filler = _FingerprintArrayFiller(
|
|
521
|
+
shmem_name=fps_shmem.name,
|
|
522
|
+
invalid_mask_shmem_name=invalid_mask_shmem.name,
|
|
523
|
+
kind=kind,
|
|
524
|
+
fp_size=n_features,
|
|
525
|
+
num_smiles=smiles_num,
|
|
526
|
+
dtype=np.dtype(dtype).name,
|
|
527
|
+
pack=pack,
|
|
528
|
+
sanitize=sanitize,
|
|
529
|
+
skip_invalid=skip_invalid,
|
|
530
|
+
)
|
|
531
|
+
num_per_batch = math.ceil(smiles_num / num_ps)
|
|
532
|
+
with mp_context.Pool(processes=num_ps) as pool:
|
|
533
|
+
pool.starmap(
|
|
534
|
+
fps_array_filler,
|
|
535
|
+
_iter_ranges_and_smiles_batches(
|
|
536
|
+
smiles,
|
|
537
|
+
num_per_batch,
|
|
538
|
+
tab_separated,
|
|
539
|
+
replace_dummy_atoms,
|
|
540
|
+
assume_paths=False,
|
|
541
|
+
),
|
|
542
|
+
)
|
|
543
|
+
fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
|
|
544
|
+
mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
|
|
545
|
+
if skip_invalid:
|
|
546
|
+
fps = np.delete(fps, mask, axis=0)
|
|
547
|
+
weakref.finalize(mask, invalid_mask_shmem.close)
|
|
548
|
+
weakref.finalize(mask, invalid_mask_shmem.unlink)
|
|
549
|
+
weakref.finalize(fps, fps_shmem.close)
|
|
550
|
+
weakref.finalize(fps, fps_shmem.unlink)
|
|
551
|
+
return fps, mask
|
|
552
|
+
|
|
553
|
+
del mask
|
|
554
|
+
invalid_mask_shmem.close()
|
|
555
|
+
invalid_mask_shmem.unlink()
|
|
556
|
+
weakref.finalize(fps, fps_shmem.close)
|
|
557
|
+
weakref.finalize(fps, fps_shmem.unlink)
|
|
558
|
+
return fps
|
|
@@ -293,7 +293,7 @@ def estimate_jt_std(
|
|
|
293
293
|
n_samples: int | None = None,
|
|
294
294
|
input_is_packed: bool = True,
|
|
295
295
|
n_features: int | None = None,
|
|
296
|
-
|
|
296
|
+
max_samples: int = 1_000_000,
|
|
297
297
|
) -> float:
|
|
298
298
|
r"""Estimate the std of all pairwise Tanimoto.
|
|
299
299
|
|
|
@@ -303,15 +303,19 @@ def estimate_jt_std(
|
|
|
303
303
|
The standard deviation of all pairwise Tanimoto among the sampled fingerprints.
|
|
304
304
|
"""
|
|
305
305
|
num_fps = len(fps)
|
|
306
|
-
if num_fps >
|
|
307
|
-
np.random.
|
|
308
|
-
random_choices =
|
|
306
|
+
if num_fps > max_samples:
|
|
307
|
+
rng = np.random.default_rng(42)
|
|
308
|
+
random_choices = rng.choice(num_fps, size=max_samples, replace=False)
|
|
309
309
|
fps = fps[random_choices]
|
|
310
310
|
num_fps = len(fps)
|
|
311
311
|
if n_samples is None:
|
|
312
312
|
# Heuristic: use at least 50 samples, or 1 per 10,000 fingerprints,
|
|
313
313
|
# to balance statistical representativeness and computational efficiency
|
|
314
|
-
|
|
314
|
+
# TODO: This heuristic is broken, too few samples until 500k
|
|
315
|
+
if num_fps <= 500_000:
|
|
316
|
+
n_samples = 50
|
|
317
|
+
else:
|
|
318
|
+
n_samples = num_fps // 10_000
|
|
315
319
|
sample_idxs = jt_stratified_sampling(fps, n_samples, input_is_packed, n_features)
|
|
316
320
|
|
|
317
321
|
# Work with only the sampled fingerprints
|
|
@@ -71,12 +71,14 @@ def _iter_ranges_and_smiles_batches(
|
|
|
71
71
|
num_per_batch: int,
|
|
72
72
|
tab_separated: bool = False,
|
|
73
73
|
replace_dummy_atoms: bool = False,
|
|
74
|
+
assume_paths: bool = True,
|
|
74
75
|
) -> tp.Iterable[tuple[tuple[int, int], tuple[str, ...]]]:
|
|
76
|
+
if assume_paths:
|
|
77
|
+
it = iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms)
|
|
78
|
+
else:
|
|
79
|
+
it = tp.cast(tp.Iterator[str], smiles_paths)
|
|
75
80
|
start_idx = 0
|
|
76
|
-
for batch in batched(
|
|
77
|
-
iter_smiles_from_paths(smiles_paths, tab_separated, replace_dummy_atoms),
|
|
78
|
-
num_per_batch,
|
|
79
|
-
):
|
|
81
|
+
for batch in batched(it, num_per_batch):
|
|
80
82
|
size = len(batch)
|
|
81
83
|
end_idx = start_idx + size
|
|
82
84
|
yield (start_idx, end_idx), batch
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bblean
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.2
|
|
4
4
|
Summary: BitBirch-Lean Python package
|
|
5
5
|
Author: The Miranda-Quintana Lab and other BitBirch developers
|
|
6
6
|
Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
import pytest
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
|
-
from bblean.bitbirch import BitBirch
|
|
5
|
+
from bblean.bitbirch import BitBirch, guess_threshold
|
|
6
6
|
from bblean.fingerprints import pack_fingerprints, make_fake_fingerprints
|
|
7
7
|
|
|
8
8
|
from inline_snapshot import snapshot
|
|
@@ -42,6 +42,20 @@ def test_bb_cluster_simple_repeated_fps() -> None:
|
|
|
42
42
|
assert ids == [list(range(repeats))]
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
def test_guess_threhsold() -> None:
|
|
46
|
+
fps = make_fake_fingerprints(
|
|
47
|
+
100, n_features=8, seed=12620509540149709235, pack=True
|
|
48
|
+
)
|
|
49
|
+
thresh = guess_threshold(fps, return_mean_std=False)
|
|
50
|
+
assert thresh > 0.9 and thresh < 1.0
|
|
51
|
+
|
|
52
|
+
fps = make_fake_fingerprints(
|
|
53
|
+
100, n_features=2048, seed=12620509540149709235, pack=True
|
|
54
|
+
)
|
|
55
|
+
thresh = guess_threshold(fps, return_mean_std=False)
|
|
56
|
+
assert thresh > 0.4 and thresh < 0.6
|
|
57
|
+
|
|
58
|
+
|
|
45
59
|
def test_bb_cluster_3_fps() -> None:
|
|
46
60
|
fps = make_fake_fingerprints(3, n_features=8, seed=12620509540149709235, pack=True)
|
|
47
61
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|