bblean 0.6.0b2__tar.gz → 0.7.2b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/upload-to-pypi.yaml +6 -4
- {bblean-0.6.0b2 → bblean-0.7.2b0}/PKG-INFO +3 -2
- {bblean-0.6.0b2 → bblean-0.7.2b0}/README.md +2 -1
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/bb_int64.py +2 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_py_similarity.py +1 -9
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_version.py +2 -2
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/bitbirch.py +42 -6
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/cli.py +68 -15
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/csrc/similarity.cpp +77 -26
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/fingerprints.py +5 -1
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/multiround.py +31 -16
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/plotting.py +7 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/similarity.py +70 -15
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/sklearn.py +1 -2
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/smiles.py +20 -5
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/PKG-INFO +3 -2
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/SOURCES.txt +5 -2
- {bblean-0.6.0b2/examples → bblean-0.7.2b0/docs/src/user-guide/notebooks}/bitbirch_quickstart.ipynb +1 -1
- bblean-0.7.2b0/examples/best_practices/best_practices_functions.py +188 -0
- bblean-0.7.2b0/examples/best_practices/best_practices_plots.py +465 -0
- bblean-0.7.2b0/examples/best_practices/bitbirch_best_practices.ipynb +601 -0
- bblean-0.7.2b0/examples/best_practices/bitbirch_best_practices_RDKit.ipynb +571 -0
- bblean-0.7.2b0/examples/best_practices/bitbirch_parameter.ipynb +1755 -0
- {bblean-0.6.0b2/docs/src/user-guide/notebooks → bblean-0.7.2b0/examples}/bitbirch_quickstart.ipynb +1 -1
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_cli.py +60 -5
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_global_clustering.py +5 -25
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_similarity.py +12 -1
- bblean-0.6.0b2/docs/src/user-guide/notebooks/bitbirch_best_practices.ipynb +0 -526
- bblean-0.6.0b2/examples/bitbirch_best_practices.ipynb +0 -526
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.cruft.json +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.flake8 +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/CODEOWNERS +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/ci-cpp.yaml +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.github/workflows/ci.yaml +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.gitignore +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/.pre-commit-config.yaml +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/LICENSE +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/LICENSES/BSD-3-Clause.txt +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/LICENSES/GPL-3.0-only.txt +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/__init__.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_config.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_console.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/__init__.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_legacy/bb_uint8.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_memory.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_merges.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/_timer.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/analysis.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/csrc/README.md +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/metrics.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean/utils.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean-demo-v2.gif +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean-demo.cast +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/dependency_links.txt +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/entry_points.txt +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/requires.txt +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/bblean.egg-info/top_level.txt +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/api.svg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/installing.svg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/logo-dark-bw.svg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/logo-light-bw.svg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/publications.svg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/style.css +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_static/user-guide.svg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/_templates/module.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/api-reference.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/conf.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/index.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/installing.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/publications.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/user-guide/linux_memory_setup.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/user-guide/parameters.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/docs/src/user-guide.rst +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/environment.yaml +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/examples/biogen_logS.csv +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/examples/chembl-33-natural-products-subset.smi +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/examples/dataset_splitting.ipynb +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/pyproject.toml +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/setup.cfg +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/setup.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/chembl-sample-3k.smi +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/chembl-sample-bad.smi +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/legacy_merges.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/legacy_metrics.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_bb_consistency.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_fake_fps.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_fingerprints.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_import_bblean.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_merges.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_metrics.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_multiround.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_refine.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_regression.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_sampling.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_simple.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_sklearn.py +0 -0
- {bblean-0.6.0b2 → bblean-0.7.2b0}/tests/test_utils.py +0 -0
|
@@ -15,10 +15,12 @@ on:
|
|
|
15
15
|
required: false
|
|
16
16
|
default: false
|
|
17
17
|
type: boolean
|
|
18
|
+
release:
|
|
19
|
+
types: [published]
|
|
18
20
|
|
|
19
21
|
env:
|
|
20
22
|
PYTHON_VERSION: '3.11'
|
|
21
|
-
SETUPTOOLS_SCM_PRETEND_VERSION: ${{ github.event.inputs.version }}
|
|
23
|
+
SETUPTOOLS_SCM_PRETEND_VERSION: ${{ github.event_name == 'release' && github.event.release.tag_name || github.event.inputs.version }}
|
|
22
24
|
# cibuildwheel configuration:
|
|
23
25
|
# Skip py 3.14, 32 bit and musllinux (Alpine) wheels
|
|
24
26
|
CIBW_SKIP: "cp314-* cp314t-* *-manylinux_i686 *-win32 *-musllinux_*"
|
|
@@ -29,7 +31,7 @@ env:
|
|
|
29
31
|
# Build wheels that support both aarch64 and x86_64 on macOS
|
|
30
32
|
CIBW_ARCHS_MACOS: "universal2"
|
|
31
33
|
CIBW_BUILD_VERBOSITY: 3
|
|
32
|
-
|
|
34
|
+
PIP_ONLY_BINARY: "llvmlite,numba"
|
|
33
35
|
jobs:
|
|
34
36
|
make_sdist:
|
|
35
37
|
name: make-source-distribution
|
|
@@ -93,7 +95,7 @@ jobs:
|
|
|
93
95
|
publish_to_testpypi:
|
|
94
96
|
needs: [build_wheels, make_sdist]
|
|
95
97
|
runs-on: ubuntu-latest
|
|
96
|
-
if: ${{ github.event.inputs.upload-testpypi
|
|
98
|
+
if: ${{ github.event_name != 'release' && github.event.inputs.upload-testpypi }}
|
|
97
99
|
environment:
|
|
98
100
|
name: testpypi
|
|
99
101
|
url: https://test.pypi.org/p/bblean
|
|
@@ -115,7 +117,7 @@ jobs:
|
|
|
115
117
|
publish_to_pypi:
|
|
116
118
|
needs: [build_wheels, make_sdist]
|
|
117
119
|
runs-on: ubuntu-latest
|
|
118
|
-
if: ${{ github.event.inputs.upload-pypi
|
|
120
|
+
if: ${{ github.event_name == 'release' || github.event.inputs.upload-pypi }}
|
|
119
121
|
environment:
|
|
120
122
|
name: pypi
|
|
121
123
|
url: https://pypi.org/p/bblean
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bblean
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.2b0
|
|
4
4
|
Summary: BitBirch-Lean Python package
|
|
5
5
|
Author: The Miranda-Quintana Lab and other BitBirch developers
|
|
6
6
|
Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
|
|
@@ -90,6 +90,7 @@ macOS via pip, which automatically includes C++ extensions:
|
|
|
90
90
|
|
|
91
91
|
```bash
|
|
92
92
|
pip install bblean
|
|
93
|
+
# Alternatively you can use 'uv pip install'
|
|
93
94
|
bb --help
|
|
94
95
|
```
|
|
95
96
|
|
|
@@ -235,7 +236,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
|
|
|
235
236
|
tree.fit(fps)
|
|
236
237
|
|
|
237
238
|
# Refine the tree (if needed)
|
|
238
|
-
tree.set_merge(
|
|
239
|
+
tree.set_merge("tolerance-diameter", tolerance=0.0)
|
|
239
240
|
tree.refine_inplace(fps)
|
|
240
241
|
|
|
241
242
|
# Visualize the results
|
|
@@ -47,6 +47,7 @@ macOS via pip, which automatically includes C++ extensions:
|
|
|
47
47
|
|
|
48
48
|
```bash
|
|
49
49
|
pip install bblean
|
|
50
|
+
# Alternatively you can use 'uv pip install'
|
|
50
51
|
bb --help
|
|
51
52
|
```
|
|
52
53
|
|
|
@@ -192,7 +193,7 @@ tree = bblean.BitBirch(branching_factor=50, threshold=0.65, merge_criterion="dia
|
|
|
192
193
|
tree.fit(fps)
|
|
193
194
|
|
|
194
195
|
# Refine the tree (if needed)
|
|
195
|
-
tree.set_merge(
|
|
196
|
+
tree.set_merge("tolerance-diameter", tolerance=0.0)
|
|
196
197
|
tree.refine_inplace(fps)
|
|
197
198
|
|
|
198
199
|
# Visualize the results
|
|
@@ -633,6 +633,7 @@ class BitBirch:
|
|
|
633
633
|
X = X[:max_fps]
|
|
634
634
|
threshold = self.threshold
|
|
635
635
|
branching_factor = self.branching_factor
|
|
636
|
+
|
|
636
637
|
n_features = _validate_n_features(X, input_is_packed, n_features)
|
|
637
638
|
d_type = X.dtype
|
|
638
639
|
|
|
@@ -718,6 +719,7 @@ class BitBirch:
|
|
|
718
719
|
"""
|
|
719
720
|
threshold = self.threshold
|
|
720
721
|
branching_factor = self.branching_factor
|
|
722
|
+
|
|
721
723
|
n_features = _validate_n_features(X, input_is_packed, n_features)
|
|
722
724
|
d_type = X.dtype
|
|
723
725
|
|
|
@@ -76,18 +76,10 @@ def jt_compl_isim(
|
|
|
76
76
|
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
|
77
77
|
return np.full(len(fps), fill_value=np.nan, dtype=np.float64)
|
|
78
78
|
linear_sum = np.sum(fps, axis=0)
|
|
79
|
-
n_objects = len(fps) - 1
|
|
80
79
|
comp_sims = [jt_isim_from_sum(linear_sum - fp, n_objects) for fp in fps]
|
|
81
|
-
|
|
82
80
|
return np.array(comp_sims, dtype=np.float64)
|
|
83
81
|
|
|
84
82
|
|
|
85
|
-
def _jt_isim_medoid_index(
|
|
86
|
-
fps: NDArray[np.uint8], input_is_packed: bool = True, n_features: int | None = None
|
|
87
|
-
) -> int:
|
|
88
|
-
return np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
|
|
89
|
-
|
|
90
|
-
|
|
91
83
|
def jt_isim_medoid(
|
|
92
84
|
fps: NDArray[np.uint8],
|
|
93
85
|
input_is_packed: bool = True,
|
|
@@ -110,7 +102,7 @@ def jt_isim_medoid(
|
|
|
110
102
|
if len(fps) < 3:
|
|
111
103
|
idx = 0 # Medoid undefined for sets of 3 or more fingerprints
|
|
112
104
|
else:
|
|
113
|
-
idx =
|
|
105
|
+
idx = np.argmin(jt_compl_isim(fps, input_is_packed, n_features)).item()
|
|
114
106
|
m = fps[idx]
|
|
115
107
|
if pack:
|
|
116
108
|
return idx, pack_fingerprints(m)
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.7.2.b0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 7, 2, 'b0')
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -47,6 +47,8 @@
|
|
|
47
47
|
# ./LICENSES/GPL-3.0-only.txt. If not, see <http://www.gnu.org/licenses/gpl-3.0.html>.
|
|
48
48
|
r"""BitBirch 'Lean' class for fast, memory-efficient O(N) clustering"""
|
|
49
49
|
from __future__ import annotations # Stringize type annotations for no runtime overhead
|
|
50
|
+
import pickle
|
|
51
|
+
import sys
|
|
50
52
|
import typing_extensions as tpx
|
|
51
53
|
import os
|
|
52
54
|
import random
|
|
@@ -646,7 +648,7 @@ class BitBirch:
|
|
|
646
648
|
|
|
647
649
|
@merge_criterion.setter
|
|
648
650
|
def merge_criterion(self, value: str) -> None:
|
|
649
|
-
self.set_merge(
|
|
651
|
+
self.set_merge(merge_criterion=value)
|
|
650
652
|
|
|
651
653
|
@property
|
|
652
654
|
def tolerance(self) -> float | None:
|
|
@@ -671,7 +673,7 @@ class BitBirch:
|
|
|
671
673
|
|
|
672
674
|
def set_merge(
|
|
673
675
|
self,
|
|
674
|
-
|
|
676
|
+
merge_criterion: str | MergeAcceptFunction | None = None,
|
|
675
677
|
*,
|
|
676
678
|
tolerance: float | None = None,
|
|
677
679
|
threshold: float | None = None,
|
|
@@ -687,10 +689,10 @@ class BitBirch:
|
|
|
687
689
|
"the global set_merge() function has *not* been used"
|
|
688
690
|
)
|
|
689
691
|
_tolerance = 0.05 if tolerance is None else tolerance
|
|
690
|
-
if isinstance(
|
|
691
|
-
self._merge_accept_fn =
|
|
692
|
-
elif isinstance(
|
|
693
|
-
self._merge_accept_fn = get_merge_accept_fn(
|
|
692
|
+
if isinstance(merge_criterion, MergeAcceptFunction):
|
|
693
|
+
self._merge_accept_fn = merge_criterion
|
|
694
|
+
elif isinstance(merge_criterion, str):
|
|
695
|
+
self._merge_accept_fn = get_merge_accept_fn(merge_criterion, _tolerance)
|
|
694
696
|
if hasattr(self._merge_accept_fn, "tolerance"):
|
|
695
697
|
self._merge_accept_fn.tolerance = _tolerance
|
|
696
698
|
elif tolerance is not None:
|
|
@@ -1316,6 +1318,40 @@ class BitBirch:
|
|
|
1316
1318
|
parts.append(f"tolerance={self.tolerance}")
|
|
1317
1319
|
return f"{self.__class__.__name__}({', '.join(parts)})"
|
|
1318
1320
|
|
|
1321
|
+
def save(self, path: Path | str) -> None:
|
|
1322
|
+
r""":meta private:"""
|
|
1323
|
+
# TODO: BitBIRCH is highly recursive. pickling may crash python,
|
|
1324
|
+
# an alternative solution would be better
|
|
1325
|
+
msg = (
|
|
1326
|
+
"Saving large BitBIRCH trees may result in large memory peaks."
|
|
1327
|
+
" An alternative serialization method may be implemented in the future"
|
|
1328
|
+
)
|
|
1329
|
+
warnings.warn(msg)
|
|
1330
|
+
_old_limit = sys.getrecursionlimit()
|
|
1331
|
+
sys.setrecursionlimit(1_000_000_000)
|
|
1332
|
+
with open(path, mode="wb") as f:
|
|
1333
|
+
pickle.dump(self, f)
|
|
1334
|
+
sys.setrecursionlimit(_old_limit)
|
|
1335
|
+
|
|
1336
|
+
@classmethod
|
|
1337
|
+
def load(cls, path: Path | str) -> tpx.Self:
|
|
1338
|
+
r""":meta private:"""
|
|
1339
|
+
# TODO: BitBIRCH is highly recursive. pickling may crash python,
|
|
1340
|
+
# an alternative solution would be better
|
|
1341
|
+
msg = (
|
|
1342
|
+
"Loading large BitBIRCH trees may result in large memory peaks."
|
|
1343
|
+
" An alternative serialization method may be implemented in the future"
|
|
1344
|
+
)
|
|
1345
|
+
warnings.warn(msg)
|
|
1346
|
+
_old_limit = sys.getrecursionlimit()
|
|
1347
|
+
sys.setrecursionlimit(1_000_000_000)
|
|
1348
|
+
with open(path, mode="rb") as f:
|
|
1349
|
+
tree = pickle.load(f)
|
|
1350
|
+
sys.setrecursionlimit(_old_limit)
|
|
1351
|
+
if not isinstance(tree, cls):
|
|
1352
|
+
raise ValueError("Path does not contain a bitbirch object")
|
|
1353
|
+
return tree
|
|
1354
|
+
|
|
1319
1355
|
def global_clustering(
|
|
1320
1356
|
self,
|
|
1321
1357
|
n_clusters: int,
|
|
@@ -1096,26 +1096,29 @@ def _run(
|
|
|
1096
1096
|
|
|
1097
1097
|
timer.end_timing("total", console, indent=False)
|
|
1098
1098
|
console.print_peak_mem(out_dir, indent=False)
|
|
1099
|
+
if save_tree:
|
|
1100
|
+
if variant != "lean":
|
|
1101
|
+
console.print("Can't save tree for non-lean variants", style="red")
|
|
1102
|
+
else:
|
|
1103
|
+
# TODO: Find alternative solution
|
|
1104
|
+
tree.save(out_dir / "bitbirch.pkl")
|
|
1099
1105
|
if variant == "lean":
|
|
1100
|
-
if save_tree:
|
|
1101
|
-
# TODO: BitBIRCH is highly recursive. pickling may crash python,
|
|
1102
|
-
# an alternative solution would be better
|
|
1103
|
-
_old_limit = sys.getrecursionlimit()
|
|
1104
|
-
sys.setrecursionlimit(100_000)
|
|
1105
|
-
with open(out_dir / "bitbirch.pkl", mode="wb") as f:
|
|
1106
|
-
pickle.dump(tree, f)
|
|
1107
|
-
sys.setrecursionlimit(_old_limit)
|
|
1108
1106
|
tree.delete_internal_nodes()
|
|
1109
|
-
|
|
1110
|
-
|
|
1107
|
+
# Dump outputs (peak memory, timings, config, cluster ids)
|
|
1108
|
+
if save_centroids:
|
|
1109
|
+
if variant != "lean":
|
|
1110
|
+
console.print("Can't save centroids for non-lean variants", style="red")
|
|
1111
|
+
with open(out_dir / "clusters.pkl", mode="wb") as f:
|
|
1112
|
+
pickle.dump(tree.get_cluster_mol_ids(), f)
|
|
1113
|
+
else:
|
|
1111
1114
|
output = tree.get_centroids_mol_ids()
|
|
1112
1115
|
with open(out_dir / "clusters.pkl", mode="wb") as f:
|
|
1113
1116
|
pickle.dump(output["mol_ids"], f)
|
|
1114
1117
|
with open(out_dir / "cluster-centroids-packed.pkl", mode="wb") as f:
|
|
1115
1118
|
pickle.dump(output["centroids"], f)
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
+
else:
|
|
1120
|
+
with open(out_dir / "clusters.pkl", mode="wb") as f:
|
|
1121
|
+
pickle.dump(tree.get_cluster_mol_ids(), f)
|
|
1119
1122
|
|
|
1120
1123
|
collect_system_specs_and_dump_config(ctx.params)
|
|
1121
1124
|
timer.dump(out_dir / "timings.json")
|
|
@@ -1193,6 +1196,14 @@ def _multiround(
|
|
|
1193
1196
|
bool,
|
|
1194
1197
|
Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
|
|
1195
1198
|
] = True,
|
|
1199
|
+
sort_fps: Annotated[
|
|
1200
|
+
bool,
|
|
1201
|
+
Option(
|
|
1202
|
+
"--sort-fps/--no-sort-fps",
|
|
1203
|
+
help="Sort the fingerprints by popcount before launching the initial round",
|
|
1204
|
+
rich_help_panel="Advanced",
|
|
1205
|
+
),
|
|
1206
|
+
] = False,
|
|
1196
1207
|
mid_merge_criterion: Annotated[
|
|
1197
1208
|
str,
|
|
1198
1209
|
Option(
|
|
@@ -1386,6 +1397,7 @@ def _multiround(
|
|
|
1386
1397
|
midsection_threshold_change=mid_threshold_change,
|
|
1387
1398
|
tolerance=tolerance,
|
|
1388
1399
|
# Advanced
|
|
1400
|
+
sort_fps=sort_fps,
|
|
1389
1401
|
save_tree=save_tree,
|
|
1390
1402
|
save_centroids=save_centroids,
|
|
1391
1403
|
bin_size=bin_size,
|
|
@@ -1526,6 +1538,13 @@ def _fps_from_smiles(
|
|
|
1526
1538
|
),
|
|
1527
1539
|
),
|
|
1528
1540
|
] = False,
|
|
1541
|
+
tab_separated: Annotated[
|
|
1542
|
+
bool,
|
|
1543
|
+
Option(
|
|
1544
|
+
"--tab-sep/--no-tab-sep",
|
|
1545
|
+
help="Whether the smiles file has the format <smiles><tab><field><tab>...",
|
|
1546
|
+
),
|
|
1547
|
+
] = False,
|
|
1529
1548
|
) -> None:
|
|
1530
1549
|
r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
|
|
1531
1550
|
|
|
@@ -1631,7 +1650,9 @@ def _fps_from_smiles(
|
|
|
1631
1650
|
with mp_context.Pool(processes=num_ps) as pool:
|
|
1632
1651
|
pool.map(
|
|
1633
1652
|
create_fp_file,
|
|
1634
|
-
_iter_idxs_and_smiles_batches(
|
|
1653
|
+
_iter_idxs_and_smiles_batches(
|
|
1654
|
+
smiles_paths, num_per_batch, tab_separated
|
|
1655
|
+
),
|
|
1635
1656
|
)
|
|
1636
1657
|
timer.end_timing("total", console, indent=False)
|
|
1637
1658
|
stem = out_name.split(".")[0]
|
|
@@ -1671,7 +1692,9 @@ def _fps_from_smiles(
|
|
|
1671
1692
|
with mp_context.Pool(processes=num_ps) as pool:
|
|
1672
1693
|
pool.starmap(
|
|
1673
1694
|
fps_array_filler,
|
|
1674
|
-
_iter_ranges_and_smiles_batches(
|
|
1695
|
+
_iter_ranges_and_smiles_batches(
|
|
1696
|
+
smiles_paths, num_per_batch, tab_separated
|
|
1697
|
+
),
|
|
1675
1698
|
)
|
|
1676
1699
|
fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
|
|
1677
1700
|
mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
|
|
@@ -1848,3 +1871,33 @@ def _merge_fps(
|
|
|
1848
1871
|
return
|
|
1849
1872
|
np.save(out_dir / stem, np.concatenate(arrays))
|
|
1850
1873
|
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
|
|
1874
|
+
|
|
1875
|
+
|
|
1876
|
+
@app.command("fps-sort", rich_help_panel="Fingerprints")
|
|
1877
|
+
def _sort_fps(
|
|
1878
|
+
in_file: Annotated[
|
|
1879
|
+
Path,
|
|
1880
|
+
Argument(help="`*.npy` file with packed fingerprints"),
|
|
1881
|
+
],
|
|
1882
|
+
out_dir: Annotated[
|
|
1883
|
+
Path | None,
|
|
1884
|
+
Option("-o", "--out-dir", show_default=False),
|
|
1885
|
+
] = None,
|
|
1886
|
+
seed: Annotated[
|
|
1887
|
+
int | None,
|
|
1888
|
+
Option("--seed", hidden=True, rich_help_panel="Debug"),
|
|
1889
|
+
] = None,
|
|
1890
|
+
) -> None:
|
|
1891
|
+
import numpy as np
|
|
1892
|
+
from bblean._py_similarity import _popcount
|
|
1893
|
+
|
|
1894
|
+
fps = np.load(in_file)
|
|
1895
|
+
stem = in_file.stem
|
|
1896
|
+
counts = _popcount(fps)
|
|
1897
|
+
sort_idxs = np.argsort(counts)
|
|
1898
|
+
fps = fps[sort_idxs]
|
|
1899
|
+
if out_dir is None:
|
|
1900
|
+
out_dir = Path.cwd()
|
|
1901
|
+
out_dir.mkdir(exist_ok=True)
|
|
1902
|
+
out_dir = out_dir.resolve()
|
|
1903
|
+
np.save(out_dir / f"sorted-{stem}.npy", fps)
|
|
@@ -300,6 +300,75 @@ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
|
|
|
300
300
|
return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
|
|
301
301
|
}
|
|
302
302
|
|
|
303
|
+
// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
|
|
304
|
+
// array is uint8_t** if the array is uint64 already, it is slower
|
|
305
|
+
template <typename T>
|
|
306
|
+
py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
|
|
307
|
+
if (arr.ndim() != 2) {
|
|
308
|
+
throw std::runtime_error("Input array must be 2-dimensional");
|
|
309
|
+
}
|
|
310
|
+
auto arr_ptr = arr.data();
|
|
311
|
+
auto out = py::array_t<uint64_t>(arr.shape(1));
|
|
312
|
+
auto out_ptr = out.mutable_data();
|
|
313
|
+
std::memset(out_ptr, 0, out.nbytes());
|
|
314
|
+
py::ssize_t n_samples = arr.shape(0);
|
|
315
|
+
py::ssize_t n_features = arr.shape(1);
|
|
316
|
+
// Check GCC / CLang vectorize this
|
|
317
|
+
for (py::ssize_t i = 0; i < n_samples; ++i) {
|
|
318
|
+
const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
|
|
319
|
+
for (py::ssize_t j = 0; j < n_features; ++j) {
|
|
320
|
+
out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return out;
|
|
324
|
+
}
|
|
325
|
+
py::array_t<double> _nochecks_jt_compl_isim_unpacked_u8(
|
|
326
|
+
const py::array_t<uint8_t, py::array::c_style>& fps) {
|
|
327
|
+
py::ssize_t n_objects = fps.shape(0);
|
|
328
|
+
py::ssize_t n_features = fps.shape(1);
|
|
329
|
+
auto out = py::array_t<double>(n_objects);
|
|
330
|
+
auto out_ptr = out.mutable_data();
|
|
331
|
+
|
|
332
|
+
if (n_objects < 3) {
|
|
333
|
+
PyErr_WarnEx(PyExc_RuntimeWarning,
|
|
334
|
+
"Invalid num fps in compl_isim. Expected n_objects >= 3",
|
|
335
|
+
1);
|
|
336
|
+
for (py::ssize_t i{0}; i != n_objects; ++i) {
|
|
337
|
+
out_ptr[i] = std::numeric_limits<double>::quiet_NaN();
|
|
338
|
+
}
|
|
339
|
+
return out;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
auto linear_sum = add_rows<uint8_t>(fps);
|
|
343
|
+
auto ls_cptr = linear_sum.data();
|
|
344
|
+
|
|
345
|
+
py::array_t<uint64_t> shifted_linear_sum(n_features);
|
|
346
|
+
auto shifted_ls_ptr = shifted_linear_sum.mutable_data();
|
|
347
|
+
|
|
348
|
+
auto in_cptr = fps.data();
|
|
349
|
+
for (py::ssize_t i{0}; i != n_objects; ++i) {
|
|
350
|
+
for (py::ssize_t j{0}; j != n_features; ++j) {
|
|
351
|
+
shifted_ls_ptr[j] = ls_cptr[j] - in_cptr[i * n_features + j];
|
|
352
|
+
}
|
|
353
|
+
// For all compl isim N is n_objects - 1
|
|
354
|
+
out_ptr[i] = jt_isim_from_sum(shifted_linear_sum, n_objects - 1);
|
|
355
|
+
}
|
|
356
|
+
return out;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
py::array_t<double> jt_compl_isim(
|
|
360
|
+
const CArrayForcecast<uint8_t>& fps, bool input_is_packed = true,
|
|
361
|
+
std::optional<py::ssize_t> n_features_opt = std::nullopt) {
|
|
362
|
+
if (fps.ndim() != 2) {
|
|
363
|
+
throw std::runtime_error("fps arr must be 2D");
|
|
364
|
+
}
|
|
365
|
+
if (input_is_packed) {
|
|
366
|
+
return _nochecks_jt_compl_isim_unpacked_u8(
|
|
367
|
+
_nochecks_unpack_fingerprints_2d(fps, n_features_opt));
|
|
368
|
+
}
|
|
369
|
+
return _nochecks_jt_compl_isim_unpacked_u8(fps);
|
|
370
|
+
}
|
|
371
|
+
|
|
303
372
|
// Contraint: T must be uint64_t or uint8_t
|
|
304
373
|
template <typename T>
|
|
305
374
|
void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
|
|
@@ -372,33 +441,10 @@ py::array_t<double> jt_sim_packed_precalc_cardinalities(
|
|
|
372
441
|
}
|
|
373
442
|
|
|
374
443
|
py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
|
|
375
|
-
|
|
444
|
+
const py::array_t<uint8_t>& vec) {
|
|
376
445
|
return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
|
|
377
446
|
}
|
|
378
447
|
|
|
379
|
-
// NOTE: This is only *slightly* faster for C++ than numpy, **only if the
|
|
380
|
-
// array is uint8_t** if the array is uint64 already, it is slower
|
|
381
|
-
template <typename T>
|
|
382
|
-
py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
|
|
383
|
-
if (arr.ndim() != 2) {
|
|
384
|
-
throw std::runtime_error("Input array must be 2-dimensional");
|
|
385
|
-
}
|
|
386
|
-
auto arr_ptr = arr.data();
|
|
387
|
-
auto out = py::array_t<uint64_t>(arr.shape(1));
|
|
388
|
-
auto out_ptr = out.mutable_data();
|
|
389
|
-
std::memset(out_ptr, 0, out.nbytes());
|
|
390
|
-
py::ssize_t n_samples = arr.shape(0);
|
|
391
|
-
py::ssize_t n_features = arr.shape(1);
|
|
392
|
-
// Check GCC / CLang vectorize this
|
|
393
|
-
for (py::ssize_t i = 0; i < n_samples; ++i) {
|
|
394
|
-
const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
|
|
395
|
-
for (py::ssize_t j = 0; j < n_features; ++j) {
|
|
396
|
-
out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
return out;
|
|
400
|
-
}
|
|
401
|
-
|
|
402
448
|
double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
|
|
403
449
|
return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
|
|
404
450
|
}
|
|
@@ -406,8 +452,9 @@ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
|
|
|
406
452
|
double jt_isim_packed_u8(
|
|
407
453
|
const CArrayForcecast<uint8_t>& arr,
|
|
408
454
|
std::optional<py::ssize_t> n_features_opt = std::nullopt) {
|
|
409
|
-
return jt_isim_from_sum(
|
|
410
|
-
|
|
455
|
+
return jt_isim_from_sum(
|
|
456
|
+
add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
|
|
457
|
+
arr.shape(0));
|
|
411
458
|
}
|
|
412
459
|
|
|
413
460
|
py::tuple jt_most_dissimilar_packed(
|
|
@@ -510,6 +557,10 @@ PYBIND11_MODULE(_cpp_similarity, m) {
|
|
|
510
557
|
m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
|
|
511
558
|
"iSIM Tanimoto calculation", py::arg("arr"));
|
|
512
559
|
|
|
560
|
+
m.def("jt_compl_isim", &jt_compl_isim, "Complementary iSIM tanimoto",
|
|
561
|
+
py::arg("fps"), py::arg("input_is_packed") = true,
|
|
562
|
+
py::arg("n_features") = std::nullopt);
|
|
563
|
+
|
|
513
564
|
m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
|
|
514
565
|
"Tanimoto similarity between a matrix of packed fps and a single "
|
|
515
566
|
"packed fp",
|
|
@@ -115,7 +115,11 @@ def _get_generator(kind: str, n_features: int) -> tp.Any:
|
|
|
115
115
|
return rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_features)
|
|
116
116
|
elif kind == "ecfp6":
|
|
117
117
|
return rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_features)
|
|
118
|
-
|
|
118
|
+
elif kind == "topological":
|
|
119
|
+
return rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_features)
|
|
120
|
+
elif kind == "ap":
|
|
121
|
+
return rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_features)
|
|
122
|
+
raise ValueError(f"Unknown kind {kind}. Use 'rdkit|ecfp4|ecfp6|topological|ap'")
|
|
119
123
|
|
|
120
124
|
|
|
121
125
|
def _get_sanitize_flags(sanitize: str) -> tp.Any:
|
|
@@ -65,6 +65,7 @@ from bblean._config import DEFAULTS
|
|
|
65
65
|
from bblean.utils import batched
|
|
66
66
|
from bblean.bitbirch import BitBirch
|
|
67
67
|
from bblean.fingerprints import _get_fps_file_num
|
|
68
|
+
from bblean._py_similarity import _popcount
|
|
68
69
|
|
|
69
70
|
__all__ = ["run_multiround_bitbirch"]
|
|
70
71
|
|
|
@@ -157,6 +158,7 @@ class _InitialRound:
|
|
|
157
158
|
max_fps: int | None = None,
|
|
158
159
|
merge_criterion: str = DEFAULTS.merge_criterion,
|
|
159
160
|
input_is_packed: bool = True,
|
|
161
|
+
sort_fps: bool = False,
|
|
160
162
|
) -> None:
|
|
161
163
|
self.n_features = n_features
|
|
162
164
|
self.refinement_before_midsection = refinement_before_midsection
|
|
@@ -171,6 +173,7 @@ class _InitialRound:
|
|
|
171
173
|
self.refine_merge_criterion = refine_merge_criterion
|
|
172
174
|
self.input_is_packed = input_is_packed
|
|
173
175
|
self.refine_threshold_change = refine_threshold_change
|
|
176
|
+
self._sort_fps = sort_fps
|
|
174
177
|
|
|
175
178
|
def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
|
|
176
179
|
file_label, fp_file, start_idx, end_idx = file_info
|
|
@@ -182,6 +185,14 @@ class _InitialRound:
|
|
|
182
185
|
threshold=self.threshold,
|
|
183
186
|
merge_criterion=self.merge_criterion,
|
|
184
187
|
)
|
|
188
|
+
if self._sort_fps:
|
|
189
|
+
fp_input = np.load(fp_file)
|
|
190
|
+
counts = _popcount(fp_input)
|
|
191
|
+
sort_idxs = np.argsort(counts)
|
|
192
|
+
fp_input = fp_input[sort_idxs]
|
|
193
|
+
else:
|
|
194
|
+
fp_input = fp_file
|
|
195
|
+
|
|
185
196
|
range_ = range(start_idx, end_idx)
|
|
186
197
|
tree.fit(
|
|
187
198
|
fp_file,
|
|
@@ -201,7 +212,7 @@ class _InitialRound:
|
|
|
201
212
|
# Finish the first refinement step internally in this round
|
|
202
213
|
tree.reset()
|
|
203
214
|
tree.set_merge(
|
|
204
|
-
self.refine_merge_criterion,
|
|
215
|
+
merge_criterion=self.refine_merge_criterion,
|
|
205
216
|
tolerance=self.tolerance,
|
|
206
217
|
threshold=self.threshold + self.refine_threshold_change,
|
|
207
218
|
)
|
|
@@ -225,7 +236,7 @@ class _TreeMergingRound:
|
|
|
225
236
|
round_idx: int,
|
|
226
237
|
out_dir: Path | str,
|
|
227
238
|
split_largest_cluster: bool,
|
|
228
|
-
|
|
239
|
+
merge_criterion: str,
|
|
229
240
|
all_fp_paths: tp.Sequence[Path] = (),
|
|
230
241
|
) -> None:
|
|
231
242
|
self.all_fp_paths = list(all_fp_paths)
|
|
@@ -235,14 +246,14 @@ class _TreeMergingRound:
|
|
|
235
246
|
self.round_idx = round_idx
|
|
236
247
|
self.out_dir = Path(out_dir)
|
|
237
248
|
self.split_largest_cluster = split_largest_cluster
|
|
238
|
-
self.
|
|
249
|
+
self.merge_criterion = merge_criterion
|
|
239
250
|
|
|
240
251
|
def __call__(self, batch_info: tuple[str, tp.Sequence[tuple[Path, Path]]]) -> None:
|
|
241
252
|
batch_label, batch_path_pairs = batch_info
|
|
242
253
|
tree = BitBirch(
|
|
243
254
|
branching_factor=self.branching_factor,
|
|
244
255
|
threshold=self.threshold,
|
|
245
|
-
merge_criterion=self.
|
|
256
|
+
merge_criterion=self.merge_criterion,
|
|
246
257
|
tolerance=self.tolerance,
|
|
247
258
|
)
|
|
248
259
|
# Rebuild a tree, inserting all BitFeatures from the corresponding batch
|
|
@@ -270,13 +281,20 @@ class _FinalTreeMergingRound(_TreeMergingRound):
|
|
|
270
281
|
branching_factor: int,
|
|
271
282
|
threshold: float,
|
|
272
283
|
tolerance: float,
|
|
273
|
-
|
|
284
|
+
merge_criterion: str,
|
|
274
285
|
out_dir: Path | str,
|
|
275
286
|
save_tree: bool,
|
|
276
287
|
save_centroids: bool,
|
|
277
288
|
) -> None:
|
|
278
289
|
super().__init__(
|
|
279
|
-
branching_factor,
|
|
290
|
+
branching_factor,
|
|
291
|
+
threshold,
|
|
292
|
+
tolerance,
|
|
293
|
+
-1,
|
|
294
|
+
out_dir,
|
|
295
|
+
False,
|
|
296
|
+
merge_criterion,
|
|
297
|
+
(),
|
|
280
298
|
)
|
|
281
299
|
self.save_tree = save_tree
|
|
282
300
|
self.save_centroids = save_centroids
|
|
@@ -286,7 +304,7 @@ class _FinalTreeMergingRound(_TreeMergingRound):
|
|
|
286
304
|
tree = BitBirch(
|
|
287
305
|
branching_factor=self.branching_factor,
|
|
288
306
|
threshold=self.threshold,
|
|
289
|
-
merge_criterion=self.
|
|
307
|
+
merge_criterion=self.merge_criterion,
|
|
290
308
|
tolerance=self.tolerance,
|
|
291
309
|
)
|
|
292
310
|
# Rebuild a tree, inserting all BitFeatures from the corresponding batch
|
|
@@ -298,13 +316,8 @@ class _FinalTreeMergingRound(_TreeMergingRound):
|
|
|
298
316
|
|
|
299
317
|
# Save clusters and exit
|
|
300
318
|
if self.save_tree:
|
|
301
|
-
# TODO:
|
|
302
|
-
|
|
303
|
-
_old_limit = sys.getrecursionlimit()
|
|
304
|
-
sys.setrecursionlimit(100_000)
|
|
305
|
-
with open(self.out_dir / "bitbirch.pkl", mode="wb") as f:
|
|
306
|
-
pickle.dump(tree, f)
|
|
307
|
-
sys.setrecursionlimit(_old_limit)
|
|
319
|
+
# TODO: Find alternative solution
|
|
320
|
+
tree.save(self.out_dir / "bitbirch.pkl")
|
|
308
321
|
tree.delete_internal_nodes()
|
|
309
322
|
if self.save_centroids:
|
|
310
323
|
output = tree.get_centroids_mol_ids()
|
|
@@ -358,6 +371,7 @@ def run_multiround_bitbirch(
|
|
|
358
371
|
mp_context: tp.Any = None,
|
|
359
372
|
save_tree: bool = False,
|
|
360
373
|
save_centroids: bool = True,
|
|
374
|
+
sort_fps: bool = False,
|
|
361
375
|
# Debug
|
|
362
376
|
max_fps: int | None = None,
|
|
363
377
|
verbose: bool = False,
|
|
@@ -404,6 +418,7 @@ def run_multiround_bitbirch(
|
|
|
404
418
|
console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
|
|
405
419
|
|
|
406
420
|
initial_fn = _InitialRound(
|
|
421
|
+
sort_fps=sort_fps,
|
|
407
422
|
n_features=n_features,
|
|
408
423
|
refinement_before_midsection=refinement_before_midsection,
|
|
409
424
|
max_fps=max_fps,
|
|
@@ -441,7 +456,7 @@ def run_multiround_bitbirch(
|
|
|
441
456
|
round_idx=round_idx,
|
|
442
457
|
all_fp_paths=input_files,
|
|
443
458
|
split_largest_cluster=split_largest_after_each_midsection_round,
|
|
444
|
-
|
|
459
|
+
merge_criterion=midsection_merge_criterion,
|
|
445
460
|
threshold=threshold + midsection_threshold_change,
|
|
446
461
|
**common_kwargs,
|
|
447
462
|
)
|
|
@@ -469,7 +484,7 @@ def run_multiround_bitbirch(
|
|
|
469
484
|
final_fn = _FinalTreeMergingRound(
|
|
470
485
|
save_tree=save_tree,
|
|
471
486
|
save_centroids=save_centroids,
|
|
472
|
-
|
|
487
|
+
merge_criterion=final_merge_criterion,
|
|
473
488
|
threshold=threshold + midsection_threshold_change,
|
|
474
489
|
**common_kwargs,
|
|
475
490
|
)
|
|
@@ -399,13 +399,17 @@ def dump_mol_images(
|
|
|
399
399
|
clusters: list[list[int]],
|
|
400
400
|
cluster_idx: int = 0,
|
|
401
401
|
batch_size: int = 30,
|
|
402
|
+
limit: int = -1,
|
|
402
403
|
) -> None:
|
|
403
404
|
r"""Dump smiles associated with a specific cluster as ``*.png`` image files"""
|
|
404
405
|
if isinstance(smiles, str):
|
|
405
406
|
smiles = [smiles]
|
|
406
407
|
smiles = np.asarray(smiles)
|
|
407
408
|
idxs = clusters[cluster_idx]
|
|
409
|
+
num = 0
|
|
408
410
|
for i, idx_seq in enumerate(batched(idxs, batch_size)):
|
|
411
|
+
if num + len(idx_seq) > limit:
|
|
412
|
+
idx_seq = idx_seq[: num + len(idx_seq) - limit]
|
|
409
413
|
mols = []
|
|
410
414
|
for smi in smiles[list(idx_seq)]:
|
|
411
415
|
mol = Chem.MolFromSmiles(smi)
|
|
@@ -415,6 +419,9 @@ def dump_mol_images(
|
|
|
415
419
|
img = Draw.MolsToGridImage(mols, molsPerRow=5)
|
|
416
420
|
with open(f"cluster_{cluster_idx}_{i}.png", "wb") as f:
|
|
417
421
|
f.write(img.data)
|
|
422
|
+
num += len(idx_seq)
|
|
423
|
+
if num >= limit:
|
|
424
|
+
break
|
|
418
425
|
|
|
419
426
|
|
|
420
427
|
# For internal use, dispatches a visualization workflow and optionally saves
|