bblean 0.6.0b2__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cp313-win_amd64.pyd +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1850 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b2.dist-info/METADATA +288 -0
- bblean-0.6.0b2.dist-info/RECORD +31 -0
- bblean-0.6.0b2.dist-info/WHEEL +5 -0
- bblean-0.6.0b2.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b2.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b2.dist-info/top_level.txt +1 -0
bblean/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
r"""BitBIRCH-Lean, a high-throughput, memory-efficient implementation of BitBIRCH
|
|
2
|
+
|
|
3
|
+
BitBIRCH-Lean is designed for high-thorouput clustering of huge molecular
|
|
4
|
+
libraries (of up to hundreds of milliones of molecules).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from bblean.smiles import load_smiles
|
|
8
|
+
from bblean.fingerprints import fps_from_smiles
|
|
9
|
+
from bblean.bitbirch import BitBirch, set_merge
|
|
10
|
+
from bblean.fingerprints import pack_fingerprints, unpack_fingerprints
|
|
11
|
+
from bblean._version import __version__
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
# Global namespace for convenience
|
|
15
|
+
"BitBirch",
|
|
16
|
+
"set_merge",
|
|
17
|
+
"pack_fingerprints",
|
|
18
|
+
"unpack_fingerprints",
|
|
19
|
+
"load_smiles",
|
|
20
|
+
"fps_from_smiles",
|
|
21
|
+
"__version__",
|
|
22
|
+
]
|
bblean/_config.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
r"""Global defaults and related utilities"""
|
|
2
|
+
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import typing as tp
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
import dataclasses
|
|
9
|
+
import multiprocessing as mp
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from bblean._memory import system_mem_gib
|
|
15
|
+
from bblean.utils import (
|
|
16
|
+
_cpu_name,
|
|
17
|
+
cpp_extensions_are_enabled,
|
|
18
|
+
cpp_extensions_are_installed,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclasses.dataclass(slots=True)
|
|
23
|
+
class BitBirchConfig:
|
|
24
|
+
threshold: float = 0.30
|
|
25
|
+
branching_factor: int = 254
|
|
26
|
+
merge_criterion: str = "diameter"
|
|
27
|
+
refine_merge_criterion: str = "tolerance-diameter"
|
|
28
|
+
refine_threshold_change: float = 0.0
|
|
29
|
+
tolerance: float = 0.05
|
|
30
|
+
n_features: int = 2048
|
|
31
|
+
fp_kind: str = "ecfp4"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
DEFAULTS = BitBirchConfig()
|
|
35
|
+
|
|
36
|
+
TSNE_SEED = 42
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def collect_system_specs_and_dump_config(
|
|
40
|
+
config: dict[str, tp.Any],
|
|
41
|
+
) -> None:
|
|
42
|
+
config = deepcopy(config)
|
|
43
|
+
config_path = Path(config["out_dir"]) / "config.json"
|
|
44
|
+
total_mem, avail_mem = system_mem_gib()
|
|
45
|
+
# System info
|
|
46
|
+
config["cpp_extensions_enabled"] = cpp_extensions_are_enabled()
|
|
47
|
+
config["cpp_extensions_installed"] = cpp_extensions_are_installed()
|
|
48
|
+
config["total_memory_gib"] = total_mem
|
|
49
|
+
config["initial_available_memory_gib"] = avail_mem
|
|
50
|
+
config["platform"] = sys.platform
|
|
51
|
+
config["cpu"] = _cpu_name()
|
|
52
|
+
config["numpy_version"] = np.__version__
|
|
53
|
+
config["python_version"] = sys.version.split()[0]
|
|
54
|
+
# Multiprocessing info
|
|
55
|
+
if config.get("num_processes", 1) > 1:
|
|
56
|
+
config["multiprocessing_start_method"] = mp.get_start_method()
|
|
57
|
+
config["visible_cpu_cores"] = os.cpu_count()
|
|
58
|
+
|
|
59
|
+
# Dump config after checking if the output dir has files
|
|
60
|
+
with open(config_path, mode="wt", encoding="utf-8") as f:
|
|
61
|
+
json.dump(config, f, indent=4)
|
bblean/_console.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
r"""Pretty printing"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import numpy as np
|
|
5
|
+
import typing as tp
|
|
6
|
+
import os
|
|
7
|
+
import multiprocessing as mp
|
|
8
|
+
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from bblean._memory import get_peak_memory_gib
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BBConsole(Console):
|
|
15
|
+
def print_banner(self) -> None:
|
|
16
|
+
if os.environ.get("BITBIRCHNOBANNER", ""):
|
|
17
|
+
return
|
|
18
|
+
banner = r"""[bold]
|
|
19
|
+
______ _ _ ______ _ _
|
|
20
|
+
| ___ (_) | | ___ (_) | | [/bold][cyan] ______ [/cyan][bold]
|
|
21
|
+
| |_/ /_| |_| |_/ /_ _ __ ___| |__ [/bold][cyan] ___ / ___________ _______ [/cyan][bold]
|
|
22
|
+
| ___ \ | __| ___ \ | '__/ __| '_ \ [/bold][cyan] __ / _ _ \ __ `/_ __ \ [/cyan][bold]
|
|
23
|
+
| |_/ / | |_| |_/ / | | | (__| | | | [/bold][cyan] _ /___/ __/ /_/ /_ / / / [/cyan][bold]
|
|
24
|
+
\____/|_|\__\____/|_|_| \___|_| |_| [/bold][cyan] /_____/\___/\__,_/ /_/ /_/ [/cyan][bold]""" # noqa W291
|
|
25
|
+
self.print(banner, highlight=False)
|
|
26
|
+
self.print()
|
|
27
|
+
self.print()
|
|
28
|
+
self.print(
|
|
29
|
+
r"""BitBirch-Lean is developed by the [bold]Miranda-Quintana Lab[/bold] https://github.com/mqcomplab
|
|
30
|
+
If you find this software useful please cite the following articles:
|
|
31
|
+
[yellow]•[/yellow] [italic]BitBIRCH: efficient clustering of large molecular libraries[/italic]:
|
|
32
|
+
https://doi.org/10.1039/D5DD00030K
|
|
33
|
+
[yellow]•[/yellow] [italic]BitBIRCH Clustering Refinement Strategies[/italic]:
|
|
34
|
+
https://doi.org/10.1021/acs.jcim.5c00627
|
|
35
|
+
[yellow]•[/yellow] [italic]BitBIRCH-Lean[/italic]:
|
|
36
|
+
(preprint) https://www.biorxiv.org/content/10.1101/2025.10.22.684015v1""" # noqa
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def print_peak_mem(self, out_dir: Path, indent: bool = True) -> None:
|
|
40
|
+
peak_mem_gib = get_peak_memory_gib(out_dir)
|
|
41
|
+
if peak_mem_gib is None:
|
|
42
|
+
return
|
|
43
|
+
indent_str = " " * 4 if indent else ""
|
|
44
|
+
self.print(f"{indent_str}- Peak RAM use: {peak_mem_gib:.4f} GiB")
|
|
45
|
+
|
|
46
|
+
def print_config(self, config: dict[str, tp.Any]) -> None:
|
|
47
|
+
num_fps_loaded = np.array(config["num_fps_loaded"])
|
|
48
|
+
total_fps_num = num_fps_loaded.sum()
|
|
49
|
+
with np.printoptions(formatter={"int": "{:,}".format}, threshold=10):
|
|
50
|
+
num_fps_str = str(num_fps_loaded)[1:-1]
|
|
51
|
+
self.print(
|
|
52
|
+
f"Running [bold]single-round, serial (1 process)[/bold] clustering\n\n"
|
|
53
|
+
f"- Branching factor: {config['branching_factor']:,}\n"
|
|
54
|
+
f"- Merge criterion: [yellow]{config['merge_criterion']}[/yellow]\n"
|
|
55
|
+
f"- Threshold: {config['threshold']}\n"
|
|
56
|
+
f"- Num. files loaded: {len(config['input_files']):,}\n"
|
|
57
|
+
f"- Num. fingerprints loaded for each file: {num_fps_str}\n"
|
|
58
|
+
f"- Total num. fingerprints: {total_fps_num:,}\n"
|
|
59
|
+
f"- Output directory: {config['out_dir']}\n",
|
|
60
|
+
end="",
|
|
61
|
+
)
|
|
62
|
+
bb_variant = config.get("bitbirch_variant", "lean")
|
|
63
|
+
max_files = config.get("max_files", None)
|
|
64
|
+
max_fps = config.get("max_fps", None)
|
|
65
|
+
if "tolerance" in config["merge_criterion"]:
|
|
66
|
+
self.print(f"- Tolerance: {config['tolerance']}\n", end="")
|
|
67
|
+
if config["refine_num"] > 0:
|
|
68
|
+
self.print(
|
|
69
|
+
f"- Will refine largest {config['refine_num']} clusters\n", end=""
|
|
70
|
+
)
|
|
71
|
+
self.print(f"- Num. clusters to refine: {config['refine_num']}\n", end="")
|
|
72
|
+
self.print(
|
|
73
|
+
"- Refine criterion: "
|
|
74
|
+
f"[yellow]{config['refine_merge_criterion']}[/yellow]\n",
|
|
75
|
+
end="",
|
|
76
|
+
)
|
|
77
|
+
if "tolerance" in config["refine_merge_criterion"]:
|
|
78
|
+
self.print(f"- Refine tolerance: {config['tolerance']}\n", end="")
|
|
79
|
+
self.print(
|
|
80
|
+
f"- Refine threshold change: {config['refine_threshold_change']}\n",
|
|
81
|
+
end="",
|
|
82
|
+
)
|
|
83
|
+
if bb_variant != "lean":
|
|
84
|
+
self.print(
|
|
85
|
+
"- [bold]DEBUG:[/bold] Using bitbirch version: {variant}\n", end=""
|
|
86
|
+
)
|
|
87
|
+
if max_files is not None:
|
|
88
|
+
self.print(
|
|
89
|
+
f"- [bold]DEBUG:[/bold] Max files to load: {max_files:,}\n", end=""
|
|
90
|
+
)
|
|
91
|
+
if max_fps is not None:
|
|
92
|
+
self.print(
|
|
93
|
+
f"- [bold]DEBUG:[/bold] Max fps to load per file: {max_fps:,}\n", end=""
|
|
94
|
+
)
|
|
95
|
+
self.print()
|
|
96
|
+
|
|
97
|
+
def print_multiround_config(
|
|
98
|
+
self, config: dict[str, tp.Any], mp_context: tp.Any = None
|
|
99
|
+
) -> None:
|
|
100
|
+
if mp_context is None:
|
|
101
|
+
mp_context = mp.get_context()
|
|
102
|
+
num_processes = config.get("num_initial_processes", 1)
|
|
103
|
+
extra_desc = (
|
|
104
|
+
f"parallel (max {num_processes:,} processes)"
|
|
105
|
+
if num_processes > 1
|
|
106
|
+
else "serial (1 process)"
|
|
107
|
+
)
|
|
108
|
+
desc = f"multi-round, {extra_desc}"
|
|
109
|
+
num_fps_loaded = np.array(config["num_fps_loaded"])
|
|
110
|
+
total_fps_num = num_fps_loaded.sum()
|
|
111
|
+
with np.printoptions(formatter={"int": "{:,}".format}, threshold=10):
|
|
112
|
+
num_fps_str = str(num_fps_loaded)[1:-1]
|
|
113
|
+
self.print(
|
|
114
|
+
f"Running [bold]{desc}[/bold] clustering\n\n"
|
|
115
|
+
f"- Branching factor: {config['branching_factor']:,}\n"
|
|
116
|
+
f"- Initial round merge criterion: [yellow]{config['initial_merge_criterion']}[/yellow]\n" # noqa:E501
|
|
117
|
+
f"- Threshold: {config['threshold']}\n"
|
|
118
|
+
f"- Tolerance: {config['tolerance']}\n"
|
|
119
|
+
f"- Num. files loaded: {len(config['input_files']):,}\n"
|
|
120
|
+
f"- Num. fingerprints loaded for each file: {num_fps_str}\n"
|
|
121
|
+
f"- Total num. fingerprints: {total_fps_num:,}\n"
|
|
122
|
+
f"- Output directory: {config['out_dir']}\n",
|
|
123
|
+
end="",
|
|
124
|
+
)
|
|
125
|
+
full_refinement_before_midsection = config.get(
|
|
126
|
+
"full_refinement_before_midsection", False
|
|
127
|
+
)
|
|
128
|
+
bb_variant = config.get("bitbirch_variant", "lean")
|
|
129
|
+
max_files = config.get("max_files", None)
|
|
130
|
+
bin_size = config.get("bin_size", None)
|
|
131
|
+
max_fps = config.get("max_fps", None)
|
|
132
|
+
if bin_size is not None:
|
|
133
|
+
self.print(f"- Bin size for second round: {bin_size:,}\n", end="")
|
|
134
|
+
if num_processes > 1:
|
|
135
|
+
self.print(
|
|
136
|
+
f"- Multiprocessing method: [yellow]{mp_context._name}[/yellow]\n",
|
|
137
|
+
end="",
|
|
138
|
+
)
|
|
139
|
+
if not full_refinement_before_midsection:
|
|
140
|
+
self.print(
|
|
141
|
+
f"- Full refinement before midsection: {full_refinement_before_midsection}\n", # noqa:E501
|
|
142
|
+
end="",
|
|
143
|
+
)
|
|
144
|
+
if bb_variant != "lean":
|
|
145
|
+
self.print(
|
|
146
|
+
"- [bold]DEBUG:[/bold] Using bitbirch version: {variant}\n", end=""
|
|
147
|
+
)
|
|
148
|
+
if max_files is not None:
|
|
149
|
+
self.print(
|
|
150
|
+
f"- [bold]DEBUG:[/bold] Max files to load: {max_files:,}\n", end=""
|
|
151
|
+
)
|
|
152
|
+
if max_fps is not None:
|
|
153
|
+
self.print(
|
|
154
|
+
f"- [bold]DEBUG:[/bold] Max fps to load per file: {max_fps:,}\n", end=""
|
|
155
|
+
)
|
|
156
|
+
self.print()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class SilentConsole(BBConsole):
|
|
160
|
+
def print(self, *args: tp.Any, **kwargs: tp.Any) -> None:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
def print_peak_mem(self, out_dir: Path, indent: bool = True) -> None:
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
def print_banner(self) -> None:
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def status(self, *args: tp.Any, **kwargs: tp.Any) -> tp.Any:
|
|
170
|
+
class DummyStatus:
|
|
171
|
+
def __enter__(self) -> tp.Any:
|
|
172
|
+
return self
|
|
173
|
+
|
|
174
|
+
def __exit__(self, *args: tp.Any, **kwargs: tp.Any) -> None:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
return DummyStatus()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
_console = BBConsole()
|
|
181
|
+
_silent_console = SilentConsole()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def get_console(silent: bool = False) -> BBConsole:
|
|
185
|
+
if silent:
|
|
186
|
+
return _silent_console
|
|
187
|
+
return _console
|
|
Binary file
|
|
File without changes
|