bblean 0.6.0b1__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bblean/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ r"""BitBIRCH-Lean, a high-throughput, memory-efficient implementation of BitBIRCH
2
+
3
+ BitBIRCH-Lean is designed for high-thorouput clustering of huge molecular
4
+ libraries (of up to hundreds of milliones of molecules).
5
+ """
6
+
7
+ from bblean.smiles import load_smiles
8
+ from bblean.fingerprints import fps_from_smiles
9
+ from bblean.bitbirch import BitBirch, set_merge
10
+ from bblean.fingerprints import pack_fingerprints, unpack_fingerprints
11
+ from bblean._version import __version__
12
+
13
+ __all__ = [
14
+ # Global namespace for convenience
15
+ "BitBirch",
16
+ "set_merge",
17
+ "pack_fingerprints",
18
+ "unpack_fingerprints",
19
+ "load_smiles",
20
+ "fps_from_smiles",
21
+ "__version__",
22
+ ]
bblean/_config.py ADDED
@@ -0,0 +1,61 @@
1
+ r"""Global defaults and related utilities"""
2
+
3
+ from copy import deepcopy
4
+ from pathlib import Path
5
+ import typing as tp
6
+ import json
7
+ import sys
8
+ import dataclasses
9
+ import multiprocessing as mp
10
+ import os
11
+
12
+ import numpy as np
13
+
14
+ from bblean._memory import system_mem_gib
15
+ from bblean.utils import (
16
+ _cpu_name,
17
+ cpp_extensions_are_enabled,
18
+ cpp_extensions_are_installed,
19
+ )
20
+
21
+
22
+ @dataclasses.dataclass(slots=True)
23
+ class BitBirchConfig:
24
+ threshold: float = 0.30
25
+ branching_factor: int = 254
26
+ merge_criterion: str = "diameter"
27
+ refine_merge_criterion: str = "tolerance-diameter"
28
+ refine_threshold_change: float = 0.0
29
+ tolerance: float = 0.05
30
+ n_features: int = 2048
31
+ fp_kind: str = "ecfp4"
32
+
33
+
34
+ DEFAULTS = BitBirchConfig()
35
+
36
+ TSNE_SEED = 42
37
+
38
+
39
+ def collect_system_specs_and_dump_config(
40
+ config: dict[str, tp.Any],
41
+ ) -> None:
42
+ config = deepcopy(config)
43
+ config_path = Path(config["out_dir"]) / "config.json"
44
+ total_mem, avail_mem = system_mem_gib()
45
+ # System info
46
+ config["cpp_extensions_enabled"] = cpp_extensions_are_enabled()
47
+ config["cpp_extensions_installed"] = cpp_extensions_are_installed()
48
+ config["total_memory_gib"] = total_mem
49
+ config["initial_available_memory_gib"] = avail_mem
50
+ config["platform"] = sys.platform
51
+ config["cpu"] = _cpu_name()
52
+ config["numpy_version"] = np.__version__
53
+ config["python_version"] = sys.version.split()[0]
54
+ # Multiprocessing info
55
+ if config.get("num_processes", 1) > 1:
56
+ config["multiprocessing_start_method"] = mp.get_start_method()
57
+ config["visible_cpu_cores"] = os.cpu_count()
58
+
59
+ # Dump config after checking if the output dir has files
60
+ with open(config_path, mode="wt", encoding="utf-8") as f:
61
+ json.dump(config, f, indent=4)
bblean/_console.py ADDED
@@ -0,0 +1,187 @@
1
+ r"""Pretty printing"""
2
+
3
+ from pathlib import Path
4
+ import numpy as np
5
+ import typing as tp
6
+ import os
7
+ import multiprocessing as mp
8
+
9
+ from rich.console import Console
10
+
11
+ from bblean._memory import get_peak_memory_gib
12
+
13
+
14
+ class BBConsole(Console):
15
+ def print_banner(self) -> None:
16
+ if os.environ.get("BITBIRCHNOBANNER", ""):
17
+ return
18
+ banner = r"""[bold]
19
+ ______ _ _ ______ _ _
20
+ | ___ (_) | | ___ (_) | | [/bold][cyan] ______ [/cyan][bold]
21
+ | |_/ /_| |_| |_/ /_ _ __ ___| |__ [/bold][cyan] ___ / ___________ _______ [/cyan][bold]
22
+ | ___ \ | __| ___ \ | '__/ __| '_ \ [/bold][cyan] __ / _ _ \ __ `/_ __ \ [/cyan][bold]
23
+ | |_/ / | |_| |_/ / | | | (__| | | | [/bold][cyan] _ /___/ __/ /_/ /_ / / / [/cyan][bold]
24
+ \____/|_|\__\____/|_|_| \___|_| |_| [/bold][cyan] /_____/\___/\__,_/ /_/ /_/ [/cyan][bold]""" # noqa W291
25
+ self.print(banner, highlight=False)
26
+ self.print()
27
+ self.print()
28
+ self.print(
29
+ r"""BitBirch-Lean is developed by the [bold]Miranda-Quintana Lab[/bold] https://github.com/mqcomplab
30
+ If you find this software useful please cite the following articles:
31
+ [yellow]•[/yellow] [italic]BitBIRCH: efficient clustering of large molecular libraries[/italic]:
32
+ https://doi.org/10.1039/D5DD00030K
33
+ [yellow]•[/yellow] [italic]BitBIRCH Clustering Refinement Strategies[/italic]:
34
+ https://doi.org/10.1021/acs.jcim.5c00627
35
+ [yellow]•[/yellow] [italic]BitBIRCH-Lean[/italic]:
36
+ (preprint) https://www.biorxiv.org/content/10.1101/2025.10.22.684015v1""" # noqa
37
+ )
38
+
39
+ def print_peak_mem(self, out_dir: Path, indent: bool = True) -> None:
40
+ peak_mem_gib = get_peak_memory_gib(out_dir)
41
+ if peak_mem_gib is None:
42
+ return
43
+ indent_str = " " * 4 if indent else ""
44
+ self.print(f"{indent_str}- Peak RAM use: {peak_mem_gib:.4f} GiB")
45
+
46
+ def print_config(self, config: dict[str, tp.Any]) -> None:
47
+ num_fps_loaded = np.array(config["num_fps_loaded"])
48
+ total_fps_num = num_fps_loaded.sum()
49
+ with np.printoptions(formatter={"int": "{:,}".format}, threshold=10):
50
+ num_fps_str = str(num_fps_loaded)[1:-1]
51
+ self.print(
52
+ f"Running [bold]single-round, serial (1 process)[/bold] clustering\n\n"
53
+ f"- Branching factor: {config['branching_factor']:,}\n"
54
+ f"- Merge criterion: [yellow]{config['merge_criterion']}[/yellow]\n"
55
+ f"- Threshold: {config['threshold']}\n"
56
+ f"- Num. files loaded: {len(config['input_files']):,}\n"
57
+ f"- Num. fingerprints loaded for each file: {num_fps_str}\n"
58
+ f"- Total num. fingerprints: {total_fps_num:,}\n"
59
+ f"- Output directory: {config['out_dir']}\n",
60
+ end="",
61
+ )
62
+ bb_variant = config.get("bitbirch_variant", "lean")
63
+ max_files = config.get("max_files", None)
64
+ max_fps = config.get("max_fps", None)
65
+ if "tolerance" in config["merge_criterion"]:
66
+ self.print(f"- Tolerance: {config['tolerance']}\n", end="")
67
+ if config["refine_num"] > 0:
68
+ self.print(
69
+ f"- Will refine largest {config['refine_num']} clusters\n", end=""
70
+ )
71
+ self.print(f"- Num. clusters to refine: {config['refine_num']}\n", end="")
72
+ self.print(
73
+ "- Refine criterion: "
74
+ f"[yellow]{config['refine_merge_criterion']}[/yellow]\n",
75
+ end="",
76
+ )
77
+ if "tolerance" in config["refine_merge_criterion"]:
78
+ self.print(f"- Refine tolerance: {config['tolerance']}\n", end="")
79
+ self.print(
80
+ f"- Refine threshold change: {config['refine_threshold_change']}\n",
81
+ end="",
82
+ )
83
+ if bb_variant != "lean":
84
+ self.print(
85
+ "- [bold]DEBUG:[/bold] Using bitbirch version: {variant}\n", end=""
86
+ )
87
+ if max_files is not None:
88
+ self.print(
89
+ f"- [bold]DEBUG:[/bold] Max files to load: {max_files:,}\n", end=""
90
+ )
91
+ if max_fps is not None:
92
+ self.print(
93
+ f"- [bold]DEBUG:[/bold] Max fps to load per file: {max_fps:,}\n", end=""
94
+ )
95
+ self.print()
96
+
97
+ def print_multiround_config(
98
+ self, config: dict[str, tp.Any], mp_context: tp.Any = None
99
+ ) -> None:
100
+ if mp_context is None:
101
+ mp_context = mp.get_context()
102
+ num_processes = config.get("num_initial_processes", 1)
103
+ extra_desc = (
104
+ f"parallel (max {num_processes:,} processes)"
105
+ if num_processes > 1
106
+ else "serial (1 process)"
107
+ )
108
+ desc = f"multi-round, {extra_desc}"
109
+ num_fps_loaded = np.array(config["num_fps_loaded"])
110
+ total_fps_num = num_fps_loaded.sum()
111
+ with np.printoptions(formatter={"int": "{:,}".format}, threshold=10):
112
+ num_fps_str = str(num_fps_loaded)[1:-1]
113
+ self.print(
114
+ f"Running [bold]{desc}[/bold] clustering\n\n"
115
+ f"- Branching factor: {config['branching_factor']:,}\n"
116
+ f"- Initial round merge criterion: [yellow]{config['initial_merge_criterion']}[/yellow]\n" # noqa:E501
117
+ f"- Threshold: {config['threshold']}\n"
118
+ f"- Tolerance: {config['tolerance']}\n"
119
+ f"- Num. files loaded: {len(config['input_files']):,}\n"
120
+ f"- Num. fingerprints loaded for each file: {num_fps_str}\n"
121
+ f"- Total num. fingerprints: {total_fps_num:,}\n"
122
+ f"- Output directory: {config['out_dir']}\n",
123
+ end="",
124
+ )
125
+ full_refinement_before_midsection = config.get(
126
+ "full_refinement_before_midsection", False
127
+ )
128
+ bb_variant = config.get("bitbirch_variant", "lean")
129
+ max_files = config.get("max_files", None)
130
+ bin_size = config.get("bin_size", None)
131
+ max_fps = config.get("max_fps", None)
132
+ if bin_size is not None:
133
+ self.print(f"- Bin size for second round: {bin_size:,}\n", end="")
134
+ if num_processes > 1:
135
+ self.print(
136
+ f"- Multiprocessing method: [yellow]{mp_context._name}[/yellow]\n",
137
+ end="",
138
+ )
139
+ if not full_refinement_before_midsection:
140
+ self.print(
141
+ f"- Full refinement before midsection: {full_refinement_before_midsection}\n", # noqa:E501
142
+ end="",
143
+ )
144
+ if bb_variant != "lean":
145
+ self.print(
146
+ "- [bold]DEBUG:[/bold] Using bitbirch version: {variant}\n", end=""
147
+ )
148
+ if max_files is not None:
149
+ self.print(
150
+ f"- [bold]DEBUG:[/bold] Max files to load: {max_files:,}\n", end=""
151
+ )
152
+ if max_fps is not None:
153
+ self.print(
154
+ f"- [bold]DEBUG:[/bold] Max fps to load per file: {max_fps:,}\n", end=""
155
+ )
156
+ self.print()
157
+
158
+
159
+ class SilentConsole(BBConsole):
160
+ def print(self, *args: tp.Any, **kwargs: tp.Any) -> None:
161
+ pass
162
+
163
+ def print_peak_mem(self, out_dir: Path, indent: bool = True) -> None:
164
+ pass
165
+
166
+ def print_banner(self) -> None:
167
+ pass
168
+
169
+ def status(self, *args: tp.Any, **kwargs: tp.Any) -> tp.Any:
170
+ class DummyStatus:
171
+ def __enter__(self) -> tp.Any:
172
+ return self
173
+
174
+ def __exit__(self, *args: tp.Any, **kwargs: tp.Any) -> None:
175
+ pass
176
+
177
+ return DummyStatus()
178
+
179
+
180
+ _console = BBConsole()
181
+ _silent_console = SilentConsole()
182
+
183
+
184
+ def get_console(silent: bool = False) -> BBConsole:
185
+ if silent:
186
+ return _silent_console
187
+ return _console
File without changes