bblean 0.6.0b1__cp312-cp312-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/__init__.py +22 -0
- bblean/_config.py +61 -0
- bblean/_console.py +187 -0
- bblean/_cpp_similarity.cpython-312-darwin.so +0 -0
- bblean/_legacy/__init__.py +0 -0
- bblean/_legacy/bb_int64.py +1252 -0
- bblean/_legacy/bb_uint8.py +1144 -0
- bblean/_memory.py +198 -0
- bblean/_merges.py +212 -0
- bblean/_py_similarity.py +278 -0
- bblean/_timer.py +42 -0
- bblean/_version.py +34 -0
- bblean/analysis.py +258 -0
- bblean/bitbirch.py +1437 -0
- bblean/cli.py +1854 -0
- bblean/csrc/README.md +1 -0
- bblean/csrc/similarity.cpp +521 -0
- bblean/fingerprints.py +424 -0
- bblean/metrics.py +199 -0
- bblean/multiround.py +489 -0
- bblean/plotting.py +479 -0
- bblean/similarity.py +304 -0
- bblean/sklearn.py +203 -0
- bblean/smiles.py +61 -0
- bblean/utils.py +130 -0
- bblean-0.6.0b1.dist-info/METADATA +283 -0
- bblean-0.6.0b1.dist-info/RECORD +31 -0
- bblean-0.6.0b1.dist-info/WHEEL +6 -0
- bblean-0.6.0b1.dist-info/entry_points.txt +2 -0
- bblean-0.6.0b1.dist-info/licenses/LICENSE +48 -0
- bblean-0.6.0b1.dist-info/top_level.txt +1 -0
bblean/cli.py
ADDED
|
@@ -0,0 +1,1854 @@
|
|
|
1
|
+
r"""Command line interface entrypoints"""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
import random
|
|
5
|
+
import typing as tp
|
|
6
|
+
import math
|
|
7
|
+
import shutil
|
|
8
|
+
import sys
|
|
9
|
+
import pickle
|
|
10
|
+
import multiprocessing as mp
|
|
11
|
+
import multiprocessing.shared_memory as shmem
|
|
12
|
+
from typing import Annotated
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from typer import Typer, Argument, Option, Abort, Context, Exit
|
|
16
|
+
|
|
17
|
+
from bblean._memory import launch_monitor_rss_daemon
|
|
18
|
+
from bblean._timer import Timer
|
|
19
|
+
from bblean._config import DEFAULTS, collect_system_specs_and_dump_config, TSNE_SEED
|
|
20
|
+
from bblean.utils import _import_bitbirch_variant, batched
|
|
21
|
+
|
|
22
|
+
app = Typer(
|
|
23
|
+
rich_markup_mode="markdown",
|
|
24
|
+
add_completion=False,
|
|
25
|
+
help=r"""CLI tool for serial or parallel fast clustering of molecular fingerprints
|
|
26
|
+
using the memory-efficient and compute-efficient *O(N)* BitBIRCH algorithm ('Lean'
|
|
27
|
+
version). For more info about the subcommands run `bb <subcommand> --help `.""",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _print_help_banner(ctx: Context, value: bool) -> None:
|
|
32
|
+
if value:
|
|
33
|
+
from bblean._console import get_console
|
|
34
|
+
|
|
35
|
+
console = get_console()
|
|
36
|
+
console.print_banner()
|
|
37
|
+
console.print(ctx.get_help())
|
|
38
|
+
raise Exit()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _validate_output_dir(out_dir: Path, overwrite: bool = False) -> None:
|
|
42
|
+
if out_dir.exists():
|
|
43
|
+
if not out_dir.is_dir():
|
|
44
|
+
raise RuntimeError("Output dir should be a dir")
|
|
45
|
+
if any(out_dir.iterdir()):
|
|
46
|
+
if overwrite:
|
|
47
|
+
shutil.rmtree(out_dir)
|
|
48
|
+
else:
|
|
49
|
+
raise RuntimeError(f"Output dir {out_dir} has files")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Validate that the naming convention for the input files is correct
|
|
53
|
+
def _validate_input_dir(in_dir: Path | str) -> None:
|
|
54
|
+
in_dir = Path(in_dir)
|
|
55
|
+
if not in_dir.is_dir():
|
|
56
|
+
raise RuntimeError(f"Input dir {in_dir} should be a dir")
|
|
57
|
+
if not any(in_dir.glob("*.npy")):
|
|
58
|
+
raise RuntimeError(f"Input dir {in_dir} should have *.npy fingerprint files")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@app.callback()
|
|
62
|
+
def _main(
|
|
63
|
+
ctx: Context,
|
|
64
|
+
help_: Annotated[
|
|
65
|
+
bool,
|
|
66
|
+
Option(
|
|
67
|
+
"--help/ ",
|
|
68
|
+
"-h",
|
|
69
|
+
is_eager=True,
|
|
70
|
+
help="Show this message and exit.",
|
|
71
|
+
callback=_print_help_banner,
|
|
72
|
+
),
|
|
73
|
+
] = False,
|
|
74
|
+
) -> None:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@app.command("plot-pops", rich_help_panel="Analysis")
|
|
79
|
+
def _plot_pops(
|
|
80
|
+
clusters_path: Annotated[
|
|
81
|
+
Path,
|
|
82
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
83
|
+
],
|
|
84
|
+
fps_path: Annotated[
|
|
85
|
+
Path | None,
|
|
86
|
+
Option(
|
|
87
|
+
"-f",
|
|
88
|
+
"--fps-path",
|
|
89
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
90
|
+
show_default=False,
|
|
91
|
+
),
|
|
92
|
+
] = None,
|
|
93
|
+
title: Annotated[
|
|
94
|
+
str | None,
|
|
95
|
+
Option("--title", help="Plot title"),
|
|
96
|
+
] = None,
|
|
97
|
+
top: Annotated[
|
|
98
|
+
int | None,
|
|
99
|
+
Option("--top"),
|
|
100
|
+
] = None,
|
|
101
|
+
input_is_packed: Annotated[
|
|
102
|
+
bool,
|
|
103
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
104
|
+
] = True,
|
|
105
|
+
min_size: Annotated[
|
|
106
|
+
int,
|
|
107
|
+
Option("--min-size"),
|
|
108
|
+
] = 0,
|
|
109
|
+
n_features: Annotated[
|
|
110
|
+
int | None,
|
|
111
|
+
Option(
|
|
112
|
+
"--n-features",
|
|
113
|
+
help="Number of features in the fingerprints."
|
|
114
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
115
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
116
|
+
rich_help_panel="Advanced",
|
|
117
|
+
),
|
|
118
|
+
] = None,
|
|
119
|
+
save: Annotated[
|
|
120
|
+
bool,
|
|
121
|
+
Option("--save/--no-save"),
|
|
122
|
+
] = True,
|
|
123
|
+
filename: Annotated[
|
|
124
|
+
str | None,
|
|
125
|
+
Option("--filename"),
|
|
126
|
+
] = None,
|
|
127
|
+
verbose: Annotated[
|
|
128
|
+
bool,
|
|
129
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
130
|
+
] = True,
|
|
131
|
+
show: Annotated[
|
|
132
|
+
bool,
|
|
133
|
+
Option("--show/--no-show", hidden=True),
|
|
134
|
+
] = True,
|
|
135
|
+
) -> None:
|
|
136
|
+
r"""Population plot of the clustering results"""
|
|
137
|
+
from bblean._console import get_console
|
|
138
|
+
|
|
139
|
+
console = get_console(silent=not verbose)
|
|
140
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
141
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
142
|
+
from bblean.plotting import _dispatch_visualization, pops_plot
|
|
143
|
+
|
|
144
|
+
_dispatch_visualization(
|
|
145
|
+
clusters_path,
|
|
146
|
+
"pops",
|
|
147
|
+
pops_plot,
|
|
148
|
+
{},
|
|
149
|
+
min_size=min_size,
|
|
150
|
+
top=top,
|
|
151
|
+
n_features=n_features,
|
|
152
|
+
input_is_packed=input_is_packed,
|
|
153
|
+
fps_path=fps_path,
|
|
154
|
+
title=title,
|
|
155
|
+
filename=filename,
|
|
156
|
+
verbose=verbose,
|
|
157
|
+
save=save,
|
|
158
|
+
show=show,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@app.command("plot-umap", rich_help_panel="Analysis")
|
|
163
|
+
def _plot_umap(
|
|
164
|
+
clusters_path: Annotated[
|
|
165
|
+
Path,
|
|
166
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
167
|
+
],
|
|
168
|
+
fps_path: Annotated[
|
|
169
|
+
Path | None,
|
|
170
|
+
Option(
|
|
171
|
+
"-f",
|
|
172
|
+
"--fps-path",
|
|
173
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
174
|
+
show_default=False,
|
|
175
|
+
),
|
|
176
|
+
] = None,
|
|
177
|
+
title: Annotated[
|
|
178
|
+
str | None,
|
|
179
|
+
Option("--title", help="Plot title"),
|
|
180
|
+
] = None,
|
|
181
|
+
save: Annotated[
|
|
182
|
+
bool,
|
|
183
|
+
Option("--save/--no-save"),
|
|
184
|
+
] = True,
|
|
185
|
+
top: Annotated[
|
|
186
|
+
int,
|
|
187
|
+
Option("--top"),
|
|
188
|
+
] = 20,
|
|
189
|
+
input_is_packed: Annotated[
|
|
190
|
+
bool,
|
|
191
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
192
|
+
] = True,
|
|
193
|
+
scaling: Annotated[
|
|
194
|
+
str,
|
|
195
|
+
Option("--scaling", rich_help_panel="Advanced"),
|
|
196
|
+
] = "normalize",
|
|
197
|
+
min_size: Annotated[
|
|
198
|
+
int,
|
|
199
|
+
Option("--min-size"),
|
|
200
|
+
] = 0,
|
|
201
|
+
n_features: Annotated[
|
|
202
|
+
int | None,
|
|
203
|
+
Option(
|
|
204
|
+
"--n-features",
|
|
205
|
+
help="Number of features in the fingerprints."
|
|
206
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
207
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
208
|
+
rich_help_panel="Advanced",
|
|
209
|
+
),
|
|
210
|
+
] = None,
|
|
211
|
+
filename: Annotated[
|
|
212
|
+
str | None,
|
|
213
|
+
Option("--filename"),
|
|
214
|
+
] = None,
|
|
215
|
+
verbose: Annotated[
|
|
216
|
+
bool,
|
|
217
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
218
|
+
] = True,
|
|
219
|
+
show: Annotated[
|
|
220
|
+
bool,
|
|
221
|
+
Option("--show/--no-show", hidden=True),
|
|
222
|
+
] = True,
|
|
223
|
+
deterministic: Annotated[
|
|
224
|
+
bool,
|
|
225
|
+
Option("--deterministic/--no-deterministic"),
|
|
226
|
+
] = False,
|
|
227
|
+
n_neighbors: Annotated[
|
|
228
|
+
int,
|
|
229
|
+
Option("-n", "--neighbors"),
|
|
230
|
+
] = 15,
|
|
231
|
+
min_dist: Annotated[
|
|
232
|
+
float,
|
|
233
|
+
Option("-d", "--min-dist"),
|
|
234
|
+
] = 0.5,
|
|
235
|
+
metric: Annotated[
|
|
236
|
+
str,
|
|
237
|
+
Option("--metric"),
|
|
238
|
+
] = "euclidean",
|
|
239
|
+
densmap: Annotated[
|
|
240
|
+
bool,
|
|
241
|
+
Option("--densmap/--no-densmap"),
|
|
242
|
+
] = False,
|
|
243
|
+
workers: Annotated[
|
|
244
|
+
int | None,
|
|
245
|
+
Option(
|
|
246
|
+
"-w",
|
|
247
|
+
"--workers",
|
|
248
|
+
help="Num. cores to use for parallel processing",
|
|
249
|
+
rich_help_panel="Advanced",
|
|
250
|
+
),
|
|
251
|
+
] = None,
|
|
252
|
+
) -> None:
|
|
253
|
+
r"""UMAP visualization of the clustering results"""
|
|
254
|
+
from bblean._console import get_console
|
|
255
|
+
|
|
256
|
+
console = get_console(silent=not verbose)
|
|
257
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
258
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
259
|
+
from bblean.plotting import _dispatch_visualization, umap_plot
|
|
260
|
+
|
|
261
|
+
kwargs = dict(
|
|
262
|
+
metric=metric,
|
|
263
|
+
densmap=densmap,
|
|
264
|
+
deterministic=deterministic,
|
|
265
|
+
n_neighbors=n_neighbors,
|
|
266
|
+
workers=workers,
|
|
267
|
+
min_dist=min_dist,
|
|
268
|
+
)
|
|
269
|
+
_dispatch_visualization(
|
|
270
|
+
clusters_path,
|
|
271
|
+
"umap",
|
|
272
|
+
umap_plot,
|
|
273
|
+
kwargs,
|
|
274
|
+
min_size=min_size,
|
|
275
|
+
top=top,
|
|
276
|
+
n_features=n_features,
|
|
277
|
+
input_is_packed=input_is_packed,
|
|
278
|
+
fps_path=fps_path,
|
|
279
|
+
title=title,
|
|
280
|
+
filename=filename,
|
|
281
|
+
verbose=verbose,
|
|
282
|
+
save=save,
|
|
283
|
+
show=show,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@app.command("plot-pca", rich_help_panel="Analysis")
|
|
288
|
+
def _plot_pca(
|
|
289
|
+
clusters_path: Annotated[
|
|
290
|
+
Path,
|
|
291
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
292
|
+
],
|
|
293
|
+
fps_path: Annotated[
|
|
294
|
+
Path | None,
|
|
295
|
+
Option(
|
|
296
|
+
"-f",
|
|
297
|
+
"--fps-path",
|
|
298
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
299
|
+
show_default=False,
|
|
300
|
+
),
|
|
301
|
+
] = None,
|
|
302
|
+
title: Annotated[
|
|
303
|
+
str | None,
|
|
304
|
+
Option("--title", help="Plot title"),
|
|
305
|
+
] = None,
|
|
306
|
+
top: Annotated[
|
|
307
|
+
int,
|
|
308
|
+
Option("--top"),
|
|
309
|
+
] = 20,
|
|
310
|
+
min_size: Annotated[
|
|
311
|
+
int,
|
|
312
|
+
Option("--min-size"),
|
|
313
|
+
] = 0,
|
|
314
|
+
input_is_packed: Annotated[
|
|
315
|
+
bool,
|
|
316
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
317
|
+
] = True,
|
|
318
|
+
scaling: Annotated[
|
|
319
|
+
str,
|
|
320
|
+
Option("--scaling", rich_help_panel="Advanced"),
|
|
321
|
+
] = "normalize",
|
|
322
|
+
n_features: Annotated[
|
|
323
|
+
int | None,
|
|
324
|
+
Option(
|
|
325
|
+
"--n-features",
|
|
326
|
+
help="Number of features in the fingerprints."
|
|
327
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
328
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
329
|
+
rich_help_panel="Advanced",
|
|
330
|
+
),
|
|
331
|
+
] = None,
|
|
332
|
+
verbose: Annotated[
|
|
333
|
+
bool,
|
|
334
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
335
|
+
] = True,
|
|
336
|
+
show: Annotated[
|
|
337
|
+
bool,
|
|
338
|
+
Option("--show/--no-show", hidden=True),
|
|
339
|
+
] = True,
|
|
340
|
+
whiten: Annotated[
|
|
341
|
+
bool,
|
|
342
|
+
Option("--whiten/--no-whiten"),
|
|
343
|
+
] = False,
|
|
344
|
+
save: Annotated[
|
|
345
|
+
bool,
|
|
346
|
+
Option("--save/--no-save"),
|
|
347
|
+
] = True,
|
|
348
|
+
filename: Annotated[
|
|
349
|
+
str | None,
|
|
350
|
+
Option("--filename"),
|
|
351
|
+
] = None,
|
|
352
|
+
) -> None:
|
|
353
|
+
r"""PCA visualization of the clustering results"""
|
|
354
|
+
from bblean._console import get_console
|
|
355
|
+
|
|
356
|
+
console = get_console(silent=not verbose)
|
|
357
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
358
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
359
|
+
from bblean.plotting import _dispatch_visualization, pca_plot
|
|
360
|
+
|
|
361
|
+
_dispatch_visualization(
|
|
362
|
+
clusters_path,
|
|
363
|
+
"pca",
|
|
364
|
+
pca_plot,
|
|
365
|
+
{"whiten": whiten},
|
|
366
|
+
min_size=min_size,
|
|
367
|
+
top=top,
|
|
368
|
+
n_features=n_features,
|
|
369
|
+
input_is_packed=input_is_packed,
|
|
370
|
+
fps_path=fps_path,
|
|
371
|
+
title=title,
|
|
372
|
+
filename=filename,
|
|
373
|
+
verbose=verbose,
|
|
374
|
+
save=save,
|
|
375
|
+
show=show,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
@app.command("plot-tsne", rich_help_panel="Analysis")
|
|
380
|
+
def _plot_tsne(
|
|
381
|
+
clusters_path: Annotated[
|
|
382
|
+
Path,
|
|
383
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
384
|
+
],
|
|
385
|
+
fps_path: Annotated[
|
|
386
|
+
Path | None,
|
|
387
|
+
Option(
|
|
388
|
+
"-f",
|
|
389
|
+
"--fps-path",
|
|
390
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
391
|
+
show_default=False,
|
|
392
|
+
),
|
|
393
|
+
] = None,
|
|
394
|
+
title: Annotated[
|
|
395
|
+
str | None,
|
|
396
|
+
Option("--title", help="Plot title"),
|
|
397
|
+
] = None,
|
|
398
|
+
save: Annotated[
|
|
399
|
+
bool,
|
|
400
|
+
Option("--save/--no-save"),
|
|
401
|
+
] = True,
|
|
402
|
+
min_size: Annotated[
|
|
403
|
+
int,
|
|
404
|
+
Option("--min-size"),
|
|
405
|
+
] = 0,
|
|
406
|
+
filename: Annotated[
|
|
407
|
+
str | None,
|
|
408
|
+
Option("--filename"),
|
|
409
|
+
] = None,
|
|
410
|
+
exaggeration: Annotated[
|
|
411
|
+
float | None,
|
|
412
|
+
Option("-e", "--exaggeration", rich_help_panel="Advanced"),
|
|
413
|
+
] = None,
|
|
414
|
+
seed: Annotated[
|
|
415
|
+
int | None,
|
|
416
|
+
Option(
|
|
417
|
+
"-s",
|
|
418
|
+
"--seed",
|
|
419
|
+
help=(
|
|
420
|
+
"Seed for the rng, fixed value by default, for reproducibility."
|
|
421
|
+
" Pass -1 to randomize"
|
|
422
|
+
),
|
|
423
|
+
show_default=False,
|
|
424
|
+
rich_help_panel="Advanced",
|
|
425
|
+
),
|
|
426
|
+
] = TSNE_SEED,
|
|
427
|
+
top: Annotated[
|
|
428
|
+
int,
|
|
429
|
+
Option("--top"),
|
|
430
|
+
] = 20,
|
|
431
|
+
metric: Annotated[
|
|
432
|
+
str,
|
|
433
|
+
Option("--metric", help="Metric to use in the t-SNE source space"),
|
|
434
|
+
] = "euclidean",
|
|
435
|
+
dof: Annotated[
|
|
436
|
+
float,
|
|
437
|
+
Option("-d", "--dof", rich_help_panel="Advanced"),
|
|
438
|
+
] = 1.0,
|
|
439
|
+
perplexity: Annotated[
|
|
440
|
+
int,
|
|
441
|
+
Option(help="t-SNE perplexity", rich_help_panel="Advanced"),
|
|
442
|
+
] = 30,
|
|
443
|
+
input_is_packed: Annotated[
|
|
444
|
+
bool,
|
|
445
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
446
|
+
] = True,
|
|
447
|
+
n_features: Annotated[
|
|
448
|
+
int | None,
|
|
449
|
+
Option(
|
|
450
|
+
"--n-features",
|
|
451
|
+
help="Number of features in the fingerprints."
|
|
452
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
453
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
454
|
+
rich_help_panel="Advanced",
|
|
455
|
+
),
|
|
456
|
+
] = None,
|
|
457
|
+
scaling: Annotated[
|
|
458
|
+
str,
|
|
459
|
+
Option("--scaling", rich_help_panel="Advanced"),
|
|
460
|
+
] = "normalize",
|
|
461
|
+
do_pca_init: Annotated[
|
|
462
|
+
bool,
|
|
463
|
+
Option(
|
|
464
|
+
"--pca-init/--no-pca-init",
|
|
465
|
+
rich_help_panel="Advanced",
|
|
466
|
+
help="Use PCA for initialization",
|
|
467
|
+
),
|
|
468
|
+
] = True,
|
|
469
|
+
pca_reduce: Annotated[
|
|
470
|
+
int | None,
|
|
471
|
+
Option(
|
|
472
|
+
"-p",
|
|
473
|
+
"--pca-reduce",
|
|
474
|
+
rich_help_panel="Advanced",
|
|
475
|
+
help=(
|
|
476
|
+
"Reduce fingerprint dimensionality to N components using PCA."
|
|
477
|
+
" A value of 50 or more maintains cluster structure in general"
|
|
478
|
+
),
|
|
479
|
+
),
|
|
480
|
+
] = None,
|
|
481
|
+
workers: Annotated[
|
|
482
|
+
int | None,
|
|
483
|
+
Option(
|
|
484
|
+
"-w",
|
|
485
|
+
"--workers",
|
|
486
|
+
help="Num. cores to use for parallel processing",
|
|
487
|
+
rich_help_panel="Advanced",
|
|
488
|
+
),
|
|
489
|
+
] = None,
|
|
490
|
+
multiscale: Annotated[
|
|
491
|
+
bool,
|
|
492
|
+
Option(
|
|
493
|
+
"-m/-M",
|
|
494
|
+
"--multiscale/--no-multiscale",
|
|
495
|
+
rich_help_panel="Advanced",
|
|
496
|
+
help="Use multiscale perplexities (WARNING: Can be very slow!)",
|
|
497
|
+
),
|
|
498
|
+
] = False,
|
|
499
|
+
verbose: Annotated[
|
|
500
|
+
bool,
|
|
501
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
502
|
+
] = True,
|
|
503
|
+
show: Annotated[
|
|
504
|
+
bool,
|
|
505
|
+
Option("--show/--no-show", hidden=True),
|
|
506
|
+
] = True,
|
|
507
|
+
) -> None:
|
|
508
|
+
r"""t-SNE visualization of the clustering results"""
|
|
509
|
+
from bblean._console import get_console
|
|
510
|
+
|
|
511
|
+
console = get_console(silent=not verbose)
|
|
512
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
513
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
514
|
+
from bblean.plotting import _dispatch_visualization, tsne_plot
|
|
515
|
+
|
|
516
|
+
kwargs = dict(
|
|
517
|
+
metric=metric,
|
|
518
|
+
seed=seed,
|
|
519
|
+
perplexity=perplexity,
|
|
520
|
+
exaggeration=exaggeration,
|
|
521
|
+
dof=dof,
|
|
522
|
+
workers=workers,
|
|
523
|
+
scaling=scaling,
|
|
524
|
+
do_pca_init=do_pca_init,
|
|
525
|
+
multiscale=multiscale,
|
|
526
|
+
pca_reduce=pca_reduce,
|
|
527
|
+
)
|
|
528
|
+
_dispatch_visualization(
|
|
529
|
+
clusters_path,
|
|
530
|
+
"tsne",
|
|
531
|
+
tsne_plot,
|
|
532
|
+
kwargs,
|
|
533
|
+
min_size=min_size,
|
|
534
|
+
top=top,
|
|
535
|
+
n_features=n_features,
|
|
536
|
+
input_is_packed=input_is_packed,
|
|
537
|
+
fps_path=fps_path,
|
|
538
|
+
title=title,
|
|
539
|
+
filename=filename,
|
|
540
|
+
verbose=verbose,
|
|
541
|
+
save=save,
|
|
542
|
+
show=show,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
@app.command("summary", rich_help_panel="Analysis")
|
|
547
|
+
def _table_summary(
|
|
548
|
+
clusters_path: Annotated[
|
|
549
|
+
Path,
|
|
550
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
551
|
+
],
|
|
552
|
+
fps_path: Annotated[
|
|
553
|
+
Path | None,
|
|
554
|
+
Option(
|
|
555
|
+
"-f",
|
|
556
|
+
"--fps-path",
|
|
557
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
558
|
+
show_default=False,
|
|
559
|
+
),
|
|
560
|
+
] = None,
|
|
561
|
+
min_size: Annotated[
|
|
562
|
+
int,
|
|
563
|
+
Option("--min-size"),
|
|
564
|
+
] = 0,
|
|
565
|
+
smiles_path: Annotated[
|
|
566
|
+
Path | None,
|
|
567
|
+
Option(
|
|
568
|
+
"-s",
|
|
569
|
+
"--smiles-path",
|
|
570
|
+
show_default=False,
|
|
571
|
+
help="Optional smiles path, if passed a scaffold analysis is performed",
|
|
572
|
+
),
|
|
573
|
+
] = None,
|
|
574
|
+
top: Annotated[
|
|
575
|
+
int,
|
|
576
|
+
Option("--top"),
|
|
577
|
+
] = 20,
|
|
578
|
+
input_is_packed: Annotated[
|
|
579
|
+
bool,
|
|
580
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
581
|
+
] = True,
|
|
582
|
+
scaffold_fp_kind: Annotated[
|
|
583
|
+
str,
|
|
584
|
+
Option("--scaffold-fp-kind"),
|
|
585
|
+
] = DEFAULTS.fp_kind,
|
|
586
|
+
n_features: Annotated[
|
|
587
|
+
int | None,
|
|
588
|
+
Option(
|
|
589
|
+
"--n-features",
|
|
590
|
+
help="Number of features in the fingerprints."
|
|
591
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
592
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
593
|
+
rich_help_panel="Advanced",
|
|
594
|
+
),
|
|
595
|
+
] = None,
|
|
596
|
+
metrics: Annotated[
|
|
597
|
+
bool,
|
|
598
|
+
Option(
|
|
599
|
+
"--metrics/--no-metrics",
|
|
600
|
+
help="Calculate clustering indices (Dunn, DBI, CHI)",
|
|
601
|
+
),
|
|
602
|
+
] = False,
|
|
603
|
+
chosen_metrics: Annotated[
|
|
604
|
+
str,
|
|
605
|
+
Option(
|
|
606
|
+
"-m",
|
|
607
|
+
"--metrics-choice",
|
|
608
|
+
help=(
|
|
609
|
+
"Chosen metrics. "
|
|
610
|
+
" Comma-separated list including dunn (slow), dbi or chi"
|
|
611
|
+
),
|
|
612
|
+
),
|
|
613
|
+
] = "dunn,dbi,chi",
|
|
614
|
+
metrics_top: Annotated[
|
|
615
|
+
int | None,
|
|
616
|
+
Option("--metrics-top", rich_help_panel="Advanced"),
|
|
617
|
+
] = 100,
|
|
618
|
+
metrics_min_size: Annotated[
|
|
619
|
+
int,
|
|
620
|
+
Option("--metrics-min-size", hidden=True),
|
|
621
|
+
] = 1,
|
|
622
|
+
verbose: Annotated[
|
|
623
|
+
bool,
|
|
624
|
+
Option("--verbose/--no-verbose", hidden=True),
|
|
625
|
+
] = True,
|
|
626
|
+
) -> None:
|
|
627
|
+
r"""Summary table of clustering results, together with cluster metrics"""
|
|
628
|
+
from bblean._console import get_console
|
|
629
|
+
from bblean.smiles import load_smiles
|
|
630
|
+
from bblean.analysis import cluster_analysis
|
|
631
|
+
from bblean.utils import _has_files_or_valid_symlinks
|
|
632
|
+
from bblean.metrics import jt_dbi, jt_isim_chi, jt_isim_dunn, _calc_centrals
|
|
633
|
+
from rich.table import Table
|
|
634
|
+
|
|
635
|
+
console = get_console(silent=not verbose)
|
|
636
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
637
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
638
|
+
if clusters_path.is_dir():
|
|
639
|
+
clusters_path = clusters_path / "clusters.pkl"
|
|
640
|
+
with open(clusters_path, mode="rb") as f:
|
|
641
|
+
clusters = pickle.load(f)
|
|
642
|
+
if fps_path is None:
|
|
643
|
+
input_fps_path = clusters_path.parent / "input-fps"
|
|
644
|
+
if input_fps_path.is_dir() and _has_files_or_valid_symlinks(input_fps_path):
|
|
645
|
+
fps_path = input_fps_path
|
|
646
|
+
else:
|
|
647
|
+
msg = (
|
|
648
|
+
"Could not find input fingerprints. Please use --fps-path."
|
|
649
|
+
" Summary plot without fingerprints doesn't include isim values"
|
|
650
|
+
)
|
|
651
|
+
warnings.warn(msg)
|
|
652
|
+
if fps_path is None:
|
|
653
|
+
fps_paths = None
|
|
654
|
+
elif fps_path.is_dir():
|
|
655
|
+
fps_paths = sorted(fps_path.glob("*.npy"))
|
|
656
|
+
else:
|
|
657
|
+
fps_paths = [fps_path]
|
|
658
|
+
ca = cluster_analysis(
|
|
659
|
+
clusters,
|
|
660
|
+
fps_paths,
|
|
661
|
+
smiles=load_smiles(smiles_path) if smiles_path is not None else (),
|
|
662
|
+
top=top,
|
|
663
|
+
n_features=n_features,
|
|
664
|
+
input_is_packed=input_is_packed,
|
|
665
|
+
min_size=min_size,
|
|
666
|
+
)
|
|
667
|
+
table = Table(title=(f"Top {top} clusters" if top is not None else "Clusters"))
|
|
668
|
+
table.add_column("Size", justify="center")
|
|
669
|
+
table.add_column("% fps", justify="center")
|
|
670
|
+
table.add_column("iSIM", justify="center")
|
|
671
|
+
if smiles_path is not None:
|
|
672
|
+
table.add_column("Size/Scaff.", justify="center")
|
|
673
|
+
table.add_column("Num. Scaff.", justify="center")
|
|
674
|
+
table.add_column("Scaff. iSIM", justify="center")
|
|
675
|
+
sizes = ca.sizes
|
|
676
|
+
isims = ca.isims
|
|
677
|
+
total_fps = ca.total_fps
|
|
678
|
+
for i in range(ca.clusters_num):
|
|
679
|
+
size = sizes[i]
|
|
680
|
+
percent = size / total_fps * 100
|
|
681
|
+
table.add_row(f"{size:,}", f"{percent:.2f}", f"{isims[i]:.3f}")
|
|
682
|
+
console.print(table)
|
|
683
|
+
console.print()
|
|
684
|
+
console.print(f"Total num. fps: {total_fps:,}")
|
|
685
|
+
console.print(f"Total num. clusters: {ca.all_clusters_num:,}")
|
|
686
|
+
singles = ca.all_singletons_num
|
|
687
|
+
singles_percent = singles * 100 / ca.all_clusters_num
|
|
688
|
+
console.print(f"Total num. singletons: {singles:,} ({singles_percent:.2f} %)")
|
|
689
|
+
gt10 = ca.all_clusters_num_with_size_above(10)
|
|
690
|
+
gt10_percent = gt10 * 100 / ca.all_clusters_num
|
|
691
|
+
console.print(
|
|
692
|
+
f"Total num. clusters, size > 10: {gt10:,} ({gt10_percent:.2f} %)"
|
|
693
|
+
)
|
|
694
|
+
gt100 = ca.all_clusters_num_with_size_above(100)
|
|
695
|
+
gt100_percent = gt100 * 100 / ca.all_clusters_num
|
|
696
|
+
console.print(
|
|
697
|
+
f"Total num. clusters, size > 100: {gt100:,} ({gt100_percent:.2f} %)"
|
|
698
|
+
)
|
|
699
|
+
console.print(
|
|
700
|
+
f"num-clusters/num-fps ratio: {ca.all_clusters_num / total_fps:.2f}"
|
|
701
|
+
)
|
|
702
|
+
console.print(f"Mean size: {ca.all_clusters_mean_size:.2f}")
|
|
703
|
+
console.print(f"Max. size: {ca.all_clusters_max_size:,}")
|
|
704
|
+
console.print(f"Q3 (75%) size: {ca.all_clusters_q3:,}")
|
|
705
|
+
console.print(f"Median size: {ca.all_clusters_median_size:,}")
|
|
706
|
+
console.print(f"Q1 (25%) size: {ca.all_clusters_q1:,}")
|
|
707
|
+
console.print(f"Min. size: {ca.all_clusters_min_size:,}")
|
|
708
|
+
if metrics:
|
|
709
|
+
chosen = set(s.lower() for s in chosen_metrics.split(","))
|
|
710
|
+
assert all(s in ["dunn", "chi", "dbi"] for s in chosen)
|
|
711
|
+
# Redo cluster analysis with more *top* clusters
|
|
712
|
+
console.print()
|
|
713
|
+
if metrics_top is None:
|
|
714
|
+
console.print("Clustering metrics:")
|
|
715
|
+
else:
|
|
716
|
+
console.print(f"Clustering metrics considering top {metrics_top} clusters:")
|
|
717
|
+
with console.status("[italic]Reanalyzing clusters...[/italic]", spinner="dots"):
|
|
718
|
+
ca = cluster_analysis(
|
|
719
|
+
clusters,
|
|
720
|
+
fps_paths,
|
|
721
|
+
smiles=(),
|
|
722
|
+
top=metrics_top,
|
|
723
|
+
n_features=n_features,
|
|
724
|
+
input_is_packed=input_is_packed,
|
|
725
|
+
min_size=metrics_min_size,
|
|
726
|
+
)
|
|
727
|
+
clusters = ca.get_top_cluster_fps()
|
|
728
|
+
with console.status("[italic]Calculating centrals...[/italic]", spinner="dots"):
|
|
729
|
+
centrals = _calc_centrals(clusters, kind="centroid")
|
|
730
|
+
if "chi" in chosen:
|
|
731
|
+
chi = jt_isim_chi(clusters, centrals=centrals, verbose=verbose)
|
|
732
|
+
console.print(f" - CHI index: {chi:.4f} (Higher is better)")
|
|
733
|
+
if "dbi" in chosen:
|
|
734
|
+
dbi = jt_dbi(clusters, centrals=centrals, verbose=verbose)
|
|
735
|
+
console.print(f" - DBI index: {dbi:.4e} (Lower is better)")
|
|
736
|
+
if "dunn" in chosen:
|
|
737
|
+
dunn = jt_isim_dunn(clusters, verbose=verbose)
|
|
738
|
+
console.print(f" - Dunn index: {dunn:.4f} (Higher is better)")
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
@app.command("plot-summary", rich_help_panel="Analysis")
|
|
742
|
+
def _plot_summary(
|
|
743
|
+
clusters_path: Annotated[
|
|
744
|
+
Path,
|
|
745
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
746
|
+
],
|
|
747
|
+
fps_path: Annotated[
|
|
748
|
+
Path | None,
|
|
749
|
+
Option(
|
|
750
|
+
"-f",
|
|
751
|
+
"--fps-path",
|
|
752
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
753
|
+
show_default=False,
|
|
754
|
+
),
|
|
755
|
+
] = None,
|
|
756
|
+
save: Annotated[
|
|
757
|
+
bool,
|
|
758
|
+
Option("--save/--no-save"),
|
|
759
|
+
] = True,
|
|
760
|
+
ylim: Annotated[
|
|
761
|
+
int | None,
|
|
762
|
+
Option("--ylim"),
|
|
763
|
+
] = None,
|
|
764
|
+
min_size: Annotated[
|
|
765
|
+
int,
|
|
766
|
+
Option("--min-size"),
|
|
767
|
+
] = 0,
|
|
768
|
+
smiles_path: Annotated[
|
|
769
|
+
Path | None,
|
|
770
|
+
Option(
|
|
771
|
+
"-s",
|
|
772
|
+
"--smiles-path",
|
|
773
|
+
show_default=False,
|
|
774
|
+
help="Optional smiles path, if passed a scaffold analysis is performed",
|
|
775
|
+
),
|
|
776
|
+
] = None,
|
|
777
|
+
title: Annotated[
|
|
778
|
+
str | None,
|
|
779
|
+
Option("--title"),
|
|
780
|
+
] = None,
|
|
781
|
+
filename: Annotated[
|
|
782
|
+
str | None,
|
|
783
|
+
Option("--filename"),
|
|
784
|
+
] = None,
|
|
785
|
+
top: Annotated[
|
|
786
|
+
int,
|
|
787
|
+
Option("--top"),
|
|
788
|
+
] = 20,
|
|
789
|
+
input_is_packed: Annotated[
|
|
790
|
+
bool,
|
|
791
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
792
|
+
] = True,
|
|
793
|
+
scaffold_fp_kind: Annotated[
|
|
794
|
+
str,
|
|
795
|
+
Option("--scaffold-fp-kind"),
|
|
796
|
+
] = DEFAULTS.fp_kind,
|
|
797
|
+
n_features: Annotated[
|
|
798
|
+
int | None,
|
|
799
|
+
Option(
|
|
800
|
+
"--n-features",
|
|
801
|
+
help="Number of features in the fingerprints."
|
|
802
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
803
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
804
|
+
rich_help_panel="Advanced",
|
|
805
|
+
),
|
|
806
|
+
] = None,
|
|
807
|
+
annotate: Annotated[
|
|
808
|
+
bool,
|
|
809
|
+
Option(
|
|
810
|
+
"--annotate/--no-annotate",
|
|
811
|
+
help="Display scaffold and fingerprint number in each cluster",
|
|
812
|
+
),
|
|
813
|
+
] = True,
|
|
814
|
+
verbose: Annotated[
|
|
815
|
+
bool,
|
|
816
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
817
|
+
] = True,
|
|
818
|
+
show: Annotated[
|
|
819
|
+
bool,
|
|
820
|
+
Option("--show/--no-show", hidden=True),
|
|
821
|
+
] = True,
|
|
822
|
+
) -> None:
|
|
823
|
+
r"""Summary plot of the clustering results"""
|
|
824
|
+
from bblean._console import get_console
|
|
825
|
+
|
|
826
|
+
console = get_console(silent=not verbose)
|
|
827
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
828
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
829
|
+
from bblean.plotting import _dispatch_visualization, summary_plot
|
|
830
|
+
from bblean.smiles import load_smiles
|
|
831
|
+
|
|
832
|
+
_dispatch_visualization(
|
|
833
|
+
clusters_path,
|
|
834
|
+
"summary",
|
|
835
|
+
summary_plot,
|
|
836
|
+
{"annotate": annotate, "counts_ylim": ylim},
|
|
837
|
+
smiles=load_smiles(smiles_path) if smiles_path is not None else (),
|
|
838
|
+
min_size=min_size,
|
|
839
|
+
top=top,
|
|
840
|
+
n_features=n_features,
|
|
841
|
+
input_is_packed=input_is_packed,
|
|
842
|
+
fps_path=fps_path,
|
|
843
|
+
title=title,
|
|
844
|
+
filename=filename,
|
|
845
|
+
verbose=verbose,
|
|
846
|
+
save=save,
|
|
847
|
+
show=show,
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
@app.command("run")
|
|
852
|
+
def _run(
|
|
853
|
+
ctx: Context,
|
|
854
|
+
input_: Annotated[
|
|
855
|
+
Path | None,
|
|
856
|
+
Argument(help="`*.npy` file with packed fingerprints, or dir `*.npy` files"),
|
|
857
|
+
] = None,
|
|
858
|
+
out_dir: Annotated[
|
|
859
|
+
Path | None,
|
|
860
|
+
Option(
|
|
861
|
+
"-o",
|
|
862
|
+
"--out-dir",
|
|
863
|
+
help="Dir to dump the output files",
|
|
864
|
+
),
|
|
865
|
+
] = None,
|
|
866
|
+
overwrite: Annotated[bool, Option(help="Allow overwriting output files")] = False,
|
|
867
|
+
branching_factor: Annotated[
|
|
868
|
+
int,
|
|
869
|
+
Option(
|
|
870
|
+
"--branching",
|
|
871
|
+
"-b",
|
|
872
|
+
help="BitBIRCH branching factor (all rounds). Usually 254 is"
|
|
873
|
+
" optimal. Set above 254 for slightly less RAM (at the cost of some perf.)",
|
|
874
|
+
),
|
|
875
|
+
] = DEFAULTS.branching_factor,
|
|
876
|
+
threshold: Annotated[
|
|
877
|
+
float,
|
|
878
|
+
Option("--threshold", "-t", help="Threshold for merge criterion"),
|
|
879
|
+
] = DEFAULTS.threshold,
|
|
880
|
+
refine_threshold_change: Annotated[
|
|
881
|
+
float,
|
|
882
|
+
Option(
|
|
883
|
+
"--refine-threshold-change",
|
|
884
|
+
help="Modify threshold for refinement criterion, can be negative",
|
|
885
|
+
),
|
|
886
|
+
] = DEFAULTS.refine_threshold_change,
|
|
887
|
+
save_tree: Annotated[
|
|
888
|
+
bool,
|
|
889
|
+
Option("--save-tree/--no-save-tree", rich_help_panel="Advanced"),
|
|
890
|
+
] = False,
|
|
891
|
+
save_centroids: Annotated[
|
|
892
|
+
bool,
|
|
893
|
+
Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
|
|
894
|
+
] = True,
|
|
895
|
+
merge_criterion: Annotated[
|
|
896
|
+
str,
|
|
897
|
+
Option("--set-merge", "-m", help="Merge criterion for initial clustsering"),
|
|
898
|
+
] = DEFAULTS.merge_criterion,
|
|
899
|
+
refine_merge_criterion: Annotated[
|
|
900
|
+
str,
|
|
901
|
+
Option("--set-refine-merge", help="Merge criterion for refinement clustsering"),
|
|
902
|
+
] = DEFAULTS.refine_merge_criterion,
|
|
903
|
+
tolerance: Annotated[
|
|
904
|
+
float,
|
|
905
|
+
Option(help="BitBIRCH tolerance. For refinement and --set-merge 'tolerance'"),
|
|
906
|
+
] = DEFAULTS.tolerance,
|
|
907
|
+
refine_num: Annotated[
|
|
908
|
+
int,
|
|
909
|
+
Option(
|
|
910
|
+
"--refine-num",
|
|
911
|
+
help=(
|
|
912
|
+
"Num. of largest clusters to refine."
|
|
913
|
+
" 1 for standard refinement, 0 is the default (no refinement)"
|
|
914
|
+
),
|
|
915
|
+
),
|
|
916
|
+
] = 0,
|
|
917
|
+
refine_rounds: Annotated[
|
|
918
|
+
int | None,
|
|
919
|
+
Option(
|
|
920
|
+
"--refine-rounds",
|
|
921
|
+
help=("Num. of refinement rounds. "),
|
|
922
|
+
hidden=True,
|
|
923
|
+
),
|
|
924
|
+
] = None,
|
|
925
|
+
recluster_rounds: Annotated[
|
|
926
|
+
int,
|
|
927
|
+
Option(
|
|
928
|
+
"--recluster-rounds",
|
|
929
|
+
help=("Num. of reclustering rounds. "),
|
|
930
|
+
hidden=True,
|
|
931
|
+
),
|
|
932
|
+
] = 0,
|
|
933
|
+
recluster_shuffle: Annotated[
|
|
934
|
+
bool,
|
|
935
|
+
Option("--recluster-shuffle/--no-recluster-shuffle", hidden=True),
|
|
936
|
+
] = True,
|
|
937
|
+
n_features: Annotated[
|
|
938
|
+
int | None,
|
|
939
|
+
Option(
|
|
940
|
+
"--n-features",
|
|
941
|
+
help="Number of features in the fingerprints."
|
|
942
|
+
" It must be provided for packed inputs *if it is not a multiple of 8*."
|
|
943
|
+
" For typical fingerprint sizes (e.g. 2048, 1024), it is not required",
|
|
944
|
+
rich_help_panel="Advanced",
|
|
945
|
+
),
|
|
946
|
+
] = None,
|
|
947
|
+
input_is_packed: Annotated[
|
|
948
|
+
bool,
|
|
949
|
+
Option(
|
|
950
|
+
"--packed-input/--unpacked-input",
|
|
951
|
+
help="Toggle whether the input consists on packed or unpacked fingerprints",
|
|
952
|
+
rich_help_panel="Advanced",
|
|
953
|
+
),
|
|
954
|
+
] = True,
|
|
955
|
+
# Debug options
|
|
956
|
+
monitor_rss: Annotated[
|
|
957
|
+
bool,
|
|
958
|
+
Option(
|
|
959
|
+
"--monitor-mem/--no-monitor-mem",
|
|
960
|
+
"--monitor-rss/--no-monitor-rss",
|
|
961
|
+
help="Monitor RAM used by all processes",
|
|
962
|
+
rich_help_panel="Advanced",
|
|
963
|
+
),
|
|
964
|
+
] = True,
|
|
965
|
+
monitor_rss_interval_s: Annotated[
|
|
966
|
+
float,
|
|
967
|
+
Option(
|
|
968
|
+
"--monitor-mem-seconds",
|
|
969
|
+
"--monitor-rss-seconds",
|
|
970
|
+
help="Interval in seconds for RAM monitoring",
|
|
971
|
+
rich_help_panel="Debug",
|
|
972
|
+
hidden=True,
|
|
973
|
+
),
|
|
974
|
+
] = 1.0,
|
|
975
|
+
max_fps: Annotated[
|
|
976
|
+
int | None,
|
|
977
|
+
Option(
|
|
978
|
+
help="Max. num of fingerprints to read from each file",
|
|
979
|
+
rich_help_panel="Debug",
|
|
980
|
+
hidden=True,
|
|
981
|
+
),
|
|
982
|
+
] = None,
|
|
983
|
+
variant: Annotated[
|
|
984
|
+
str,
|
|
985
|
+
Option(
|
|
986
|
+
"--bb-variant",
|
|
987
|
+
help="Use different bitbirch variants, *only for debugging*.",
|
|
988
|
+
hidden=True,
|
|
989
|
+
),
|
|
990
|
+
] = "lean",
|
|
991
|
+
copy_inputs: Annotated[
|
|
992
|
+
bool,
|
|
993
|
+
Option(
|
|
994
|
+
"--copy/--no-copy",
|
|
995
|
+
rich_help_panel="Advanced",
|
|
996
|
+
help="Copy the input files instead of symlink",
|
|
997
|
+
),
|
|
998
|
+
] = False,
|
|
999
|
+
verbose: Annotated[
|
|
1000
|
+
bool,
|
|
1001
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
1002
|
+
] = True,
|
|
1003
|
+
) -> None:
|
|
1004
|
+
r"""Run standard, serial BitBIRCH clustering over `*.npy` fingerprint files"""
|
|
1005
|
+
# TODO: Remove code duplication with multiround
|
|
1006
|
+
from bblean._console import get_console
|
|
1007
|
+
from bblean.fingerprints import _get_fps_file_num
|
|
1008
|
+
|
|
1009
|
+
console = get_console(silent=not verbose)
|
|
1010
|
+
if variant == "int64" and input_is_packed:
|
|
1011
|
+
raise ValueError("Packed inputs are not supported for the int64 variant")
|
|
1012
|
+
if refine_rounds is None:
|
|
1013
|
+
refine_rounds = 1 if refine_num > 0 else 0
|
|
1014
|
+
if refine_rounds > 0 and refine_num == 0:
|
|
1015
|
+
refine_num = 1
|
|
1016
|
+
ctx.params["refine_rounds"] = refine_rounds
|
|
1017
|
+
ctx.params["refine_num"] = refine_num
|
|
1018
|
+
|
|
1019
|
+
BitBirch, set_merge = _import_bitbirch_variant(variant)
|
|
1020
|
+
|
|
1021
|
+
# NOTE: Files are sorted according to name
|
|
1022
|
+
if input_ is None:
|
|
1023
|
+
input_ = Path.cwd() / "bb_inputs"
|
|
1024
|
+
input_.mkdir(exist_ok=True)
|
|
1025
|
+
input_files = sorted(input_.glob("*.npy"))
|
|
1026
|
+
_validate_input_dir(input_)
|
|
1027
|
+
elif input_.is_dir():
|
|
1028
|
+
input_files = sorted(input_.glob("*.npy"))
|
|
1029
|
+
_validate_input_dir(input_)
|
|
1030
|
+
else:
|
|
1031
|
+
input_files = [input_]
|
|
1032
|
+
ctx.params.pop("input_")
|
|
1033
|
+
ctx.params["input_files"] = [str(p.resolve()) for p in input_files]
|
|
1034
|
+
ctx.params["num_fps_present"] = [_get_fps_file_num(p) for p in input_files]
|
|
1035
|
+
if max_fps is not None:
|
|
1036
|
+
ctx.params["num_fps_loaded"] = [
|
|
1037
|
+
min(n, max_fps) for n in ctx.params["num_fps_present"]
|
|
1038
|
+
]
|
|
1039
|
+
else:
|
|
1040
|
+
ctx.params["num_fps_loaded"] = ctx.params["num_fps_present"]
|
|
1041
|
+
unique_id = format(random.getrandbits(32), "08x")
|
|
1042
|
+
if out_dir is None:
|
|
1043
|
+
out_dir = Path.cwd() / "bb_run_outputs" / unique_id
|
|
1044
|
+
out_dir.mkdir(exist_ok=True, parents=True)
|
|
1045
|
+
_validate_output_dir(out_dir, overwrite)
|
|
1046
|
+
ctx.params["out_dir"] = str(out_dir.resolve())
|
|
1047
|
+
|
|
1048
|
+
console.print_banner()
|
|
1049
|
+
console.print()
|
|
1050
|
+
console.print_config(ctx.params)
|
|
1051
|
+
|
|
1052
|
+
# Optinally start a separate process that tracks RAM usage
|
|
1053
|
+
if monitor_rss:
|
|
1054
|
+
launch_monitor_rss_daemon(out_dir / "monitor-rss.csv", monitor_rss_interval_s)
|
|
1055
|
+
|
|
1056
|
+
timer = Timer()
|
|
1057
|
+
timer.init_timing("total")
|
|
1058
|
+
if "lean" not in variant:
|
|
1059
|
+
set_merge(merge_criterion, tolerance=tolerance)
|
|
1060
|
+
tree = BitBirch(branching_factor=branching_factor, threshold=threshold)
|
|
1061
|
+
else:
|
|
1062
|
+
tree = BitBirch(
|
|
1063
|
+
branching_factor=branching_factor,
|
|
1064
|
+
threshold=threshold,
|
|
1065
|
+
merge_criterion=merge_criterion,
|
|
1066
|
+
tolerance=tolerance,
|
|
1067
|
+
)
|
|
1068
|
+
with console.status("[italic]BitBirching...[/italic]", spinner="dots"):
|
|
1069
|
+
for file in input_files:
|
|
1070
|
+
# Fitting a file uses mmap internally, and releases memory in a smart way
|
|
1071
|
+
tree.fit(
|
|
1072
|
+
file,
|
|
1073
|
+
n_features=n_features,
|
|
1074
|
+
input_is_packed=input_is_packed,
|
|
1075
|
+
max_fps=max_fps,
|
|
1076
|
+
)
|
|
1077
|
+
if recluster_rounds != 0 or refine_rounds != 0:
|
|
1078
|
+
tree.set_merge(
|
|
1079
|
+
refine_merge_criterion,
|
|
1080
|
+
tolerance=tolerance,
|
|
1081
|
+
threshold=threshold + refine_threshold_change,
|
|
1082
|
+
)
|
|
1083
|
+
for r in range(refine_rounds):
|
|
1084
|
+
msg = (
|
|
1085
|
+
f"[italic]Refinement, round {r + 1}"
|
|
1086
|
+
f" (will split {refine_num} largest clusters)...[/italic]"
|
|
1087
|
+
)
|
|
1088
|
+
with console.status(msg, spinner="dots"):
|
|
1089
|
+
tree.refine_inplace(
|
|
1090
|
+
input_files,
|
|
1091
|
+
input_is_packed=input_is_packed,
|
|
1092
|
+
n_largest=refine_num,
|
|
1093
|
+
)
|
|
1094
|
+
for r in range(recluster_rounds):
|
|
1095
|
+
msg = f"[italic]Reclustering, round {r + 1}...[/italic]"
|
|
1096
|
+
with console.status(msg, spinner="dots"):
|
|
1097
|
+
tree.recluster_inplace(shuffle=recluster_shuffle)
|
|
1098
|
+
|
|
1099
|
+
timer.end_timing("total", console, indent=False)
|
|
1100
|
+
console.print_peak_mem(out_dir, indent=False)
|
|
1101
|
+
if variant == "lean":
|
|
1102
|
+
if save_tree:
|
|
1103
|
+
# TODO: BitBIRCH is highly recursive. pickling may crash python,
|
|
1104
|
+
# an alternative solution would be better
|
|
1105
|
+
_old_limit = sys.getrecursionlimit()
|
|
1106
|
+
sys.setrecursionlimit(100_000)
|
|
1107
|
+
with open(out_dir / "bitbirch.pkl", mode="wb") as f:
|
|
1108
|
+
pickle.dump(tree, f)
|
|
1109
|
+
sys.setrecursionlimit(_old_limit)
|
|
1110
|
+
tree.delete_internal_nodes()
|
|
1111
|
+
# Dump outputs (peak memory, timings, config, cluster ids)
|
|
1112
|
+
if save_centroids:
|
|
1113
|
+
output = tree.get_centroids_mol_ids()
|
|
1114
|
+
with open(out_dir / "clusters.pkl", mode="wb") as f:
|
|
1115
|
+
pickle.dump(output["mol_ids"], f)
|
|
1116
|
+
with open(out_dir / "cluster-centroids-packed.pkl", mode="wb") as f:
|
|
1117
|
+
pickle.dump(output["centroids"], f)
|
|
1118
|
+
else:
|
|
1119
|
+
with open(out_dir / "clusters.pkl", mode="wb") as f:
|
|
1120
|
+
pickle.dump(tree.get_cluster_mol_ids(), f)
|
|
1121
|
+
|
|
1122
|
+
collect_system_specs_and_dump_config(ctx.params)
|
|
1123
|
+
timer.dump(out_dir / "timings.json")
|
|
1124
|
+
|
|
1125
|
+
# Symlink or copy fingerprint files
|
|
1126
|
+
input_fps_dir = (out_dir / "input-fps").resolve()
|
|
1127
|
+
input_fps_dir.mkdir()
|
|
1128
|
+
if copy_inputs:
|
|
1129
|
+
for file in input_files:
|
|
1130
|
+
shutil.copy(file, input_fps_dir / file.name)
|
|
1131
|
+
else:
|
|
1132
|
+
for file in input_files:
|
|
1133
|
+
(input_fps_dir / file.name).symlink_to(file.resolve())
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
# TODO: Currently sometimes after a round is triggered *more* files are output, since
|
|
1137
|
+
# the files are divided *both* by uint8/uint16 and the batch idx. I believe this is not
|
|
1138
|
+
# ideal
|
|
1139
|
+
@app.command("multiround")
|
|
1140
|
+
def _multiround(
|
|
1141
|
+
ctx: Context,
|
|
1142
|
+
in_dir: Annotated[
|
|
1143
|
+
Path | None,
|
|
1144
|
+
Argument(help="Directory with input `*.npy` files with packed fingerprints"),
|
|
1145
|
+
] = None,
|
|
1146
|
+
out_dir: Annotated[
|
|
1147
|
+
Path | None,
|
|
1148
|
+
Option("-o", "--out-dir", help="Dir for output files"),
|
|
1149
|
+
] = None,
|
|
1150
|
+
overwrite: Annotated[bool, Option(help="Allow overwriting output files")] = False,
|
|
1151
|
+
num_initial_processes: Annotated[
|
|
1152
|
+
int, Option("--ps", "--processes", help="Num. processes for first round")
|
|
1153
|
+
] = 10,
|
|
1154
|
+
num_midsection_processes: Annotated[
|
|
1155
|
+
int | None,
|
|
1156
|
+
Option(
|
|
1157
|
+
"--mid-ps",
|
|
1158
|
+
"--mid-processes",
|
|
1159
|
+
help="Num. processes for middle section rounds."
|
|
1160
|
+
" These are be memory intensive,"
|
|
1161
|
+
" you may want to use 50%-30% of --ps."
|
|
1162
|
+
" Default is same as --ps",
|
|
1163
|
+
),
|
|
1164
|
+
] = None,
|
|
1165
|
+
branching_factor: Annotated[
|
|
1166
|
+
int,
|
|
1167
|
+
Option(
|
|
1168
|
+
"--branching",
|
|
1169
|
+
"-b",
|
|
1170
|
+
help="BitBIRCH branching factor (all rounds). Usually 254 is"
|
|
1171
|
+
" optimal. Set above 254 for slightly less RAM (at the cost of some perf.)",
|
|
1172
|
+
),
|
|
1173
|
+
] = DEFAULTS.branching_factor,
|
|
1174
|
+
threshold: Annotated[
|
|
1175
|
+
float,
|
|
1176
|
+
Option("--threshold", "-t", help="Thresh for merge criterion (initial step)"),
|
|
1177
|
+
] = DEFAULTS.threshold,
|
|
1178
|
+
mid_threshold_change: Annotated[
|
|
1179
|
+
float,
|
|
1180
|
+
Option("--mid-threshold-change", help="Modify threshold for refinement"),
|
|
1181
|
+
] = DEFAULTS.refine_threshold_change,
|
|
1182
|
+
initial_merge_criterion: Annotated[
|
|
1183
|
+
str,
|
|
1184
|
+
Option(
|
|
1185
|
+
"--set-merge",
|
|
1186
|
+
"-m",
|
|
1187
|
+
help="Initial merge criterion for round 1. ('diameter' recommended)",
|
|
1188
|
+
),
|
|
1189
|
+
] = DEFAULTS.merge_criterion,
|
|
1190
|
+
save_tree: Annotated[
|
|
1191
|
+
bool,
|
|
1192
|
+
Option("--save-tree/--no-save-tree", rich_help_panel="Advanced"),
|
|
1193
|
+
] = False,
|
|
1194
|
+
save_centroids: Annotated[
|
|
1195
|
+
bool,
|
|
1196
|
+
Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
|
|
1197
|
+
] = True,
|
|
1198
|
+
mid_merge_criterion: Annotated[
|
|
1199
|
+
str,
|
|
1200
|
+
Option(
|
|
1201
|
+
"--set-mid-merge",
|
|
1202
|
+
help="Merge criterion for midsection rounds ('diameter' recommended)",
|
|
1203
|
+
),
|
|
1204
|
+
] = DEFAULTS.refine_merge_criterion,
|
|
1205
|
+
tolerance: Annotated[
|
|
1206
|
+
float,
|
|
1207
|
+
Option(
|
|
1208
|
+
help="Tolerance value for all steps that use the 'tolerance' criterion"
|
|
1209
|
+
" (by default all except initial round)",
|
|
1210
|
+
),
|
|
1211
|
+
] = DEFAULTS.tolerance,
|
|
1212
|
+
n_features: Annotated[
|
|
1213
|
+
int | None,
|
|
1214
|
+
Option(
|
|
1215
|
+
"--n-features",
|
|
1216
|
+
help="Number of features in the fingerprints."
|
|
1217
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
1218
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
1219
|
+
rich_help_panel="Advanced",
|
|
1220
|
+
),
|
|
1221
|
+
] = None,
|
|
1222
|
+
input_is_packed: Annotated[
|
|
1223
|
+
bool,
|
|
1224
|
+
Option(
|
|
1225
|
+
"--packed-input/--unpacked-input",
|
|
1226
|
+
help="Toggle whether the input consists on packed or unpacked fingerprints",
|
|
1227
|
+
rich_help_panel="Advanced",
|
|
1228
|
+
),
|
|
1229
|
+
] = True,
|
|
1230
|
+
# Advanced options
|
|
1231
|
+
num_midsection_rounds: Annotated[
|
|
1232
|
+
int,
|
|
1233
|
+
Option(
|
|
1234
|
+
"--num-mid-rounds",
|
|
1235
|
+
help="Number of midsection rounds to perform",
|
|
1236
|
+
rich_help_panel="Advanced",
|
|
1237
|
+
),
|
|
1238
|
+
] = 1,
|
|
1239
|
+
split_largest_after_midsection: Annotated[
|
|
1240
|
+
bool,
|
|
1241
|
+
Option(
|
|
1242
|
+
"--split-after-mid/--no-split-after-mid",
|
|
1243
|
+
help=(
|
|
1244
|
+
"Split largest cluster after each midsection round"
|
|
1245
|
+
" (to be refined by the next round)"
|
|
1246
|
+
),
|
|
1247
|
+
rich_help_panel="Advanced",
|
|
1248
|
+
),
|
|
1249
|
+
] = False,
|
|
1250
|
+
refinement_before_midsection: Annotated[
|
|
1251
|
+
str,
|
|
1252
|
+
Option(
|
|
1253
|
+
"--initial-refine",
|
|
1254
|
+
help=(
|
|
1255
|
+
"Run a *full* refinement step after the initial clustering round,"
|
|
1256
|
+
" only *split* largest cluster, or do *none*."
|
|
1257
|
+
),
|
|
1258
|
+
rich_help_panel="Advanced",
|
|
1259
|
+
),
|
|
1260
|
+
] = "full",
|
|
1261
|
+
max_tasks_per_process: Annotated[
|
|
1262
|
+
int, Option(help="Max tasks per process", rich_help_panel="Advanced")
|
|
1263
|
+
] = 1,
|
|
1264
|
+
fork: Annotated[
|
|
1265
|
+
bool,
|
|
1266
|
+
Option(
|
|
1267
|
+
help="In linux, force the 'fork' multiprocessing start method",
|
|
1268
|
+
rich_help_panel="Advanced",
|
|
1269
|
+
),
|
|
1270
|
+
] = False,
|
|
1271
|
+
bin_size: Annotated[
|
|
1272
|
+
int,
|
|
1273
|
+
Option(help="Bin size for chunking during Round 2", rich_help_panel="Advanced"),
|
|
1274
|
+
] = 10,
|
|
1275
|
+
# Debug options
|
|
1276
|
+
variant: Annotated[
|
|
1277
|
+
str,
|
|
1278
|
+
Option(
|
|
1279
|
+
"--bb-variant",
|
|
1280
|
+
help="Use different bitbirch variants, *only for debugging*.",
|
|
1281
|
+
hidden=True,
|
|
1282
|
+
),
|
|
1283
|
+
] = "lean",
|
|
1284
|
+
monitor_rss: Annotated[
|
|
1285
|
+
bool,
|
|
1286
|
+
Option(
|
|
1287
|
+
"--monitor-mem",
|
|
1288
|
+
"--monitor-rss",
|
|
1289
|
+
help="Monitor RAM used by all processes",
|
|
1290
|
+
rich_help_panel="Advanced",
|
|
1291
|
+
),
|
|
1292
|
+
] = True,
|
|
1293
|
+
monitor_rss_interval_s: Annotated[
|
|
1294
|
+
float,
|
|
1295
|
+
Option(
|
|
1296
|
+
"--monitor-mem-seconds",
|
|
1297
|
+
"--monitor-rss-seconds",
|
|
1298
|
+
help="Interval in seconds for RAM monitoring",
|
|
1299
|
+
rich_help_panel="Debug",
|
|
1300
|
+
hidden=True,
|
|
1301
|
+
),
|
|
1302
|
+
] = 1.0,
|
|
1303
|
+
max_fps: Annotated[
|
|
1304
|
+
int | None,
|
|
1305
|
+
Option(
|
|
1306
|
+
help="Max num. of fps to load from each input file",
|
|
1307
|
+
rich_help_panel="Debug",
|
|
1308
|
+
hidden=True,
|
|
1309
|
+
),
|
|
1310
|
+
] = None,
|
|
1311
|
+
max_files: Annotated[
|
|
1312
|
+
int | None,
|
|
1313
|
+
Option(help="Max num. files to read", rich_help_panel="Debug", hidden=True),
|
|
1314
|
+
] = None,
|
|
1315
|
+
copy_inputs: Annotated[
|
|
1316
|
+
bool,
|
|
1317
|
+
Option(
|
|
1318
|
+
"--copy/--no-copy",
|
|
1319
|
+
rich_help_panel="Advanced",
|
|
1320
|
+
help="Copy the input files instead of symlink",
|
|
1321
|
+
),
|
|
1322
|
+
] = False,
|
|
1323
|
+
verbose: Annotated[
|
|
1324
|
+
bool,
|
|
1325
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
1326
|
+
] = True,
|
|
1327
|
+
cleanup: Annotated[
|
|
1328
|
+
bool,
|
|
1329
|
+
Option("--cleanup/--no-cleanup", hidden=True),
|
|
1330
|
+
] = True,
|
|
1331
|
+
) -> None:
|
|
1332
|
+
r"""Run multi-round BitBIRCH clustering, optionally parallelize over `*.npy` files""" # noqa:E501
|
|
1333
|
+
from bblean._console import get_console
|
|
1334
|
+
from bblean.multiround import run_multiround_bitbirch
|
|
1335
|
+
from bblean.fingerprints import _get_fps_file_num
|
|
1336
|
+
|
|
1337
|
+
console = get_console(silent=not verbose)
|
|
1338
|
+
|
|
1339
|
+
# Set multiprocessing start method
|
|
1340
|
+
if fork and not sys.platform == "linux":
|
|
1341
|
+
console.print("'fork' is only available on Linux", style="red")
|
|
1342
|
+
raise Abort()
|
|
1343
|
+
if sys.platform == "linux":
|
|
1344
|
+
mp_context = mp.get_context("fork" if fork else "forkserver")
|
|
1345
|
+
else:
|
|
1346
|
+
mp_context = mp.get_context()
|
|
1347
|
+
|
|
1348
|
+
# Collect inputs:
|
|
1349
|
+
# If not passed, input dir is bb_inputs/
|
|
1350
|
+
if in_dir is None:
|
|
1351
|
+
in_dir = Path.cwd() / "bb_inputs"
|
|
1352
|
+
_validate_input_dir(in_dir)
|
|
1353
|
+
# All files in the input dir with *.npy suffix are considered input files
|
|
1354
|
+
input_files = sorted(in_dir.glob("*.npy"))[:max_files]
|
|
1355
|
+
ctx.params["input_files"] = [str(p.resolve()) for p in input_files]
|
|
1356
|
+
ctx.params["num_fps"] = [_get_fps_file_num(p) for p in input_files]
|
|
1357
|
+
if max_fps is not None:
|
|
1358
|
+
ctx.params["num_fps_loaded"] = [min(n, max_fps) for n in ctx.params["num_fps"]]
|
|
1359
|
+
else:
|
|
1360
|
+
ctx.params["num_fps_loaded"] = ctx.params["num_fps"]
|
|
1361
|
+
|
|
1362
|
+
# Set up outputs:
|
|
1363
|
+
# If not passed, output dir is constructed as bb_multiround_outputs/<unique-id>/
|
|
1364
|
+
unique_id = format(random.getrandbits(32), "08x")
|
|
1365
|
+
if out_dir is None:
|
|
1366
|
+
out_dir = Path.cwd() / "bb_multiround_outputs" / unique_id
|
|
1367
|
+
out_dir.mkdir(exist_ok=True, parents=True)
|
|
1368
|
+
_validate_output_dir(out_dir, overwrite)
|
|
1369
|
+
ctx.params["out_dir"] = str(out_dir.resolve())
|
|
1370
|
+
|
|
1371
|
+
console.print_banner()
|
|
1372
|
+
console.print()
|
|
1373
|
+
console.print_multiround_config(ctx.params, mp_context)
|
|
1374
|
+
|
|
1375
|
+
# Optinally start a separate process that tracks RAM usage
|
|
1376
|
+
if monitor_rss:
|
|
1377
|
+
launch_monitor_rss_daemon(out_dir / "monitor-rss.csv", monitor_rss_interval_s)
|
|
1378
|
+
|
|
1379
|
+
timer = run_multiround_bitbirch(
|
|
1380
|
+
input_files=input_files,
|
|
1381
|
+
n_features=n_features,
|
|
1382
|
+
input_is_packed=input_is_packed,
|
|
1383
|
+
out_dir=out_dir,
|
|
1384
|
+
initial_merge_criterion=initial_merge_criterion,
|
|
1385
|
+
midsection_merge_criterion=mid_merge_criterion,
|
|
1386
|
+
num_initial_processes=num_initial_processes,
|
|
1387
|
+
num_midsection_processes=num_midsection_processes,
|
|
1388
|
+
branching_factor=branching_factor,
|
|
1389
|
+
threshold=threshold,
|
|
1390
|
+
midsection_threshold_change=mid_threshold_change,
|
|
1391
|
+
tolerance=tolerance,
|
|
1392
|
+
# Advanced
|
|
1393
|
+
save_tree=save_tree,
|
|
1394
|
+
save_centroids=save_centroids,
|
|
1395
|
+
bin_size=bin_size,
|
|
1396
|
+
max_tasks_per_process=max_tasks_per_process,
|
|
1397
|
+
refinement_before_midsection=refinement_before_midsection,
|
|
1398
|
+
num_midsection_rounds=num_midsection_rounds,
|
|
1399
|
+
split_largest_after_each_midsection_round=split_largest_after_midsection,
|
|
1400
|
+
# Debug
|
|
1401
|
+
max_fps=max_fps,
|
|
1402
|
+
verbose=verbose,
|
|
1403
|
+
mp_context=mp_context,
|
|
1404
|
+
cleanup=cleanup,
|
|
1405
|
+
)
|
|
1406
|
+
timer.dump(out_dir / "timings.json")
|
|
1407
|
+
# TODO: Also dump peak-rss.json
|
|
1408
|
+
collect_system_specs_and_dump_config(ctx.params)
|
|
1409
|
+
|
|
1410
|
+
# Symlink or copy fingerprint files
|
|
1411
|
+
input_fps_dir = (out_dir / "input-fps").resolve()
|
|
1412
|
+
input_fps_dir.mkdir()
|
|
1413
|
+
if copy_inputs:
|
|
1414
|
+
for file in input_files:
|
|
1415
|
+
shutil.copy(file, input_fps_dir / file.name)
|
|
1416
|
+
else:
|
|
1417
|
+
for file in input_files:
|
|
1418
|
+
(input_fps_dir / file.name).symlink_to(file.resolve())
|
|
1419
|
+
|
|
1420
|
+
|
|
1421
|
+
@app.command("fps-info", rich_help_panel="Fingerprints")
|
|
1422
|
+
def _fps_info(
|
|
1423
|
+
fp_paths: Annotated[
|
|
1424
|
+
list[Path] | None,
|
|
1425
|
+
Argument(show_default=False, help="Paths to *.smi files with smiles"),
|
|
1426
|
+
] = None,
|
|
1427
|
+
) -> None:
|
|
1428
|
+
"""Show info about a `*.npy` fingerprint file, or a dir with `*.npy` files"""
|
|
1429
|
+
from bblean._console import get_console
|
|
1430
|
+
from bblean.fingerprints import _print_fps_file_info
|
|
1431
|
+
|
|
1432
|
+
console = get_console()
|
|
1433
|
+
if fp_paths is None:
|
|
1434
|
+
fp_paths = [Path.cwd()]
|
|
1435
|
+
|
|
1436
|
+
for path in fp_paths:
|
|
1437
|
+
if path.is_dir():
|
|
1438
|
+
for file in path.glob("*.npy"):
|
|
1439
|
+
_print_fps_file_info(file, console)
|
|
1440
|
+
elif path.suffix == ".npy":
|
|
1441
|
+
_print_fps_file_info(file, console)
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
@app.command("fps-from-smiles", rich_help_panel="Fingerprints")
|
|
1445
|
+
def _fps_from_smiles(
|
|
1446
|
+
smiles_paths: Annotated[
|
|
1447
|
+
list[Path] | None,
|
|
1448
|
+
Argument(show_default=False, help="Paths to *.smi files with smiles"),
|
|
1449
|
+
] = None,
|
|
1450
|
+
out_dir: Annotated[
|
|
1451
|
+
Path | None,
|
|
1452
|
+
Option("-o", "--out-dir", show_default=False),
|
|
1453
|
+
] = None,
|
|
1454
|
+
out_name: Annotated[
|
|
1455
|
+
str | None,
|
|
1456
|
+
Option("--name", help="Base name of output file"),
|
|
1457
|
+
] = None,
|
|
1458
|
+
kind: Annotated[
|
|
1459
|
+
str,
|
|
1460
|
+
Option("-k", "--kind"),
|
|
1461
|
+
] = DEFAULTS.fp_kind,
|
|
1462
|
+
fp_size: Annotated[
|
|
1463
|
+
int,
|
|
1464
|
+
Option("--n-features", help="Num. features of the generated fingerprints"),
|
|
1465
|
+
] = DEFAULTS.n_features,
|
|
1466
|
+
parts: Annotated[
|
|
1467
|
+
int | None,
|
|
1468
|
+
Option(
|
|
1469
|
+
"-n", "--num-parts", help="Split the created file into this number of parts"
|
|
1470
|
+
),
|
|
1471
|
+
] = None,
|
|
1472
|
+
max_fps_per_file: Annotated[
|
|
1473
|
+
int | None,
|
|
1474
|
+
Option(
|
|
1475
|
+
"-m",
|
|
1476
|
+
"--max-fps-per-file",
|
|
1477
|
+
help="Max. number of fps per file. Mutually exclusive with --num-parts",
|
|
1478
|
+
show_default=False,
|
|
1479
|
+
),
|
|
1480
|
+
] = None,
|
|
1481
|
+
pack: Annotated[
|
|
1482
|
+
bool,
|
|
1483
|
+
Option(
|
|
1484
|
+
"-p/-P",
|
|
1485
|
+
"--pack/--no-pack",
|
|
1486
|
+
help="Pack bits in last dimension of fingerprints",
|
|
1487
|
+
rich_help_panel="Advanced",
|
|
1488
|
+
),
|
|
1489
|
+
] = True,
|
|
1490
|
+
dtype: Annotated[
|
|
1491
|
+
str,
|
|
1492
|
+
Option(
|
|
1493
|
+
"-d",
|
|
1494
|
+
"--dtype",
|
|
1495
|
+
help="NumPy dtype for the generated fingerprints",
|
|
1496
|
+
rich_help_panel="Advanced",
|
|
1497
|
+
),
|
|
1498
|
+
] = "uint8",
|
|
1499
|
+
verbose: Annotated[
|
|
1500
|
+
bool,
|
|
1501
|
+
Option("-v/-V", "--verbose/--no-verbose"),
|
|
1502
|
+
] = True,
|
|
1503
|
+
num_ps: Annotated[
|
|
1504
|
+
int | None,
|
|
1505
|
+
Option(
|
|
1506
|
+
"--ps",
|
|
1507
|
+
"--processes",
|
|
1508
|
+
help=(
|
|
1509
|
+
"Num. processes for multprocess generation."
|
|
1510
|
+
" One process per file is used for multi-file generation"
|
|
1511
|
+
),
|
|
1512
|
+
),
|
|
1513
|
+
] = None,
|
|
1514
|
+
sanitize: Annotated[
|
|
1515
|
+
str,
|
|
1516
|
+
Option(
|
|
1517
|
+
"--sanitize",
|
|
1518
|
+
help="RDKit sanitization operations to perform ('all' or 'minimal')",
|
|
1519
|
+
),
|
|
1520
|
+
] = "all",
|
|
1521
|
+
skip_invalid: Annotated[
|
|
1522
|
+
bool,
|
|
1523
|
+
Option(
|
|
1524
|
+
"--skip-invalid/--no-skip-invalid",
|
|
1525
|
+
help=(
|
|
1526
|
+
"Skip invalid smiles."
|
|
1527
|
+
" If False, an error is raised on invalid smiles. If True they are"
|
|
1528
|
+
" silently skipped (this is be more memory intensive, especially for"
|
|
1529
|
+
" parallel processing)"
|
|
1530
|
+
),
|
|
1531
|
+
),
|
|
1532
|
+
] = False,
|
|
1533
|
+
) -> None:
|
|
1534
|
+
r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
|
|
1535
|
+
|
|
1536
|
+
By default this function runs in parallel and uses all available CPUs. In order to
|
|
1537
|
+
use the memory efficient BitBIRCH u8 algorithm you should keep the defaults:
|
|
1538
|
+
--dtype=uint8 and --pack
|
|
1539
|
+
"""
|
|
1540
|
+
import numpy as np
|
|
1541
|
+
|
|
1542
|
+
from bblean._console import get_console
|
|
1543
|
+
from bblean.utils import _num_avail_cpus
|
|
1544
|
+
from bblean.fingerprints import _FingerprintFileCreator, _FingerprintArrayFiller
|
|
1545
|
+
from bblean.smiles import (
|
|
1546
|
+
calc_num_smiles,
|
|
1547
|
+
_iter_ranges_and_smiles_batches,
|
|
1548
|
+
_iter_idxs_and_smiles_batches,
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1551
|
+
# Force forkserver since rdkit may use threads, and fork is unsafe with threads
|
|
1552
|
+
mp_context = mp.get_context("forkserver" if sys.platform == "linux" else None)
|
|
1553
|
+
|
|
1554
|
+
console = get_console(silent=not verbose)
|
|
1555
|
+
|
|
1556
|
+
if smiles_paths is None:
|
|
1557
|
+
smiles_paths = list(Path.cwd().glob("*.smi"))
|
|
1558
|
+
if not smiles_paths:
|
|
1559
|
+
console.print("No *.smi files found", style="red")
|
|
1560
|
+
raise Abort()
|
|
1561
|
+
|
|
1562
|
+
smiles_num = calc_num_smiles(smiles_paths)
|
|
1563
|
+
|
|
1564
|
+
def parse_num_per_batch(
|
|
1565
|
+
smiles_num: int, parts: int | None, max_fps_per_file: int | None
|
|
1566
|
+
) -> tuple[int, int, int | None]:
|
|
1567
|
+
digits: int | None
|
|
1568
|
+
if parts is not None and max_fps_per_file is None:
|
|
1569
|
+
num_per_batch = math.ceil(smiles_num / parts)
|
|
1570
|
+
digits = len(str(parts))
|
|
1571
|
+
elif parts is None and max_fps_per_file is not None:
|
|
1572
|
+
num_per_batch = max_fps_per_file
|
|
1573
|
+
parts = math.ceil(smiles_num / max_fps_per_file)
|
|
1574
|
+
digits = len(str(parts))
|
|
1575
|
+
elif parts is None and max_fps_per_file is None:
|
|
1576
|
+
parts = 1
|
|
1577
|
+
num_per_batch = math.ceil(smiles_num / parts)
|
|
1578
|
+
digits = None
|
|
1579
|
+
else:
|
|
1580
|
+
raise ValueError("parts and max_fps_per_file are mutually exclusive")
|
|
1581
|
+
return parts, num_per_batch, digits
|
|
1582
|
+
|
|
1583
|
+
try:
|
|
1584
|
+
parts, num_per_batch, digits = parse_num_per_batch(
|
|
1585
|
+
smiles_num, parts, max_fps_per_file
|
|
1586
|
+
)
|
|
1587
|
+
except ValueError:
|
|
1588
|
+
console.print(
|
|
1589
|
+
"'--max-fps-per-file' and '--num-parts' are mutually exclusive",
|
|
1590
|
+
style="red",
|
|
1591
|
+
)
|
|
1592
|
+
raise Abort() from None
|
|
1593
|
+
if out_dir is None:
|
|
1594
|
+
out_dir = Path.cwd()
|
|
1595
|
+
out_dir.mkdir(exist_ok=True)
|
|
1596
|
+
out_dir = out_dir.resolve()
|
|
1597
|
+
|
|
1598
|
+
# Pass 2: build the molecules
|
|
1599
|
+
unique_id = format(random.getrandbits(32), "08x")
|
|
1600
|
+
if out_name is None:
|
|
1601
|
+
# Save the fingerprints as a NumPy array
|
|
1602
|
+
out_name = f"{'packed-' if pack else ''}fps-{dtype}-{kind}-{unique_id}"
|
|
1603
|
+
else:
|
|
1604
|
+
# Strip suffix
|
|
1605
|
+
if out_name.endswith(".npy"):
|
|
1606
|
+
out_name = out_name[:-4]
|
|
1607
|
+
|
|
1608
|
+
if num_ps is None:
|
|
1609
|
+
# Get the number of cores *available for use for this process*
|
|
1610
|
+
# bound by the number of parts to avoid spawning useless processes
|
|
1611
|
+
if parts == 1:
|
|
1612
|
+
num_ps = _num_avail_cpus()
|
|
1613
|
+
else:
|
|
1614
|
+
num_ps = min(_num_avail_cpus(), parts)
|
|
1615
|
+
create_fp_file = _FingerprintFileCreator(
|
|
1616
|
+
dtype,
|
|
1617
|
+
out_dir,
|
|
1618
|
+
out_name,
|
|
1619
|
+
digits,
|
|
1620
|
+
pack,
|
|
1621
|
+
kind,
|
|
1622
|
+
fp_size,
|
|
1623
|
+
sanitize=sanitize,
|
|
1624
|
+
skip_invalid=skip_invalid,
|
|
1625
|
+
verbose=verbose,
|
|
1626
|
+
)
|
|
1627
|
+
timer = Timer()
|
|
1628
|
+
timer.init_timing("total")
|
|
1629
|
+
if parts > 1 and num_ps is not None and num_ps > 1:
|
|
1630
|
+
# Multiprocessing version, 1 process per file
|
|
1631
|
+
with console.status(
|
|
1632
|
+
f"[italic]Generating fingerprints ({parts} files, parallel, {num_ps} procs.) ...[/italic]", # noqa:E501
|
|
1633
|
+
spinner="dots",
|
|
1634
|
+
):
|
|
1635
|
+
with mp_context.Pool(processes=num_ps) as pool:
|
|
1636
|
+
pool.map(
|
|
1637
|
+
create_fp_file,
|
|
1638
|
+
_iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
|
|
1639
|
+
)
|
|
1640
|
+
timer.end_timing("total", console, indent=False)
|
|
1641
|
+
stem = out_name.split(".")[0]
|
|
1642
|
+
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.<idx>.npy")
|
|
1643
|
+
return
|
|
1644
|
+
|
|
1645
|
+
# Parallel or serial, single file version
|
|
1646
|
+
msg = "parallel" if num_ps > 1 else "serial"
|
|
1647
|
+
with console.status(
|
|
1648
|
+
f"[italic]Generating fingerprints ({parts} files, {msg}, {num_ps} procs.) ...[/italic]", # noqa:E501
|
|
1649
|
+
spinner="dots",
|
|
1650
|
+
):
|
|
1651
|
+
if pack:
|
|
1652
|
+
out_dim = (fp_size + 7) // 8
|
|
1653
|
+
else:
|
|
1654
|
+
out_dim = fp_size
|
|
1655
|
+
shmem_size = smiles_num * out_dim * np.dtype(dtype).itemsize
|
|
1656
|
+
fps_shmem = shmem.SharedMemory(create=True, size=shmem_size)
|
|
1657
|
+
invalid_mask_shmem = shmem.SharedMemory(create=True, size=smiles_num)
|
|
1658
|
+
fps_array_filler = _FingerprintArrayFiller(
|
|
1659
|
+
shmem_name=fps_shmem.name,
|
|
1660
|
+
invalid_mask_shmem_name=invalid_mask_shmem.name,
|
|
1661
|
+
kind=kind,
|
|
1662
|
+
fp_size=fp_size,
|
|
1663
|
+
num_smiles=smiles_num,
|
|
1664
|
+
dtype=dtype,
|
|
1665
|
+
pack=pack,
|
|
1666
|
+
sanitize=sanitize,
|
|
1667
|
+
skip_invalid=skip_invalid,
|
|
1668
|
+
)
|
|
1669
|
+
if num_ps > 1 and parts == 1:
|
|
1670
|
+
# Split into batches anyways if we have a single batch but multiple
|
|
1671
|
+
# processes
|
|
1672
|
+
_, num_per_batch, _ = parse_num_per_batch(
|
|
1673
|
+
smiles_num, num_ps, max_fps_per_file
|
|
1674
|
+
)
|
|
1675
|
+
with mp_context.Pool(processes=num_ps) as pool:
|
|
1676
|
+
pool.starmap(
|
|
1677
|
+
fps_array_filler,
|
|
1678
|
+
_iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
|
|
1679
|
+
)
|
|
1680
|
+
fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
|
|
1681
|
+
mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
|
|
1682
|
+
if skip_invalid:
|
|
1683
|
+
prev_num = len(fps)
|
|
1684
|
+
fps = np.delete(fps, mask, axis=0)
|
|
1685
|
+
new_num = len(fps)
|
|
1686
|
+
console.print(f"Generated {new_num} fingerprints")
|
|
1687
|
+
console.print(f"Skipped {prev_num - new_num} invalid smiles")
|
|
1688
|
+
invalid_name = f"invalid-{unique_id}.npy"
|
|
1689
|
+
console.print(
|
|
1690
|
+
f"Invalid smiles idxs written to {str(out_dir / invalid_name)}"
|
|
1691
|
+
)
|
|
1692
|
+
np.save(out_dir / f"invalid-{unique_id}.npy", mask.nonzero()[0].reshape(-1))
|
|
1693
|
+
|
|
1694
|
+
np.save(
|
|
1695
|
+
out_dir / out_name,
|
|
1696
|
+
fps,
|
|
1697
|
+
)
|
|
1698
|
+
del mask
|
|
1699
|
+
del fps
|
|
1700
|
+
# Cleanup
|
|
1701
|
+
fps_shmem.unlink()
|
|
1702
|
+
invalid_mask_shmem.unlink()
|
|
1703
|
+
timer.end_timing("total", console, indent=False)
|
|
1704
|
+
console.print(f"Finished. Outputs written to {str(out_dir / out_name)}.npy")
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
@app.command("fps-split", rich_help_panel="Fingerprints")
|
|
1708
|
+
def _split_fps(
|
|
1709
|
+
input_: Annotated[
|
|
1710
|
+
Path,
|
|
1711
|
+
Argument(help="`*.npy` file with fingerprints"),
|
|
1712
|
+
],
|
|
1713
|
+
out_dir: Annotated[
|
|
1714
|
+
Path | None,
|
|
1715
|
+
Option("-o", "--out-dir", show_default=False),
|
|
1716
|
+
] = None,
|
|
1717
|
+
parts: Annotated[
|
|
1718
|
+
int | None,
|
|
1719
|
+
Option(
|
|
1720
|
+
"-n",
|
|
1721
|
+
"--num-parts",
|
|
1722
|
+
help="Num. of parts to split file into. Mutually exclusive with --max-fps",
|
|
1723
|
+
show_default=False,
|
|
1724
|
+
),
|
|
1725
|
+
] = None,
|
|
1726
|
+
max_fps_per_file: Annotated[
|
|
1727
|
+
int | None,
|
|
1728
|
+
Option(
|
|
1729
|
+
"-m",
|
|
1730
|
+
"--max-fps",
|
|
1731
|
+
help="Max. number of fps per file. Mutually exclusive with --num-parts",
|
|
1732
|
+
show_default=False,
|
|
1733
|
+
),
|
|
1734
|
+
] = None,
|
|
1735
|
+
) -> None:
|
|
1736
|
+
r"""Split a `*.npy` fingerprint file into multiple `*.npy` files
|
|
1737
|
+
|
|
1738
|
+
Usage to split into multiple files with a max number of fps each (e.g. 10k) is `bb
|
|
1739
|
+
split-fps --max-fps 10_000 ./fps.npy --out-dir ./split`. To split into a pre-defined
|
|
1740
|
+
number of parts (e.g. 10) `bb split-fps --num-parts 10 ./fps.npy --out-dir ./split`.
|
|
1741
|
+
"""
|
|
1742
|
+
from bblean._console import get_console
|
|
1743
|
+
import numpy as np
|
|
1744
|
+
|
|
1745
|
+
console = get_console()
|
|
1746
|
+
if parts is not None and parts < 2:
|
|
1747
|
+
console.print("Num must be >= 2", style="red")
|
|
1748
|
+
raise Abort()
|
|
1749
|
+
fps = np.load(input_, mmap_mode="r")
|
|
1750
|
+
if parts is not None and max_fps_per_file is None:
|
|
1751
|
+
num_per_batch = math.ceil(fps.shape[0] / parts)
|
|
1752
|
+
digits = len(str(parts))
|
|
1753
|
+
elif parts is None and max_fps_per_file is not None:
|
|
1754
|
+
num_per_batch = max_fps_per_file
|
|
1755
|
+
digits = len(str(math.ceil(fps.shape[0] / max_fps_per_file)))
|
|
1756
|
+
else:
|
|
1757
|
+
console.print(
|
|
1758
|
+
"One and only one of '--max-fps' and '--num-parts' required", style="red"
|
|
1759
|
+
)
|
|
1760
|
+
raise Abort()
|
|
1761
|
+
|
|
1762
|
+
stem = input_.name.split(".")[0]
|
|
1763
|
+
with console.status("[italic]Splitting fingerprints...[/italic]", spinner="dots"):
|
|
1764
|
+
i = -1
|
|
1765
|
+
for i, batch in enumerate(batched(fps, num_per_batch)):
|
|
1766
|
+
suffixes = input_.suffixes
|
|
1767
|
+
name = f"{stem}{''.join(suffixes[:-1])}.{str(i).zfill(digits)}.npy"
|
|
1768
|
+
|
|
1769
|
+
# Generate out dir when first fp file is being saved
|
|
1770
|
+
if out_dir is None:
|
|
1771
|
+
out_dir = Path.cwd() / stem
|
|
1772
|
+
out_dir.mkdir(exist_ok=True)
|
|
1773
|
+
out_dir = out_dir.resolve()
|
|
1774
|
+
|
|
1775
|
+
np.save(out_dir / name, batch)
|
|
1776
|
+
|
|
1777
|
+
if i == -1:
|
|
1778
|
+
console.print("Warning: No fingerprints written", style="yellow")
|
|
1779
|
+
return
|
|
1780
|
+
console.print(
|
|
1781
|
+
f"Finished. Outputs written to {str(tp.cast(Path, out_dir) / stem)}.<idx>.npy"
|
|
1782
|
+
)
|
|
1783
|
+
|
|
1784
|
+
|
|
1785
|
+
@app.command("fps-shuffle", rich_help_panel="Fingerprints")
|
|
1786
|
+
def _shuffle_fps(
|
|
1787
|
+
in_file: Annotated[
|
|
1788
|
+
Path,
|
|
1789
|
+
Argument(help="`*.npy` file with packed fingerprints"),
|
|
1790
|
+
],
|
|
1791
|
+
out_dir: Annotated[
|
|
1792
|
+
Path | None,
|
|
1793
|
+
Option("-o", "--out-dir", show_default=False),
|
|
1794
|
+
] = None,
|
|
1795
|
+
seed: Annotated[
|
|
1796
|
+
int | None,
|
|
1797
|
+
Option("--seed", hidden=True, rich_help_panel="Debug"),
|
|
1798
|
+
] = None,
|
|
1799
|
+
) -> None:
|
|
1800
|
+
"""Shuffle a fingerprints file
|
|
1801
|
+
|
|
1802
|
+
This function is not optimized and as such may have high RAM usage. It is
|
|
1803
|
+
meant for testing purposes only"""
|
|
1804
|
+
import numpy as np
|
|
1805
|
+
|
|
1806
|
+
fps = np.load(in_file)
|
|
1807
|
+
stem = in_file.stem
|
|
1808
|
+
rng = np.random.default_rng(seed)
|
|
1809
|
+
rng.shuffle(fps, axis=0)
|
|
1810
|
+
if out_dir is None:
|
|
1811
|
+
out_dir = Path.cwd()
|
|
1812
|
+
out_dir.mkdir(exist_ok=True)
|
|
1813
|
+
out_dir = out_dir.resolve()
|
|
1814
|
+
np.save(out_dir / f"shuffled-{stem}.npy", fps)
|
|
1815
|
+
|
|
1816
|
+
|
|
1817
|
+
@app.command("fps-merge", rich_help_panel="Fingerprints")
|
|
1818
|
+
def _merge_fps(
|
|
1819
|
+
in_dir: Annotated[
|
|
1820
|
+
Path,
|
|
1821
|
+
Argument(help="Directory with input `*.npy` files with packed fingerprints"),
|
|
1822
|
+
],
|
|
1823
|
+
out_dir: Annotated[
|
|
1824
|
+
Path | None,
|
|
1825
|
+
Option("-o", "--out-dir", show_default=False),
|
|
1826
|
+
] = None,
|
|
1827
|
+
) -> None:
|
|
1828
|
+
r"""Merge a dir with multiple `*.npy` fingerprint file into a single `*.npy` file"""
|
|
1829
|
+
from bblean._console import get_console
|
|
1830
|
+
import numpy as np
|
|
1831
|
+
|
|
1832
|
+
console = get_console()
|
|
1833
|
+
|
|
1834
|
+
if out_dir is None:
|
|
1835
|
+
out_dir = Path.cwd()
|
|
1836
|
+
out_dir.mkdir(exist_ok=True)
|
|
1837
|
+
out_dir = out_dir.resolve()
|
|
1838
|
+
arrays = []
|
|
1839
|
+
with console.status("[italic]Merging fingerprints...[/italic]", spinner="dots"):
|
|
1840
|
+
stem = None
|
|
1841
|
+
for f in sorted(in_dir.glob("*.npy")):
|
|
1842
|
+
if stem is None:
|
|
1843
|
+
stem = f.name.split(".")[0]
|
|
1844
|
+
elif stem != f.name.split(".")[0]:
|
|
1845
|
+
raise ValueError(
|
|
1846
|
+
"Name convention must be <name>.<idx>.npy"
|
|
1847
|
+
" with all files having the same <name>"
|
|
1848
|
+
)
|
|
1849
|
+
arrays.append(np.load(f))
|
|
1850
|
+
if stem is None:
|
|
1851
|
+
console.print("No *.npy files found")
|
|
1852
|
+
return
|
|
1853
|
+
np.save(out_dir / stem, np.concatenate(arrays))
|
|
1854
|
+
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
|