bblean 0.6.0b1__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bblean/cli.py ADDED
@@ -0,0 +1,1854 @@
1
+ r"""Command line interface entrypoints"""
2
+
3
+ import warnings
4
+ import random
5
+ import typing as tp
6
+ import math
7
+ import shutil
8
+ import sys
9
+ import pickle
10
+ import multiprocessing as mp
11
+ import multiprocessing.shared_memory as shmem
12
+ from typing import Annotated
13
+ from pathlib import Path
14
+
15
+ from typer import Typer, Argument, Option, Abort, Context, Exit
16
+
17
+ from bblean._memory import launch_monitor_rss_daemon
18
+ from bblean._timer import Timer
19
+ from bblean._config import DEFAULTS, collect_system_specs_and_dump_config, TSNE_SEED
20
+ from bblean.utils import _import_bitbirch_variant, batched
21
+
22
+ app = Typer(
23
+ rich_markup_mode="markdown",
24
+ add_completion=False,
25
+ help=r"""CLI tool for serial or parallel fast clustering of molecular fingerprints
26
+ using the memory-efficient and compute-efficient *O(N)* BitBIRCH algorithm ('Lean'
27
+ version). For more info about the subcommands run `bb <subcommand> --help `.""",
28
+ )
29
+
30
+
31
+ def _print_help_banner(ctx: Context, value: bool) -> None:
32
+ if value:
33
+ from bblean._console import get_console
34
+
35
+ console = get_console()
36
+ console.print_banner()
37
+ console.print(ctx.get_help())
38
+ raise Exit()
39
+
40
+
41
+ def _validate_output_dir(out_dir: Path, overwrite: bool = False) -> None:
42
+ if out_dir.exists():
43
+ if not out_dir.is_dir():
44
+ raise RuntimeError("Output dir should be a dir")
45
+ if any(out_dir.iterdir()):
46
+ if overwrite:
47
+ shutil.rmtree(out_dir)
48
+ else:
49
+ raise RuntimeError(f"Output dir {out_dir} has files")
50
+
51
+
52
+ # Validate that the naming convention for the input files is correct
53
+ def _validate_input_dir(in_dir: Path | str) -> None:
54
+ in_dir = Path(in_dir)
55
+ if not in_dir.is_dir():
56
+ raise RuntimeError(f"Input dir {in_dir} should be a dir")
57
+ if not any(in_dir.glob("*.npy")):
58
+ raise RuntimeError(f"Input dir {in_dir} should have *.npy fingerprint files")
59
+
60
+
61
+ @app.callback()
62
+ def _main(
63
+ ctx: Context,
64
+ help_: Annotated[
65
+ bool,
66
+ Option(
67
+ "--help/ ",
68
+ "-h",
69
+ is_eager=True,
70
+ help="Show this message and exit.",
71
+ callback=_print_help_banner,
72
+ ),
73
+ ] = False,
74
+ ) -> None:
75
+ pass
76
+
77
+
78
+ @app.command("plot-pops", rich_help_panel="Analysis")
79
+ def _plot_pops(
80
+ clusters_path: Annotated[
81
+ Path,
82
+ Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
83
+ ],
84
+ fps_path: Annotated[
85
+ Path | None,
86
+ Option(
87
+ "-f",
88
+ "--fps-path",
89
+ help="Path to fingerprint file, or directory with fingerprint files",
90
+ show_default=False,
91
+ ),
92
+ ] = None,
93
+ title: Annotated[
94
+ str | None,
95
+ Option("--title", help="Plot title"),
96
+ ] = None,
97
+ top: Annotated[
98
+ int | None,
99
+ Option("--top"),
100
+ ] = None,
101
+ input_is_packed: Annotated[
102
+ bool,
103
+ Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
104
+ ] = True,
105
+ min_size: Annotated[
106
+ int,
107
+ Option("--min-size"),
108
+ ] = 0,
109
+ n_features: Annotated[
110
+ int | None,
111
+ Option(
112
+ "--n-features",
113
+ help="Number of features in the fingerprints."
114
+ " Only for packed inputs *if it is not a multiple of 8*."
115
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
116
+ rich_help_panel="Advanced",
117
+ ),
118
+ ] = None,
119
+ save: Annotated[
120
+ bool,
121
+ Option("--save/--no-save"),
122
+ ] = True,
123
+ filename: Annotated[
124
+ str | None,
125
+ Option("--filename"),
126
+ ] = None,
127
+ verbose: Annotated[
128
+ bool,
129
+ Option("-v/-V", "--verbose/--no-verbose"),
130
+ ] = True,
131
+ show: Annotated[
132
+ bool,
133
+ Option("--show/--no-show", hidden=True),
134
+ ] = True,
135
+ ) -> None:
136
+ r"""Population plot of the clustering results"""
137
+ from bblean._console import get_console
138
+
139
+ console = get_console(silent=not verbose)
140
+ # Imports may take a bit of time since sklearn is slow, so start the spinner here
141
+ with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
142
+ from bblean.plotting import _dispatch_visualization, pops_plot
143
+
144
+ _dispatch_visualization(
145
+ clusters_path,
146
+ "pops",
147
+ pops_plot,
148
+ {},
149
+ min_size=min_size,
150
+ top=top,
151
+ n_features=n_features,
152
+ input_is_packed=input_is_packed,
153
+ fps_path=fps_path,
154
+ title=title,
155
+ filename=filename,
156
+ verbose=verbose,
157
+ save=save,
158
+ show=show,
159
+ )
160
+
161
+
162
+ @app.command("plot-umap", rich_help_panel="Analysis")
163
+ def _plot_umap(
164
+ clusters_path: Annotated[
165
+ Path,
166
+ Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
167
+ ],
168
+ fps_path: Annotated[
169
+ Path | None,
170
+ Option(
171
+ "-f",
172
+ "--fps-path",
173
+ help="Path to fingerprint file, or directory with fingerprint files",
174
+ show_default=False,
175
+ ),
176
+ ] = None,
177
+ title: Annotated[
178
+ str | None,
179
+ Option("--title", help="Plot title"),
180
+ ] = None,
181
+ save: Annotated[
182
+ bool,
183
+ Option("--save/--no-save"),
184
+ ] = True,
185
+ top: Annotated[
186
+ int,
187
+ Option("--top"),
188
+ ] = 20,
189
+ input_is_packed: Annotated[
190
+ bool,
191
+ Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
192
+ ] = True,
193
+ scaling: Annotated[
194
+ str,
195
+ Option("--scaling", rich_help_panel="Advanced"),
196
+ ] = "normalize",
197
+ min_size: Annotated[
198
+ int,
199
+ Option("--min-size"),
200
+ ] = 0,
201
+ n_features: Annotated[
202
+ int | None,
203
+ Option(
204
+ "--n-features",
205
+ help="Number of features in the fingerprints."
206
+ " Only for packed inputs *if it is not a multiple of 8*."
207
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
208
+ rich_help_panel="Advanced",
209
+ ),
210
+ ] = None,
211
+ filename: Annotated[
212
+ str | None,
213
+ Option("--filename"),
214
+ ] = None,
215
+ verbose: Annotated[
216
+ bool,
217
+ Option("-v/-V", "--verbose/--no-verbose"),
218
+ ] = True,
219
+ show: Annotated[
220
+ bool,
221
+ Option("--show/--no-show", hidden=True),
222
+ ] = True,
223
+ deterministic: Annotated[
224
+ bool,
225
+ Option("--deterministic/--no-deterministic"),
226
+ ] = False,
227
+ n_neighbors: Annotated[
228
+ int,
229
+ Option("-n", "--neighbors"),
230
+ ] = 15,
231
+ min_dist: Annotated[
232
+ float,
233
+ Option("-d", "--min-dist"),
234
+ ] = 0.5,
235
+ metric: Annotated[
236
+ str,
237
+ Option("--metric"),
238
+ ] = "euclidean",
239
+ densmap: Annotated[
240
+ bool,
241
+ Option("--densmap/--no-densmap"),
242
+ ] = False,
243
+ workers: Annotated[
244
+ int | None,
245
+ Option(
246
+ "-w",
247
+ "--workers",
248
+ help="Num. cores to use for parallel processing",
249
+ rich_help_panel="Advanced",
250
+ ),
251
+ ] = None,
252
+ ) -> None:
253
+ r"""UMAP visualization of the clustering results"""
254
+ from bblean._console import get_console
255
+
256
+ console = get_console(silent=not verbose)
257
+ # Imports may take a bit of time since sklearn is slow, so start the spinner here
258
+ with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
259
+ from bblean.plotting import _dispatch_visualization, umap_plot
260
+
261
+ kwargs = dict(
262
+ metric=metric,
263
+ densmap=densmap,
264
+ deterministic=deterministic,
265
+ n_neighbors=n_neighbors,
266
+ workers=workers,
267
+ min_dist=min_dist,
268
+ )
269
+ _dispatch_visualization(
270
+ clusters_path,
271
+ "umap",
272
+ umap_plot,
273
+ kwargs,
274
+ min_size=min_size,
275
+ top=top,
276
+ n_features=n_features,
277
+ input_is_packed=input_is_packed,
278
+ fps_path=fps_path,
279
+ title=title,
280
+ filename=filename,
281
+ verbose=verbose,
282
+ save=save,
283
+ show=show,
284
+ )
285
+
286
+
287
+ @app.command("plot-pca", rich_help_panel="Analysis")
288
+ def _plot_pca(
289
+ clusters_path: Annotated[
290
+ Path,
291
+ Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
292
+ ],
293
+ fps_path: Annotated[
294
+ Path | None,
295
+ Option(
296
+ "-f",
297
+ "--fps-path",
298
+ help="Path to fingerprint file, or directory with fingerprint files",
299
+ show_default=False,
300
+ ),
301
+ ] = None,
302
+ title: Annotated[
303
+ str | None,
304
+ Option("--title", help="Plot title"),
305
+ ] = None,
306
+ top: Annotated[
307
+ int,
308
+ Option("--top"),
309
+ ] = 20,
310
+ min_size: Annotated[
311
+ int,
312
+ Option("--min-size"),
313
+ ] = 0,
314
+ input_is_packed: Annotated[
315
+ bool,
316
+ Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
317
+ ] = True,
318
+ scaling: Annotated[
319
+ str,
320
+ Option("--scaling", rich_help_panel="Advanced"),
321
+ ] = "normalize",
322
+ n_features: Annotated[
323
+ int | None,
324
+ Option(
325
+ "--n-features",
326
+ help="Number of features in the fingerprints."
327
+ " Only for packed inputs *if it is not a multiple of 8*."
328
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
329
+ rich_help_panel="Advanced",
330
+ ),
331
+ ] = None,
332
+ verbose: Annotated[
333
+ bool,
334
+ Option("-v/-V", "--verbose/--no-verbose"),
335
+ ] = True,
336
+ show: Annotated[
337
+ bool,
338
+ Option("--show/--no-show", hidden=True),
339
+ ] = True,
340
+ whiten: Annotated[
341
+ bool,
342
+ Option("--whiten/--no-whiten"),
343
+ ] = False,
344
+ save: Annotated[
345
+ bool,
346
+ Option("--save/--no-save"),
347
+ ] = True,
348
+ filename: Annotated[
349
+ str | None,
350
+ Option("--filename"),
351
+ ] = None,
352
+ ) -> None:
353
+ r"""PCA visualization of the clustering results"""
354
+ from bblean._console import get_console
355
+
356
+ console = get_console(silent=not verbose)
357
+ # Imports may take a bit of time since sklearn is slow, so start the spinner here
358
+ with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
359
+ from bblean.plotting import _dispatch_visualization, pca_plot
360
+
361
+ _dispatch_visualization(
362
+ clusters_path,
363
+ "pca",
364
+ pca_plot,
365
+ {"whiten": whiten},
366
+ min_size=min_size,
367
+ top=top,
368
+ n_features=n_features,
369
+ input_is_packed=input_is_packed,
370
+ fps_path=fps_path,
371
+ title=title,
372
+ filename=filename,
373
+ verbose=verbose,
374
+ save=save,
375
+ show=show,
376
+ )
377
+
378
+
379
+ @app.command("plot-tsne", rich_help_panel="Analysis")
380
+ def _plot_tsne(
381
+ clusters_path: Annotated[
382
+ Path,
383
+ Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
384
+ ],
385
+ fps_path: Annotated[
386
+ Path | None,
387
+ Option(
388
+ "-f",
389
+ "--fps-path",
390
+ help="Path to fingerprint file, or directory with fingerprint files",
391
+ show_default=False,
392
+ ),
393
+ ] = None,
394
+ title: Annotated[
395
+ str | None,
396
+ Option("--title", help="Plot title"),
397
+ ] = None,
398
+ save: Annotated[
399
+ bool,
400
+ Option("--save/--no-save"),
401
+ ] = True,
402
+ min_size: Annotated[
403
+ int,
404
+ Option("--min-size"),
405
+ ] = 0,
406
+ filename: Annotated[
407
+ str | None,
408
+ Option("--filename"),
409
+ ] = None,
410
+ exaggeration: Annotated[
411
+ float | None,
412
+ Option("-e", "--exaggeration", rich_help_panel="Advanced"),
413
+ ] = None,
414
+ seed: Annotated[
415
+ int | None,
416
+ Option(
417
+ "-s",
418
+ "--seed",
419
+ help=(
420
+ "Seed for the rng, fixed value by default, for reproducibility."
421
+ " Pass -1 to randomize"
422
+ ),
423
+ show_default=False,
424
+ rich_help_panel="Advanced",
425
+ ),
426
+ ] = TSNE_SEED,
427
+ top: Annotated[
428
+ int,
429
+ Option("--top"),
430
+ ] = 20,
431
+ metric: Annotated[
432
+ str,
433
+ Option("--metric", help="Metric to use in the t-SNE source space"),
434
+ ] = "euclidean",
435
+ dof: Annotated[
436
+ float,
437
+ Option("-d", "--dof", rich_help_panel="Advanced"),
438
+ ] = 1.0,
439
+ perplexity: Annotated[
440
+ int,
441
+ Option(help="t-SNE perplexity", rich_help_panel="Advanced"),
442
+ ] = 30,
443
+ input_is_packed: Annotated[
444
+ bool,
445
+ Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
446
+ ] = True,
447
+ n_features: Annotated[
448
+ int | None,
449
+ Option(
450
+ "--n-features",
451
+ help="Number of features in the fingerprints."
452
+ " Only for packed inputs *if it is not a multiple of 8*."
453
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
454
+ rich_help_panel="Advanced",
455
+ ),
456
+ ] = None,
457
+ scaling: Annotated[
458
+ str,
459
+ Option("--scaling", rich_help_panel="Advanced"),
460
+ ] = "normalize",
461
+ do_pca_init: Annotated[
462
+ bool,
463
+ Option(
464
+ "--pca-init/--no-pca-init",
465
+ rich_help_panel="Advanced",
466
+ help="Use PCA for initialization",
467
+ ),
468
+ ] = True,
469
+ pca_reduce: Annotated[
470
+ int | None,
471
+ Option(
472
+ "-p",
473
+ "--pca-reduce",
474
+ rich_help_panel="Advanced",
475
+ help=(
476
+ "Reduce fingerprint dimensionality to N components using PCA."
477
+ " A value of 50 or more maintains cluster structure in general"
478
+ ),
479
+ ),
480
+ ] = None,
481
+ workers: Annotated[
482
+ int | None,
483
+ Option(
484
+ "-w",
485
+ "--workers",
486
+ help="Num. cores to use for parallel processing",
487
+ rich_help_panel="Advanced",
488
+ ),
489
+ ] = None,
490
+ multiscale: Annotated[
491
+ bool,
492
+ Option(
493
+ "-m/-M",
494
+ "--multiscale/--no-multiscale",
495
+ rich_help_panel="Advanced",
496
+ help="Use multiscale perplexities (WARNING: Can be very slow!)",
497
+ ),
498
+ ] = False,
499
+ verbose: Annotated[
500
+ bool,
501
+ Option("-v/-V", "--verbose/--no-verbose"),
502
+ ] = True,
503
+ show: Annotated[
504
+ bool,
505
+ Option("--show/--no-show", hidden=True),
506
+ ] = True,
507
+ ) -> None:
508
+ r"""t-SNE visualization of the clustering results"""
509
+ from bblean._console import get_console
510
+
511
+ console = get_console(silent=not verbose)
512
+ # Imports may take a bit of time since sklearn is slow, so start the spinner here
513
+ with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
514
+ from bblean.plotting import _dispatch_visualization, tsne_plot
515
+
516
+ kwargs = dict(
517
+ metric=metric,
518
+ seed=seed,
519
+ perplexity=perplexity,
520
+ exaggeration=exaggeration,
521
+ dof=dof,
522
+ workers=workers,
523
+ scaling=scaling,
524
+ do_pca_init=do_pca_init,
525
+ multiscale=multiscale,
526
+ pca_reduce=pca_reduce,
527
+ )
528
+ _dispatch_visualization(
529
+ clusters_path,
530
+ "tsne",
531
+ tsne_plot,
532
+ kwargs,
533
+ min_size=min_size,
534
+ top=top,
535
+ n_features=n_features,
536
+ input_is_packed=input_is_packed,
537
+ fps_path=fps_path,
538
+ title=title,
539
+ filename=filename,
540
+ verbose=verbose,
541
+ save=save,
542
+ show=show,
543
+ )
544
+
545
+
546
+ @app.command("summary", rich_help_panel="Analysis")
547
+ def _table_summary(
548
+ clusters_path: Annotated[
549
+ Path,
550
+ Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
551
+ ],
552
+ fps_path: Annotated[
553
+ Path | None,
554
+ Option(
555
+ "-f",
556
+ "--fps-path",
557
+ help="Path to fingerprint file, or directory with fingerprint files",
558
+ show_default=False,
559
+ ),
560
+ ] = None,
561
+ min_size: Annotated[
562
+ int,
563
+ Option("--min-size"),
564
+ ] = 0,
565
+ smiles_path: Annotated[
566
+ Path | None,
567
+ Option(
568
+ "-s",
569
+ "--smiles-path",
570
+ show_default=False,
571
+ help="Optional smiles path, if passed a scaffold analysis is performed",
572
+ ),
573
+ ] = None,
574
+ top: Annotated[
575
+ int,
576
+ Option("--top"),
577
+ ] = 20,
578
+ input_is_packed: Annotated[
579
+ bool,
580
+ Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
581
+ ] = True,
582
+ scaffold_fp_kind: Annotated[
583
+ str,
584
+ Option("--scaffold-fp-kind"),
585
+ ] = DEFAULTS.fp_kind,
586
+ n_features: Annotated[
587
+ int | None,
588
+ Option(
589
+ "--n-features",
590
+ help="Number of features in the fingerprints."
591
+ " Only for packed inputs *if it is not a multiple of 8*."
592
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
593
+ rich_help_panel="Advanced",
594
+ ),
595
+ ] = None,
596
+ metrics: Annotated[
597
+ bool,
598
+ Option(
599
+ "--metrics/--no-metrics",
600
+ help="Calculate clustering indices (Dunn, DBI, CHI)",
601
+ ),
602
+ ] = False,
603
+ chosen_metrics: Annotated[
604
+ str,
605
+ Option(
606
+ "-m",
607
+ "--metrics-choice",
608
+ help=(
609
+ "Chosen metrics. "
610
+ " Comma-separated list including dunn (slow), dbi or chi"
611
+ ),
612
+ ),
613
+ ] = "dunn,dbi,chi",
614
+ metrics_top: Annotated[
615
+ int | None,
616
+ Option("--metrics-top", rich_help_panel="Advanced"),
617
+ ] = 100,
618
+ metrics_min_size: Annotated[
619
+ int,
620
+ Option("--metrics-min-size", hidden=True),
621
+ ] = 1,
622
+ verbose: Annotated[
623
+ bool,
624
+ Option("--verbose/--no-verbose", hidden=True),
625
+ ] = True,
626
+ ) -> None:
627
+ r"""Summary table of clustering results, together with cluster metrics"""
628
+ from bblean._console import get_console
629
+ from bblean.smiles import load_smiles
630
+ from bblean.analysis import cluster_analysis
631
+ from bblean.utils import _has_files_or_valid_symlinks
632
+ from bblean.metrics import jt_dbi, jt_isim_chi, jt_isim_dunn, _calc_centrals
633
+ from rich.table import Table
634
+
635
+ console = get_console(silent=not verbose)
636
+ # Imports may take a bit of time since sklearn is slow, so start the spinner here
637
+ with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
638
+ if clusters_path.is_dir():
639
+ clusters_path = clusters_path / "clusters.pkl"
640
+ with open(clusters_path, mode="rb") as f:
641
+ clusters = pickle.load(f)
642
+ if fps_path is None:
643
+ input_fps_path = clusters_path.parent / "input-fps"
644
+ if input_fps_path.is_dir() and _has_files_or_valid_symlinks(input_fps_path):
645
+ fps_path = input_fps_path
646
+ else:
647
+ msg = (
648
+ "Could not find input fingerprints. Please use --fps-path."
649
+ " Summary plot without fingerprints doesn't include isim values"
650
+ )
651
+ warnings.warn(msg)
652
+ if fps_path is None:
653
+ fps_paths = None
654
+ elif fps_path.is_dir():
655
+ fps_paths = sorted(fps_path.glob("*.npy"))
656
+ else:
657
+ fps_paths = [fps_path]
658
+ ca = cluster_analysis(
659
+ clusters,
660
+ fps_paths,
661
+ smiles=load_smiles(smiles_path) if smiles_path is not None else (),
662
+ top=top,
663
+ n_features=n_features,
664
+ input_is_packed=input_is_packed,
665
+ min_size=min_size,
666
+ )
667
+ table = Table(title=(f"Top {top} clusters" if top is not None else "Clusters"))
668
+ table.add_column("Size", justify="center")
669
+ table.add_column("% fps", justify="center")
670
+ table.add_column("iSIM", justify="center")
671
+ if smiles_path is not None:
672
+ table.add_column("Size/Scaff.", justify="center")
673
+ table.add_column("Num. Scaff.", justify="center")
674
+ table.add_column("Scaff. iSIM", justify="center")
675
+ sizes = ca.sizes
676
+ isims = ca.isims
677
+ total_fps = ca.total_fps
678
+ for i in range(ca.clusters_num):
679
+ size = sizes[i]
680
+ percent = size / total_fps * 100
681
+ table.add_row(f"{size:,}", f"{percent:.2f}", f"{isims[i]:.3f}")
682
+ console.print(table)
683
+ console.print()
684
+ console.print(f"Total num. fps: {total_fps:,}")
685
+ console.print(f"Total num. clusters: {ca.all_clusters_num:,}")
686
+ singles = ca.all_singletons_num
687
+ singles_percent = singles * 100 / ca.all_clusters_num
688
+ console.print(f"Total num. singletons: {singles:,} ({singles_percent:.2f} %)")
689
+ gt10 = ca.all_clusters_num_with_size_above(10)
690
+ gt10_percent = gt10 * 100 / ca.all_clusters_num
691
+ console.print(
692
+ f"Total num. clusters, size > 10: {gt10:,} ({gt10_percent:.2f} %)"
693
+ )
694
+ gt100 = ca.all_clusters_num_with_size_above(100)
695
+ gt100_percent = gt100 * 100 / ca.all_clusters_num
696
+ console.print(
697
+ f"Total num. clusters, size > 100: {gt100:,} ({gt100_percent:.2f} %)"
698
+ )
699
+ console.print(
700
+ f"num-clusters/num-fps ratio: {ca.all_clusters_num / total_fps:.2f}"
701
+ )
702
+ console.print(f"Mean size: {ca.all_clusters_mean_size:.2f}")
703
+ console.print(f"Max. size: {ca.all_clusters_max_size:,}")
704
+ console.print(f"Q3 (75%) size: {ca.all_clusters_q3:,}")
705
+ console.print(f"Median size: {ca.all_clusters_median_size:,}")
706
+ console.print(f"Q1 (25%) size: {ca.all_clusters_q1:,}")
707
+ console.print(f"Min. size: {ca.all_clusters_min_size:,}")
708
+ if metrics:
709
+ chosen = set(s.lower() for s in chosen_metrics.split(","))
710
+ assert all(s in ["dunn", "chi", "dbi"] for s in chosen)
711
+ # Redo cluster analysis with more *top* clusters
712
+ console.print()
713
+ if metrics_top is None:
714
+ console.print("Clustering metrics:")
715
+ else:
716
+ console.print(f"Clustering metrics considering top {metrics_top} clusters:")
717
+ with console.status("[italic]Reanalyzing clusters...[/italic]", spinner="dots"):
718
+ ca = cluster_analysis(
719
+ clusters,
720
+ fps_paths,
721
+ smiles=(),
722
+ top=metrics_top,
723
+ n_features=n_features,
724
+ input_is_packed=input_is_packed,
725
+ min_size=metrics_min_size,
726
+ )
727
+ clusters = ca.get_top_cluster_fps()
728
+ with console.status("[italic]Calculating centrals...[/italic]", spinner="dots"):
729
+ centrals = _calc_centrals(clusters, kind="centroid")
730
+ if "chi" in chosen:
731
+ chi = jt_isim_chi(clusters, centrals=centrals, verbose=verbose)
732
+ console.print(f" - CHI index: {chi:.4f} (Higher is better)")
733
+ if "dbi" in chosen:
734
+ dbi = jt_dbi(clusters, centrals=centrals, verbose=verbose)
735
+ console.print(f" - DBI index: {dbi:.4e} (Lower is better)")
736
+ if "dunn" in chosen:
737
+ dunn = jt_isim_dunn(clusters, verbose=verbose)
738
+ console.print(f" - Dunn index: {dunn:.4f} (Higher is better)")
739
+
740
+
741
+ @app.command("plot-summary", rich_help_panel="Analysis")
742
+ def _plot_summary(
743
+ clusters_path: Annotated[
744
+ Path,
745
+ Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
746
+ ],
747
+ fps_path: Annotated[
748
+ Path | None,
749
+ Option(
750
+ "-f",
751
+ "--fps-path",
752
+ help="Path to fingerprint file, or directory with fingerprint files",
753
+ show_default=False,
754
+ ),
755
+ ] = None,
756
+ save: Annotated[
757
+ bool,
758
+ Option("--save/--no-save"),
759
+ ] = True,
760
+ ylim: Annotated[
761
+ int | None,
762
+ Option("--ylim"),
763
+ ] = None,
764
+ min_size: Annotated[
765
+ int,
766
+ Option("--min-size"),
767
+ ] = 0,
768
+ smiles_path: Annotated[
769
+ Path | None,
770
+ Option(
771
+ "-s",
772
+ "--smiles-path",
773
+ show_default=False,
774
+ help="Optional smiles path, if passed a scaffold analysis is performed",
775
+ ),
776
+ ] = None,
777
+ title: Annotated[
778
+ str | None,
779
+ Option("--title"),
780
+ ] = None,
781
+ filename: Annotated[
782
+ str | None,
783
+ Option("--filename"),
784
+ ] = None,
785
+ top: Annotated[
786
+ int,
787
+ Option("--top"),
788
+ ] = 20,
789
+ input_is_packed: Annotated[
790
+ bool,
791
+ Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
792
+ ] = True,
793
+ scaffold_fp_kind: Annotated[
794
+ str,
795
+ Option("--scaffold-fp-kind"),
796
+ ] = DEFAULTS.fp_kind,
797
+ n_features: Annotated[
798
+ int | None,
799
+ Option(
800
+ "--n-features",
801
+ help="Number of features in the fingerprints."
802
+ " Only for packed inputs *if it is not a multiple of 8*."
803
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
804
+ rich_help_panel="Advanced",
805
+ ),
806
+ ] = None,
807
+ annotate: Annotated[
808
+ bool,
809
+ Option(
810
+ "--annotate/--no-annotate",
811
+ help="Display scaffold and fingerprint number in each cluster",
812
+ ),
813
+ ] = True,
814
+ verbose: Annotated[
815
+ bool,
816
+ Option("-v/-V", "--verbose/--no-verbose"),
817
+ ] = True,
818
+ show: Annotated[
819
+ bool,
820
+ Option("--show/--no-show", hidden=True),
821
+ ] = True,
822
+ ) -> None:
823
+ r"""Summary plot of the clustering results"""
824
+ from bblean._console import get_console
825
+
826
+ console = get_console(silent=not verbose)
827
+ # Imports may take a bit of time since sklearn is slow, so start the spinner here
828
+ with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
829
+ from bblean.plotting import _dispatch_visualization, summary_plot
830
+ from bblean.smiles import load_smiles
831
+
832
+ _dispatch_visualization(
833
+ clusters_path,
834
+ "summary",
835
+ summary_plot,
836
+ {"annotate": annotate, "counts_ylim": ylim},
837
+ smiles=load_smiles(smiles_path) if smiles_path is not None else (),
838
+ min_size=min_size,
839
+ top=top,
840
+ n_features=n_features,
841
+ input_is_packed=input_is_packed,
842
+ fps_path=fps_path,
843
+ title=title,
844
+ filename=filename,
845
+ verbose=verbose,
846
+ save=save,
847
+ show=show,
848
+ )
849
+
850
+
851
+ @app.command("run")
852
+ def _run(
853
+ ctx: Context,
854
+ input_: Annotated[
855
+ Path | None,
856
+ Argument(help="`*.npy` file with packed fingerprints, or dir `*.npy` files"),
857
+ ] = None,
858
+ out_dir: Annotated[
859
+ Path | None,
860
+ Option(
861
+ "-o",
862
+ "--out-dir",
863
+ help="Dir to dump the output files",
864
+ ),
865
+ ] = None,
866
+ overwrite: Annotated[bool, Option(help="Allow overwriting output files")] = False,
867
+ branching_factor: Annotated[
868
+ int,
869
+ Option(
870
+ "--branching",
871
+ "-b",
872
+ help="BitBIRCH branching factor (all rounds). Usually 254 is"
873
+ " optimal. Set above 254 for slightly less RAM (at the cost of some perf.)",
874
+ ),
875
+ ] = DEFAULTS.branching_factor,
876
+ threshold: Annotated[
877
+ float,
878
+ Option("--threshold", "-t", help="Threshold for merge criterion"),
879
+ ] = DEFAULTS.threshold,
880
+ refine_threshold_change: Annotated[
881
+ float,
882
+ Option(
883
+ "--refine-threshold-change",
884
+ help="Modify threshold for refinement criterion, can be negative",
885
+ ),
886
+ ] = DEFAULTS.refine_threshold_change,
887
+ save_tree: Annotated[
888
+ bool,
889
+ Option("--save-tree/--no-save-tree", rich_help_panel="Advanced"),
890
+ ] = False,
891
+ save_centroids: Annotated[
892
+ bool,
893
+ Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
894
+ ] = True,
895
+ merge_criterion: Annotated[
896
+ str,
897
+ Option("--set-merge", "-m", help="Merge criterion for initial clustsering"),
898
+ ] = DEFAULTS.merge_criterion,
899
+ refine_merge_criterion: Annotated[
900
+ str,
901
+ Option("--set-refine-merge", help="Merge criterion for refinement clustsering"),
902
+ ] = DEFAULTS.refine_merge_criterion,
903
+ tolerance: Annotated[
904
+ float,
905
+ Option(help="BitBIRCH tolerance. For refinement and --set-merge 'tolerance'"),
906
+ ] = DEFAULTS.tolerance,
907
+ refine_num: Annotated[
908
+ int,
909
+ Option(
910
+ "--refine-num",
911
+ help=(
912
+ "Num. of largest clusters to refine."
913
+ " 1 for standard refinement, 0 is the default (no refinement)"
914
+ ),
915
+ ),
916
+ ] = 0,
917
+ refine_rounds: Annotated[
918
+ int | None,
919
+ Option(
920
+ "--refine-rounds",
921
+ help=("Num. of refinement rounds. "),
922
+ hidden=True,
923
+ ),
924
+ ] = None,
925
+ recluster_rounds: Annotated[
926
+ int,
927
+ Option(
928
+ "--recluster-rounds",
929
+ help=("Num. of reclustering rounds. "),
930
+ hidden=True,
931
+ ),
932
+ ] = 0,
933
+ recluster_shuffle: Annotated[
934
+ bool,
935
+ Option("--recluster-shuffle/--no-recluster-shuffle", hidden=True),
936
+ ] = True,
937
+ n_features: Annotated[
938
+ int | None,
939
+ Option(
940
+ "--n-features",
941
+ help="Number of features in the fingerprints."
942
+ " It must be provided for packed inputs *if it is not a multiple of 8*."
943
+ " For typical fingerprint sizes (e.g. 2048, 1024), it is not required",
944
+ rich_help_panel="Advanced",
945
+ ),
946
+ ] = None,
947
+ input_is_packed: Annotated[
948
+ bool,
949
+ Option(
950
+ "--packed-input/--unpacked-input",
951
+ help="Toggle whether the input consists on packed or unpacked fingerprints",
952
+ rich_help_panel="Advanced",
953
+ ),
954
+ ] = True,
955
+ # Debug options
956
+ monitor_rss: Annotated[
957
+ bool,
958
+ Option(
959
+ "--monitor-mem/--no-monitor-mem",
960
+ "--monitor-rss/--no-monitor-rss",
961
+ help="Monitor RAM used by all processes",
962
+ rich_help_panel="Advanced",
963
+ ),
964
+ ] = True,
965
+ monitor_rss_interval_s: Annotated[
966
+ float,
967
+ Option(
968
+ "--monitor-mem-seconds",
969
+ "--monitor-rss-seconds",
970
+ help="Interval in seconds for RAM monitoring",
971
+ rich_help_panel="Debug",
972
+ hidden=True,
973
+ ),
974
+ ] = 1.0,
975
+ max_fps: Annotated[
976
+ int | None,
977
+ Option(
978
+ help="Max. num of fingerprints to read from each file",
979
+ rich_help_panel="Debug",
980
+ hidden=True,
981
+ ),
982
+ ] = None,
983
+ variant: Annotated[
984
+ str,
985
+ Option(
986
+ "--bb-variant",
987
+ help="Use different bitbirch variants, *only for debugging*.",
988
+ hidden=True,
989
+ ),
990
+ ] = "lean",
991
+ copy_inputs: Annotated[
992
+ bool,
993
+ Option(
994
+ "--copy/--no-copy",
995
+ rich_help_panel="Advanced",
996
+ help="Copy the input files instead of symlink",
997
+ ),
998
+ ] = False,
999
+ verbose: Annotated[
1000
+ bool,
1001
+ Option("-v/-V", "--verbose/--no-verbose"),
1002
+ ] = True,
1003
+ ) -> None:
1004
+ r"""Run standard, serial BitBIRCH clustering over `*.npy` fingerprint files"""
1005
+ # TODO: Remove code duplication with multiround
1006
+ from bblean._console import get_console
1007
+ from bblean.fingerprints import _get_fps_file_num
1008
+
1009
+ console = get_console(silent=not verbose)
1010
+ if variant == "int64" and input_is_packed:
1011
+ raise ValueError("Packed inputs are not supported for the int64 variant")
1012
+ if refine_rounds is None:
1013
+ refine_rounds = 1 if refine_num > 0 else 0
1014
+ if refine_rounds > 0 and refine_num == 0:
1015
+ refine_num = 1
1016
+ ctx.params["refine_rounds"] = refine_rounds
1017
+ ctx.params["refine_num"] = refine_num
1018
+
1019
+ BitBirch, set_merge = _import_bitbirch_variant(variant)
1020
+
1021
+ # NOTE: Files are sorted according to name
1022
+ if input_ is None:
1023
+ input_ = Path.cwd() / "bb_inputs"
1024
+ input_.mkdir(exist_ok=True)
1025
+ input_files = sorted(input_.glob("*.npy"))
1026
+ _validate_input_dir(input_)
1027
+ elif input_.is_dir():
1028
+ input_files = sorted(input_.glob("*.npy"))
1029
+ _validate_input_dir(input_)
1030
+ else:
1031
+ input_files = [input_]
1032
+ ctx.params.pop("input_")
1033
+ ctx.params["input_files"] = [str(p.resolve()) for p in input_files]
1034
+ ctx.params["num_fps_present"] = [_get_fps_file_num(p) for p in input_files]
1035
+ if max_fps is not None:
1036
+ ctx.params["num_fps_loaded"] = [
1037
+ min(n, max_fps) for n in ctx.params["num_fps_present"]
1038
+ ]
1039
+ else:
1040
+ ctx.params["num_fps_loaded"] = ctx.params["num_fps_present"]
1041
+ unique_id = format(random.getrandbits(32), "08x")
1042
+ if out_dir is None:
1043
+ out_dir = Path.cwd() / "bb_run_outputs" / unique_id
1044
+ out_dir.mkdir(exist_ok=True, parents=True)
1045
+ _validate_output_dir(out_dir, overwrite)
1046
+ ctx.params["out_dir"] = str(out_dir.resolve())
1047
+
1048
+ console.print_banner()
1049
+ console.print()
1050
+ console.print_config(ctx.params)
1051
+
1052
+ # Optinally start a separate process that tracks RAM usage
1053
+ if monitor_rss:
1054
+ launch_monitor_rss_daemon(out_dir / "monitor-rss.csv", monitor_rss_interval_s)
1055
+
1056
+ timer = Timer()
1057
+ timer.init_timing("total")
1058
+ if "lean" not in variant:
1059
+ set_merge(merge_criterion, tolerance=tolerance)
1060
+ tree = BitBirch(branching_factor=branching_factor, threshold=threshold)
1061
+ else:
1062
+ tree = BitBirch(
1063
+ branching_factor=branching_factor,
1064
+ threshold=threshold,
1065
+ merge_criterion=merge_criterion,
1066
+ tolerance=tolerance,
1067
+ )
1068
+ with console.status("[italic]BitBirching...[/italic]", spinner="dots"):
1069
+ for file in input_files:
1070
+ # Fitting a file uses mmap internally, and releases memory in a smart way
1071
+ tree.fit(
1072
+ file,
1073
+ n_features=n_features,
1074
+ input_is_packed=input_is_packed,
1075
+ max_fps=max_fps,
1076
+ )
1077
+ if recluster_rounds != 0 or refine_rounds != 0:
1078
+ tree.set_merge(
1079
+ refine_merge_criterion,
1080
+ tolerance=tolerance,
1081
+ threshold=threshold + refine_threshold_change,
1082
+ )
1083
+ for r in range(refine_rounds):
1084
+ msg = (
1085
+ f"[italic]Refinement, round {r + 1}"
1086
+ f" (will split {refine_num} largest clusters)...[/italic]"
1087
+ )
1088
+ with console.status(msg, spinner="dots"):
1089
+ tree.refine_inplace(
1090
+ input_files,
1091
+ input_is_packed=input_is_packed,
1092
+ n_largest=refine_num,
1093
+ )
1094
+ for r in range(recluster_rounds):
1095
+ msg = f"[italic]Reclustering, round {r + 1}...[/italic]"
1096
+ with console.status(msg, spinner="dots"):
1097
+ tree.recluster_inplace(shuffle=recluster_shuffle)
1098
+
1099
+ timer.end_timing("total", console, indent=False)
1100
+ console.print_peak_mem(out_dir, indent=False)
1101
+ if variant == "lean":
1102
+ if save_tree:
1103
+ # TODO: BitBIRCH is highly recursive. pickling may crash python,
1104
+ # an alternative solution would be better
1105
+ _old_limit = sys.getrecursionlimit()
1106
+ sys.setrecursionlimit(100_000)
1107
+ with open(out_dir / "bitbirch.pkl", mode="wb") as f:
1108
+ pickle.dump(tree, f)
1109
+ sys.setrecursionlimit(_old_limit)
1110
+ tree.delete_internal_nodes()
1111
+ # Dump outputs (peak memory, timings, config, cluster ids)
1112
+ if save_centroids:
1113
+ output = tree.get_centroids_mol_ids()
1114
+ with open(out_dir / "clusters.pkl", mode="wb") as f:
1115
+ pickle.dump(output["mol_ids"], f)
1116
+ with open(out_dir / "cluster-centroids-packed.pkl", mode="wb") as f:
1117
+ pickle.dump(output["centroids"], f)
1118
+ else:
1119
+ with open(out_dir / "clusters.pkl", mode="wb") as f:
1120
+ pickle.dump(tree.get_cluster_mol_ids(), f)
1121
+
1122
+ collect_system_specs_and_dump_config(ctx.params)
1123
+ timer.dump(out_dir / "timings.json")
1124
+
1125
+ # Symlink or copy fingerprint files
1126
+ input_fps_dir = (out_dir / "input-fps").resolve()
1127
+ input_fps_dir.mkdir()
1128
+ if copy_inputs:
1129
+ for file in input_files:
1130
+ shutil.copy(file, input_fps_dir / file.name)
1131
+ else:
1132
+ for file in input_files:
1133
+ (input_fps_dir / file.name).symlink_to(file.resolve())
1134
+
1135
+
1136
+ # TODO: Currently sometimes after a round is triggered *more* files are output, since
1137
+ # the files are divided *both* by uint8/uint16 and the batch idx. I believe this is not
1138
+ # ideal
1139
+ @app.command("multiround")
1140
+ def _multiround(
1141
+ ctx: Context,
1142
+ in_dir: Annotated[
1143
+ Path | None,
1144
+ Argument(help="Directory with input `*.npy` files with packed fingerprints"),
1145
+ ] = None,
1146
+ out_dir: Annotated[
1147
+ Path | None,
1148
+ Option("-o", "--out-dir", help="Dir for output files"),
1149
+ ] = None,
1150
+ overwrite: Annotated[bool, Option(help="Allow overwriting output files")] = False,
1151
+ num_initial_processes: Annotated[
1152
+ int, Option("--ps", "--processes", help="Num. processes for first round")
1153
+ ] = 10,
1154
+ num_midsection_processes: Annotated[
1155
+ int | None,
1156
+ Option(
1157
+ "--mid-ps",
1158
+ "--mid-processes",
1159
+ help="Num. processes for middle section rounds."
1160
+ " These are be memory intensive,"
1161
+ " you may want to use 50%-30% of --ps."
1162
+ " Default is same as --ps",
1163
+ ),
1164
+ ] = None,
1165
+ branching_factor: Annotated[
1166
+ int,
1167
+ Option(
1168
+ "--branching",
1169
+ "-b",
1170
+ help="BitBIRCH branching factor (all rounds). Usually 254 is"
1171
+ " optimal. Set above 254 for slightly less RAM (at the cost of some perf.)",
1172
+ ),
1173
+ ] = DEFAULTS.branching_factor,
1174
+ threshold: Annotated[
1175
+ float,
1176
+ Option("--threshold", "-t", help="Thresh for merge criterion (initial step)"),
1177
+ ] = DEFAULTS.threshold,
1178
+ mid_threshold_change: Annotated[
1179
+ float,
1180
+ Option("--mid-threshold-change", help="Modify threshold for refinement"),
1181
+ ] = DEFAULTS.refine_threshold_change,
1182
+ initial_merge_criterion: Annotated[
1183
+ str,
1184
+ Option(
1185
+ "--set-merge",
1186
+ "-m",
1187
+ help="Initial merge criterion for round 1. ('diameter' recommended)",
1188
+ ),
1189
+ ] = DEFAULTS.merge_criterion,
1190
+ save_tree: Annotated[
1191
+ bool,
1192
+ Option("--save-tree/--no-save-tree", rich_help_panel="Advanced"),
1193
+ ] = False,
1194
+ save_centroids: Annotated[
1195
+ bool,
1196
+ Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
1197
+ ] = True,
1198
+ mid_merge_criterion: Annotated[
1199
+ str,
1200
+ Option(
1201
+ "--set-mid-merge",
1202
+ help="Merge criterion for midsection rounds ('diameter' recommended)",
1203
+ ),
1204
+ ] = DEFAULTS.refine_merge_criterion,
1205
+ tolerance: Annotated[
1206
+ float,
1207
+ Option(
1208
+ help="Tolerance value for all steps that use the 'tolerance' criterion"
1209
+ " (by default all except initial round)",
1210
+ ),
1211
+ ] = DEFAULTS.tolerance,
1212
+ n_features: Annotated[
1213
+ int | None,
1214
+ Option(
1215
+ "--n-features",
1216
+ help="Number of features in the fingerprints."
1217
+ " Only for packed inputs *if it is not a multiple of 8*."
1218
+ " Not required for typical fingerprint sizes (e.g. 2048, 1024)",
1219
+ rich_help_panel="Advanced",
1220
+ ),
1221
+ ] = None,
1222
+ input_is_packed: Annotated[
1223
+ bool,
1224
+ Option(
1225
+ "--packed-input/--unpacked-input",
1226
+ help="Toggle whether the input consists on packed or unpacked fingerprints",
1227
+ rich_help_panel="Advanced",
1228
+ ),
1229
+ ] = True,
1230
+ # Advanced options
1231
+ num_midsection_rounds: Annotated[
1232
+ int,
1233
+ Option(
1234
+ "--num-mid-rounds",
1235
+ help="Number of midsection rounds to perform",
1236
+ rich_help_panel="Advanced",
1237
+ ),
1238
+ ] = 1,
1239
+ split_largest_after_midsection: Annotated[
1240
+ bool,
1241
+ Option(
1242
+ "--split-after-mid/--no-split-after-mid",
1243
+ help=(
1244
+ "Split largest cluster after each midsection round"
1245
+ " (to be refined by the next round)"
1246
+ ),
1247
+ rich_help_panel="Advanced",
1248
+ ),
1249
+ ] = False,
1250
+ refinement_before_midsection: Annotated[
1251
+ str,
1252
+ Option(
1253
+ "--initial-refine",
1254
+ help=(
1255
+ "Run a *full* refinement step after the initial clustering round,"
1256
+ " only *split* largest cluster, or do *none*."
1257
+ ),
1258
+ rich_help_panel="Advanced",
1259
+ ),
1260
+ ] = "full",
1261
+ max_tasks_per_process: Annotated[
1262
+ int, Option(help="Max tasks per process", rich_help_panel="Advanced")
1263
+ ] = 1,
1264
+ fork: Annotated[
1265
+ bool,
1266
+ Option(
1267
+ help="In linux, force the 'fork' multiprocessing start method",
1268
+ rich_help_panel="Advanced",
1269
+ ),
1270
+ ] = False,
1271
+ bin_size: Annotated[
1272
+ int,
1273
+ Option(help="Bin size for chunking during Round 2", rich_help_panel="Advanced"),
1274
+ ] = 10,
1275
+ # Debug options
1276
+ variant: Annotated[
1277
+ str,
1278
+ Option(
1279
+ "--bb-variant",
1280
+ help="Use different bitbirch variants, *only for debugging*.",
1281
+ hidden=True,
1282
+ ),
1283
+ ] = "lean",
1284
+ monitor_rss: Annotated[
1285
+ bool,
1286
+ Option(
1287
+ "--monitor-mem",
1288
+ "--monitor-rss",
1289
+ help="Monitor RAM used by all processes",
1290
+ rich_help_panel="Advanced",
1291
+ ),
1292
+ ] = True,
1293
+ monitor_rss_interval_s: Annotated[
1294
+ float,
1295
+ Option(
1296
+ "--monitor-mem-seconds",
1297
+ "--monitor-rss-seconds",
1298
+ help="Interval in seconds for RAM monitoring",
1299
+ rich_help_panel="Debug",
1300
+ hidden=True,
1301
+ ),
1302
+ ] = 1.0,
1303
+ max_fps: Annotated[
1304
+ int | None,
1305
+ Option(
1306
+ help="Max num. of fps to load from each input file",
1307
+ rich_help_panel="Debug",
1308
+ hidden=True,
1309
+ ),
1310
+ ] = None,
1311
+ max_files: Annotated[
1312
+ int | None,
1313
+ Option(help="Max num. files to read", rich_help_panel="Debug", hidden=True),
1314
+ ] = None,
1315
+ copy_inputs: Annotated[
1316
+ bool,
1317
+ Option(
1318
+ "--copy/--no-copy",
1319
+ rich_help_panel="Advanced",
1320
+ help="Copy the input files instead of symlink",
1321
+ ),
1322
+ ] = False,
1323
+ verbose: Annotated[
1324
+ bool,
1325
+ Option("-v/-V", "--verbose/--no-verbose"),
1326
+ ] = True,
1327
+ cleanup: Annotated[
1328
+ bool,
1329
+ Option("--cleanup/--no-cleanup", hidden=True),
1330
+ ] = True,
1331
+ ) -> None:
1332
+ r"""Run multi-round BitBIRCH clustering, optionally parallelize over `*.npy` files""" # noqa:E501
1333
+ from bblean._console import get_console
1334
+ from bblean.multiround import run_multiround_bitbirch
1335
+ from bblean.fingerprints import _get_fps_file_num
1336
+
1337
+ console = get_console(silent=not verbose)
1338
+
1339
+ # Set multiprocessing start method
1340
+ if fork and not sys.platform == "linux":
1341
+ console.print("'fork' is only available on Linux", style="red")
1342
+ raise Abort()
1343
+ if sys.platform == "linux":
1344
+ mp_context = mp.get_context("fork" if fork else "forkserver")
1345
+ else:
1346
+ mp_context = mp.get_context()
1347
+
1348
+ # Collect inputs:
1349
+ # If not passed, input dir is bb_inputs/
1350
+ if in_dir is None:
1351
+ in_dir = Path.cwd() / "bb_inputs"
1352
+ _validate_input_dir(in_dir)
1353
+ # All files in the input dir with *.npy suffix are considered input files
1354
+ input_files = sorted(in_dir.glob("*.npy"))[:max_files]
1355
+ ctx.params["input_files"] = [str(p.resolve()) for p in input_files]
1356
+ ctx.params["num_fps"] = [_get_fps_file_num(p) for p in input_files]
1357
+ if max_fps is not None:
1358
+ ctx.params["num_fps_loaded"] = [min(n, max_fps) for n in ctx.params["num_fps"]]
1359
+ else:
1360
+ ctx.params["num_fps_loaded"] = ctx.params["num_fps"]
1361
+
1362
+ # Set up outputs:
1363
+ # If not passed, output dir is constructed as bb_multiround_outputs/<unique-id>/
1364
+ unique_id = format(random.getrandbits(32), "08x")
1365
+ if out_dir is None:
1366
+ out_dir = Path.cwd() / "bb_multiround_outputs" / unique_id
1367
+ out_dir.mkdir(exist_ok=True, parents=True)
1368
+ _validate_output_dir(out_dir, overwrite)
1369
+ ctx.params["out_dir"] = str(out_dir.resolve())
1370
+
1371
+ console.print_banner()
1372
+ console.print()
1373
+ console.print_multiround_config(ctx.params, mp_context)
1374
+
1375
+ # Optinally start a separate process that tracks RAM usage
1376
+ if monitor_rss:
1377
+ launch_monitor_rss_daemon(out_dir / "monitor-rss.csv", monitor_rss_interval_s)
1378
+
1379
+ timer = run_multiround_bitbirch(
1380
+ input_files=input_files,
1381
+ n_features=n_features,
1382
+ input_is_packed=input_is_packed,
1383
+ out_dir=out_dir,
1384
+ initial_merge_criterion=initial_merge_criterion,
1385
+ midsection_merge_criterion=mid_merge_criterion,
1386
+ num_initial_processes=num_initial_processes,
1387
+ num_midsection_processes=num_midsection_processes,
1388
+ branching_factor=branching_factor,
1389
+ threshold=threshold,
1390
+ midsection_threshold_change=mid_threshold_change,
1391
+ tolerance=tolerance,
1392
+ # Advanced
1393
+ save_tree=save_tree,
1394
+ save_centroids=save_centroids,
1395
+ bin_size=bin_size,
1396
+ max_tasks_per_process=max_tasks_per_process,
1397
+ refinement_before_midsection=refinement_before_midsection,
1398
+ num_midsection_rounds=num_midsection_rounds,
1399
+ split_largest_after_each_midsection_round=split_largest_after_midsection,
1400
+ # Debug
1401
+ max_fps=max_fps,
1402
+ verbose=verbose,
1403
+ mp_context=mp_context,
1404
+ cleanup=cleanup,
1405
+ )
1406
+ timer.dump(out_dir / "timings.json")
1407
+ # TODO: Also dump peak-rss.json
1408
+ collect_system_specs_and_dump_config(ctx.params)
1409
+
1410
+ # Symlink or copy fingerprint files
1411
+ input_fps_dir = (out_dir / "input-fps").resolve()
1412
+ input_fps_dir.mkdir()
1413
+ if copy_inputs:
1414
+ for file in input_files:
1415
+ shutil.copy(file, input_fps_dir / file.name)
1416
+ else:
1417
+ for file in input_files:
1418
+ (input_fps_dir / file.name).symlink_to(file.resolve())
1419
+
1420
+
1421
+ @app.command("fps-info", rich_help_panel="Fingerprints")
1422
+ def _fps_info(
1423
+ fp_paths: Annotated[
1424
+ list[Path] | None,
1425
+ Argument(show_default=False, help="Paths to *.smi files with smiles"),
1426
+ ] = None,
1427
+ ) -> None:
1428
+ """Show info about a `*.npy` fingerprint file, or a dir with `*.npy` files"""
1429
+ from bblean._console import get_console
1430
+ from bblean.fingerprints import _print_fps_file_info
1431
+
1432
+ console = get_console()
1433
+ if fp_paths is None:
1434
+ fp_paths = [Path.cwd()]
1435
+
1436
+ for path in fp_paths:
1437
+ if path.is_dir():
1438
+ for file in path.glob("*.npy"):
1439
+ _print_fps_file_info(file, console)
1440
+ elif path.suffix == ".npy":
1441
+ _print_fps_file_info(file, console)
1442
+
1443
+
1444
+ @app.command("fps-from-smiles", rich_help_panel="Fingerprints")
1445
+ def _fps_from_smiles(
1446
+ smiles_paths: Annotated[
1447
+ list[Path] | None,
1448
+ Argument(show_default=False, help="Paths to *.smi files with smiles"),
1449
+ ] = None,
1450
+ out_dir: Annotated[
1451
+ Path | None,
1452
+ Option("-o", "--out-dir", show_default=False),
1453
+ ] = None,
1454
+ out_name: Annotated[
1455
+ str | None,
1456
+ Option("--name", help="Base name of output file"),
1457
+ ] = None,
1458
+ kind: Annotated[
1459
+ str,
1460
+ Option("-k", "--kind"),
1461
+ ] = DEFAULTS.fp_kind,
1462
+ fp_size: Annotated[
1463
+ int,
1464
+ Option("--n-features", help="Num. features of the generated fingerprints"),
1465
+ ] = DEFAULTS.n_features,
1466
+ parts: Annotated[
1467
+ int | None,
1468
+ Option(
1469
+ "-n", "--num-parts", help="Split the created file into this number of parts"
1470
+ ),
1471
+ ] = None,
1472
+ max_fps_per_file: Annotated[
1473
+ int | None,
1474
+ Option(
1475
+ "-m",
1476
+ "--max-fps-per-file",
1477
+ help="Max. number of fps per file. Mutually exclusive with --num-parts",
1478
+ show_default=False,
1479
+ ),
1480
+ ] = None,
1481
+ pack: Annotated[
1482
+ bool,
1483
+ Option(
1484
+ "-p/-P",
1485
+ "--pack/--no-pack",
1486
+ help="Pack bits in last dimension of fingerprints",
1487
+ rich_help_panel="Advanced",
1488
+ ),
1489
+ ] = True,
1490
+ dtype: Annotated[
1491
+ str,
1492
+ Option(
1493
+ "-d",
1494
+ "--dtype",
1495
+ help="NumPy dtype for the generated fingerprints",
1496
+ rich_help_panel="Advanced",
1497
+ ),
1498
+ ] = "uint8",
1499
+ verbose: Annotated[
1500
+ bool,
1501
+ Option("-v/-V", "--verbose/--no-verbose"),
1502
+ ] = True,
1503
+ num_ps: Annotated[
1504
+ int | None,
1505
+ Option(
1506
+ "--ps",
1507
+ "--processes",
1508
+ help=(
1509
+ "Num. processes for multprocess generation."
1510
+ " One process per file is used for multi-file generation"
1511
+ ),
1512
+ ),
1513
+ ] = None,
1514
+ sanitize: Annotated[
1515
+ str,
1516
+ Option(
1517
+ "--sanitize",
1518
+ help="RDKit sanitization operations to perform ('all' or 'minimal')",
1519
+ ),
1520
+ ] = "all",
1521
+ skip_invalid: Annotated[
1522
+ bool,
1523
+ Option(
1524
+ "--skip-invalid/--no-skip-invalid",
1525
+ help=(
1526
+ "Skip invalid smiles."
1527
+ " If False, an error is raised on invalid smiles. If True they are"
1528
+ " silently skipped (this is be more memory intensive, especially for"
1529
+ " parallel processing)"
1530
+ ),
1531
+ ),
1532
+ ] = False,
1533
+ ) -> None:
1534
+ r"""Generate a `*.npy` fingerprints file from one or more `*.smi` smiles files
1535
+
1536
+ By default this function runs in parallel and uses all available CPUs. In order to
1537
+ use the memory efficient BitBIRCH u8 algorithm you should keep the defaults:
1538
+ --dtype=uint8 and --pack
1539
+ """
1540
+ import numpy as np
1541
+
1542
+ from bblean._console import get_console
1543
+ from bblean.utils import _num_avail_cpus
1544
+ from bblean.fingerprints import _FingerprintFileCreator, _FingerprintArrayFiller
1545
+ from bblean.smiles import (
1546
+ calc_num_smiles,
1547
+ _iter_ranges_and_smiles_batches,
1548
+ _iter_idxs_and_smiles_batches,
1549
+ )
1550
+
1551
+ # Force forkserver since rdkit may use threads, and fork is unsafe with threads
1552
+ mp_context = mp.get_context("forkserver" if sys.platform == "linux" else None)
1553
+
1554
+ console = get_console(silent=not verbose)
1555
+
1556
+ if smiles_paths is None:
1557
+ smiles_paths = list(Path.cwd().glob("*.smi"))
1558
+ if not smiles_paths:
1559
+ console.print("No *.smi files found", style="red")
1560
+ raise Abort()
1561
+
1562
+ smiles_num = calc_num_smiles(smiles_paths)
1563
+
1564
+ def parse_num_per_batch(
1565
+ smiles_num: int, parts: int | None, max_fps_per_file: int | None
1566
+ ) -> tuple[int, int, int | None]:
1567
+ digits: int | None
1568
+ if parts is not None and max_fps_per_file is None:
1569
+ num_per_batch = math.ceil(smiles_num / parts)
1570
+ digits = len(str(parts))
1571
+ elif parts is None and max_fps_per_file is not None:
1572
+ num_per_batch = max_fps_per_file
1573
+ parts = math.ceil(smiles_num / max_fps_per_file)
1574
+ digits = len(str(parts))
1575
+ elif parts is None and max_fps_per_file is None:
1576
+ parts = 1
1577
+ num_per_batch = math.ceil(smiles_num / parts)
1578
+ digits = None
1579
+ else:
1580
+ raise ValueError("parts and max_fps_per_file are mutually exclusive")
1581
+ return parts, num_per_batch, digits
1582
+
1583
+ try:
1584
+ parts, num_per_batch, digits = parse_num_per_batch(
1585
+ smiles_num, parts, max_fps_per_file
1586
+ )
1587
+ except ValueError:
1588
+ console.print(
1589
+ "'--max-fps-per-file' and '--num-parts' are mutually exclusive",
1590
+ style="red",
1591
+ )
1592
+ raise Abort() from None
1593
+ if out_dir is None:
1594
+ out_dir = Path.cwd()
1595
+ out_dir.mkdir(exist_ok=True)
1596
+ out_dir = out_dir.resolve()
1597
+
1598
+ # Pass 2: build the molecules
1599
+ unique_id = format(random.getrandbits(32), "08x")
1600
+ if out_name is None:
1601
+ # Save the fingerprints as a NumPy array
1602
+ out_name = f"{'packed-' if pack else ''}fps-{dtype}-{kind}-{unique_id}"
1603
+ else:
1604
+ # Strip suffix
1605
+ if out_name.endswith(".npy"):
1606
+ out_name = out_name[:-4]
1607
+
1608
+ if num_ps is None:
1609
+ # Get the number of cores *available for use for this process*
1610
+ # bound by the number of parts to avoid spawning useless processes
1611
+ if parts == 1:
1612
+ num_ps = _num_avail_cpus()
1613
+ else:
1614
+ num_ps = min(_num_avail_cpus(), parts)
1615
+ create_fp_file = _FingerprintFileCreator(
1616
+ dtype,
1617
+ out_dir,
1618
+ out_name,
1619
+ digits,
1620
+ pack,
1621
+ kind,
1622
+ fp_size,
1623
+ sanitize=sanitize,
1624
+ skip_invalid=skip_invalid,
1625
+ verbose=verbose,
1626
+ )
1627
+ timer = Timer()
1628
+ timer.init_timing("total")
1629
+ if parts > 1 and num_ps is not None and num_ps > 1:
1630
+ # Multiprocessing version, 1 process per file
1631
+ with console.status(
1632
+ f"[italic]Generating fingerprints ({parts} files, parallel, {num_ps} procs.) ...[/italic]", # noqa:E501
1633
+ spinner="dots",
1634
+ ):
1635
+ with mp_context.Pool(processes=num_ps) as pool:
1636
+ pool.map(
1637
+ create_fp_file,
1638
+ _iter_idxs_and_smiles_batches(smiles_paths, num_per_batch),
1639
+ )
1640
+ timer.end_timing("total", console, indent=False)
1641
+ stem = out_name.split(".")[0]
1642
+ console.print(f"Finished. Outputs written to {str(out_dir / stem)}.<idx>.npy")
1643
+ return
1644
+
1645
+ # Parallel or serial, single file version
1646
+ msg = "parallel" if num_ps > 1 else "serial"
1647
+ with console.status(
1648
+ f"[italic]Generating fingerprints ({parts} files, {msg}, {num_ps} procs.) ...[/italic]", # noqa:E501
1649
+ spinner="dots",
1650
+ ):
1651
+ if pack:
1652
+ out_dim = (fp_size + 7) // 8
1653
+ else:
1654
+ out_dim = fp_size
1655
+ shmem_size = smiles_num * out_dim * np.dtype(dtype).itemsize
1656
+ fps_shmem = shmem.SharedMemory(create=True, size=shmem_size)
1657
+ invalid_mask_shmem = shmem.SharedMemory(create=True, size=smiles_num)
1658
+ fps_array_filler = _FingerprintArrayFiller(
1659
+ shmem_name=fps_shmem.name,
1660
+ invalid_mask_shmem_name=invalid_mask_shmem.name,
1661
+ kind=kind,
1662
+ fp_size=fp_size,
1663
+ num_smiles=smiles_num,
1664
+ dtype=dtype,
1665
+ pack=pack,
1666
+ sanitize=sanitize,
1667
+ skip_invalid=skip_invalid,
1668
+ )
1669
+ if num_ps > 1 and parts == 1:
1670
+ # Split into batches anyways if we have a single batch but multiple
1671
+ # processes
1672
+ _, num_per_batch, _ = parse_num_per_batch(
1673
+ smiles_num, num_ps, max_fps_per_file
1674
+ )
1675
+ with mp_context.Pool(processes=num_ps) as pool:
1676
+ pool.starmap(
1677
+ fps_array_filler,
1678
+ _iter_ranges_and_smiles_batches(smiles_paths, num_per_batch),
1679
+ )
1680
+ fps = np.ndarray((smiles_num, out_dim), dtype=dtype, buffer=fps_shmem.buf)
1681
+ mask = np.ndarray((smiles_num,), dtype=np.bool, buffer=invalid_mask_shmem.buf)
1682
+ if skip_invalid:
1683
+ prev_num = len(fps)
1684
+ fps = np.delete(fps, mask, axis=0)
1685
+ new_num = len(fps)
1686
+ console.print(f"Generated {new_num} fingerprints")
1687
+ console.print(f"Skipped {prev_num - new_num} invalid smiles")
1688
+ invalid_name = f"invalid-{unique_id}.npy"
1689
+ console.print(
1690
+ f"Invalid smiles idxs written to {str(out_dir / invalid_name)}"
1691
+ )
1692
+ np.save(out_dir / f"invalid-{unique_id}.npy", mask.nonzero()[0].reshape(-1))
1693
+
1694
+ np.save(
1695
+ out_dir / out_name,
1696
+ fps,
1697
+ )
1698
+ del mask
1699
+ del fps
1700
+ # Cleanup
1701
+ fps_shmem.unlink()
1702
+ invalid_mask_shmem.unlink()
1703
+ timer.end_timing("total", console, indent=False)
1704
+ console.print(f"Finished. Outputs written to {str(out_dir / out_name)}.npy")
1705
+
1706
+
1707
+ @app.command("fps-split", rich_help_panel="Fingerprints")
1708
+ def _split_fps(
1709
+ input_: Annotated[
1710
+ Path,
1711
+ Argument(help="`*.npy` file with fingerprints"),
1712
+ ],
1713
+ out_dir: Annotated[
1714
+ Path | None,
1715
+ Option("-o", "--out-dir", show_default=False),
1716
+ ] = None,
1717
+ parts: Annotated[
1718
+ int | None,
1719
+ Option(
1720
+ "-n",
1721
+ "--num-parts",
1722
+ help="Num. of parts to split file into. Mutually exclusive with --max-fps",
1723
+ show_default=False,
1724
+ ),
1725
+ ] = None,
1726
+ max_fps_per_file: Annotated[
1727
+ int | None,
1728
+ Option(
1729
+ "-m",
1730
+ "--max-fps",
1731
+ help="Max. number of fps per file. Mutually exclusive with --num-parts",
1732
+ show_default=False,
1733
+ ),
1734
+ ] = None,
1735
+ ) -> None:
1736
+ r"""Split a `*.npy` fingerprint file into multiple `*.npy` files
1737
+
1738
+ Usage to split into multiple files with a max number of fps each (e.g. 10k) is `bb
1739
+ split-fps --max-fps 10_000 ./fps.npy --out-dir ./split`. To split into a pre-defined
1740
+ number of parts (e.g. 10) `bb split-fps --num-parts 10 ./fps.npy --out-dir ./split`.
1741
+ """
1742
+ from bblean._console import get_console
1743
+ import numpy as np
1744
+
1745
+ console = get_console()
1746
+ if parts is not None and parts < 2:
1747
+ console.print("Num must be >= 2", style="red")
1748
+ raise Abort()
1749
+ fps = np.load(input_, mmap_mode="r")
1750
+ if parts is not None and max_fps_per_file is None:
1751
+ num_per_batch = math.ceil(fps.shape[0] / parts)
1752
+ digits = len(str(parts))
1753
+ elif parts is None and max_fps_per_file is not None:
1754
+ num_per_batch = max_fps_per_file
1755
+ digits = len(str(math.ceil(fps.shape[0] / max_fps_per_file)))
1756
+ else:
1757
+ console.print(
1758
+ "One and only one of '--max-fps' and '--num-parts' required", style="red"
1759
+ )
1760
+ raise Abort()
1761
+
1762
+ stem = input_.name.split(".")[0]
1763
+ with console.status("[italic]Splitting fingerprints...[/italic]", spinner="dots"):
1764
+ i = -1
1765
+ for i, batch in enumerate(batched(fps, num_per_batch)):
1766
+ suffixes = input_.suffixes
1767
+ name = f"{stem}{''.join(suffixes[:-1])}.{str(i).zfill(digits)}.npy"
1768
+
1769
+ # Generate out dir when first fp file is being saved
1770
+ if out_dir is None:
1771
+ out_dir = Path.cwd() / stem
1772
+ out_dir.mkdir(exist_ok=True)
1773
+ out_dir = out_dir.resolve()
1774
+
1775
+ np.save(out_dir / name, batch)
1776
+
1777
+ if i == -1:
1778
+ console.print("Warning: No fingerprints written", style="yellow")
1779
+ return
1780
+ console.print(
1781
+ f"Finished. Outputs written to {str(tp.cast(Path, out_dir) / stem)}.<idx>.npy"
1782
+ )
1783
+
1784
+
1785
+ @app.command("fps-shuffle", rich_help_panel="Fingerprints")
1786
+ def _shuffle_fps(
1787
+ in_file: Annotated[
1788
+ Path,
1789
+ Argument(help="`*.npy` file with packed fingerprints"),
1790
+ ],
1791
+ out_dir: Annotated[
1792
+ Path | None,
1793
+ Option("-o", "--out-dir", show_default=False),
1794
+ ] = None,
1795
+ seed: Annotated[
1796
+ int | None,
1797
+ Option("--seed", hidden=True, rich_help_panel="Debug"),
1798
+ ] = None,
1799
+ ) -> None:
1800
+ """Shuffle a fingerprints file
1801
+
1802
+ This function is not optimized and as such may have high RAM usage. It is
1803
+ meant for testing purposes only"""
1804
+ import numpy as np
1805
+
1806
+ fps = np.load(in_file)
1807
+ stem = in_file.stem
1808
+ rng = np.random.default_rng(seed)
1809
+ rng.shuffle(fps, axis=0)
1810
+ if out_dir is None:
1811
+ out_dir = Path.cwd()
1812
+ out_dir.mkdir(exist_ok=True)
1813
+ out_dir = out_dir.resolve()
1814
+ np.save(out_dir / f"shuffled-{stem}.npy", fps)
1815
+
1816
+
1817
+ @app.command("fps-merge", rich_help_panel="Fingerprints")
1818
+ def _merge_fps(
1819
+ in_dir: Annotated[
1820
+ Path,
1821
+ Argument(help="Directory with input `*.npy` files with packed fingerprints"),
1822
+ ],
1823
+ out_dir: Annotated[
1824
+ Path | None,
1825
+ Option("-o", "--out-dir", show_default=False),
1826
+ ] = None,
1827
+ ) -> None:
1828
+ r"""Merge a dir with multiple `*.npy` fingerprint file into a single `*.npy` file"""
1829
+ from bblean._console import get_console
1830
+ import numpy as np
1831
+
1832
+ console = get_console()
1833
+
1834
+ if out_dir is None:
1835
+ out_dir = Path.cwd()
1836
+ out_dir.mkdir(exist_ok=True)
1837
+ out_dir = out_dir.resolve()
1838
+ arrays = []
1839
+ with console.status("[italic]Merging fingerprints...[/italic]", spinner="dots"):
1840
+ stem = None
1841
+ for f in sorted(in_dir.glob("*.npy")):
1842
+ if stem is None:
1843
+ stem = f.name.split(".")[0]
1844
+ elif stem != f.name.split(".")[0]:
1845
+ raise ValueError(
1846
+ "Name convention must be <name>.<idx>.npy"
1847
+ " with all files having the same <name>"
1848
+ )
1849
+ arrays.append(np.load(f))
1850
+ if stem is None:
1851
+ console.print("No *.npy files found")
1852
+ return
1853
+ np.save(out_dir / stem, np.concatenate(arrays))
1854
+ console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")