bblean 0.7.3__cp313-cp313-win_amd64.whl → 0.7.4b0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bblean/_cpp_similarity.cp313-win_amd64.pyd +0 -0
- bblean/_version.py +3 -3
- bblean/cli.py +423 -232
- bblean/fingerprints.py +16 -1
- bblean/multiround.py +0 -12
- bblean/plotting.py +3 -1
- {bblean-0.7.3.dist-info → bblean-0.7.4b0.dist-info}/METADATA +1 -1
- {bblean-0.7.3.dist-info → bblean-0.7.4b0.dist-info}/RECORD +12 -12
- {bblean-0.7.3.dist-info → bblean-0.7.4b0.dist-info}/WHEEL +0 -0
- {bblean-0.7.3.dist-info → bblean-0.7.4b0.dist-info}/entry_points.txt +0 -0
- {bblean-0.7.3.dist-info → bblean-0.7.4b0.dist-info}/licenses/LICENSE +0 -0
- {bblean-0.7.3.dist-info → bblean-0.7.4b0.dist-info}/top_level.txt +0 -0
|
Binary file
|
bblean/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.7.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 7,
|
|
31
|
+
__version__ = version = '0.7.4b0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 7, 4, 'b0')
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id =
|
|
34
|
+
__commit_id__ = commit_id = 'g200eab9e3'
|
bblean/cli.py
CHANGED
|
@@ -54,7 +54,8 @@ def _validate_input_dir(in_dir: Path | str) -> None:
|
|
|
54
54
|
in_dir = Path(in_dir)
|
|
55
55
|
if not in_dir.is_dir():
|
|
56
56
|
raise RuntimeError(f"Input dir {in_dir} should be a dir")
|
|
57
|
-
|
|
57
|
+
fp_files = (f for f in in_dir.glob("*.npy") if not f.stem.endswith(".indices"))
|
|
58
|
+
if not any(fp_files):
|
|
58
59
|
raise RuntimeError(f"Input dir {in_dir} should have *.npy fingerprint files")
|
|
59
60
|
|
|
60
61
|
|
|
@@ -75,6 +76,203 @@ def _main(
|
|
|
75
76
|
pass
|
|
76
77
|
|
|
77
78
|
|
|
79
|
+
@app.command("summary", rich_help_panel="Analysis")
|
|
80
|
+
def _table_summary(
|
|
81
|
+
clusters_path: Annotated[
|
|
82
|
+
Path,
|
|
83
|
+
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
84
|
+
],
|
|
85
|
+
fps_path: Annotated[
|
|
86
|
+
Path | None,
|
|
87
|
+
Option(
|
|
88
|
+
"-f",
|
|
89
|
+
"--fps-path",
|
|
90
|
+
help="Path to fingerprint file, or directory with fingerprint files",
|
|
91
|
+
show_default=False,
|
|
92
|
+
),
|
|
93
|
+
] = None,
|
|
94
|
+
min_size: Annotated[
|
|
95
|
+
int,
|
|
96
|
+
Option("--min-size"),
|
|
97
|
+
] = 0,
|
|
98
|
+
smiles_path: Annotated[
|
|
99
|
+
Path | None,
|
|
100
|
+
Option(
|
|
101
|
+
"-s",
|
|
102
|
+
"--smiles-path",
|
|
103
|
+
show_default=False,
|
|
104
|
+
help="Optional smiles path, if passed a scaffold analysis is performed",
|
|
105
|
+
),
|
|
106
|
+
] = None,
|
|
107
|
+
top: Annotated[
|
|
108
|
+
int,
|
|
109
|
+
Option("--top"),
|
|
110
|
+
] = 20,
|
|
111
|
+
input_is_packed: Annotated[
|
|
112
|
+
bool,
|
|
113
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
114
|
+
] = True,
|
|
115
|
+
scaffold_fp_kind: Annotated[
|
|
116
|
+
str,
|
|
117
|
+
Option("--scaffold-fp-kind"),
|
|
118
|
+
] = DEFAULTS.fp_kind,
|
|
119
|
+
n_features: Annotated[
|
|
120
|
+
int | None,
|
|
121
|
+
Option(
|
|
122
|
+
"--n-features",
|
|
123
|
+
help="Number of features in the fingerprints."
|
|
124
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
125
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
126
|
+
rich_help_panel="Advanced",
|
|
127
|
+
),
|
|
128
|
+
] = None,
|
|
129
|
+
metrics: Annotated[
|
|
130
|
+
bool,
|
|
131
|
+
Option(
|
|
132
|
+
"--metrics/--no-metrics",
|
|
133
|
+
help="Calculate clustering indices (Dunn, DBI, CHI)",
|
|
134
|
+
),
|
|
135
|
+
] = False,
|
|
136
|
+
chosen_metrics: Annotated[
|
|
137
|
+
str,
|
|
138
|
+
Option(
|
|
139
|
+
"-m",
|
|
140
|
+
"--metrics-choice",
|
|
141
|
+
help=(
|
|
142
|
+
"Chosen metrics. "
|
|
143
|
+
" Comma-separated list including dunn (slow), dbi or chi"
|
|
144
|
+
),
|
|
145
|
+
),
|
|
146
|
+
] = "dunn,dbi,chi",
|
|
147
|
+
metrics_top: Annotated[
|
|
148
|
+
int | None,
|
|
149
|
+
Option("--metrics-top", rich_help_panel="Advanced"),
|
|
150
|
+
] = 100,
|
|
151
|
+
metrics_min_size: Annotated[
|
|
152
|
+
int,
|
|
153
|
+
Option("--metrics-min-size", hidden=True),
|
|
154
|
+
] = 1,
|
|
155
|
+
verbose: Annotated[
|
|
156
|
+
bool,
|
|
157
|
+
Option("--verbose/--no-verbose", hidden=True),
|
|
158
|
+
] = True,
|
|
159
|
+
) -> None:
|
|
160
|
+
r"""Summary table of clustering results, together with cluster metrics"""
|
|
161
|
+
from bblean._console import get_console
|
|
162
|
+
from bblean.smiles import load_smiles
|
|
163
|
+
from bblean.analysis import cluster_analysis
|
|
164
|
+
from bblean.utils import _has_files_or_valid_symlinks
|
|
165
|
+
from bblean.metrics import jt_dbi, jt_isim_chi, jt_isim_dunn, _calc_centrals
|
|
166
|
+
from rich.table import Table
|
|
167
|
+
|
|
168
|
+
console = get_console(silent=not verbose)
|
|
169
|
+
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
170
|
+
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
171
|
+
if clusters_path.is_dir():
|
|
172
|
+
clusters_path = clusters_path / "clusters.pkl"
|
|
173
|
+
with open(clusters_path, mode="rb") as f:
|
|
174
|
+
clusters = pickle.load(f)
|
|
175
|
+
if fps_path is None:
|
|
176
|
+
input_fps_path = clusters_path.parent / "input-fps"
|
|
177
|
+
if input_fps_path.is_dir() and _has_files_or_valid_symlinks(input_fps_path):
|
|
178
|
+
fps_path = input_fps_path
|
|
179
|
+
else:
|
|
180
|
+
msg = (
|
|
181
|
+
"Could not find input fingerprints. Please use --fps-path."
|
|
182
|
+
" Summary plot without fingerprints doesn't include isim values"
|
|
183
|
+
)
|
|
184
|
+
warnings.warn(msg)
|
|
185
|
+
if fps_path is None:
|
|
186
|
+
fps_paths = None
|
|
187
|
+
elif fps_path.is_dir():
|
|
188
|
+
fps_paths = sorted(
|
|
189
|
+
f for f in fps_path.glob("*.npy") if not f.stem.endswith(".indices")
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
fps_paths = [fps_path]
|
|
193
|
+
ca = cluster_analysis(
|
|
194
|
+
clusters,
|
|
195
|
+
fps_paths,
|
|
196
|
+
smiles=load_smiles(smiles_path) if smiles_path is not None else (),
|
|
197
|
+
top=top,
|
|
198
|
+
n_features=n_features,
|
|
199
|
+
input_is_packed=input_is_packed,
|
|
200
|
+
min_size=min_size,
|
|
201
|
+
)
|
|
202
|
+
table = Table(title=(f"Top {top} clusters" if top is not None else "Clusters"))
|
|
203
|
+
table.add_column("Size", justify="center")
|
|
204
|
+
table.add_column("% fps", justify="center")
|
|
205
|
+
table.add_column("iSIM", justify="center")
|
|
206
|
+
if smiles_path is not None:
|
|
207
|
+
table.add_column("Size/Scaff.", justify="center")
|
|
208
|
+
table.add_column("Num. Scaff.", justify="center")
|
|
209
|
+
table.add_column("Scaff. iSIM", justify="center")
|
|
210
|
+
sizes = ca.sizes
|
|
211
|
+
isims = ca.isims
|
|
212
|
+
total_fps = ca.total_fps
|
|
213
|
+
for i in range(ca.clusters_num):
|
|
214
|
+
size = sizes[i]
|
|
215
|
+
percent = size / total_fps * 100
|
|
216
|
+
table.add_row(f"{size:,}", f"{percent:.2f}", f"{isims[i]:.3f}")
|
|
217
|
+
console.print(table)
|
|
218
|
+
console.print()
|
|
219
|
+
console.print(f"Total num. fps: {total_fps:,}")
|
|
220
|
+
console.print(f"Total num. clusters: {ca.all_clusters_num:,}")
|
|
221
|
+
singles = ca.all_singletons_num
|
|
222
|
+
singles_percent = singles * 100 / ca.all_clusters_num
|
|
223
|
+
console.print(f"Total num. singletons: {singles:,} ({singles_percent:.2f} %)")
|
|
224
|
+
gt10 = ca.all_clusters_num_with_size_above(10)
|
|
225
|
+
gt10_percent = gt10 * 100 / ca.all_clusters_num
|
|
226
|
+
console.print(
|
|
227
|
+
f"Total num. clusters, size > 10: {gt10:,} ({gt10_percent:.2f} %)"
|
|
228
|
+
)
|
|
229
|
+
gt100 = ca.all_clusters_num_with_size_above(100)
|
|
230
|
+
gt100_percent = gt100 * 100 / ca.all_clusters_num
|
|
231
|
+
console.print(
|
|
232
|
+
f"Total num. clusters, size > 100: {gt100:,} ({gt100_percent:.2f} %)"
|
|
233
|
+
)
|
|
234
|
+
console.print(
|
|
235
|
+
f"num-clusters/num-fps ratio: {ca.all_clusters_num / total_fps:.2f}"
|
|
236
|
+
)
|
|
237
|
+
console.print(f"Mean size: {ca.all_clusters_mean_size:.2f}")
|
|
238
|
+
console.print(f"Max. size: {ca.all_clusters_max_size:,}")
|
|
239
|
+
console.print(f"Q3 (75%) size: {ca.all_clusters_q3:,}")
|
|
240
|
+
console.print(f"Median size: {ca.all_clusters_median_size:,}")
|
|
241
|
+
console.print(f"Q1 (25%) size: {ca.all_clusters_q1:,}")
|
|
242
|
+
console.print(f"Min. size: {ca.all_clusters_min_size:,}")
|
|
243
|
+
if metrics:
|
|
244
|
+
chosen = set(s.lower() for s in chosen_metrics.split(","))
|
|
245
|
+
assert all(s in ["dunn", "chi", "dbi"] for s in chosen)
|
|
246
|
+
# Redo cluster analysis with more *top* clusters
|
|
247
|
+
console.print()
|
|
248
|
+
if metrics_top is None:
|
|
249
|
+
console.print("Clustering metrics:")
|
|
250
|
+
else:
|
|
251
|
+
console.print(f"Clustering metrics considering top {metrics_top} clusters:")
|
|
252
|
+
with console.status("[italic]Reanalyzing clusters...[/italic]", spinner="dots"):
|
|
253
|
+
ca = cluster_analysis(
|
|
254
|
+
clusters,
|
|
255
|
+
fps_paths,
|
|
256
|
+
smiles=(),
|
|
257
|
+
top=metrics_top,
|
|
258
|
+
n_features=n_features,
|
|
259
|
+
input_is_packed=input_is_packed,
|
|
260
|
+
min_size=metrics_min_size,
|
|
261
|
+
)
|
|
262
|
+
clusters = ca.get_top_cluster_fps()
|
|
263
|
+
with console.status("[italic]Calculating centrals...[/italic]", spinner="dots"):
|
|
264
|
+
centrals = _calc_centrals(clusters, kind="centroid")
|
|
265
|
+
if "chi" in chosen:
|
|
266
|
+
chi = jt_isim_chi(clusters, centrals=centrals, verbose=verbose)
|
|
267
|
+
console.print(f" - CHI index: {chi:.4f} (Higher is better)")
|
|
268
|
+
if "dbi" in chosen:
|
|
269
|
+
dbi = jt_dbi(clusters, centrals=centrals, verbose=verbose)
|
|
270
|
+
console.print(f" - DBI index: {dbi:.4e} (Lower is better)")
|
|
271
|
+
if "dunn" in chosen:
|
|
272
|
+
dunn = jt_isim_dunn(clusters, verbose=verbose)
|
|
273
|
+
console.print(f" - Dunn index: {dunn:.4f} (Higher is better)")
|
|
274
|
+
|
|
275
|
+
|
|
78
276
|
@app.command("plot-pops", rich_help_panel="Analysis")
|
|
79
277
|
def _plot_pops(
|
|
80
278
|
clusters_path: Annotated[
|
|
@@ -543,201 +741,6 @@ def _plot_tsne(
|
|
|
543
741
|
)
|
|
544
742
|
|
|
545
743
|
|
|
546
|
-
@app.command("summary", rich_help_panel="Analysis")
|
|
547
|
-
def _table_summary(
|
|
548
|
-
clusters_path: Annotated[
|
|
549
|
-
Path,
|
|
550
|
-
Argument(help="Path to the clusters file, or a dir with a clusters.pkl file"),
|
|
551
|
-
],
|
|
552
|
-
fps_path: Annotated[
|
|
553
|
-
Path | None,
|
|
554
|
-
Option(
|
|
555
|
-
"-f",
|
|
556
|
-
"--fps-path",
|
|
557
|
-
help="Path to fingerprint file, or directory with fingerprint files",
|
|
558
|
-
show_default=False,
|
|
559
|
-
),
|
|
560
|
-
] = None,
|
|
561
|
-
min_size: Annotated[
|
|
562
|
-
int,
|
|
563
|
-
Option("--min-size"),
|
|
564
|
-
] = 0,
|
|
565
|
-
smiles_path: Annotated[
|
|
566
|
-
Path | None,
|
|
567
|
-
Option(
|
|
568
|
-
"-s",
|
|
569
|
-
"--smiles-path",
|
|
570
|
-
show_default=False,
|
|
571
|
-
help="Optional smiles path, if passed a scaffold analysis is performed",
|
|
572
|
-
),
|
|
573
|
-
] = None,
|
|
574
|
-
top: Annotated[
|
|
575
|
-
int,
|
|
576
|
-
Option("--top"),
|
|
577
|
-
] = 20,
|
|
578
|
-
input_is_packed: Annotated[
|
|
579
|
-
bool,
|
|
580
|
-
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
581
|
-
] = True,
|
|
582
|
-
scaffold_fp_kind: Annotated[
|
|
583
|
-
str,
|
|
584
|
-
Option("--scaffold-fp-kind"),
|
|
585
|
-
] = DEFAULTS.fp_kind,
|
|
586
|
-
n_features: Annotated[
|
|
587
|
-
int | None,
|
|
588
|
-
Option(
|
|
589
|
-
"--n-features",
|
|
590
|
-
help="Number of features in the fingerprints."
|
|
591
|
-
" Only for packed inputs *if it is not a multiple of 8*."
|
|
592
|
-
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
593
|
-
rich_help_panel="Advanced",
|
|
594
|
-
),
|
|
595
|
-
] = None,
|
|
596
|
-
metrics: Annotated[
|
|
597
|
-
bool,
|
|
598
|
-
Option(
|
|
599
|
-
"--metrics/--no-metrics",
|
|
600
|
-
help="Calculate clustering indices (Dunn, DBI, CHI)",
|
|
601
|
-
),
|
|
602
|
-
] = False,
|
|
603
|
-
chosen_metrics: Annotated[
|
|
604
|
-
str,
|
|
605
|
-
Option(
|
|
606
|
-
"-m",
|
|
607
|
-
"--metrics-choice",
|
|
608
|
-
help=(
|
|
609
|
-
"Chosen metrics. "
|
|
610
|
-
" Comma-separated list including dunn (slow), dbi or chi"
|
|
611
|
-
),
|
|
612
|
-
),
|
|
613
|
-
] = "dunn,dbi,chi",
|
|
614
|
-
metrics_top: Annotated[
|
|
615
|
-
int | None,
|
|
616
|
-
Option("--metrics-top", rich_help_panel="Advanced"),
|
|
617
|
-
] = 100,
|
|
618
|
-
metrics_min_size: Annotated[
|
|
619
|
-
int,
|
|
620
|
-
Option("--metrics-min-size", hidden=True),
|
|
621
|
-
] = 1,
|
|
622
|
-
verbose: Annotated[
|
|
623
|
-
bool,
|
|
624
|
-
Option("--verbose/--no-verbose", hidden=True),
|
|
625
|
-
] = True,
|
|
626
|
-
) -> None:
|
|
627
|
-
r"""Summary table of clustering results, together with cluster metrics"""
|
|
628
|
-
from bblean._console import get_console
|
|
629
|
-
from bblean.smiles import load_smiles
|
|
630
|
-
from bblean.analysis import cluster_analysis
|
|
631
|
-
from bblean.utils import _has_files_or_valid_symlinks
|
|
632
|
-
from bblean.metrics import jt_dbi, jt_isim_chi, jt_isim_dunn, _calc_centrals
|
|
633
|
-
from rich.table import Table
|
|
634
|
-
|
|
635
|
-
console = get_console(silent=not verbose)
|
|
636
|
-
# Imports may take a bit of time since sklearn is slow, so start the spinner here
|
|
637
|
-
with console.status("[italic]Analyzing clusters...[/italic]", spinner="dots"):
|
|
638
|
-
if clusters_path.is_dir():
|
|
639
|
-
clusters_path = clusters_path / "clusters.pkl"
|
|
640
|
-
with open(clusters_path, mode="rb") as f:
|
|
641
|
-
clusters = pickle.load(f)
|
|
642
|
-
if fps_path is None:
|
|
643
|
-
input_fps_path = clusters_path.parent / "input-fps"
|
|
644
|
-
if input_fps_path.is_dir() and _has_files_or_valid_symlinks(input_fps_path):
|
|
645
|
-
fps_path = input_fps_path
|
|
646
|
-
else:
|
|
647
|
-
msg = (
|
|
648
|
-
"Could not find input fingerprints. Please use --fps-path."
|
|
649
|
-
" Summary plot without fingerprints doesn't include isim values"
|
|
650
|
-
)
|
|
651
|
-
warnings.warn(msg)
|
|
652
|
-
if fps_path is None:
|
|
653
|
-
fps_paths = None
|
|
654
|
-
elif fps_path.is_dir():
|
|
655
|
-
fps_paths = sorted(fps_path.glob("*.npy"))
|
|
656
|
-
else:
|
|
657
|
-
fps_paths = [fps_path]
|
|
658
|
-
ca = cluster_analysis(
|
|
659
|
-
clusters,
|
|
660
|
-
fps_paths,
|
|
661
|
-
smiles=load_smiles(smiles_path) if smiles_path is not None else (),
|
|
662
|
-
top=top,
|
|
663
|
-
n_features=n_features,
|
|
664
|
-
input_is_packed=input_is_packed,
|
|
665
|
-
min_size=min_size,
|
|
666
|
-
)
|
|
667
|
-
table = Table(title=(f"Top {top} clusters" if top is not None else "Clusters"))
|
|
668
|
-
table.add_column("Size", justify="center")
|
|
669
|
-
table.add_column("% fps", justify="center")
|
|
670
|
-
table.add_column("iSIM", justify="center")
|
|
671
|
-
if smiles_path is not None:
|
|
672
|
-
table.add_column("Size/Scaff.", justify="center")
|
|
673
|
-
table.add_column("Num. Scaff.", justify="center")
|
|
674
|
-
table.add_column("Scaff. iSIM", justify="center")
|
|
675
|
-
sizes = ca.sizes
|
|
676
|
-
isims = ca.isims
|
|
677
|
-
total_fps = ca.total_fps
|
|
678
|
-
for i in range(ca.clusters_num):
|
|
679
|
-
size = sizes[i]
|
|
680
|
-
percent = size / total_fps * 100
|
|
681
|
-
table.add_row(f"{size:,}", f"{percent:.2f}", f"{isims[i]:.3f}")
|
|
682
|
-
console.print(table)
|
|
683
|
-
console.print()
|
|
684
|
-
console.print(f"Total num. fps: {total_fps:,}")
|
|
685
|
-
console.print(f"Total num. clusters: {ca.all_clusters_num:,}")
|
|
686
|
-
singles = ca.all_singletons_num
|
|
687
|
-
singles_percent = singles * 100 / ca.all_clusters_num
|
|
688
|
-
console.print(f"Total num. singletons: {singles:,} ({singles_percent:.2f} %)")
|
|
689
|
-
gt10 = ca.all_clusters_num_with_size_above(10)
|
|
690
|
-
gt10_percent = gt10 * 100 / ca.all_clusters_num
|
|
691
|
-
console.print(
|
|
692
|
-
f"Total num. clusters, size > 10: {gt10:,} ({gt10_percent:.2f} %)"
|
|
693
|
-
)
|
|
694
|
-
gt100 = ca.all_clusters_num_with_size_above(100)
|
|
695
|
-
gt100_percent = gt100 * 100 / ca.all_clusters_num
|
|
696
|
-
console.print(
|
|
697
|
-
f"Total num. clusters, size > 100: {gt100:,} ({gt100_percent:.2f} %)"
|
|
698
|
-
)
|
|
699
|
-
console.print(
|
|
700
|
-
f"num-clusters/num-fps ratio: {ca.all_clusters_num / total_fps:.2f}"
|
|
701
|
-
)
|
|
702
|
-
console.print(f"Mean size: {ca.all_clusters_mean_size:.2f}")
|
|
703
|
-
console.print(f"Max. size: {ca.all_clusters_max_size:,}")
|
|
704
|
-
console.print(f"Q3 (75%) size: {ca.all_clusters_q3:,}")
|
|
705
|
-
console.print(f"Median size: {ca.all_clusters_median_size:,}")
|
|
706
|
-
console.print(f"Q1 (25%) size: {ca.all_clusters_q1:,}")
|
|
707
|
-
console.print(f"Min. size: {ca.all_clusters_min_size:,}")
|
|
708
|
-
if metrics:
|
|
709
|
-
chosen = set(s.lower() for s in chosen_metrics.split(","))
|
|
710
|
-
assert all(s in ["dunn", "chi", "dbi"] for s in chosen)
|
|
711
|
-
# Redo cluster analysis with more *top* clusters
|
|
712
|
-
console.print()
|
|
713
|
-
if metrics_top is None:
|
|
714
|
-
console.print("Clustering metrics:")
|
|
715
|
-
else:
|
|
716
|
-
console.print(f"Clustering metrics considering top {metrics_top} clusters:")
|
|
717
|
-
with console.status("[italic]Reanalyzing clusters...[/italic]", spinner="dots"):
|
|
718
|
-
ca = cluster_analysis(
|
|
719
|
-
clusters,
|
|
720
|
-
fps_paths,
|
|
721
|
-
smiles=(),
|
|
722
|
-
top=metrics_top,
|
|
723
|
-
n_features=n_features,
|
|
724
|
-
input_is_packed=input_is_packed,
|
|
725
|
-
min_size=metrics_min_size,
|
|
726
|
-
)
|
|
727
|
-
clusters = ca.get_top_cluster_fps()
|
|
728
|
-
with console.status("[italic]Calculating centrals...[/italic]", spinner="dots"):
|
|
729
|
-
centrals = _calc_centrals(clusters, kind="centroid")
|
|
730
|
-
if "chi" in chosen:
|
|
731
|
-
chi = jt_isim_chi(clusters, centrals=centrals, verbose=verbose)
|
|
732
|
-
console.print(f" - CHI index: {chi:.4f} (Higher is better)")
|
|
733
|
-
if "dbi" in chosen:
|
|
734
|
-
dbi = jt_dbi(clusters, centrals=centrals, verbose=verbose)
|
|
735
|
-
console.print(f" - DBI index: {dbi:.4e} (Lower is better)")
|
|
736
|
-
if "dunn" in chosen:
|
|
737
|
-
dunn = jt_isim_dunn(clusters, verbose=verbose)
|
|
738
|
-
console.print(f" - Dunn index: {dunn:.4f} (Higher is better)")
|
|
739
|
-
|
|
740
|
-
|
|
741
744
|
@app.command("plot-summary", rich_help_panel="Analysis")
|
|
742
745
|
def _plot_summary(
|
|
743
746
|
clusters_path: Annotated[
|
|
@@ -853,7 +856,7 @@ def _run(
|
|
|
853
856
|
ctx: Context,
|
|
854
857
|
input_: Annotated[
|
|
855
858
|
Path | None,
|
|
856
|
-
Argument(help="`*.npy` file with
|
|
859
|
+
Argument(help="`*.npy` file with fingerprints, or dir with `*.npy` files"),
|
|
857
860
|
] = None,
|
|
858
861
|
out_dir: Annotated[
|
|
859
862
|
Path | None,
|
|
@@ -882,6 +885,7 @@ def _run(
|
|
|
882
885
|
Option(
|
|
883
886
|
"--refine-threshold-change",
|
|
884
887
|
help="Modify threshold for refinement criterion, can be negative",
|
|
888
|
+
hidden=True,
|
|
885
889
|
),
|
|
886
890
|
] = DEFAULTS.refine_threshold_change,
|
|
887
891
|
save_tree: Annotated[
|
|
@@ -912,6 +916,7 @@ def _run(
|
|
|
912
916
|
"Num. of largest clusters to refine."
|
|
913
917
|
" 1 for standard refinement, 0 is the default (no refinement)"
|
|
914
918
|
),
|
|
919
|
+
hidden=True,
|
|
915
920
|
),
|
|
916
921
|
] = 0,
|
|
917
922
|
refine_rounds: Annotated[
|
|
@@ -919,7 +924,6 @@ def _run(
|
|
|
919
924
|
Option(
|
|
920
925
|
"--refine-rounds",
|
|
921
926
|
help=("Num. of refinement rounds. "),
|
|
922
|
-
hidden=True,
|
|
923
927
|
),
|
|
924
928
|
] = None,
|
|
925
929
|
recluster_rounds: Annotated[
|
|
@@ -927,13 +931,12 @@ def _run(
|
|
|
927
931
|
Option(
|
|
928
932
|
"--recluster-rounds",
|
|
929
933
|
help=("Num. of reclustering rounds. "),
|
|
930
|
-
hidden=True,
|
|
931
934
|
),
|
|
932
935
|
] = 0,
|
|
933
936
|
recluster_shuffle: Annotated[
|
|
934
937
|
bool,
|
|
935
938
|
Option("--recluster-shuffle/--no-recluster-shuffle", hidden=True),
|
|
936
|
-
] =
|
|
939
|
+
] = False,
|
|
937
940
|
n_features: Annotated[
|
|
938
941
|
int | None,
|
|
939
942
|
Option(
|
|
@@ -1020,10 +1023,14 @@ def _run(
|
|
|
1020
1023
|
if input_ is None:
|
|
1021
1024
|
input_ = Path.cwd() / "bb_inputs"
|
|
1022
1025
|
input_.mkdir(exist_ok=True)
|
|
1023
|
-
input_files = sorted(
|
|
1026
|
+
input_files = sorted(
|
|
1027
|
+
f for f in input_.glob("*.npy") if not f.stem.endswith(".indices")
|
|
1028
|
+
)
|
|
1024
1029
|
_validate_input_dir(input_)
|
|
1025
1030
|
elif input_.is_dir():
|
|
1026
|
-
input_files = sorted(
|
|
1031
|
+
input_files = sorted(
|
|
1032
|
+
f for f in input_.glob("*.npy") if not f.stem.endswith(".indices")
|
|
1033
|
+
)
|
|
1027
1034
|
_validate_input_dir(input_)
|
|
1028
1035
|
else:
|
|
1029
1036
|
input_files = [input_]
|
|
@@ -1158,7 +1165,7 @@ def _multiround(
|
|
|
1158
1165
|
"--mid-ps",
|
|
1159
1166
|
"--mid-processes",
|
|
1160
1167
|
help="Num. processes for middle section rounds."
|
|
1161
|
-
" These are
|
|
1168
|
+
" These are memory intensive,"
|
|
1162
1169
|
" you may want to use 50%-30% of --ps."
|
|
1163
1170
|
" Default is same as --ps",
|
|
1164
1171
|
),
|
|
@@ -1176,10 +1183,6 @@ def _multiround(
|
|
|
1176
1183
|
float,
|
|
1177
1184
|
Option("--threshold", "-t", help="Thresh for merge criterion (initial step)"),
|
|
1178
1185
|
] = DEFAULTS.threshold,
|
|
1179
|
-
mid_threshold_change: Annotated[
|
|
1180
|
-
float,
|
|
1181
|
-
Option("--mid-threshold-change", help="Modify threshold for refinement"),
|
|
1182
|
-
] = DEFAULTS.refine_threshold_change,
|
|
1183
1186
|
initial_merge_criterion: Annotated[
|
|
1184
1187
|
str,
|
|
1185
1188
|
Option(
|
|
@@ -1196,19 +1199,19 @@ def _multiround(
|
|
|
1196
1199
|
bool,
|
|
1197
1200
|
Option("--save-centroids/--no-save-centroids", rich_help_panel="Advanced"),
|
|
1198
1201
|
] = True,
|
|
1199
|
-
|
|
1200
|
-
|
|
1202
|
+
mid_threshold_change: Annotated[
|
|
1203
|
+
float,
|
|
1201
1204
|
Option(
|
|
1202
|
-
"--
|
|
1203
|
-
help="
|
|
1205
|
+
"--mid-threshold-change",
|
|
1206
|
+
help="Modify threshold for refinement",
|
|
1204
1207
|
rich_help_panel="Advanced",
|
|
1205
1208
|
),
|
|
1206
|
-
] =
|
|
1209
|
+
] = DEFAULTS.refine_threshold_change,
|
|
1207
1210
|
mid_merge_criterion: Annotated[
|
|
1208
1211
|
str,
|
|
1209
1212
|
Option(
|
|
1210
1213
|
"--set-mid-merge",
|
|
1211
|
-
help="Merge criterion for
|
|
1214
|
+
help="Merge criterion for mid rounds ('tolerance-diameter' recommended)",
|
|
1212
1215
|
),
|
|
1213
1216
|
] = DEFAULTS.refine_merge_criterion,
|
|
1214
1217
|
tolerance: Annotated[
|
|
@@ -1242,7 +1245,6 @@ def _multiround(
|
|
|
1242
1245
|
Option(
|
|
1243
1246
|
"--num-mid-rounds",
|
|
1244
1247
|
help="Number of midsection rounds to perform",
|
|
1245
|
-
rich_help_panel="Advanced",
|
|
1246
1248
|
),
|
|
1247
1249
|
] = 1,
|
|
1248
1250
|
split_largest_after_midsection: Annotated[
|
|
@@ -1358,7 +1360,9 @@ def _multiround(
|
|
|
1358
1360
|
in_dir = Path.cwd() / "bb_inputs"
|
|
1359
1361
|
_validate_input_dir(in_dir)
|
|
1360
1362
|
# All files in the input dir with *.npy suffix are considered input files
|
|
1361
|
-
input_files = sorted(
|
|
1363
|
+
input_files = sorted(
|
|
1364
|
+
f for f in in_dir.glob("*.npy") if not f.stem.endswith(".indices")
|
|
1365
|
+
)[:max_files]
|
|
1362
1366
|
ctx.params["input_files"] = [str(p.resolve()) for p in input_files]
|
|
1363
1367
|
ctx.params["num_fps"] = [_get_fps_file_num(p) for p in input_files]
|
|
1364
1368
|
if max_fps is not None:
|
|
@@ -1397,7 +1401,6 @@ def _multiround(
|
|
|
1397
1401
|
midsection_threshold_change=mid_threshold_change,
|
|
1398
1402
|
tolerance=tolerance,
|
|
1399
1403
|
# Advanced
|
|
1400
|
-
sort_fps=sort_fps,
|
|
1401
1404
|
save_tree=save_tree,
|
|
1402
1405
|
save_centroids=save_centroids,
|
|
1403
1406
|
bin_size=bin_size,
|
|
@@ -1444,9 +1447,11 @@ def _fps_info(
|
|
|
1444
1447
|
for path in fp_paths:
|
|
1445
1448
|
if path.is_dir():
|
|
1446
1449
|
for file in path.glob("*.npy"):
|
|
1450
|
+
if file.stem.endswith(".indices"):
|
|
1451
|
+
continue
|
|
1447
1452
|
_print_fps_file_info(file, console)
|
|
1448
1453
|
elif path.suffix == ".npy":
|
|
1449
|
-
_print_fps_file_info(
|
|
1454
|
+
_print_fps_file_info(path, console)
|
|
1450
1455
|
|
|
1451
1456
|
|
|
1452
1457
|
@app.command("fps-from-smiles", rich_help_panel="Fingerprints")
|
|
@@ -1803,9 +1808,9 @@ def _split_fps(
|
|
|
1803
1808
|
|
|
1804
1809
|
@app.command("fps-shuffle", rich_help_panel="Fingerprints")
|
|
1805
1810
|
def _shuffle_fps(
|
|
1806
|
-
|
|
1811
|
+
in_path: Annotated[
|
|
1807
1812
|
Path,
|
|
1808
|
-
Argument(help="`*.npy` file with
|
|
1813
|
+
Argument(help="`*.npy` file with fingerprints, or dir with `*.npy` files"),
|
|
1809
1814
|
],
|
|
1810
1815
|
out_dir: Annotated[
|
|
1811
1816
|
Path | None,
|
|
@@ -1815,22 +1820,50 @@ def _shuffle_fps(
|
|
|
1815
1820
|
int | None,
|
|
1816
1821
|
Option("--seed", hidden=True, rich_help_panel="Debug"),
|
|
1817
1822
|
] = None,
|
|
1823
|
+
save_shuffle_idxs: Annotated[
|
|
1824
|
+
bool,
|
|
1825
|
+
Option("--save-shuffle-idxs/--no-save-shuffle-idxs"),
|
|
1826
|
+
] = True,
|
|
1818
1827
|
) -> None:
|
|
1819
1828
|
"""Shuffle a fingerprints file
|
|
1820
1829
|
|
|
1821
1830
|
This function is not optimized and as such may have high RAM usage. It is
|
|
1822
1831
|
meant for testing purposes only"""
|
|
1823
1832
|
import numpy as np
|
|
1833
|
+
from bblean._console import get_console
|
|
1824
1834
|
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1835
|
+
console = get_console()
|
|
1836
|
+
|
|
1837
|
+
console = get_console()
|
|
1838
|
+
if in_path.is_dir():
|
|
1839
|
+
files = sorted(
|
|
1840
|
+
f for f in in_path.glob("*.npy") if not f.stem.endswith(".indices")
|
|
1841
|
+
)
|
|
1842
|
+
else:
|
|
1843
|
+
files = [in_path]
|
|
1829
1844
|
if out_dir is None:
|
|
1830
1845
|
out_dir = Path.cwd()
|
|
1831
1846
|
out_dir.mkdir(exist_ok=True)
|
|
1832
1847
|
out_dir = out_dir.resolve()
|
|
1833
|
-
|
|
1848
|
+
for f in files:
|
|
1849
|
+
with console.status(
|
|
1850
|
+
"[italic]Shuffling fingerprints...[/italic]", spinner="dots"
|
|
1851
|
+
):
|
|
1852
|
+
fps = np.load(f)
|
|
1853
|
+
stem = f.stem
|
|
1854
|
+
rng = np.random.default_rng(seed)
|
|
1855
|
+
shuffle_idxs = rng.permutation(fps.shape[0])
|
|
1856
|
+
fps = fps[shuffle_idxs]
|
|
1857
|
+
stem = f"shuffled-{stem}"
|
|
1858
|
+
np.save(out_dir / f"{stem}.npy", fps)
|
|
1859
|
+
if save_shuffle_idxs:
|
|
1860
|
+
np.save(out_dir / f"{stem}.indices.npy", shuffle_idxs)
|
|
1861
|
+
if save_shuffle_idxs:
|
|
1862
|
+
console.print(
|
|
1863
|
+
f"Finished. Outputs written to {str(out_dir / stem)}.npy and {str(out_dir / stem)}.indices.npy" # noqa
|
|
1864
|
+
)
|
|
1865
|
+
else:
|
|
1866
|
+
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
|
|
1834
1867
|
|
|
1835
1868
|
|
|
1836
1869
|
@app.command("fps-merge", rich_help_panel="Fingerprints")
|
|
@@ -1858,6 +1891,8 @@ def _merge_fps(
|
|
|
1858
1891
|
with console.status("[italic]Merging fingerprints...[/italic]", spinner="dots"):
|
|
1859
1892
|
stem = None
|
|
1860
1893
|
for f in sorted(in_dir.glob("*.npy")):
|
|
1894
|
+
if f.stem.endswith(".indices"):
|
|
1895
|
+
continue
|
|
1861
1896
|
if stem is None:
|
|
1862
1897
|
stem = f.name.split(".")[0]
|
|
1863
1898
|
elif stem != f.name.split(".")[0]:
|
|
@@ -1875,9 +1910,9 @@ def _merge_fps(
|
|
|
1875
1910
|
|
|
1876
1911
|
@app.command("fps-sort", rich_help_panel="Fingerprints")
|
|
1877
1912
|
def _sort_fps(
|
|
1878
|
-
|
|
1913
|
+
in_path: Annotated[
|
|
1879
1914
|
Path,
|
|
1880
|
-
Argument(help="`*.npy` file with
|
|
1915
|
+
Argument(help="`*.npy` file with fingerprints, or dir with `*.npy` files"),
|
|
1881
1916
|
],
|
|
1882
1917
|
out_dir: Annotated[
|
|
1883
1918
|
Path | None,
|
|
@@ -1887,17 +1922,173 @@ def _sort_fps(
|
|
|
1887
1922
|
int | None,
|
|
1888
1923
|
Option("--seed", hidden=True, rich_help_panel="Debug"),
|
|
1889
1924
|
] = None,
|
|
1925
|
+
input_is_packed: Annotated[
|
|
1926
|
+
bool,
|
|
1927
|
+
Option("--packed-input/--unpacked-input", rich_help_panel="Advanced"),
|
|
1928
|
+
] = True,
|
|
1929
|
+
n_features: Annotated[
|
|
1930
|
+
int | None,
|
|
1931
|
+
Option(
|
|
1932
|
+
"--n-features",
|
|
1933
|
+
help="Number of features in the fingerprints."
|
|
1934
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
1935
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
1936
|
+
rich_help_panel="Advanced",
|
|
1937
|
+
),
|
|
1938
|
+
] = None,
|
|
1939
|
+
save_sort_idxs: Annotated[
|
|
1940
|
+
bool,
|
|
1941
|
+
Option("--save-sort-idxs/--no-save-sort-idxs"),
|
|
1942
|
+
] = True,
|
|
1890
1943
|
) -> None:
|
|
1944
|
+
r"""Sort a fingerprints file by popcount"""
|
|
1891
1945
|
import numpy as np
|
|
1892
1946
|
from bblean._py_similarity import _popcount
|
|
1947
|
+
from bblean._console import get_console
|
|
1948
|
+
from bblean.fingerprints import pack_fingerprints
|
|
1949
|
+
|
|
1950
|
+
# Note that n_features is not used here even if input_is_packed is True,
|
|
1951
|
+
# it is added for API homogeneity
|
|
1952
|
+
|
|
1953
|
+
console = get_console()
|
|
1954
|
+
if in_path.is_dir():
|
|
1955
|
+
files = sorted(
|
|
1956
|
+
f for f in in_path.glob("*.npy") if not f.stem.endswith(".indices")
|
|
1957
|
+
)
|
|
1958
|
+
else:
|
|
1959
|
+
files = [in_path]
|
|
1960
|
+
if out_dir is None:
|
|
1961
|
+
out_dir = Path.cwd()
|
|
1962
|
+
out_dir.mkdir(exist_ok=True)
|
|
1963
|
+
out_dir = out_dir.resolve()
|
|
1964
|
+
for f in files:
|
|
1965
|
+
with console.status(
|
|
1966
|
+
"[italic]Sorting fingerprints by popcount...[/italic]", spinner="dots"
|
|
1967
|
+
):
|
|
1968
|
+
fps = np.load(f)
|
|
1969
|
+
stem = f.stem
|
|
1970
|
+
if not input_is_packed:
|
|
1971
|
+
packed_fps = pack_fingerprints(fps)
|
|
1972
|
+
else:
|
|
1973
|
+
packed_fps = fps
|
|
1974
|
+
counts = _popcount(packed_fps)
|
|
1975
|
+
sort_idxs = np.argsort(counts)
|
|
1976
|
+
fps = fps[sort_idxs]
|
|
1977
|
+
stem = f"sorted-{stem}"
|
|
1978
|
+
np.save(out_dir / f"{stem}.npy", fps)
|
|
1979
|
+
if save_sort_idxs:
|
|
1980
|
+
np.save(out_dir / f"{stem}.indices.npy", sort_idxs)
|
|
1981
|
+
|
|
1982
|
+
if save_sort_idxs:
|
|
1983
|
+
console.print(
|
|
1984
|
+
f"Finished. Outputs written to {str(out_dir / stem)}.npy and {str(out_dir / stem)}.indices.npy" # noqa
|
|
1985
|
+
)
|
|
1986
|
+
else:
|
|
1987
|
+
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
|
|
1988
|
+
|
|
1989
|
+
|
|
1990
|
+
@app.command("fps-unpack", rich_help_panel="Fingerprints")
|
|
1991
|
+
def _unpack_fps(
|
|
1992
|
+
in_path: Annotated[
|
|
1993
|
+
Path,
|
|
1994
|
+
Argument(help="`*.npy` file with fingerprints, or dir with `*.npy` files"),
|
|
1995
|
+
],
|
|
1996
|
+
out_dir: Annotated[
|
|
1997
|
+
Path | None,
|
|
1998
|
+
Option("-o", "--out-dir", show_default=False),
|
|
1999
|
+
] = None,
|
|
2000
|
+
n_features: Annotated[
|
|
2001
|
+
int | None,
|
|
2002
|
+
Option(
|
|
2003
|
+
"--n-features",
|
|
2004
|
+
help="Number of features in the fingerprints."
|
|
2005
|
+
" Only for packed inputs *if it is not a multiple of 8*."
|
|
2006
|
+
" Not required for typical fingerprint sizes (e.g. 2048, 1024)",
|
|
2007
|
+
rich_help_panel="Advanced",
|
|
2008
|
+
),
|
|
2009
|
+
] = None,
|
|
2010
|
+
) -> None:
|
|
2011
|
+
r"""Unpack a fingerprints file"""
|
|
2012
|
+
import numpy as np
|
|
2013
|
+
from bblean.fingerprints import unpack_fingerprints
|
|
2014
|
+
from bblean._console import get_console
|
|
1893
2015
|
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
2016
|
+
console = get_console()
|
|
2017
|
+
|
|
2018
|
+
if in_path.is_dir():
|
|
2019
|
+
files = sorted(
|
|
2020
|
+
f for f in in_path.glob("*.npy") if not f.stem.endswith(".indices")
|
|
2021
|
+
)
|
|
2022
|
+
else:
|
|
2023
|
+
files = [in_path]
|
|
1899
2024
|
if out_dir is None:
|
|
1900
2025
|
out_dir = Path.cwd()
|
|
1901
2026
|
out_dir.mkdir(exist_ok=True)
|
|
1902
2027
|
out_dir = out_dir.resolve()
|
|
1903
|
-
|
|
2028
|
+
for f in files:
|
|
2029
|
+
with console.status(
|
|
2030
|
+
"[italic]Unpacking fingerprints...[/italic]", spinner="dots"
|
|
2031
|
+
):
|
|
2032
|
+
fps = np.load(f)
|
|
2033
|
+
stem = f.stem
|
|
2034
|
+
if "unpacked" in stem:
|
|
2035
|
+
warnings.warn(
|
|
2036
|
+
"The fingerprints file name containes 'unpacked',"
|
|
2037
|
+
" make sure the file contains packed fps"
|
|
2038
|
+
)
|
|
2039
|
+
stem = f"unpacked-{stem}"
|
|
2040
|
+
elif "packed" in stem:
|
|
2041
|
+
stem = stem.replace("packed", "unpacked")
|
|
2042
|
+
else:
|
|
2043
|
+
stem = f"unpacked-{stem}"
|
|
2044
|
+
unpacked_fps = unpack_fingerprints(fps, n_features)
|
|
2045
|
+
np.save(out_dir / f"{stem}.npy", unpacked_fps)
|
|
2046
|
+
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
|
|
2047
|
+
|
|
2048
|
+
|
|
2049
|
+
@app.command("fps-pack", rich_help_panel="Fingerprints")
|
|
2050
|
+
def _pack_fps(
|
|
2051
|
+
in_path: Annotated[
|
|
2052
|
+
Path,
|
|
2053
|
+
Argument(help="`*.npy` file with fingerprints, or dir with `*.npy` files"),
|
|
2054
|
+
],
|
|
2055
|
+
out_dir: Annotated[
|
|
2056
|
+
Path | None,
|
|
2057
|
+
Option("-o", "--out-dir", show_default=False),
|
|
2058
|
+
] = None,
|
|
2059
|
+
) -> None:
|
|
2060
|
+
r"""Pack a fingerprints file"""
|
|
2061
|
+
import numpy as np
|
|
2062
|
+
from bblean.fingerprints import pack_fingerprints
|
|
2063
|
+
from bblean._console import get_console
|
|
2064
|
+
|
|
2065
|
+
console = get_console()
|
|
2066
|
+
|
|
2067
|
+
if in_path.is_dir():
|
|
2068
|
+
files = sorted(
|
|
2069
|
+
f for f in in_path.glob("*.npy") if not f.stem.endswith(".indices")
|
|
2070
|
+
)
|
|
2071
|
+
else:
|
|
2072
|
+
files = [in_path]
|
|
2073
|
+
if out_dir is None:
|
|
2074
|
+
out_dir = Path.cwd()
|
|
2075
|
+
out_dir.mkdir(exist_ok=True)
|
|
2076
|
+
out_dir = out_dir.resolve()
|
|
2077
|
+
for f in files:
|
|
2078
|
+
with console.status("[italic]Packing fingerprints...[/italic]", spinner="dots"):
|
|
2079
|
+
fps = np.load(f)
|
|
2080
|
+
stem = f.stem
|
|
2081
|
+
if "packed" in stem and "unpacked" not in "stem":
|
|
2082
|
+
msg = (
|
|
2083
|
+
"The fingerprints file name containes 'packed',"
|
|
2084
|
+
" make sure the file contains packed fps"
|
|
2085
|
+
)
|
|
2086
|
+
warnings.warn(msg)
|
|
2087
|
+
stem = f"packed-{stem}"
|
|
2088
|
+
elif "unpacked" in stem:
|
|
2089
|
+
stem = stem.replace("unpacked", "packed")
|
|
2090
|
+
else:
|
|
2091
|
+
stem = f"packed-{stem}"
|
|
2092
|
+
unpacked_fps = pack_fingerprints(fps)
|
|
2093
|
+
np.save(out_dir / f"{stem}.npy", unpacked_fps)
|
|
2094
|
+
console.print(f"Finished. Outputs written to {str(out_dir / stem)}.npy")
|
bblean/fingerprints.py
CHANGED
|
@@ -242,13 +242,28 @@ def _print_fps_file_info(path: Path, console: Console | None = None) -> None:
|
|
|
242
242
|
shape, dtype, shape_is_valid, dtype_is_valid = _get_fps_file_shape_and_dtype(path)
|
|
243
243
|
|
|
244
244
|
console.print(f"File: {path.resolve()}")
|
|
245
|
+
has_nonzero = None
|
|
245
246
|
if shape_is_valid and dtype_is_valid:
|
|
246
247
|
console.print(" - [green]Valid fingerprint file[/green]")
|
|
248
|
+
if shape[0] > 0:
|
|
249
|
+
first_fp = np.load(path, mmap_mode="r")[0]
|
|
250
|
+
has_nonzero = (first_fp > 1).any()
|
|
251
|
+
if has_nonzero:
|
|
252
|
+
console.print(" - Guessed format: [cyan]Packed[/cyan]")
|
|
253
|
+
else:
|
|
254
|
+
console.print(" - Guessed format: [magenta]Unpacked[/magenta]")
|
|
255
|
+
else:
|
|
256
|
+
console.print(" - Guessed format: [red]Unknown[/red]")
|
|
247
257
|
else:
|
|
248
258
|
console.print(" - [red]Invalid fingerprint file[/red]")
|
|
249
259
|
if shape_is_valid:
|
|
250
260
|
console.print(f" - Num. fingerprints: {shape[0]:,}")
|
|
251
|
-
|
|
261
|
+
if has_nonzero:
|
|
262
|
+
console.print(
|
|
263
|
+
f" - Num. features: {shape[1]:,} (guessed unpacked: {shape[1] * 8:,})" # noqa
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
console.print(f" - Num. features: {shape[1]:,}")
|
|
252
267
|
else:
|
|
253
268
|
console.print(f" - Shape: {shape}")
|
|
254
269
|
console.print(f" - DType: [yellow]{dtype.name}[/yellow]")
|
bblean/multiround.py
CHANGED
|
@@ -65,7 +65,6 @@ from bblean._config import DEFAULTS
|
|
|
65
65
|
from bblean.utils import batched
|
|
66
66
|
from bblean.bitbirch import BitBirch
|
|
67
67
|
from bblean.fingerprints import _get_fps_file_num
|
|
68
|
-
from bblean._py_similarity import _popcount
|
|
69
68
|
|
|
70
69
|
__all__ = ["run_multiround_bitbirch"]
|
|
71
70
|
|
|
@@ -158,7 +157,6 @@ class _InitialRound:
|
|
|
158
157
|
max_fps: int | None = None,
|
|
159
158
|
merge_criterion: str = DEFAULTS.merge_criterion,
|
|
160
159
|
input_is_packed: bool = True,
|
|
161
|
-
sort_fps: bool = False,
|
|
162
160
|
) -> None:
|
|
163
161
|
self.n_features = n_features
|
|
164
162
|
self.refinement_before_midsection = refinement_before_midsection
|
|
@@ -173,7 +171,6 @@ class _InitialRound:
|
|
|
173
171
|
self.refine_merge_criterion = refine_merge_criterion
|
|
174
172
|
self.input_is_packed = input_is_packed
|
|
175
173
|
self.refine_threshold_change = refine_threshold_change
|
|
176
|
-
self._sort_fps = sort_fps
|
|
177
174
|
|
|
178
175
|
def __call__(self, file_info: tuple[str, Path, int, int]) -> None:
|
|
179
176
|
file_label, fp_file, start_idx, end_idx = file_info
|
|
@@ -185,13 +182,6 @@ class _InitialRound:
|
|
|
185
182
|
threshold=self.threshold,
|
|
186
183
|
merge_criterion=self.merge_criterion,
|
|
187
184
|
)
|
|
188
|
-
if self._sort_fps:
|
|
189
|
-
fp_input = np.load(fp_file)
|
|
190
|
-
counts = _popcount(fp_input)
|
|
191
|
-
sort_idxs = np.argsort(counts)
|
|
192
|
-
fp_input = fp_input[sort_idxs]
|
|
193
|
-
else:
|
|
194
|
-
fp_input = fp_file
|
|
195
185
|
|
|
196
186
|
range_ = range(start_idx, end_idx)
|
|
197
187
|
tree.fit(
|
|
@@ -371,7 +361,6 @@ def run_multiround_bitbirch(
|
|
|
371
361
|
mp_context: tp.Any = None,
|
|
372
362
|
save_tree: bool = False,
|
|
373
363
|
save_centroids: bool = True,
|
|
374
|
-
sort_fps: bool = False,
|
|
375
364
|
# Debug
|
|
376
365
|
max_fps: int | None = None,
|
|
377
366
|
verbose: bool = False,
|
|
@@ -418,7 +407,6 @@ def run_multiround_bitbirch(
|
|
|
418
407
|
console.print(f"(Initial) Round {round_idx}: Cluster initial batch of fingerprints")
|
|
419
408
|
|
|
420
409
|
initial_fn = _InitialRound(
|
|
421
|
-
sort_fps=sort_fps,
|
|
422
410
|
n_features=n_features,
|
|
423
411
|
refinement_before_midsection=refinement_before_midsection,
|
|
424
412
|
max_fps=max_fps,
|
bblean/plotting.py
CHANGED
|
@@ -464,7 +464,9 @@ def _dispatch_visualization(
|
|
|
464
464
|
if fps_path is None:
|
|
465
465
|
fps_paths = None
|
|
466
466
|
elif fps_path.is_dir():
|
|
467
|
-
fps_paths = sorted(
|
|
467
|
+
fps_paths = sorted(
|
|
468
|
+
f for f in fps_path.glob("*.npy") if not f.stem.endswith(".indices")
|
|
469
|
+
)
|
|
468
470
|
else:
|
|
469
471
|
fps_paths = [fps_path]
|
|
470
472
|
ca = cluster_analysis(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bblean
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4b0
|
|
4
4
|
Summary: BitBirch-Lean Python package
|
|
5
5
|
Author: The Miranda-Quintana Lab and other BitBirch developers
|
|
6
6
|
Author-email: Ramon Alain Miranda Quintana <quintana@chem.ufl.edu>, Krisztina Zsigmond <kzsigmond@ufl.edu>, Ignacio Pickering <ipickering@ufl.edu>, Kenneth Lopez Perez <klopezperez@chem.ufl.edu>, Miroslav Lzicar <miroslav.lzicar@deepmedchem.com>
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
bblean/__init__.py,sha256=9cudBHEt0H5p0jKEvgrhLZIHPSzwNAx0uJRp-_iM32I,686
|
|
2
2
|
bblean/_config.py,sha256=WaONZilOWCLFdZulqWLKRqNM-ZLhY0YCXfwk-84FYmQ,1813
|
|
3
3
|
bblean/_console.py,sha256=Mk1hi1NdPw2HDmjWj1LLbCuV3vCxL5l6u2gXaEeOFBM,8021
|
|
4
|
-
bblean/_cpp_similarity.cp313-win_amd64.pyd,sha256=
|
|
4
|
+
bblean/_cpp_similarity.cp313-win_amd64.pyd,sha256=j6WG-AVy52N0pI7vgTleE00oepptFYNKakdTBvt_Z_g,182272
|
|
5
5
|
bblean/_memory.py,sha256=eycXzXV_O_VEyIKpAv3QpbxtpB5WkBLChzm_e2Dqaw0,6892
|
|
6
6
|
bblean/_merges.py,sha256=xwFMJUPJ9VMujf2nSROx0NhsPoQ_R84KIxBF81x2hks,6432
|
|
7
7
|
bblean/_py_similarity.py,sha256=VYWu7gVCEDjNaRLgxiCxCGjCfmTity86UPC0dfT83Ok,9633
|
|
8
8
|
bblean/_timer.py,sha256=D1-_tTQFJqIQgzl4HSE__-P3Scw72EIVlNDaChJT8Qs,1402
|
|
9
|
-
bblean/_version.py,sha256=
|
|
9
|
+
bblean/_version.py,sha256=jIms8K656rZcsogwrmXt4K8zNWYu-ByI9RQMgcCJcTM,754
|
|
10
10
|
bblean/analysis.py,sha256=apD5OgSoNGbIuBLSJFFzlUkVjZHBtb3fVEeEUJGbyqc,8118
|
|
11
11
|
bblean/bitbirch.py,sha256=OjK0IhdXT83dMdtsEcpQQLbAq6yEBb7z-7QojAkgelA,60279
|
|
12
|
-
bblean/cli.py,sha256=
|
|
13
|
-
bblean/fingerprints.py,sha256=
|
|
12
|
+
bblean/cli.py,sha256=1vbUEbuCTpl_tczHp-DiufoL6Ev6cNoCzLe55nHhkrE,70994
|
|
13
|
+
bblean/fingerprints.py,sha256=nbdTjBXVvaBJ9ralIva4vl5qtpPFxtHtZ0bp_R7R_jI,16176
|
|
14
14
|
bblean/metrics.py,sha256=4KB-PIQJtFMsNg7lG2uM1HEId_eR5vhqcdLpCVLuI5Y,7280
|
|
15
|
-
bblean/multiround.py,sha256=
|
|
16
|
-
bblean/plotting.py,sha256=
|
|
15
|
+
bblean/multiround.py,sha256=5VAACXTQfLxgl6UefVpF2tQo0ifFG3ehq1_ELjoMt5k,19862
|
|
16
|
+
bblean/plotting.py,sha256=B2Kpw_HuKx1KxuKXI83IIWPQVsd-uJyDSu47a6mhzwE,15956
|
|
17
17
|
bblean/similarity.py,sha256=O2OTW5Dw64go177jwzF5skvDSJEzDS7UImyIQ2nShig,12192
|
|
18
18
|
bblean/sklearn.py,sha256=KK7rbF3gENjlv5-9uOvH-Q0LEW1RUY__xClcnLznuE0,7450
|
|
19
19
|
bblean/smiles.py,sha256=zyLWXzTLebeFmltDMuJcneJqaLLgGOYw0118889nn7A,2356
|
|
@@ -23,9 +23,9 @@ bblean/_legacy/bb_int64.py,sha256=Otqxu8NBLrfOMpJoMrLgWtDP_9Hn4joQXZVkU1hjges,45
|
|
|
23
23
|
bblean/_legacy/bb_uint8.py,sha256=8kbeVAq7MxiR8hS_6lKhSDhVWc6acjLmLzNFCR466iA,41573
|
|
24
24
|
bblean/csrc/README.md,sha256=qOPPK6sTqkYgnlPWtcNu9P3PwuLH8cCNJ1FwJeewsrk,59
|
|
25
25
|
bblean/csrc/similarity.cpp,sha256=q6oMg9Vd0REPmqze8xToTmeXZiEuHTmOfL6QsTRFkDE,23122
|
|
26
|
-
bblean-0.7.
|
|
27
|
-
bblean-0.7.
|
|
28
|
-
bblean-0.7.
|
|
29
|
-
bblean-0.7.
|
|
30
|
-
bblean-0.7.
|
|
31
|
-
bblean-0.7.
|
|
26
|
+
bblean-0.7.4b0.dist-info/licenses/LICENSE,sha256=Dq9t2XHr5wSrykVuVo8etKsAS35ENnDobU1h7t3H_-k,2598
|
|
27
|
+
bblean-0.7.4b0.dist-info/METADATA,sha256=1g5LcbhEzHD1UkVNygvPu4ZoSVvdOR07Poq8X5H95B0,13053
|
|
28
|
+
bblean-0.7.4b0.dist-info/WHEEL,sha256=qV0EIPljj1XC_vuSatRWjn02nZIz3N1t8jsZz7HBr2U,101
|
|
29
|
+
bblean-0.7.4b0.dist-info/entry_points.txt,sha256=a0jb2L5JFKioMD6CqbvJiI2unaArGzi-AMZsyY-uyGg,38
|
|
30
|
+
bblean-0.7.4b0.dist-info/top_level.txt,sha256=ybxTonvTC9zR25yR5B27aEDLl6CiwID093ZyS_--Cq4,7
|
|
31
|
+
bblean-0.7.4b0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|