dayhoff-tools 1.14.13__py3-none-any.whl → 1.14.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/cli/batch/commands/finalize.py +25 -12
- {dayhoff_tools-1.14.13.dist-info → dayhoff_tools-1.14.14.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.14.13.dist-info → dayhoff_tools-1.14.14.dist-info}/RECORD +5 -5
- {dayhoff_tools-1.14.13.dist-info → dayhoff_tools-1.14.14.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.14.13.dist-info → dayhoff_tools-1.14.14.dist-info}/entry_points.txt +0 -0
|
@@ -34,8 +34,13 @@ from ..manifest import (
|
|
|
34
34
|
is_flag=True,
|
|
35
35
|
help="For Boltz: copy entire output directory (default: only essential files)",
|
|
36
36
|
)
|
|
37
|
+
@click.option(
|
|
38
|
+
"--skip-dedup",
|
|
39
|
+
is_flag=True,
|
|
40
|
+
help="Skip deduplication step (use if input has no duplicates)",
|
|
41
|
+
)
|
|
37
42
|
@click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
|
|
38
|
-
def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
|
|
43
|
+
def finalize(job_id, output, force, keep_intermediates, full_output, skip_dedup, base_path):
|
|
39
44
|
"""Combine results and clean up job intermediates.
|
|
40
45
|
|
|
41
46
|
For embedding jobs, combines H5 files into a single output file.
|
|
@@ -46,6 +51,9 @@ def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
|
|
|
46
51
|
# Embedding job - combine H5 files
|
|
47
52
|
dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5
|
|
48
53
|
|
|
54
|
+
# Skip deduplication (faster if input has no duplicates)
|
|
55
|
+
dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5 --skip-dedup
|
|
56
|
+
|
|
49
57
|
# Boltz job - extract essential files only (default)
|
|
50
58
|
dh batch finalize dma-boltz-20260113-190a --output /primordial/structures/
|
|
51
59
|
|
|
@@ -92,7 +100,7 @@ def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
|
|
|
92
100
|
# Finalize based on pipeline type
|
|
93
101
|
click.echo()
|
|
94
102
|
if manifest.pipeline in ("embed-t5", "embed"):
|
|
95
|
-
_finalize_embeddings(output_dir, output_path)
|
|
103
|
+
_finalize_embeddings(output_dir, output_path, skip_dedup=skip_dedup)
|
|
96
104
|
elif manifest.pipeline == "boltz":
|
|
97
105
|
_finalize_boltz(output_dir, output_path, full_output=full_output)
|
|
98
106
|
else:
|
|
@@ -182,7 +190,7 @@ def _check_completion(job_id: str, base_path: str) -> list[int]:
|
|
|
182
190
|
return incomplete
|
|
183
191
|
|
|
184
192
|
|
|
185
|
-
def _finalize_embeddings(output_dir: Path, output_path: Path):
|
|
193
|
+
def _finalize_embeddings(output_dir: Path, output_path: Path, skip_dedup: bool = False):
|
|
186
194
|
"""Combine H5 embedding files into a single output."""
|
|
187
195
|
h5_files = sorted(output_dir.glob("embed_*.h5"))
|
|
188
196
|
|
|
@@ -191,6 +199,8 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
|
|
|
191
199
|
raise SystemExit(1)
|
|
192
200
|
|
|
193
201
|
click.echo(f"Found {len(h5_files)} H5 files to combine")
|
|
202
|
+
if skip_dedup:
|
|
203
|
+
click.echo("Skipping deduplication (--skip-dedup)")
|
|
194
204
|
|
|
195
205
|
# Check if output already exists
|
|
196
206
|
if output_path.exists():
|
|
@@ -213,10 +223,9 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
|
|
|
213
223
|
click.echo("Single chunk - copying directly...")
|
|
214
224
|
shutil.copy2(h5_files[0], output_path)
|
|
215
225
|
else:
|
|
216
|
-
# Multiple files - combine
|
|
226
|
+
# Multiple files - combine and optionally deduplicate
|
|
217
227
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
218
228
|
combined_path = Path(tmpdir) / "combined.h5"
|
|
219
|
-
deduped_path = Path(tmpdir) / "deduped.h5"
|
|
220
229
|
|
|
221
230
|
# Combine H5 files
|
|
222
231
|
click.echo("Combining H5 files...")
|
|
@@ -226,13 +235,17 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
|
|
|
226
235
|
output_file=str(combined_path),
|
|
227
236
|
)
|
|
228
237
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
238
|
+
if skip_dedup:
|
|
239
|
+
# Skip dedup - optimize directly from combined
|
|
240
|
+
click.echo("Optimizing chunks...")
|
|
241
|
+
optimize_protein_embedding_chunks(str(combined_path), str(output_path))
|
|
242
|
+
else:
|
|
243
|
+
# Full pipeline: combine -> dedup -> optimize
|
|
244
|
+
deduped_path = Path(tmpdir) / "deduped.h5"
|
|
245
|
+
click.echo("Deduplicating...")
|
|
246
|
+
deduplicate_h5_file(str(combined_path), str(deduped_path))
|
|
247
|
+
click.echo("Optimizing chunks...")
|
|
248
|
+
optimize_protein_embedding_chunks(str(deduped_path), str(output_path))
|
|
236
249
|
|
|
237
250
|
click.echo(click.style("✓ H5 files combined successfully", fg="green"))
|
|
238
251
|
|
|
@@ -14,7 +14,7 @@ dayhoff_tools/cli/batch/commands/boltz.py,sha256=N0LksmtOpkvnEsR0SAUHxtksKPAsQjR
|
|
|
14
14
|
dayhoff_tools/cli/batch/commands/cancel.py,sha256=kjvmCcFaMShyHfQjvR4WlII4njg4Fm4uffpWcY1qRWg,5299
|
|
15
15
|
dayhoff_tools/cli/batch/commands/clean.py,sha256=nWOKbVM2nDuLMpyC038Q9aylOQxk2bq4N0JF65qJg-s,4570
|
|
16
16
|
dayhoff_tools/cli/batch/commands/embed_t5.py,sha256=QXFydAw0wndevdzXF1cxikxMmvn1BuQ5p9lwutQFajU,11453
|
|
17
|
-
dayhoff_tools/cli/batch/commands/finalize.py,sha256=
|
|
17
|
+
dayhoff_tools/cli/batch/commands/finalize.py,sha256=OQUF9RiO8S55SCeQcFqExLKjYd-leL0Z_FOV0xMg7Dw,13497
|
|
18
18
|
dayhoff_tools/cli/batch/commands/list_jobs.py,sha256=COfxZddDVUAHeTayNAB3ruYNhgrE3osgFxY2qzf33cg,4284
|
|
19
19
|
dayhoff_tools/cli/batch/commands/local.py,sha256=dZeKhNakaM1jS-EoByAwg1nWspRRoOmYzcwzjEKBaIA,3226
|
|
20
20
|
dayhoff_tools/cli/batch/commands/logs.py,sha256=ctgJksdzFmqBdD18ePPsZe2BpuJYtHz2xAaMPnUplmQ,5293
|
|
@@ -71,7 +71,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
|
71
71
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
|
72
72
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
|
73
73
|
dayhoff_tools/warehouse.py,sha256=UETBtZD3r7WgvURqfGbyHlT7cxoiVq8isjzMuerKw8I,24475
|
|
74
|
-
dayhoff_tools-1.14.
|
|
75
|
-
dayhoff_tools-1.14.
|
|
76
|
-
dayhoff_tools-1.14.
|
|
77
|
-
dayhoff_tools-1.14.
|
|
74
|
+
dayhoff_tools-1.14.14.dist-info/METADATA,sha256=QdEyPEUN_WWKDpvsKdWnN38FDpX6bqc4rsF4jGFbBDY,3185
|
|
75
|
+
dayhoff_tools-1.14.14.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
|
|
76
|
+
dayhoff_tools-1.14.14.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
|
77
|
+
dayhoff_tools-1.14.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|