dayhoff-tools 1.14.13__py3-none-any.whl → 1.14.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,8 +34,13 @@ from ..manifest import (
34
34
  is_flag=True,
35
35
  help="For Boltz: copy entire output directory (default: only essential files)",
36
36
  )
37
+ @click.option(
38
+ "--skip-dedup",
39
+ is_flag=True,
40
+ help="Skip deduplication step (use if input has no duplicates)",
41
+ )
37
42
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
38
- def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
43
+ def finalize(job_id, output, force, keep_intermediates, full_output, skip_dedup, base_path):
39
44
  """Combine results and clean up job intermediates.
40
45
 
41
46
  For embedding jobs, combines H5 files into a single output file.
@@ -46,6 +51,9 @@ def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
46
51
  # Embedding job - combine H5 files
47
52
  dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5
48
53
 
54
+ # Skip deduplication (faster if input has no duplicates)
55
+ dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5 --skip-dedup
56
+
49
57
  # Boltz job - extract essential files only (default)
50
58
  dh batch finalize dma-boltz-20260113-190a --output /primordial/structures/
51
59
 
@@ -92,7 +100,7 @@ def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
92
100
  # Finalize based on pipeline type
93
101
  click.echo()
94
102
  if manifest.pipeline in ("embed-t5", "embed"):
95
- _finalize_embeddings(output_dir, output_path)
103
+ _finalize_embeddings(output_dir, output_path, skip_dedup=skip_dedup)
96
104
  elif manifest.pipeline == "boltz":
97
105
  _finalize_boltz(output_dir, output_path, full_output=full_output)
98
106
  else:
@@ -182,7 +190,7 @@ def _check_completion(job_id: str, base_path: str) -> list[int]:
182
190
  return incomplete
183
191
 
184
192
 
185
- def _finalize_embeddings(output_dir: Path, output_path: Path):
193
+ def _finalize_embeddings(output_dir: Path, output_path: Path, skip_dedup: bool = False):
186
194
  """Combine H5 embedding files into a single output."""
187
195
  h5_files = sorted(output_dir.glob("embed_*.h5"))
188
196
 
@@ -191,6 +199,8 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
191
199
  raise SystemExit(1)
192
200
 
193
201
  click.echo(f"Found {len(h5_files)} H5 files to combine")
202
+ if skip_dedup:
203
+ click.echo("Skipping deduplication (--skip-dedup)")
194
204
 
195
205
  # Check if output already exists
196
206
  if output_path.exists():
@@ -213,10 +223,9 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
213
223
  click.echo("Single chunk - copying directly...")
214
224
  shutil.copy2(h5_files[0], output_path)
215
225
  else:
216
- # Multiple files - combine, deduplicate, and optimize
226
+ # Multiple files - combine and optionally deduplicate
217
227
  with tempfile.TemporaryDirectory() as tmpdir:
218
228
  combined_path = Path(tmpdir) / "combined.h5"
219
- deduped_path = Path(tmpdir) / "deduped.h5"
220
229
 
221
230
  # Combine H5 files
222
231
  click.echo("Combining H5 files...")
@@ -226,13 +235,17 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
226
235
  output_file=str(combined_path),
227
236
  )
228
237
 
229
- # Deduplicate
230
- click.echo("Deduplicating...")
231
- deduplicate_h5_file(str(combined_path), str(deduped_path))
232
-
233
- # Optimize chunks
234
- click.echo("Optimizing chunks...")
235
- optimize_protein_embedding_chunks(str(deduped_path), str(output_path))
238
+ if skip_dedup:
239
+ # Skip dedup - optimize directly from combined
240
+ click.echo("Optimizing chunks...")
241
+ optimize_protein_embedding_chunks(str(combined_path), str(output_path))
242
+ else:
243
+ # Full pipeline: combine -> dedup -> optimize
244
+ deduped_path = Path(tmpdir) / "deduped.h5"
245
+ click.echo("Deduplicating...")
246
+ deduplicate_h5_file(str(combined_path), str(deduped_path))
247
+ click.echo("Optimizing chunks...")
248
+ optimize_protein_embedding_chunks(str(deduped_path), str(output_path))
236
249
 
237
250
  click.echo(click.style("✓ H5 files combined successfully", fg="green"))
238
251
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dayhoff-tools
3
- Version: 1.14.13
3
+ Version: 1.14.14
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -14,7 +14,7 @@ dayhoff_tools/cli/batch/commands/boltz.py,sha256=N0LksmtOpkvnEsR0SAUHxtksKPAsQjR
14
14
  dayhoff_tools/cli/batch/commands/cancel.py,sha256=kjvmCcFaMShyHfQjvR4WlII4njg4Fm4uffpWcY1qRWg,5299
15
15
  dayhoff_tools/cli/batch/commands/clean.py,sha256=nWOKbVM2nDuLMpyC038Q9aylOQxk2bq4N0JF65qJg-s,4570
16
16
  dayhoff_tools/cli/batch/commands/embed_t5.py,sha256=QXFydAw0wndevdzXF1cxikxMmvn1BuQ5p9lwutQFajU,11453
17
- dayhoff_tools/cli/batch/commands/finalize.py,sha256=xr3GFcMbvtU6UYiJI3UXhQqeaACSFzKIZOxz4GK-Dmo,12785
17
+ dayhoff_tools/cli/batch/commands/finalize.py,sha256=OQUF9RiO8S55SCeQcFqExLKjYd-leL0Z_FOV0xMg7Dw,13497
18
18
  dayhoff_tools/cli/batch/commands/list_jobs.py,sha256=COfxZddDVUAHeTayNAB3ruYNhgrE3osgFxY2qzf33cg,4284
19
19
  dayhoff_tools/cli/batch/commands/local.py,sha256=dZeKhNakaM1jS-EoByAwg1nWspRRoOmYzcwzjEKBaIA,3226
20
20
  dayhoff_tools/cli/batch/commands/logs.py,sha256=ctgJksdzFmqBdD18ePPsZe2BpuJYtHz2xAaMPnUplmQ,5293
@@ -71,7 +71,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
71
71
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
72
72
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
73
73
  dayhoff_tools/warehouse.py,sha256=UETBtZD3r7WgvURqfGbyHlT7cxoiVq8isjzMuerKw8I,24475
74
- dayhoff_tools-1.14.13.dist-info/METADATA,sha256=703JkMV45GgnJlsqHJVmlkwBGZntVuYUp3zNmhvT7Us,3185
75
- dayhoff_tools-1.14.13.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
76
- dayhoff_tools-1.14.13.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
77
- dayhoff_tools-1.14.13.dist-info/RECORD,,
74
+ dayhoff_tools-1.14.14.dist-info/METADATA,sha256=QdEyPEUN_WWKDpvsKdWnN38FDpX6bqc4rsF4jGFbBDY,3185
75
+ dayhoff_tools-1.14.14.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
76
+ dayhoff_tools-1.14.14.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
77
+ dayhoff_tools-1.14.14.dist-info/RECORD,,