dayhoff-tools 1.14.6__tar.gz → 1.14.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/PKG-INFO +1 -1
  2. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/batch/workers/boltz.py +38 -9
  3. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/aws_batch.py +4 -4
  4. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/boltz.py +31 -12
  5. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/cancel.py +20 -6
  6. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/embed_t5.py +86 -25
  7. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/finalize.py +43 -18
  8. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/list_jobs.py +3 -1
  9. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/local.py +27 -10
  10. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/logs.py +6 -2
  11. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/retry.py +6 -2
  12. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/status.py +21 -6
  13. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/submit.py +9 -3
  14. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/manifest.py +3 -1
  15. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/main.py +5 -1
  16. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/pyproject.toml +1 -1
  17. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/README.md +0 -0
  18. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/__init__.py +0 -0
  19. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/batch/__init__.py +0 -0
  20. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/batch/workers/__init__.py +0 -0
  21. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/batch/workers/base.py +0 -0
  22. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/batch/workers/embed_t5.py +0 -0
  23. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/chemistry/standardizer.py +0 -0
  24. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/chemistry/utils.py +0 -0
  25. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/__init__.py +0 -0
  26. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/__init__.py +0 -0
  27. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/commands/__init__.py +0 -0
  28. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/batch/job_id.py +0 -0
  29. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/cloud_commands.py +0 -0
  30. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/__init__.py +0 -0
  31. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/engine_core.py +0 -0
  32. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/engine_lifecycle.py +0 -0
  33. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/engine_maintenance.py +0 -0
  34. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/engine_management.py +0 -0
  35. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/shared.py +0 -0
  36. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engine1/studio_commands.py +0 -0
  37. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/__init__.py +0 -0
  38. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/api_client.py +0 -0
  39. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/auth.py +0 -0
  40. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/engine-studio-cli.md +0 -0
  41. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/engine_commands.py +0 -0
  42. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/progress.py +0 -0
  43. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +0 -0
  44. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/demo.sh +0 -0
  45. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +0 -0
  46. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +0 -0
  47. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +0 -0
  48. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +0 -0
  49. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +0 -0
  50. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +0 -0
  51. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/ssh_config.py +0 -0
  52. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/engines_studios/studio_commands.py +0 -0
  53. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/github_commands.py +0 -0
  54. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/swarm_commands.py +0 -0
  55. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/cli/utility_commands.py +0 -0
  56. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/base.py +0 -0
  57. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  58. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  59. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  60. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/job_runner.py +0 -0
  61. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/processors.py +0 -0
  62. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/deployment/swarm.py +0 -0
  63. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/embedders.py +0 -0
  64. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/fasta.py +0 -0
  65. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/file_ops.py +0 -0
  66. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/h5.py +0 -0
  67. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/intake/gcp.py +0 -0
  68. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/intake/gtdb.py +0 -0
  69. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/intake/kegg.py +0 -0
  70. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/intake/mmseqs.py +0 -0
  71. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/intake/structure.py +0 -0
  72. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/intake/uniprot.py +0 -0
  73. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/logs.py +0 -0
  74. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/sqlite.py +0 -0
  75. {dayhoff_tools-1.14.6 → dayhoff_tools-1.14.7}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dayhoff-tools
3
- Version: 1.14.6
3
+ Version: 1.14.7
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -88,7 +88,9 @@ class BoltzProcessor:
88
88
  match = re.match(pattern1, base_name)
89
89
  if match:
90
90
  protein_id = match.group(1)
91
- logger.debug(f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 1)")
91
+ logger.debug(
92
+ f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 1)"
93
+ )
92
94
  return protein_id
93
95
 
94
96
  # Pattern 2: PROTEINID_suffix (no leading number)
@@ -96,7 +98,9 @@ class BoltzProcessor:
96
98
  match = re.match(pattern2, base_name)
97
99
  if match:
98
100
  protein_id = match.group(1)
99
- logger.debug(f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 2)")
101
+ logger.debug(
102
+ f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 2)"
103
+ )
100
104
  return protein_id
101
105
 
102
106
  # Pattern 3: Just PROTEINID (no suffix)
@@ -104,7 +108,9 @@ class BoltzProcessor:
104
108
  match = re.match(pattern3, base_name)
105
109
  if match:
106
110
  protein_id = match.group(1)
107
- logger.debug(f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 3)")
111
+ logger.debug(
112
+ f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 3)"
113
+ )
108
114
  return protein_id
109
115
 
110
116
  logger.debug(f"Could not extract protein ID from filename '{filename}'")
@@ -222,16 +228,24 @@ class BoltzProcessor:
222
228
  raise FileNotFoundError(f"Input file not found: {input_file}")
223
229
 
224
230
  # Enhance with MSA if available
225
- enhanced_input_file, msa_found, original_yaml_data = self._enhance_yaml_with_msa(
226
- input_file
231
+ enhanced_input_file, msa_found, original_yaml_data = (
232
+ self._enhance_yaml_with_msa(input_file)
227
233
  )
228
234
 
229
235
  # Determine output directory
236
+ # Boltz always creates boltz_results_{input_name} inside --out_dir
230
237
  input_base = os.path.splitext(os.path.basename(input_file))[0]
238
+
231
239
  if output_dir is None:
240
+ # No output_dir specified, boltz creates in current directory
232
241
  expected_output_dir = f"boltz_results_{input_base}"
242
+ out_dir_arg = None
233
243
  else:
234
- expected_output_dir = output_dir
244
+ # output_dir specified - use its parent for --out_dir
245
+ # and expect boltz_results_{input_base} inside it
246
+ parent_dir = os.path.dirname(output_dir)
247
+ expected_output_dir = os.path.join(parent_dir, f"boltz_results_{input_base}")
248
+ out_dir_arg = parent_dir if parent_dir else None
235
249
 
236
250
  logger.info(f"Running Boltz prediction for {input_file}")
237
251
  logger.info(f"Output directory: {expected_output_dir}")
@@ -239,6 +253,10 @@ class BoltzProcessor:
239
253
  # Build command
240
254
  cmd = ["boltz", "predict", input_file]
241
255
 
256
+ # Add output directory if specified
257
+ if out_dir_arg:
258
+ cmd.extend(["--out_dir", out_dir_arg])
259
+
242
260
  # Add cache directory
243
261
  cmd.extend(["--cache", self.cache_dir])
244
262
 
@@ -259,7 +277,9 @@ class BoltzProcessor:
259
277
  # Handle MSA server option
260
278
  if msa_found:
261
279
  if use_msa_server_in_opts:
262
- additional_args = [arg for arg in additional_args if arg != "--use_msa_server"]
280
+ additional_args = [
281
+ arg for arg in additional_args if arg != "--use_msa_server"
282
+ ]
263
283
  logger.info("Removed --use_msa_server since local MSA was found")
264
284
  else:
265
285
  if not use_msa_server_in_opts:
@@ -270,6 +290,11 @@ class BoltzProcessor:
270
290
  if not num_workers_in_opts:
271
291
  cmd.extend(["--num_workers", str(self.num_workers)])
272
292
 
293
+ # Disable cuequivariance kernels - they require cuda-devel image
294
+ # which is much larger. The performance difference is modest.
295
+ # TODO: Consider switching to cuda-devel base image if perf is critical
296
+ cmd.append("--no_kernels")
297
+
273
298
  cmd.extend(additional_args)
274
299
 
275
300
  # Log and run command
@@ -305,7 +330,9 @@ class BoltzProcessor:
305
330
 
306
331
  # Copy input config to output directory
307
332
  try:
308
- config_dest = os.path.join(expected_output_dir, os.path.basename(input_file))
333
+ config_dest = os.path.join(
334
+ expected_output_dir, os.path.basename(input_file)
335
+ )
309
336
  shutil.copy2(input_file, config_dest)
310
337
  logger.debug(f"Copied input config to results: {config_dest}")
311
338
  except Exception as e:
@@ -346,7 +373,9 @@ def main():
346
373
  input_files = sorted(input_dir.glob("*.yaml"))
347
374
 
348
375
  if index >= len(input_files):
349
- logger.error(f"Index {index} out of range. Found {len(input_files)} input files.")
376
+ logger.error(
377
+ f"Index {index} out of range. Found {len(input_files)} input files."
378
+ )
350
379
  raise RuntimeError(f"Index {index} out of range")
351
380
 
352
381
  input_file = input_files[index]
@@ -256,9 +256,7 @@ class BatchClient:
256
256
  # List child jobs with FAILED status
257
257
  try:
258
258
  paginator = self.batch.get_paginator("list_jobs")
259
- for page in paginator.paginate(
260
- arrayJobId=job_id, jobStatus="FAILED"
261
- ):
259
+ for page in paginator.paginate(arrayJobId=job_id, jobStatus="FAILED"):
262
260
  for job_summary in page.get("jobSummaryList", []):
263
261
  # Extract array index from job ID (format: jobId:index)
264
262
  child_id = job_summary.get("jobId", "")
@@ -361,7 +359,9 @@ class BatchClient:
361
359
  timestamp = event.get("timestamp", 0)
362
360
  message = event.get("message", "")
363
361
  # Format timestamp
364
- dt = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp / 1000))
362
+ dt = time.strftime(
363
+ "%Y-%m-%d %H:%M:%S", time.localtime(timestamp / 1000)
364
+ )
365
365
  messages.append(f"[{dt}] {message}")
366
366
 
367
367
  except ClientError as e:
@@ -289,7 +289,9 @@ def _run_local_mode(input_path: Path):
289
289
  click.echo(click.style("Error: No YAML files found", fg="red"), err=True)
290
290
  raise SystemExit(1)
291
291
 
292
- click.echo(f"Found {len(yaml_files)} YAML files, will process: {yaml_files[0].name}")
292
+ click.echo(
293
+ f"Found {len(yaml_files)} YAML files, will process: {yaml_files[0].name}"
294
+ )
293
295
 
294
296
  # Create a temporary job directory structure
295
297
  temp_job_dir = input_path / ".local_boltz_job"
@@ -311,14 +313,25 @@ def _run_local_mode(input_path: Path):
311
313
  click.echo()
312
314
 
313
315
  cmd = [
314
- "docker", "run", "--rm",
315
- "--gpus", "all",
316
- "-v", "/primordial:/primordial",
317
- "-v", f"{temp_job_dir}:{temp_job_dir}",
318
- "-e", f"JOB_DIR={temp_job_dir}",
319
- "-e", "AWS_BATCH_JOB_ARRAY_INDEX=0",
320
- "-e", "BOLTZ_CACHE=/primordial/.cache/boltz",
321
- "-e", "MSA_DIR=/primordial/.cache/msas",
316
+ "docker",
317
+ "run",
318
+ "--rm",
319
+ "--gpus",
320
+ "all",
321
+ "-v",
322
+ "/primordial:/primordial",
323
+ "-v",
324
+ f"{temp_job_dir}:{temp_job_dir}",
325
+ "-e",
326
+ f"JOB_DIR={temp_job_dir}",
327
+ "-e",
328
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
329
+ "-e",
330
+ "BOLTZ_CACHE=/primordial/.cache/boltz",
331
+ "-e",
332
+ "MSA_DIR=/primordial/.cache/msas",
333
+ "-e",
334
+ "BOLTZ_OPTIONS=--no_kernels",
322
335
  DEFAULT_IMAGE_URI,
323
336
  ]
324
337
 
@@ -329,13 +342,17 @@ def _run_local_mode(input_path: Path):
329
342
  result = subprocess.run(cmd)
330
343
  if result.returncode != 0:
331
344
  click.echo(
332
- click.style(f"Container exited with code {result.returncode}", fg="red"),
345
+ click.style(
346
+ f"Container exited with code {result.returncode}", fg="red"
347
+ ),
333
348
  err=True,
334
349
  )
335
350
  raise SystemExit(result.returncode)
336
351
 
337
352
  # Check for output
338
- output_dirs = list(temp_output_dir.iterdir()) if temp_output_dir.exists() else []
353
+ output_dirs = (
354
+ list(temp_output_dir.iterdir()) if temp_output_dir.exists() else []
355
+ )
339
356
  if output_dirs:
340
357
  click.echo()
341
358
  click.echo(click.style("✓ Prediction complete!", fg="green"))
@@ -347,7 +364,9 @@ def _run_local_mode(input_path: Path):
347
364
 
348
365
  except FileNotFoundError:
349
366
  click.echo(
350
- click.style("Error: Docker not found. Is Docker installed and running?", fg="red"),
367
+ click.style(
368
+ "Error: Docker not found. Is Docker installed and running?", fg="red"
369
+ ),
351
370
  err=True,
352
371
  )
353
372
  raise SystemExit(1)
@@ -33,8 +33,14 @@ def cancel(job_id, force, base_path):
33
33
  raise SystemExit(1)
34
34
 
35
35
  # Check if job can be cancelled
36
- if manifest.status in (JobStatus.SUCCEEDED, JobStatus.FINALIZED, JobStatus.CANCELLED):
37
- click.echo(f"Job {job_id} is already {manifest.status.value}, cannot cancel.", err=True)
36
+ if manifest.status in (
37
+ JobStatus.SUCCEEDED,
38
+ JobStatus.FINALIZED,
39
+ JobStatus.CANCELLED,
40
+ ):
41
+ click.echo(
42
+ f"Job {job_id} is already {manifest.status.value}, cannot cancel.", err=True
43
+ )
38
44
  raise SystemExit(1)
39
45
 
40
46
  # Get Batch job ID
@@ -53,10 +59,14 @@ def cancel(job_id, force, base_path):
53
59
 
54
60
  if force:
55
61
  click.echo(f"Terminating job {batch_job_id}...")
56
- client.terminate_job(batch_job_id, reason="Terminated by user via dh batch cancel --force")
62
+ client.terminate_job(
63
+ batch_job_id, reason="Terminated by user via dh batch cancel --force"
64
+ )
57
65
  else:
58
66
  click.echo(f"Cancelling job {batch_job_id}...")
59
- client.cancel_job(batch_job_id, reason="Cancelled by user via dh batch cancel")
67
+ client.cancel_job(
68
+ batch_job_id, reason="Cancelled by user via dh batch cancel"
69
+ )
60
70
 
61
71
  # Update manifest
62
72
  manifest.status = JobStatus.CANCELLED
@@ -70,9 +80,13 @@ def cancel(job_id, force, base_path):
70
80
  if retry_info.batch_job_id:
71
81
  try:
72
82
  if force:
73
- client.terminate_job(retry_info.batch_job_id, reason="Parent job cancelled")
83
+ client.terminate_job(
84
+ retry_info.batch_job_id, reason="Parent job cancelled"
85
+ )
74
86
  else:
75
- client.cancel_job(retry_info.batch_job_id, reason="Parent job cancelled")
87
+ client.cancel_job(
88
+ retry_info.batch_job_id, reason="Parent job cancelled"
89
+ )
76
90
  click.echo(f" Also cancelled retry job: {retry_info.retry_id}")
77
91
  except BatchError:
78
92
  pass # Retry job may already be complete
@@ -32,14 +32,42 @@ DEFAULT_IMAGE_URI = "074735440724.dkr.ecr.us-east-1.amazonaws.com/dayhoff:embed-
32
32
 
33
33
  @click.command()
34
34
  @click.argument("input_fasta", type=click.Path(exists=True))
35
- @click.option("--workers", default=DEFAULT_WORKERS, type=int, help=f"Number of parallel workers [default: {DEFAULT_WORKERS}]")
36
- @click.option("--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]")
37
- @click.option("--seqs-per-chunk", default=DEFAULT_SEQS_PER_CHUNK, type=int, help=f"Sequences per chunk [default: {DEFAULT_SEQS_PER_CHUNK}]")
38
- @click.option("--local", "run_local", is_flag=True, help="Run single chunk locally instead of Batch")
39
- @click.option("--shell", "run_shell", is_flag=True, help="Drop into container shell for debugging")
35
+ @click.option(
36
+ "--workers",
37
+ default=DEFAULT_WORKERS,
38
+ type=int,
39
+ help=f"Number of parallel workers [default: {DEFAULT_WORKERS}]",
40
+ )
41
+ @click.option(
42
+ "--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]"
43
+ )
44
+ @click.option(
45
+ "--seqs-per-chunk",
46
+ default=DEFAULT_SEQS_PER_CHUNK,
47
+ type=int,
48
+ help=f"Sequences per chunk [default: {DEFAULT_SEQS_PER_CHUNK}]",
49
+ )
50
+ @click.option(
51
+ "--local",
52
+ "run_local",
53
+ is_flag=True,
54
+ help="Run single chunk locally instead of Batch",
55
+ )
56
+ @click.option(
57
+ "--shell", "run_shell", is_flag=True, help="Drop into container shell for debugging"
58
+ )
40
59
  @click.option("--dry-run", is_flag=True, help="Show plan without submitting")
41
60
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
42
- def embed_t5(input_fasta, workers, queue, seqs_per_chunk, run_local, run_shell, dry_run, base_path):
61
+ def embed_t5(
62
+ input_fasta,
63
+ workers,
64
+ queue,
65
+ seqs_per_chunk,
66
+ run_local,
67
+ run_shell,
68
+ dry_run,
69
+ base_path,
70
+ ):
43
71
  """Generate T5 protein embeddings for a FASTA file.
44
72
 
45
73
  Splits the input FASTA into chunks and processes them in parallel using
@@ -115,7 +143,14 @@ def _split_fasta(input_path: Path, output_dir: Path, seqs_per_chunk: int) -> int
115
143
  return num_chunks
116
144
 
117
145
 
118
- def _submit_batch_job(input_path: Path, workers: int, queue: str, seqs_per_chunk: int, dry_run: bool, base_path: str):
146
+ def _submit_batch_job(
147
+ input_path: Path,
148
+ workers: int,
149
+ queue: str,
150
+ seqs_per_chunk: int,
151
+ dry_run: bool,
152
+ base_path: str,
153
+ ):
119
154
  """Submit embedding job to AWS Batch."""
120
155
  # Count sequences
121
156
  click.echo(f"Counting sequences in {input_path}...")
@@ -123,7 +158,9 @@ def _submit_batch_job(input_path: Path, workers: int, queue: str, seqs_per_chunk
123
158
  click.echo(f"Found {num_sequences:,} sequences")
124
159
 
125
160
  if num_sequences == 0:
126
- click.echo(click.style("Error: No sequences found in input file", fg="red"), err=True)
161
+ click.echo(
162
+ click.style("Error: No sequences found in input file", fg="red"), err=True
163
+ )
127
164
  raise SystemExit(1)
128
165
 
129
166
  # Calculate chunks
@@ -223,7 +260,9 @@ def _submit_batch_job(input_path: Path, workers: int, queue: str, seqs_per_chunk
223
260
  click.echo(f" Cancel: dh batch cancel {job_id}")
224
261
  click.echo()
225
262
  click.echo("After completion:")
226
- click.echo(f" Finalize: dh batch finalize {job_id} --output /primordial/embeddings.h5")
263
+ click.echo(
264
+ f" Finalize: dh batch finalize {job_id} --output /primordial/embeddings.h5"
265
+ )
227
266
 
228
267
  except BatchError as e:
229
268
  manifest.status = JobStatus.FAILED
@@ -265,12 +304,19 @@ def _run_local_mode(input_path: Path):
265
304
  click.echo()
266
305
 
267
306
  cmd = [
268
- "docker", "run", "--rm",
269
- "--gpus", "all",
270
- "-v", "/primordial:/primordial",
271
- "-v", f"{temp_job_dir}:{temp_job_dir}",
272
- "-e", f"JOB_DIR={temp_job_dir}",
273
- "-e", "AWS_BATCH_JOB_ARRAY_INDEX=0",
307
+ "docker",
308
+ "run",
309
+ "--rm",
310
+ "--gpus",
311
+ "all",
312
+ "-v",
313
+ "/primordial:/primordial",
314
+ "-v",
315
+ f"{temp_job_dir}:{temp_job_dir}",
316
+ "-e",
317
+ f"JOB_DIR={temp_job_dir}",
318
+ "-e",
319
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
274
320
  DEFAULT_IMAGE_URI,
275
321
  ]
276
322
 
@@ -281,7 +327,9 @@ def _run_local_mode(input_path: Path):
281
327
  result = subprocess.run(cmd)
282
328
  if result.returncode != 0:
283
329
  click.echo(
284
- click.style(f"Container exited with code {result.returncode}", fg="red"),
330
+ click.style(
331
+ f"Container exited with code {result.returncode}", fg="red"
332
+ ),
285
333
  err=True,
286
334
  )
287
335
  raise SystemExit(result.returncode)
@@ -303,7 +351,9 @@ def _run_local_mode(input_path: Path):
303
351
 
304
352
  except FileNotFoundError:
305
353
  click.echo(
306
- click.style("Error: Docker not found. Is Docker installed and running?", fg="red"),
354
+ click.style(
355
+ "Error: Docker not found. Is Docker installed and running?", fg="red"
356
+ ),
307
357
  err=True,
308
358
  )
309
359
  raise SystemExit(1)
@@ -318,13 +368,22 @@ def _run_shell_mode(input_path: Path):
318
368
  input_dir = input_path.parent
319
369
 
320
370
  cmd = [
321
- "docker", "run", "--rm", "-it",
322
- "--gpus", "all",
323
- "-v", "/primordial:/primordial",
324
- "-v", f"{input_dir}:/input",
325
- "-e", "JOB_DIR=/input",
326
- "-e", "AWS_BATCH_JOB_ARRAY_INDEX=0",
327
- "--entrypoint", "/bin/bash",
371
+ "docker",
372
+ "run",
373
+ "--rm",
374
+ "-it",
375
+ "--gpus",
376
+ "all",
377
+ "-v",
378
+ "/primordial:/primordial",
379
+ "-v",
380
+ f"{input_dir}:/input",
381
+ "-e",
382
+ "JOB_DIR=/input",
383
+ "-e",
384
+ "AWS_BATCH_JOB_ARRAY_INDEX=0",
385
+ "--entrypoint",
386
+ "/bin/bash",
328
387
  DEFAULT_IMAGE_URI,
329
388
  ]
330
389
 
@@ -335,7 +394,9 @@ def _run_shell_mode(input_path: Path):
335
394
  subprocess.run(cmd)
336
395
  except FileNotFoundError:
337
396
  click.echo(
338
- click.style("Error: Docker not found. Is Docker installed and running?", fg="red"),
397
+ click.style(
398
+ "Error: Docker not found. Is Docker installed and running?", fg="red"
399
+ ),
339
400
  err=True,
340
401
  )
341
402
  raise SystemExit(1)
@@ -17,9 +17,18 @@ from ..manifest import (
17
17
 
18
18
  @click.command()
19
19
  @click.argument("job_id")
20
- @click.option("--output", required=True, type=click.Path(), help="Output path for combined results")
20
+ @click.option(
21
+ "--output",
22
+ required=True,
23
+ type=click.Path(),
24
+ help="Output path for combined results",
25
+ )
21
26
  @click.option("--force", is_flag=True, help="Finalize even if some chunks failed")
22
- @click.option("--keep-intermediates", is_flag=True, help="Don't delete job directory after finalizing")
27
+ @click.option(
28
+ "--keep-intermediates",
29
+ is_flag=True,
30
+ help="Don't delete job directory after finalizing",
31
+ )
23
32
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
24
33
  def finalize(job_id, output, force, keep_intermediates, base_path):
25
34
  """Combine results and clean up job intermediates.
@@ -59,7 +68,9 @@ def finalize(job_id, output, force, keep_intermediates, base_path):
59
68
  click.echo(f" dh batch retry {job_id}")
60
69
  raise SystemExit(1)
61
70
  click.echo()
62
- click.echo(click.style("Warning: Finalizing with incomplete chunks", fg="yellow"))
71
+ click.echo(
72
+ click.style("Warning: Finalizing with incomplete chunks", fg="yellow")
73
+ )
63
74
 
64
75
  # Update status
65
76
  manifest.status = JobStatus.FINALIZING
@@ -132,24 +143,38 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
132
143
  output_path.parent.mkdir(parents=True, exist_ok=True)
133
144
 
134
145
  try:
135
- from dayhoff_tools.h5 import combine_h5_files, deduplicate_h5_file, optimize_protein_embedding_chunks
136
-
137
- # Combine H5 files
138
- click.echo("Combining H5 files...")
139
- # Get list of h5 file paths as strings
140
- h5_file_paths = [str(f) for f in h5_files]
141
- combine_h5_files(
142
- input_files=h5_file_paths,
143
- output_file=str(output_path),
146
+ from dayhoff_tools.h5 import (
147
+ combine_h5_files,
148
+ deduplicate_h5_file,
149
+ optimize_protein_embedding_chunks,
144
150
  )
151
+ import tempfile
152
+
153
+ if len(h5_files) == 1:
154
+ # Single file - just copy, no need to combine/dedup/optimize
155
+ click.echo("Single chunk - copying directly...")
156
+ shutil.copy2(h5_files[0], output_path)
157
+ else:
158
+ # Multiple files - combine, deduplicate, and optimize
159
+ with tempfile.TemporaryDirectory() as tmpdir:
160
+ combined_path = Path(tmpdir) / "combined.h5"
161
+ deduped_path = Path(tmpdir) / "deduped.h5"
162
+
163
+ # Combine H5 files
164
+ click.echo("Combining H5 files...")
165
+ h5_file_paths = [str(f) for f in h5_files]
166
+ combine_h5_files(
167
+ input_files=h5_file_paths,
168
+ output_file=str(combined_path),
169
+ )
145
170
 
146
- # Deduplicate
147
- click.echo("Deduplicating...")
148
- deduplicate_h5_file(str(output_path))
171
+ # Deduplicate
172
+ click.echo("Deduplicating...")
173
+ deduplicate_h5_file(str(combined_path), str(deduped_path))
149
174
 
150
- # Optimize chunks
151
- click.echo("Optimizing chunks...")
152
- optimize_protein_embedding_chunks(str(output_path))
175
+ # Optimize chunks
176
+ click.echo("Optimizing chunks...")
177
+ optimize_protein_embedding_chunks(str(deduped_path), str(output_path))
153
178
 
154
179
  click.echo(click.style("✓ H5 files combined successfully", fg="green"))
155
180
 
@@ -15,7 +15,9 @@ from .status import format_status, format_time_ago
15
15
  help="Filter by status",
16
16
  )
17
17
  @click.option("--pipeline", help="Filter by pipeline type")
18
- @click.option("--limit", default=20, type=int, help="Maximum number of jobs to show [default: 20]")
18
+ @click.option(
19
+ "--limit", default=20, type=int, help="Maximum number of jobs to show [default: 20]"
20
+ )
19
21
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
20
22
  def list_jobs(user, status_filter, pipeline, limit, base_path):
21
23
  """List recent batch jobs.
@@ -10,7 +10,12 @@ from ..manifest import BATCH_JOBS_BASE, get_job_dir, load_manifest
10
10
  @click.command()
11
11
  @click.argument("job_id")
12
12
  @click.option("--index", required=True, type=int, help="Array index to run")
13
- @click.option("--shell", "run_shell", is_flag=True, help="Drop into shell instead of running command")
13
+ @click.option(
14
+ "--shell",
15
+ "run_shell",
16
+ is_flag=True,
17
+ help="Drop into shell instead of running command",
18
+ )
14
19
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
15
20
  def local(job_id, index, run_shell, base_path):
16
21
  """Run a job chunk locally for debugging.
@@ -54,13 +59,21 @@ def local(job_id, index, run_shell, base_path):
54
59
 
55
60
  # Build Docker command
56
61
  cmd = [
57
- "docker", "run", "--rm",
58
- "--gpus", "all",
59
- "-v", "/primordial:/primordial",
60
- "-v", f"{job_dir}:{job_dir}",
61
- "-e", f"AWS_BATCH_JOB_ARRAY_INDEX={index}",
62
- "-e", f"JOB_DIR={job_dir}",
63
- "-e", f"JOB_ID={job_id}",
62
+ "docker",
63
+ "run",
64
+ "--rm",
65
+ "--gpus",
66
+ "all",
67
+ "-v",
68
+ "/primordial:/primordial",
69
+ "-v",
70
+ f"{job_dir}:{job_dir}",
71
+ "-e",
72
+ f"AWS_BATCH_JOB_ARRAY_INDEX={index}",
73
+ "-e",
74
+ f"JOB_DIR={job_dir}",
75
+ "-e",
76
+ f"JOB_ID={job_id}",
64
77
  ]
65
78
 
66
79
  if run_shell:
@@ -81,7 +94,9 @@ def local(job_id, index, run_shell, base_path):
81
94
  result = subprocess.run(cmd)
82
95
  if result.returncode != 0:
83
96
  click.echo(
84
- click.style(f"Container exited with code {result.returncode}", fg="red"),
97
+ click.style(
98
+ f"Container exited with code {result.returncode}", fg="red"
99
+ ),
85
100
  err=True,
86
101
  )
87
102
  raise SystemExit(result.returncode)
@@ -89,7 +104,9 @@ def local(job_id, index, run_shell, base_path):
89
104
  click.echo(click.style("✓ Container completed successfully", fg="green"))
90
105
  except FileNotFoundError:
91
106
  click.echo(
92
- click.style("Error: Docker not found. Is Docker installed and running?", fg="red"),
107
+ click.style(
108
+ "Error: Docker not found. Is Docker installed and running?", fg="red"
109
+ ),
93
110
  err=True,
94
111
  )
95
112
  raise SystemExit(1)
@@ -77,7 +77,9 @@ def _show_job_logs(client: BatchClient, batch_job_id: str, tail: int, follow: bo
77
77
  click.echo(f" dh batch logs {batch_job_id.split('-')[0]} --failed")
78
78
  click.echo()
79
79
  click.echo("To view logs for a specific index:")
80
- click.echo(f" dh batch logs {batch_job_id.split('-')[0]} --index {failed_indices[0]}")
80
+ click.echo(
81
+ f" dh batch logs {batch_job_id.split('-')[0]} --index {failed_indices[0]}"
82
+ )
81
83
  return
82
84
 
83
85
  # Single job - show logs
@@ -94,7 +96,9 @@ def _show_job_logs(client: BatchClient, batch_job_id: str, tail: int, follow: bo
94
96
  click.echo(click.style(f"Error fetching logs: {e}", fg="red"), err=True)
95
97
 
96
98
 
97
- def _show_index_logs(client: BatchClient, batch_job_id: str, index: int, tail: int, follow: bool):
99
+ def _show_index_logs(
100
+ client: BatchClient, batch_job_id: str, index: int, tail: int, follow: bool
101
+ ):
98
102
  """Show logs for a specific array index."""
99
103
  child_job_id = f"{batch_job_id}:{index}"
100
104
 
@@ -19,7 +19,9 @@ from ..manifest import (
19
19
  @click.command()
20
20
  @click.argument("job_id")
21
21
  @click.option("--indices", help="Specific indices to retry (comma-separated)")
22
- @click.option("--dry-run", is_flag=True, help="Show what would be retried without submitting")
22
+ @click.option(
23
+ "--dry-run", is_flag=True, help="Show what would be retried without submitting"
24
+ )
23
25
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
24
26
  def retry(job_id, indices, dry_run, base_path):
25
27
  """Retry failed chunks of a batch job.
@@ -112,7 +114,9 @@ def retry(job_id, indices, dry_run, base_path):
112
114
  click.echo(f" View logs: dh batch logs {job_id}")
113
115
 
114
116
  except BatchError as e:
115
- click.echo(click.style(f"✗ Failed to submit retry job: {e}", fg="red"), err=True)
117
+ click.echo(
118
+ click.style(f"✗ Failed to submit retry job: {e}", fg="red"), err=True
119
+ )
116
120
  raise SystemExit(1)
117
121
 
118
122
 
@@ -3,7 +3,12 @@
3
3
  import click
4
4
 
5
5
  from ..aws_batch import BatchClient, BatchError
6
- from ..manifest import BATCH_JOBS_BASE, JobStatus, list_jobs as list_manifests, load_manifest
6
+ from ..manifest import (
7
+ BATCH_JOBS_BASE,
8
+ JobStatus,
9
+ list_jobs as list_manifests,
10
+ load_manifest,
11
+ )
7
12
 
8
13
 
9
14
  def format_status(status: JobStatus) -> str:
@@ -125,8 +130,12 @@ def _show_job_details(job_id: str, base_path: str):
125
130
  click.echo(f"Status: {format_status(manifest.status)}")
126
131
  click.echo(f"Pipeline: {manifest.pipeline}")
127
132
  click.echo(f"User: {manifest.user}")
128
- click.echo(f"Created: {manifest.created.isoformat()} ({format_time_ago(manifest.created)})")
129
- click.echo(f"Updated: {manifest.updated.isoformat()} ({format_time_ago(manifest.updated)})")
133
+ click.echo(
134
+ f"Created: {manifest.created.isoformat()} ({format_time_ago(manifest.created)})"
135
+ )
136
+ click.echo(
137
+ f"Updated: {manifest.updated.isoformat()} ({format_time_ago(manifest.updated)})"
138
+ )
130
139
 
131
140
  if manifest.input:
132
141
  click.echo()
@@ -182,7 +191,9 @@ def _show_job_details(job_id: str, base_path: str):
182
191
  click.echo(f" Retry: dh batch retry {job_id}")
183
192
  elif manifest.status == JobStatus.SUCCEEDED:
184
193
  click.echo("Next steps:")
185
- click.echo(f" Finalize: dh batch finalize {job_id} --output /primordial/output.h5")
194
+ click.echo(
195
+ f" Finalize: dh batch finalize {job_id} --output /primordial/output.h5"
196
+ )
186
197
 
187
198
 
188
199
  def _show_array_status(batch_job_id: str):
@@ -205,10 +216,14 @@ def _show_array_status(batch_job_id: str):
205
216
  if array_status.is_complete:
206
217
  pct = array_status.success_rate * 100
207
218
  color = "green" if pct == 100 else "yellow" if pct > 90 else "red"
208
- click.echo(f" Complete: {click.style(f'{pct:.1f}%', fg=color)} success rate")
219
+ click.echo(
220
+ f" Complete: {click.style(f'{pct:.1f}%', fg=color)} success rate"
221
+ )
209
222
  else:
210
223
  pct = array_status.completed / array_status.total * 100
211
- click.echo(f" Progress: {pct:.1f}% ({array_status.completed}/{array_status.total})")
224
+ click.echo(
225
+ f" Progress: {pct:.1f}% ({array_status.completed}/{array_status.total})"
226
+ )
212
227
 
213
228
  except BatchError as e:
214
229
  click.echo(f" (Could not fetch live status: {e})")
@@ -25,9 +25,13 @@ DEFAULT_QUEUE = "t4-1x-spot"
25
25
 
26
26
 
27
27
  @click.command()
28
- @click.option("-f", "--file", "config_file", type=click.Path(exists=True), help="Config file path")
28
+ @click.option(
29
+ "-f", "--file", "config_file", type=click.Path(exists=True), help="Config file path"
30
+ )
29
31
  @click.option("--command", help="Command to run (alternative to config file)")
30
- @click.option("--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]")
32
+ @click.option(
33
+ "--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]"
34
+ )
31
35
  @click.option("--memory", default="30G", help="Memory limit (e.g., 30G)")
32
36
  @click.option("--vcpus", default=8, type=int, help="Number of vCPUs")
33
37
  @click.option("--gpus", default=1, type=int, help="Number of GPUs")
@@ -91,7 +95,9 @@ def submit(
91
95
  # Override with command-line options
92
96
  job_command = command or config.get("command")
93
97
  if not job_command:
94
- raise click.UsageError("Must specify --command or provide config file with 'command' field")
98
+ raise click.UsageError(
99
+ "Must specify --command or provide config file with 'command' field"
100
+ )
95
101
 
96
102
  job_queue = queue if queue != DEFAULT_QUEUE else config.get("queue", queue)
97
103
  job_memory = memory if memory != "30G" else config.get("memory", memory)
@@ -33,7 +33,9 @@ class InputConfig(BaseModel):
33
33
  """Configuration for job input."""
34
34
 
35
35
  source: str = Field(..., description="Path to input file or directory")
36
- num_sequences: int | None = Field(None, description="Number of sequences (for FASTA)")
36
+ num_sequences: int | None = Field(
37
+ None, description="Number of sequences (for FASTA)"
38
+ )
37
39
  num_chunks: int | None = Field(None, description="Number of chunks created")
38
40
  sequences_per_chunk: int | None = Field(None, description="Sequences per chunk")
39
41
 
@@ -6,7 +6,10 @@ from importlib.metadata import PackageNotFoundError, version
6
6
  import typer
7
7
  from dayhoff_tools.cli.cloud_commands import aws_app, gcp_app
8
8
  from dayhoff_tools.cli.github_commands import gh_app
9
- from dayhoff_tools.cli.engine1 import engine_app as engine1_app, studio_app as studio1_app
9
+ from dayhoff_tools.cli.engine1 import (
10
+ engine_app as engine1_app,
11
+ studio_app as studio1_app,
12
+ )
10
13
  from dayhoff_tools.cli.utility_commands import (
11
14
  add_dependency,
12
15
  build_and_upload_wheel,
@@ -70,6 +73,7 @@ app.add_typer(gcp_app, name="gcp", help="Manage GCP authentication and impersona
70
73
  app.add_typer(aws_app, name="aws", help="Manage AWS SSO authentication.")
71
74
  app.add_typer(gh_app, name="gh", help="Manage GitHub authentication.")
72
75
 
76
+
73
77
  # Engine and Studio commands (v2 - new default with progress tracking)
74
78
  # These use Click instead of Typer, so we need a passthrough wrapper
75
79
  @app.command(
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
5
5
 
6
6
  [project]
7
7
  name = "dayhoff-tools"
8
- version = "1.14.6"
8
+ version = "1.14.7"
9
9
  description = "Common tools for all the repos at Dayhoff Labs"
10
10
  authors = [
11
11
  {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
File without changes