dayhoff-tools 1.14.9__tar.gz → 1.14.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/PKG-INFO +1 -1
  2. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/base.py +30 -1
  3. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/boltz.py +93 -26
  4. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/boltz.py +2 -0
  5. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/finalize.py +49 -4
  6. dayhoff_tools-1.14.11/dayhoff_tools/cli/batch/commands/retry.py +288 -0
  7. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/manifest.py +6 -0
  8. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/pyproject.toml +1 -1
  9. dayhoff_tools-1.14.9/dayhoff_tools/cli/batch/commands/retry.py +0 -146
  10. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/README.md +0 -0
  11. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/__init__.py +0 -0
  12. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/__init__.py +0 -0
  13. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/__init__.py +0 -0
  14. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/embed_t5.py +0 -0
  15. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/chemistry/standardizer.py +0 -0
  16. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/chemistry/utils.py +0 -0
  17. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/__init__.py +0 -0
  18. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/__init__.py +0 -0
  19. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/aws_batch.py +0 -0
  20. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/__init__.py +0 -0
  21. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/cancel.py +0 -0
  22. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/clean.py +0 -0
  23. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/embed_t5.py +0 -0
  24. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/list_jobs.py +0 -0
  25. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/local.py +0 -0
  26. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/logs.py +0 -0
  27. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/status.py +0 -0
  28. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/submit.py +0 -0
  29. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/job_id.py +0 -0
  30. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/cloud_commands.py +0 -0
  31. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/__init__.py +0 -0
  32. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_core.py +0 -0
  33. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_lifecycle.py +0 -0
  34. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_maintenance.py +0 -0
  35. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_management.py +0 -0
  36. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/shared.py +0 -0
  37. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/studio_commands.py +0 -0
  38. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/__init__.py +0 -0
  39. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/api_client.py +0 -0
  40. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/auth.py +0 -0
  41. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/engine-studio-cli.md +0 -0
  42. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/engine_commands.py +0 -0
  43. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/progress.py +0 -0
  44. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +0 -0
  45. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/demo.sh +0 -0
  46. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +0 -0
  47. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +0 -0
  48. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +0 -0
  49. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +0 -0
  50. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +0 -0
  51. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +0 -0
  52. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/ssh_config.py +0 -0
  53. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/studio_commands.py +0 -0
  54. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/github_commands.py +0 -0
  55. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/main.py +0 -0
  56. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/swarm_commands.py +0 -0
  57. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/utility_commands.py +0 -0
  58. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/base.py +0 -0
  59. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  60. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  61. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  62. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/job_runner.py +0 -0
  63. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/processors.py +0 -0
  64. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/swarm.py +0 -0
  65. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/embedders.py +0 -0
  66. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/fasta.py +0 -0
  67. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/file_ops.py +0 -0
  68. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/h5.py +0 -0
  69. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/gcp.py +0 -0
  70. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/gtdb.py +0 -0
  71. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/kegg.py +0 -0
  72. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/mmseqs.py +0 -0
  73. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/structure.py +0 -0
  74. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/uniprot.py +0 -0
  75. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/logs.py +0 -0
  76. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/sqlite.py +0 -0
  77. {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dayhoff-tools
3
- Version: 1.14.9
3
+ Version: 1.14.11
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -30,12 +30,18 @@ def get_array_index() -> int:
30
30
 
31
31
  For array jobs, reads AWS_BATCH_JOB_ARRAY_INDEX.
32
32
  For retry jobs, maps from BATCH_RETRY_INDICES.
33
+ For resliced retry jobs, uses the raw array index (chunks are renumbered).
33
34
  For single jobs (array_size=1), defaults to 0.
34
35
 
35
36
  Returns:
36
37
  The array index this worker should process
37
38
  """
38
- # Check for retry mode first
39
+ # For resliced retries, use raw array index (chunks are renumbered 0..N-1)
40
+ if os.environ.get("RESLICE_PREFIX"):
41
+ array_idx = os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0")
42
+ return int(array_idx)
43
+
44
+ # Check for retry mode (non-resliced)
39
45
  retry_indices = os.environ.get("BATCH_RETRY_INDICES")
40
46
  if retry_indices:
41
47
  # In retry mode, we have a list of indices and use array index to pick
@@ -69,6 +75,20 @@ def get_job_dir() -> Path:
69
75
  return Path(job_dir)
70
76
 
71
77
 
78
+ def get_reslice_prefix() -> str | None:
79
+ """Get the reslice prefix from environment if set.
80
+
81
+ When RESLICE_PREFIX is set (e.g., 'r1'), files are named like:
82
+ - Input: chunk_r1_000.fasta
83
+ - Output: embed_r1_000.h5
84
+ - Done marker: embed_r1_000.done
85
+
86
+ Returns:
87
+ The reslice prefix or None if not in reslice mode
88
+ """
89
+ return os.environ.get("RESLICE_PREFIX")
90
+
91
+
72
92
  def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
73
93
  """Get the input file path for a given index.
74
94
 
@@ -80,6 +100,9 @@ def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
80
100
  Returns:
81
101
  Path to input file
82
102
  """
103
+ reslice = get_reslice_prefix()
104
+ if reslice:
105
+ return job_dir / "input" / f"{prefix}_{reslice}_{index:03d}.fasta"
83
106
  return job_dir / "input" / f"{prefix}_{index:03d}.fasta"
84
107
 
85
108
 
@@ -97,6 +120,9 @@ def get_output_file(
97
120
  Returns:
98
121
  Path to output file
99
122
  """
123
+ reslice = get_reslice_prefix()
124
+ if reslice:
125
+ return job_dir / "output" / f"{prefix}_{reslice}_{index:03d}{suffix}"
100
126
  return job_dir / "output" / f"{prefix}_{index:03d}{suffix}"
101
127
 
102
128
 
@@ -111,6 +137,9 @@ def get_done_marker(index: int, job_dir: Path, prefix: str = "embed") -> Path:
111
137
  Returns:
112
138
  Path to done marker file
113
139
  """
140
+ reslice = get_reslice_prefix()
141
+ if reslice:
142
+ return job_dir / "output" / f"{prefix}_{reslice}_{index:03d}.done"
114
143
  return job_dir / "output" / f"{prefix}_{index:03d}.done"
115
144
 
116
145
 
@@ -341,10 +341,31 @@ class BoltzProcessor:
341
341
  return expected_output_dir
342
342
 
343
343
 
344
+ def _get_done_marker_for_file(job_dir: Path, file_stem: str) -> Path:
345
+ """Get the done marker path for a specific input file."""
346
+ return job_dir / "output" / f"boltz_{file_stem}.done"
347
+
348
+
349
+ def _check_file_complete(job_dir: Path, file_stem: str) -> bool:
350
+ """Check if a specific file has been processed."""
351
+ return _get_done_marker_for_file(job_dir, file_stem).exists()
352
+
353
+
354
+ def _mark_file_complete(job_dir: Path, file_stem: str):
355
+ """Mark a specific file as complete."""
356
+ done_marker = _get_done_marker_for_file(job_dir, file_stem)
357
+ done_marker.parent.mkdir(parents=True, exist_ok=True)
358
+ done_marker.touch()
359
+ logger.info(f"File {file_stem} marked complete: {done_marker}")
360
+
361
+
344
362
  def main():
345
- """Boltz worker main entrypoint for AWS Batch array jobs."""
363
+ """Boltz worker main entrypoint for AWS Batch array jobs.
364
+
365
+ Each worker processes multiple files based on array index and total workers.
366
+ With N files and M workers, worker i processes files where file_index % M == i.
367
+ """
346
368
  from .base import (
347
- check_already_complete,
348
369
  configure_worker_logging,
349
370
  get_array_index,
350
371
  get_job_dir,
@@ -358,34 +379,36 @@ def main():
358
379
  # Get configuration from environment
359
380
  index = get_array_index()
360
381
  job_dir = get_job_dir()
382
+ array_size = int(os.environ.get("BATCH_ARRAY_SIZE", "1"))
383
+ num_files = int(os.environ.get("BATCH_NUM_FILES", "0"))
361
384
 
362
385
  logger.info(f"Worker configuration:")
363
386
  logger.info(f" Array index: {index}")
387
+ logger.info(f" Array size: {array_size}")
388
+ logger.info(f" Total files: {num_files}")
364
389
  logger.info(f" Job directory: {job_dir}")
365
390
 
366
- # Check idempotency
367
- if check_already_complete(index, job_dir, prefix="boltz"):
368
- logger.info("Exiting - complex already processed")
369
- return
370
-
371
- # Find input file by index
391
+ # Find all input files
372
392
  input_dir = job_dir / "input"
373
393
  input_files = sorted(input_dir.glob("*.yaml"))
394
+ total_files = len(input_files)
374
395
 
375
- if index >= len(input_files):
376
- logger.error(
377
- f"Index {index} out of range. Found {len(input_files)} input files."
378
- )
379
- raise RuntimeError(f"Index {index} out of range")
396
+ if total_files == 0:
397
+ logger.error("No input files found")
398
+ raise RuntimeError("No input files found")
380
399
 
381
- input_file = input_files[index]
382
- logger.info(f" Input file: {input_file}")
400
+ # Calculate which files this worker should process
401
+ # Worker i processes files where file_index % array_size == index
402
+ my_files = [f for i, f in enumerate(input_files) if i % array_size == index]
383
403
 
384
- # Determine output directory
385
- output_base = input_file.stem
386
- output_dir = job_dir / "output" / output_base
404
+ logger.info(f" Files assigned to this worker: {len(my_files)}")
387
405
 
388
- # Get MSA directories
406
+ if not my_files:
407
+ logger.info("No files assigned to this worker, exiting successfully")
408
+ mark_complete(index, job_dir, prefix="boltz")
409
+ return
410
+
411
+ # Get MSA directories (shared across all files)
389
412
  job_msa_dir = job_dir / "msas"
390
413
  global_msa_dir = Path(os.environ.get("MSA_DIR", "/primordial/.cache/msas"))
391
414
 
@@ -408,7 +431,7 @@ def main():
408
431
  if boltz_options:
409
432
  logger.info(f" Boltz options: {boltz_options}")
410
433
 
411
- # Create processor and run
434
+ # Create processor (reused for all files)
412
435
  processor = BoltzProcessor(
413
436
  num_workers=None, # Auto-detect
414
437
  boltz_options=boltz_options,
@@ -416,16 +439,60 @@ def main():
416
439
  cache_dir=cache_dir,
417
440
  )
418
441
 
419
- # Ensure output directory exists
420
- output_dir.parent.mkdir(parents=True, exist_ok=True)
442
+ # Process each assigned file
443
+ completed = 0
444
+ failed = 0
445
+
446
+ for file_idx, input_file in enumerate(my_files):
447
+ file_stem = input_file.stem
448
+
449
+ # Check if this file is already complete (idempotency)
450
+ if _check_file_complete(job_dir, file_stem):
451
+ logger.info(
452
+ f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
453
+ "already complete, skipping"
454
+ )
455
+ completed += 1
456
+ continue
457
+
458
+ logger.info(
459
+ f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}..."
460
+ )
461
+
462
+ try:
463
+ # Determine output directory
464
+ output_dir = job_dir / "output" / file_stem
465
+ output_dir.parent.mkdir(parents=True, exist_ok=True)
466
+
467
+ result_dir = processor.run(str(input_file), str(output_dir))
468
+
469
+ # Mark this file as complete
470
+ _mark_file_complete(job_dir, file_stem)
471
+
472
+ logger.info(
473
+ f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
474
+ f"completed successfully -> {result_dir}"
475
+ )
476
+ completed += 1
477
+
478
+ except Exception as e:
479
+ logger.error(
480
+ f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
481
+ f"failed with error: {e}"
482
+ )
483
+ failed += 1
484
+ # Continue processing other files even if one fails
421
485
 
422
- result_dir = processor.run(str(input_file), str(output_dir))
486
+ # Summary
487
+ logger.info(f"Worker {index} finished: {completed} completed, {failed} failed")
423
488
 
424
- # Mark as complete
489
+ # Mark worker as complete
425
490
  mark_complete(index, job_dir, prefix="boltz")
426
491
 
427
- logger.info(f"Complex {input_file.stem} completed successfully")
428
- logger.info(f"Output: {result_dir}")
492
+ if failed > 0:
493
+ logger.warning(f"{failed} file(s) failed to process")
494
+ # Don't exit with error - some files succeeded and are marked complete
495
+ # The failed files can be retried later
429
496
 
430
497
  except Exception as e:
431
498
  logger.exception(f"Worker failed with error: {e}")
@@ -232,6 +232,8 @@ def _submit_batch_job(
232
232
  "JOB_ID": job_id,
233
233
  "BOLTZ_CACHE": "/primordial/.cache/boltz",
234
234
  "MSA_DIR": "/primordial/.cache/msas",
235
+ "BATCH_ARRAY_SIZE": str(array_size),
236
+ "BATCH_NUM_FILES": str(num_files),
235
237
  }
236
238
 
237
239
  batch_job_id = client.submit_job(
@@ -118,7 +118,12 @@ def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
118
118
 
119
119
 
120
120
  def _check_completion(job_id: str, base_path: str) -> list[int]:
121
- """Check which chunks are incomplete (no .done marker)."""
121
+ """Check which chunks are incomplete (no .done marker).
122
+
123
+ Handles both original chunks (chunk_000.fasta) and resliced chunks
124
+ (chunk_r1_000.fasta). For original chunks that were resliced in a retry,
125
+ checks if all resliced chunks completed.
126
+ """
122
127
  job_dir = get_job_dir(job_id, base_path)
123
128
  input_dir = job_dir / "input"
124
129
  output_dir = job_dir / "output"
@@ -126,13 +131,53 @@ def _check_completion(job_id: str, base_path: str) -> list[int]:
126
131
  if not input_dir.exists():
127
132
  return []
128
133
 
134
+ # Load manifest to check for resliced retries
135
+ try:
136
+ manifest = load_manifest(job_id, base_path)
137
+ resliced_indices: set[int] = set()
138
+ reslice_info: dict[str, int] = {} # prefix -> expected count
139
+
140
+ for retry in manifest.retries:
141
+ if retry.reslice_prefix and retry.reslice_count:
142
+ resliced_indices.update(retry.indices)
143
+ reslice_info[retry.reslice_prefix] = retry.reslice_count
144
+ except FileNotFoundError:
145
+ resliced_indices = set()
146
+ reslice_info = {}
147
+
129
148
  incomplete = []
130
- for chunk_path in sorted(input_dir.glob("chunk_*.fasta")):
149
+
150
+ # Check original chunks (chunk_000.fasta pattern)
151
+ for chunk_path in sorted(input_dir.glob("chunk_[0-9][0-9][0-9].fasta")):
131
152
  idx_str = chunk_path.stem.split("_")[1]
132
153
  idx = int(idx_str)
154
+
155
+ # Check for original done marker
133
156
  done_marker = output_dir / f"embed_{idx:03d}.done"
134
- if not done_marker.exists():
135
- incomplete.append(idx)
157
+ if done_marker.exists():
158
+ continue
159
+
160
+ # Check if this chunk was resliced
161
+ if idx in resliced_indices:
162
+ # Find which retry covered this index and check if complete
163
+ is_covered = False
164
+ for retry in manifest.retries:
165
+ if (
166
+ retry.reslice_prefix
167
+ and retry.reslice_count
168
+ and idx in retry.indices
169
+ ):
170
+ # Check if all resliced chunks for this retry completed
171
+ done_count = len(
172
+ list(output_dir.glob(f"embed_{retry.reslice_prefix}_*.done"))
173
+ )
174
+ if done_count >= retry.reslice_count:
175
+ is_covered = True
176
+ break
177
+ if is_covered:
178
+ continue
179
+
180
+ incomplete.append(idx)
136
181
 
137
182
  return incomplete
138
183
 
@@ -0,0 +1,288 @@
1
+ """Retry command for re-running failed chunks."""
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from ..aws_batch import BatchClient, BatchError
9
+ from ..job_id import generate_job_id
10
+ from ..manifest import (
11
+ BATCH_JOBS_BASE,
12
+ JobStatus,
13
+ RetryInfo,
14
+ get_job_dir,
15
+ load_manifest,
16
+ save_manifest,
17
+ )
18
+
19
+
20
+ @click.command()
21
+ @click.argument("job_id")
22
+ @click.option("--indices", help="Specific indices to retry (comma-separated)")
23
+ @click.option(
24
+ "--queue",
25
+ help="Override job queue (e.g., 't4-1x' for on-demand instead of spot)",
26
+ )
27
+ @click.option(
28
+ "--reslice",
29
+ type=int,
30
+ help="Reslice failed chunks into N thinner chunks (reduces interruption risk)",
31
+ )
32
+ @click.option(
33
+ "--dry-run", is_flag=True, help="Show what would be retried without submitting"
34
+ )
35
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
36
+ def retry(job_id, indices, queue, reslice, dry_run, base_path):
37
+ """Retry failed chunks of a batch job.
38
+
39
+ Identifies failed array indices and submits a new job to retry only
40
+ those specific indices. Outputs go to the same job directory, so
41
+ finalization works normally after retries complete.
42
+
43
+ The --reslice option concatenates failed chunks and re-splits them into
44
+ thinner slices, reducing the time per worker and thus the risk of spot
45
+ interruptions. Resliced outputs are named with a prefix (e.g., embed_r1_000.h5)
46
+ and are automatically included in finalization.
47
+
48
+ \b
49
+ Examples:
50
+ dh batch retry dma-embed-20260109-a3f2 # Retry all failed
51
+ dh batch retry dma-embed-20260109-a3f2 --indices 5,12,27 # Retry specific indices
52
+ dh batch retry dma-embed-20260109-a3f2 --queue t4-1x # Use on-demand (no spot interruptions)
53
+ dh batch retry dma-embed-20260109-a3f2 --reslice 40 # Reslice into 40 thinner chunks
54
+ dh batch retry dma-embed-20260109-a3f2 --dry-run # Show what would be retried
55
+ """
56
+ # Load manifest
57
+ try:
58
+ manifest = load_manifest(job_id, base_path)
59
+ except FileNotFoundError:
60
+ click.echo(f"Job not found: {job_id}", err=True)
61
+ raise SystemExit(1)
62
+
63
+ # Get failed indices
64
+ if indices:
65
+ # User specified indices
66
+ retry_indices = [int(i.strip()) for i in indices.split(",")]
67
+ else:
68
+ # Auto-detect from .done markers
69
+ retry_indices = _find_incomplete_chunks(job_id, base_path)
70
+
71
+ if not retry_indices:
72
+ click.echo("No failed or incomplete chunks found. Nothing to retry.")
73
+ return
74
+
75
+ click.echo(f"Found {len(retry_indices)} chunks to retry: {retry_indices}")
76
+
77
+ # Check if we have the required info
78
+ if not manifest.batch:
79
+ click.echo("Job has no batch configuration.", err=True)
80
+ raise SystemExit(1)
81
+
82
+ # Generate retry job ID and reslice prefix
83
+ retry_num = len(manifest.retries) + 1
84
+ retry_id = f"{job_id}-r{retry_num}"
85
+ reslice_prefix = f"r{retry_num}" if reslice else None
86
+
87
+ job_dir = get_job_dir(job_id, base_path)
88
+
89
+ if reslice:
90
+ # Count sequences in failed chunks to estimate split
91
+ total_seqs = _count_sequences_in_chunks(job_dir, retry_indices)
92
+ seqs_per_chunk = max(1, total_seqs // reslice)
93
+ click.echo(f"Total sequences in failed chunks: {total_seqs:,}")
94
+ click.echo(f"Reslicing into {reslice} chunks (~{seqs_per_chunk:,} seqs each)")
95
+
96
+ if dry_run:
97
+ click.echo()
98
+ click.echo(click.style("Dry run - job not submitted", fg="yellow"))
99
+ return
100
+
101
+ click.echo()
102
+ click.echo(f"Retry job ID: {retry_id}")
103
+
104
+ # Handle reslicing if requested
105
+ if reslice:
106
+ click.echo(f"Reslice prefix: {reslice_prefix}")
107
+ actual_chunks = _reslice_failed_chunks(
108
+ job_dir, retry_indices, reslice_prefix, reslice
109
+ )
110
+ click.echo(f"Created {actual_chunks} resliced chunks")
111
+ array_size = actual_chunks
112
+ else:
113
+ array_size = len(retry_indices)
114
+
115
+ # Submit retry job
116
+ try:
117
+ client = BatchClient()
118
+
119
+ environment = {
120
+ "JOB_DIR": str(job_dir),
121
+ "JOB_ID": job_id,
122
+ }
123
+
124
+ # Use provided queue or fall back to original
125
+ job_queue = queue or manifest.batch.queue
126
+ if queue and queue != manifest.batch.queue:
127
+ click.echo(f"Using queue: {job_queue} (original: {manifest.batch.queue})")
128
+
129
+ if reslice:
130
+ # Resliced retry: use RESLICE_PREFIX, sequential indices 0..N-1
131
+ environment["RESLICE_PREFIX"] = reslice_prefix
132
+ batch_job_id = client.submit_job(
133
+ job_name=retry_id,
134
+ job_definition=manifest.batch.job_definition or "dayhoff-embed-t5",
135
+ job_queue=job_queue,
136
+ array_size=array_size,
137
+ environment=environment,
138
+ timeout_seconds=6 * 3600,
139
+ retry_attempts=5,
140
+ )
141
+ else:
142
+ # Standard retry: use BATCH_RETRY_INDICES mapping
143
+ environment["BATCH_RETRY_INDICES"] = ",".join(str(i) for i in retry_indices)
144
+ batch_job_id = client.submit_array_job_with_indices(
145
+ job_name=retry_id,
146
+ job_definition=manifest.batch.job_definition or "dayhoff-embed-t5",
147
+ job_queue=job_queue,
148
+ indices=retry_indices,
149
+ environment=environment,
150
+ timeout_seconds=6 * 3600,
151
+ retry_attempts=5,
152
+ )
153
+
154
+ # Update manifest with retry info
155
+ retry_info = RetryInfo(
156
+ retry_id=retry_id,
157
+ indices=retry_indices,
158
+ batch_job_id=batch_job_id,
159
+ reslice_prefix=reslice_prefix,
160
+ reslice_count=array_size if reslice else None,
161
+ created=datetime.utcnow(),
162
+ )
163
+ manifest.retries.append(retry_info)
164
+ manifest.status = JobStatus.RUNNING
165
+ save_manifest(manifest, base_path)
166
+
167
+ click.echo()
168
+ click.echo(click.style("✓ Retry job submitted successfully!", fg="green"))
169
+ click.echo()
170
+ click.echo(f"AWS Batch Job ID: {batch_job_id}")
171
+ click.echo()
172
+ click.echo("Next steps:")
173
+ click.echo(f" Check status: dh batch status {job_id}")
174
+ click.echo(f" View logs: dh batch logs {job_id}")
175
+
176
+ except BatchError as e:
177
+ click.echo(
178
+ click.style(f"✗ Failed to submit retry job: {e}", fg="red"), err=True
179
+ )
180
+ raise SystemExit(1)
181
+
182
+
183
+ def _find_incomplete_chunks(job_id: str, base_path: str) -> list[int]:
184
+ """Find chunks that don't have .done markers."""
185
+ job_dir = get_job_dir(job_id, base_path)
186
+ input_dir = job_dir / "input"
187
+ output_dir = job_dir / "output"
188
+
189
+ if not input_dir.exists():
190
+ return []
191
+
192
+ # Find all original input chunks (not resliced ones)
193
+ input_chunks = sorted(input_dir.glob("chunk_[0-9][0-9][0-9].fasta"))
194
+ incomplete = []
195
+
196
+ for chunk_path in input_chunks:
197
+ # Extract index from filename (chunk_000.fasta -> 0)
198
+ idx_str = chunk_path.stem.split("_")[1]
199
+ idx = int(idx_str)
200
+
201
+ # Check for .done marker
202
+ done_marker = output_dir / f"embed_{idx:03d}.done"
203
+ if not done_marker.exists():
204
+ incomplete.append(idx)
205
+
206
+ return incomplete
207
+
208
+
209
+ def _count_sequences_in_chunks(job_dir: Path, indices: list[int]) -> int:
210
+ """Count total sequences in the specified chunk files."""
211
+ input_dir = job_dir / "input"
212
+ total = 0
213
+
214
+ for idx in indices:
215
+ chunk_path = input_dir / f"chunk_{idx:03d}.fasta"
216
+ if chunk_path.exists():
217
+ with open(chunk_path) as f:
218
+ for line in f:
219
+ if line.startswith(">"):
220
+ total += 1
221
+
222
+ return total
223
+
224
+
225
+ def _reslice_failed_chunks(
226
+ job_dir: Path, indices: list[int], reslice_prefix: str, num_chunks: int
227
+ ) -> int:
228
+ """Concatenate failed chunks and re-split into thinner slices.
229
+
230
+ Creates new chunk files named chunk_{prefix}_000.fasta, etc.
231
+
232
+ Args:
233
+ job_dir: Job directory path
234
+ indices: List of failed chunk indices
235
+ reslice_prefix: Prefix for new chunk files (e.g., 'r1')
236
+ num_chunks: Target number of new chunks
237
+
238
+ Returns:
239
+ Actual number of chunks created
240
+ """
241
+ from dayhoff_tools.fasta import split_fasta
242
+ import tempfile
243
+
244
+ input_dir = job_dir / "input"
245
+
246
+ # Concatenate all failed chunks into a temp file
247
+ with tempfile.NamedTemporaryFile(
248
+ mode="w", suffix=".fasta", delete=False
249
+ ) as tmp_file:
250
+ tmp_path = tmp_file.name
251
+ total_seqs = 0
252
+
253
+ for idx in indices:
254
+ chunk_path = input_dir / f"chunk_{idx:03d}.fasta"
255
+ if chunk_path.exists():
256
+ with open(chunk_path) as f:
257
+ for line in f:
258
+ tmp_file.write(line)
259
+ if line.startswith(">"):
260
+ total_seqs += 1
261
+
262
+ try:
263
+ # Calculate sequences per chunk
264
+ seqs_per_chunk = max(1, (total_seqs + num_chunks - 1) // num_chunks)
265
+
266
+ # Split into new chunks with reslice prefix
267
+ # split_fasta creates files like: chunk_r1_1.fasta, chunk_r1_2.fasta, etc.
268
+ actual_chunks = split_fasta(
269
+ fasta_file=tmp_path,
270
+ target_folder=str(input_dir),
271
+ base_name=f"chunk_{reslice_prefix}",
272
+ sequences_per_file=seqs_per_chunk,
273
+ max_files=num_chunks,
274
+ show_progress=True,
275
+ )
276
+
277
+ # Rename to zero-padded indices (chunk_r1_000.fasta, etc.)
278
+ for i in range(1, actual_chunks + 1):
279
+ old_name = input_dir / f"chunk_{reslice_prefix}_{i}.fasta"
280
+ new_name = input_dir / f"chunk_{reslice_prefix}_{i-1:03d}.fasta"
281
+ if old_name.exists():
282
+ old_name.rename(new_name)
283
+
284
+ return actual_chunks
285
+
286
+ finally:
287
+ # Clean up temp file
288
+ Path(tmp_path).unlink(missing_ok=True)
@@ -62,6 +62,12 @@ class RetryInfo(BaseModel):
62
62
  retry_id: str = Field(..., description="Retry job ID")
63
63
  indices: list[int] = Field(..., description="Array indices being retried")
64
64
  batch_job_id: str | None = Field(None, description="AWS Batch job ID for retry")
65
+ reslice_prefix: str | None = Field(
66
+ None, description="Reslice prefix if chunks were resliced (e.g., 'r1')"
67
+ )
68
+ reslice_count: int | None = Field(
69
+ None, description="Number of resliced chunks created"
70
+ )
65
71
  created: datetime = Field(default_factory=datetime.utcnow)
66
72
 
67
73
 
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
5
5
 
6
6
  [project]
7
7
  name = "dayhoff-tools"
8
- version = "1.14.9"
8
+ version = "1.14.11"
9
9
  description = "Common tools for all the repos at Dayhoff Labs"
10
10
  authors = [
11
11
  {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
@@ -1,146 +0,0 @@
1
- """Retry command for re-running failed chunks."""
2
-
3
- from datetime import datetime
4
-
5
- import click
6
-
7
- from ..aws_batch import BatchClient, BatchError
8
- from ..job_id import generate_job_id
9
- from ..manifest import (
10
- BATCH_JOBS_BASE,
11
- JobStatus,
12
- RetryInfo,
13
- get_job_dir,
14
- load_manifest,
15
- save_manifest,
16
- )
17
-
18
-
19
- @click.command()
20
- @click.argument("job_id")
21
- @click.option("--indices", help="Specific indices to retry (comma-separated)")
22
- @click.option(
23
- "--dry-run", is_flag=True, help="Show what would be retried without submitting"
24
- )
25
- @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
26
- def retry(job_id, indices, dry_run, base_path):
27
- """Retry failed chunks of a batch job.
28
-
29
- Identifies failed array indices and submits a new job to retry only
30
- those specific indices.
31
-
32
- \b
33
- Examples:
34
- dh batch retry dma-embed-20260109-a3f2 # Retry all failed
35
- dh batch retry dma-embed-20260109-a3f2 --indices 5,12,27 # Retry specific indices
36
- dh batch retry dma-embed-20260109-a3f2 --dry-run # Show what would be retried
37
- """
38
- # Load manifest
39
- try:
40
- manifest = load_manifest(job_id, base_path)
41
- except FileNotFoundError:
42
- click.echo(f"Job not found: {job_id}", err=True)
43
- raise SystemExit(1)
44
-
45
- # Get failed indices
46
- if indices:
47
- # User specified indices
48
- retry_indices = [int(i.strip()) for i in indices.split(",")]
49
- else:
50
- # Auto-detect from .done markers
51
- retry_indices = _find_incomplete_chunks(job_id, base_path)
52
-
53
- if not retry_indices:
54
- click.echo("No failed or incomplete chunks found. Nothing to retry.")
55
- return
56
-
57
- click.echo(f"Found {len(retry_indices)} chunks to retry: {retry_indices}")
58
-
59
- if dry_run:
60
- click.echo()
61
- click.echo(click.style("Dry run - job not submitted", fg="yellow"))
62
- return
63
-
64
- # Check if we have the required info
65
- if not manifest.batch:
66
- click.echo("Job has no batch configuration.", err=True)
67
- raise SystemExit(1)
68
-
69
- # Generate retry job ID
70
- retry_id = f"{job_id}-r{len(manifest.retries) + 1}"
71
-
72
- click.echo()
73
- click.echo(f"Retry job ID: {retry_id}")
74
-
75
- # Submit retry job
76
- try:
77
- client = BatchClient()
78
- job_dir = get_job_dir(job_id, base_path)
79
-
80
- environment = {
81
- "JOB_DIR": str(job_dir),
82
- "JOB_ID": job_id,
83
- "BATCH_RETRY_INDICES": ",".join(str(i) for i in retry_indices),
84
- }
85
-
86
- batch_job_id = client.submit_array_job_with_indices(
87
- job_name=retry_id,
88
- job_definition=manifest.batch.job_definition or "dayhoff-embed-t5",
89
- job_queue=manifest.batch.queue,
90
- indices=retry_indices,
91
- environment=environment,
92
- timeout_seconds=6 * 3600,
93
- retry_attempts=3,
94
- )
95
-
96
- # Update manifest with retry info
97
- retry_info = RetryInfo(
98
- retry_id=retry_id,
99
- indices=retry_indices,
100
- batch_job_id=batch_job_id,
101
- created=datetime.utcnow(),
102
- )
103
- manifest.retries.append(retry_info)
104
- manifest.status = JobStatus.RUNNING
105
- save_manifest(manifest, base_path)
106
-
107
- click.echo()
108
- click.echo(click.style("✓ Retry job submitted successfully!", fg="green"))
109
- click.echo()
110
- click.echo(f"AWS Batch Job ID: {batch_job_id}")
111
- click.echo()
112
- click.echo("Next steps:")
113
- click.echo(f" Check status: dh batch status {job_id}")
114
- click.echo(f" View logs: dh batch logs {job_id}")
115
-
116
- except BatchError as e:
117
- click.echo(
118
- click.style(f"✗ Failed to submit retry job: {e}", fg="red"), err=True
119
- )
120
- raise SystemExit(1)
121
-
122
-
123
- def _find_incomplete_chunks(job_id: str, base_path: str) -> list[int]:
124
- """Find chunks that don't have .done markers."""
125
- job_dir = get_job_dir(job_id, base_path)
126
- input_dir = job_dir / "input"
127
- output_dir = job_dir / "output"
128
-
129
- if not input_dir.exists():
130
- return []
131
-
132
- # Find all input chunks
133
- input_chunks = sorted(input_dir.glob("chunk_*.fasta"))
134
- incomplete = []
135
-
136
- for chunk_path in input_chunks:
137
- # Extract index from filename (chunk_000.fasta -> 0)
138
- idx_str = chunk_path.stem.split("_")[1]
139
- idx = int(idx_str)
140
-
141
- # Check for .done marker
142
- done_marker = output_dir / f"embed_{idx:03d}.done"
143
- if not done_marker.exists():
144
- incomplete.append(idx)
145
-
146
- return incomplete