dayhoff-tools 1.14.9__tar.gz → 1.14.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/PKG-INFO +1 -1
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/base.py +30 -1
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/boltz.py +93 -26
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/boltz.py +2 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/finalize.py +49 -4
- dayhoff_tools-1.14.11/dayhoff_tools/cli/batch/commands/retry.py +288 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/manifest.py +6 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/pyproject.toml +1 -1
- dayhoff_tools-1.14.9/dayhoff_tools/cli/batch/commands/retry.py +0 -146
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/README.md +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/batch/workers/embed_t5.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/chemistry/standardizer.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/chemistry/utils.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/aws_batch.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/cancel.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/clean.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/embed_t5.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/list_jobs.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/local.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/logs.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/status.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/submit.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/job_id.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/cloud_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_core.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_lifecycle.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_maintenance.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_management.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/shared.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/studio_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/__init__.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/api_client.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/auth.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/engine-studio-cli.md +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/engine_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/progress.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/demo.sh +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/ssh_config.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/studio_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/github_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/main.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/swarm_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/utility_commands.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/base.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/deploy_aws.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/deploy_utils.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/job_runner.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/processors.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/deployment/swarm.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/embedders.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/fasta.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/file_ops.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/h5.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/gcp.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/gtdb.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/kegg.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/mmseqs.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/structure.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/intake/uniprot.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/logs.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/sqlite.py +0 -0
- {dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/warehouse.py +0 -0
|
@@ -30,12 +30,18 @@ def get_array_index() -> int:
|
|
|
30
30
|
|
|
31
31
|
For array jobs, reads AWS_BATCH_JOB_ARRAY_INDEX.
|
|
32
32
|
For retry jobs, maps from BATCH_RETRY_INDICES.
|
|
33
|
+
For resliced retry jobs, uses the raw array index (chunks are renumbered).
|
|
33
34
|
For single jobs (array_size=1), defaults to 0.
|
|
34
35
|
|
|
35
36
|
Returns:
|
|
36
37
|
The array index this worker should process
|
|
37
38
|
"""
|
|
38
|
-
#
|
|
39
|
+
# For resliced retries, use raw array index (chunks are renumbered 0..N-1)
|
|
40
|
+
if os.environ.get("RESLICE_PREFIX"):
|
|
41
|
+
array_idx = os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0")
|
|
42
|
+
return int(array_idx)
|
|
43
|
+
|
|
44
|
+
# Check for retry mode (non-resliced)
|
|
39
45
|
retry_indices = os.environ.get("BATCH_RETRY_INDICES")
|
|
40
46
|
if retry_indices:
|
|
41
47
|
# In retry mode, we have a list of indices and use array index to pick
|
|
@@ -69,6 +75,20 @@ def get_job_dir() -> Path:
|
|
|
69
75
|
return Path(job_dir)
|
|
70
76
|
|
|
71
77
|
|
|
78
|
+
def get_reslice_prefix() -> str | None:
|
|
79
|
+
"""Get the reslice prefix from environment if set.
|
|
80
|
+
|
|
81
|
+
When RESLICE_PREFIX is set (e.g., 'r1'), files are named like:
|
|
82
|
+
- Input: chunk_r1_000.fasta
|
|
83
|
+
- Output: embed_r1_000.h5
|
|
84
|
+
- Done marker: embed_r1_000.done
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
The reslice prefix or None if not in reslice mode
|
|
88
|
+
"""
|
|
89
|
+
return os.environ.get("RESLICE_PREFIX")
|
|
90
|
+
|
|
91
|
+
|
|
72
92
|
def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
|
|
73
93
|
"""Get the input file path for a given index.
|
|
74
94
|
|
|
@@ -80,6 +100,9 @@ def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
|
|
|
80
100
|
Returns:
|
|
81
101
|
Path to input file
|
|
82
102
|
"""
|
|
103
|
+
reslice = get_reslice_prefix()
|
|
104
|
+
if reslice:
|
|
105
|
+
return job_dir / "input" / f"{prefix}_{reslice}_{index:03d}.fasta"
|
|
83
106
|
return job_dir / "input" / f"{prefix}_{index:03d}.fasta"
|
|
84
107
|
|
|
85
108
|
|
|
@@ -97,6 +120,9 @@ def get_output_file(
|
|
|
97
120
|
Returns:
|
|
98
121
|
Path to output file
|
|
99
122
|
"""
|
|
123
|
+
reslice = get_reslice_prefix()
|
|
124
|
+
if reslice:
|
|
125
|
+
return job_dir / "output" / f"{prefix}_{reslice}_{index:03d}{suffix}"
|
|
100
126
|
return job_dir / "output" / f"{prefix}_{index:03d}{suffix}"
|
|
101
127
|
|
|
102
128
|
|
|
@@ -111,6 +137,9 @@ def get_done_marker(index: int, job_dir: Path, prefix: str = "embed") -> Path:
|
|
|
111
137
|
Returns:
|
|
112
138
|
Path to done marker file
|
|
113
139
|
"""
|
|
140
|
+
reslice = get_reslice_prefix()
|
|
141
|
+
if reslice:
|
|
142
|
+
return job_dir / "output" / f"{prefix}_{reslice}_{index:03d}.done"
|
|
114
143
|
return job_dir / "output" / f"{prefix}_{index:03d}.done"
|
|
115
144
|
|
|
116
145
|
|
|
@@ -341,10 +341,31 @@ class BoltzProcessor:
|
|
|
341
341
|
return expected_output_dir
|
|
342
342
|
|
|
343
343
|
|
|
344
|
+
def _get_done_marker_for_file(job_dir: Path, file_stem: str) -> Path:
|
|
345
|
+
"""Get the done marker path for a specific input file."""
|
|
346
|
+
return job_dir / "output" / f"boltz_{file_stem}.done"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _check_file_complete(job_dir: Path, file_stem: str) -> bool:
|
|
350
|
+
"""Check if a specific file has been processed."""
|
|
351
|
+
return _get_done_marker_for_file(job_dir, file_stem).exists()
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _mark_file_complete(job_dir: Path, file_stem: str):
|
|
355
|
+
"""Mark a specific file as complete."""
|
|
356
|
+
done_marker = _get_done_marker_for_file(job_dir, file_stem)
|
|
357
|
+
done_marker.parent.mkdir(parents=True, exist_ok=True)
|
|
358
|
+
done_marker.touch()
|
|
359
|
+
logger.info(f"File {file_stem} marked complete: {done_marker}")
|
|
360
|
+
|
|
361
|
+
|
|
344
362
|
def main():
|
|
345
|
-
"""Boltz worker main entrypoint for AWS Batch array jobs.
|
|
363
|
+
"""Boltz worker main entrypoint for AWS Batch array jobs.
|
|
364
|
+
|
|
365
|
+
Each worker processes multiple files based on array index and total workers.
|
|
366
|
+
With N files and M workers, worker i processes files where file_index % M == i.
|
|
367
|
+
"""
|
|
346
368
|
from .base import (
|
|
347
|
-
check_already_complete,
|
|
348
369
|
configure_worker_logging,
|
|
349
370
|
get_array_index,
|
|
350
371
|
get_job_dir,
|
|
@@ -358,34 +379,36 @@ def main():
|
|
|
358
379
|
# Get configuration from environment
|
|
359
380
|
index = get_array_index()
|
|
360
381
|
job_dir = get_job_dir()
|
|
382
|
+
array_size = int(os.environ.get("BATCH_ARRAY_SIZE", "1"))
|
|
383
|
+
num_files = int(os.environ.get("BATCH_NUM_FILES", "0"))
|
|
361
384
|
|
|
362
385
|
logger.info(f"Worker configuration:")
|
|
363
386
|
logger.info(f" Array index: {index}")
|
|
387
|
+
logger.info(f" Array size: {array_size}")
|
|
388
|
+
logger.info(f" Total files: {num_files}")
|
|
364
389
|
logger.info(f" Job directory: {job_dir}")
|
|
365
390
|
|
|
366
|
-
#
|
|
367
|
-
if check_already_complete(index, job_dir, prefix="boltz"):
|
|
368
|
-
logger.info("Exiting - complex already processed")
|
|
369
|
-
return
|
|
370
|
-
|
|
371
|
-
# Find input file by index
|
|
391
|
+
# Find all input files
|
|
372
392
|
input_dir = job_dir / "input"
|
|
373
393
|
input_files = sorted(input_dir.glob("*.yaml"))
|
|
394
|
+
total_files = len(input_files)
|
|
374
395
|
|
|
375
|
-
if
|
|
376
|
-
logger.error(
|
|
377
|
-
|
|
378
|
-
)
|
|
379
|
-
raise RuntimeError(f"Index {index} out of range")
|
|
396
|
+
if total_files == 0:
|
|
397
|
+
logger.error("No input files found")
|
|
398
|
+
raise RuntimeError("No input files found")
|
|
380
399
|
|
|
381
|
-
|
|
382
|
-
|
|
400
|
+
# Calculate which files this worker should process
|
|
401
|
+
# Worker i processes files where file_index % array_size == index
|
|
402
|
+
my_files = [f for i, f in enumerate(input_files) if i % array_size == index]
|
|
383
403
|
|
|
384
|
-
|
|
385
|
-
output_base = input_file.stem
|
|
386
|
-
output_dir = job_dir / "output" / output_base
|
|
404
|
+
logger.info(f" Files assigned to this worker: {len(my_files)}")
|
|
387
405
|
|
|
388
|
-
|
|
406
|
+
if not my_files:
|
|
407
|
+
logger.info("No files assigned to this worker, exiting successfully")
|
|
408
|
+
mark_complete(index, job_dir, prefix="boltz")
|
|
409
|
+
return
|
|
410
|
+
|
|
411
|
+
# Get MSA directories (shared across all files)
|
|
389
412
|
job_msa_dir = job_dir / "msas"
|
|
390
413
|
global_msa_dir = Path(os.environ.get("MSA_DIR", "/primordial/.cache/msas"))
|
|
391
414
|
|
|
@@ -408,7 +431,7 @@ def main():
|
|
|
408
431
|
if boltz_options:
|
|
409
432
|
logger.info(f" Boltz options: {boltz_options}")
|
|
410
433
|
|
|
411
|
-
# Create processor
|
|
434
|
+
# Create processor (reused for all files)
|
|
412
435
|
processor = BoltzProcessor(
|
|
413
436
|
num_workers=None, # Auto-detect
|
|
414
437
|
boltz_options=boltz_options,
|
|
@@ -416,16 +439,60 @@ def main():
|
|
|
416
439
|
cache_dir=cache_dir,
|
|
417
440
|
)
|
|
418
441
|
|
|
419
|
-
#
|
|
420
|
-
|
|
442
|
+
# Process each assigned file
|
|
443
|
+
completed = 0
|
|
444
|
+
failed = 0
|
|
445
|
+
|
|
446
|
+
for file_idx, input_file in enumerate(my_files):
|
|
447
|
+
file_stem = input_file.stem
|
|
448
|
+
|
|
449
|
+
# Check if this file is already complete (idempotency)
|
|
450
|
+
if _check_file_complete(job_dir, file_stem):
|
|
451
|
+
logger.info(
|
|
452
|
+
f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
|
|
453
|
+
"already complete, skipping"
|
|
454
|
+
)
|
|
455
|
+
completed += 1
|
|
456
|
+
continue
|
|
457
|
+
|
|
458
|
+
logger.info(
|
|
459
|
+
f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}..."
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
# Determine output directory
|
|
464
|
+
output_dir = job_dir / "output" / file_stem
|
|
465
|
+
output_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
466
|
+
|
|
467
|
+
result_dir = processor.run(str(input_file), str(output_dir))
|
|
468
|
+
|
|
469
|
+
# Mark this file as complete
|
|
470
|
+
_mark_file_complete(job_dir, file_stem)
|
|
471
|
+
|
|
472
|
+
logger.info(
|
|
473
|
+
f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
|
|
474
|
+
f"completed successfully -> {result_dir}"
|
|
475
|
+
)
|
|
476
|
+
completed += 1
|
|
477
|
+
|
|
478
|
+
except Exception as e:
|
|
479
|
+
logger.error(
|
|
480
|
+
f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
|
|
481
|
+
f"failed with error: {e}"
|
|
482
|
+
)
|
|
483
|
+
failed += 1
|
|
484
|
+
# Continue processing other files even if one fails
|
|
421
485
|
|
|
422
|
-
|
|
486
|
+
# Summary
|
|
487
|
+
logger.info(f"Worker {index} finished: {completed} completed, {failed} failed")
|
|
423
488
|
|
|
424
|
-
# Mark as complete
|
|
489
|
+
# Mark worker as complete
|
|
425
490
|
mark_complete(index, job_dir, prefix="boltz")
|
|
426
491
|
|
|
427
|
-
|
|
428
|
-
|
|
492
|
+
if failed > 0:
|
|
493
|
+
logger.warning(f"{failed} file(s) failed to process")
|
|
494
|
+
# Don't exit with error - some files succeeded and are marked complete
|
|
495
|
+
# The failed files can be retried later
|
|
429
496
|
|
|
430
497
|
except Exception as e:
|
|
431
498
|
logger.exception(f"Worker failed with error: {e}")
|
|
@@ -232,6 +232,8 @@ def _submit_batch_job(
|
|
|
232
232
|
"JOB_ID": job_id,
|
|
233
233
|
"BOLTZ_CACHE": "/primordial/.cache/boltz",
|
|
234
234
|
"MSA_DIR": "/primordial/.cache/msas",
|
|
235
|
+
"BATCH_ARRAY_SIZE": str(array_size),
|
|
236
|
+
"BATCH_NUM_FILES": str(num_files),
|
|
235
237
|
}
|
|
236
238
|
|
|
237
239
|
batch_job_id = client.submit_job(
|
|
@@ -118,7 +118,12 @@ def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
|
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
def _check_completion(job_id: str, base_path: str) -> list[int]:
|
|
121
|
-
"""Check which chunks are incomplete (no .done marker).
|
|
121
|
+
"""Check which chunks are incomplete (no .done marker).
|
|
122
|
+
|
|
123
|
+
Handles both original chunks (chunk_000.fasta) and resliced chunks
|
|
124
|
+
(chunk_r1_000.fasta). For original chunks that were resliced in a retry,
|
|
125
|
+
checks if all resliced chunks completed.
|
|
126
|
+
"""
|
|
122
127
|
job_dir = get_job_dir(job_id, base_path)
|
|
123
128
|
input_dir = job_dir / "input"
|
|
124
129
|
output_dir = job_dir / "output"
|
|
@@ -126,13 +131,53 @@ def _check_completion(job_id: str, base_path: str) -> list[int]:
|
|
|
126
131
|
if not input_dir.exists():
|
|
127
132
|
return []
|
|
128
133
|
|
|
134
|
+
# Load manifest to check for resliced retries
|
|
135
|
+
try:
|
|
136
|
+
manifest = load_manifest(job_id, base_path)
|
|
137
|
+
resliced_indices: set[int] = set()
|
|
138
|
+
reslice_info: dict[str, int] = {} # prefix -> expected count
|
|
139
|
+
|
|
140
|
+
for retry in manifest.retries:
|
|
141
|
+
if retry.reslice_prefix and retry.reslice_count:
|
|
142
|
+
resliced_indices.update(retry.indices)
|
|
143
|
+
reslice_info[retry.reslice_prefix] = retry.reslice_count
|
|
144
|
+
except FileNotFoundError:
|
|
145
|
+
resliced_indices = set()
|
|
146
|
+
reslice_info = {}
|
|
147
|
+
|
|
129
148
|
incomplete = []
|
|
130
|
-
|
|
149
|
+
|
|
150
|
+
# Check original chunks (chunk_000.fasta pattern)
|
|
151
|
+
for chunk_path in sorted(input_dir.glob("chunk_[0-9][0-9][0-9].fasta")):
|
|
131
152
|
idx_str = chunk_path.stem.split("_")[1]
|
|
132
153
|
idx = int(idx_str)
|
|
154
|
+
|
|
155
|
+
# Check for original done marker
|
|
133
156
|
done_marker = output_dir / f"embed_{idx:03d}.done"
|
|
134
|
-
if
|
|
135
|
-
|
|
157
|
+
if done_marker.exists():
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
# Check if this chunk was resliced
|
|
161
|
+
if idx in resliced_indices:
|
|
162
|
+
# Find which retry covered this index and check if complete
|
|
163
|
+
is_covered = False
|
|
164
|
+
for retry in manifest.retries:
|
|
165
|
+
if (
|
|
166
|
+
retry.reslice_prefix
|
|
167
|
+
and retry.reslice_count
|
|
168
|
+
and idx in retry.indices
|
|
169
|
+
):
|
|
170
|
+
# Check if all resliced chunks for this retry completed
|
|
171
|
+
done_count = len(
|
|
172
|
+
list(output_dir.glob(f"embed_{retry.reslice_prefix}_*.done"))
|
|
173
|
+
)
|
|
174
|
+
if done_count >= retry.reslice_count:
|
|
175
|
+
is_covered = True
|
|
176
|
+
break
|
|
177
|
+
if is_covered:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
incomplete.append(idx)
|
|
136
181
|
|
|
137
182
|
return incomplete
|
|
138
183
|
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""Retry command for re-running failed chunks."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from ..aws_batch import BatchClient, BatchError
|
|
9
|
+
from ..job_id import generate_job_id
|
|
10
|
+
from ..manifest import (
|
|
11
|
+
BATCH_JOBS_BASE,
|
|
12
|
+
JobStatus,
|
|
13
|
+
RetryInfo,
|
|
14
|
+
get_job_dir,
|
|
15
|
+
load_manifest,
|
|
16
|
+
save_manifest,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@click.command()
|
|
21
|
+
@click.argument("job_id")
|
|
22
|
+
@click.option("--indices", help="Specific indices to retry (comma-separated)")
|
|
23
|
+
@click.option(
|
|
24
|
+
"--queue",
|
|
25
|
+
help="Override job queue (e.g., 't4-1x' for on-demand instead of spot)",
|
|
26
|
+
)
|
|
27
|
+
@click.option(
|
|
28
|
+
"--reslice",
|
|
29
|
+
type=int,
|
|
30
|
+
help="Reslice failed chunks into N thinner chunks (reduces interruption risk)",
|
|
31
|
+
)
|
|
32
|
+
@click.option(
|
|
33
|
+
"--dry-run", is_flag=True, help="Show what would be retried without submitting"
|
|
34
|
+
)
|
|
35
|
+
@click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
|
|
36
|
+
def retry(job_id, indices, queue, reslice, dry_run, base_path):
|
|
37
|
+
"""Retry failed chunks of a batch job.
|
|
38
|
+
|
|
39
|
+
Identifies failed array indices and submits a new job to retry only
|
|
40
|
+
those specific indices. Outputs go to the same job directory, so
|
|
41
|
+
finalization works normally after retries complete.
|
|
42
|
+
|
|
43
|
+
The --reslice option concatenates failed chunks and re-splits them into
|
|
44
|
+
thinner slices, reducing the time per worker and thus the risk of spot
|
|
45
|
+
interruptions. Resliced outputs are named with a prefix (e.g., embed_r1_000.h5)
|
|
46
|
+
and are automatically included in finalization.
|
|
47
|
+
|
|
48
|
+
\b
|
|
49
|
+
Examples:
|
|
50
|
+
dh batch retry dma-embed-20260109-a3f2 # Retry all failed
|
|
51
|
+
dh batch retry dma-embed-20260109-a3f2 --indices 5,12,27 # Retry specific indices
|
|
52
|
+
dh batch retry dma-embed-20260109-a3f2 --queue t4-1x # Use on-demand (no spot interruptions)
|
|
53
|
+
dh batch retry dma-embed-20260109-a3f2 --reslice 40 # Reslice into 40 thinner chunks
|
|
54
|
+
dh batch retry dma-embed-20260109-a3f2 --dry-run # Show what would be retried
|
|
55
|
+
"""
|
|
56
|
+
# Load manifest
|
|
57
|
+
try:
|
|
58
|
+
manifest = load_manifest(job_id, base_path)
|
|
59
|
+
except FileNotFoundError:
|
|
60
|
+
click.echo(f"Job not found: {job_id}", err=True)
|
|
61
|
+
raise SystemExit(1)
|
|
62
|
+
|
|
63
|
+
# Get failed indices
|
|
64
|
+
if indices:
|
|
65
|
+
# User specified indices
|
|
66
|
+
retry_indices = [int(i.strip()) for i in indices.split(",")]
|
|
67
|
+
else:
|
|
68
|
+
# Auto-detect from .done markers
|
|
69
|
+
retry_indices = _find_incomplete_chunks(job_id, base_path)
|
|
70
|
+
|
|
71
|
+
if not retry_indices:
|
|
72
|
+
click.echo("No failed or incomplete chunks found. Nothing to retry.")
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
click.echo(f"Found {len(retry_indices)} chunks to retry: {retry_indices}")
|
|
76
|
+
|
|
77
|
+
# Check if we have the required info
|
|
78
|
+
if not manifest.batch:
|
|
79
|
+
click.echo("Job has no batch configuration.", err=True)
|
|
80
|
+
raise SystemExit(1)
|
|
81
|
+
|
|
82
|
+
# Generate retry job ID and reslice prefix
|
|
83
|
+
retry_num = len(manifest.retries) + 1
|
|
84
|
+
retry_id = f"{job_id}-r{retry_num}"
|
|
85
|
+
reslice_prefix = f"r{retry_num}" if reslice else None
|
|
86
|
+
|
|
87
|
+
job_dir = get_job_dir(job_id, base_path)
|
|
88
|
+
|
|
89
|
+
if reslice:
|
|
90
|
+
# Count sequences in failed chunks to estimate split
|
|
91
|
+
total_seqs = _count_sequences_in_chunks(job_dir, retry_indices)
|
|
92
|
+
seqs_per_chunk = max(1, total_seqs // reslice)
|
|
93
|
+
click.echo(f"Total sequences in failed chunks: {total_seqs:,}")
|
|
94
|
+
click.echo(f"Reslicing into {reslice} chunks (~{seqs_per_chunk:,} seqs each)")
|
|
95
|
+
|
|
96
|
+
if dry_run:
|
|
97
|
+
click.echo()
|
|
98
|
+
click.echo(click.style("Dry run - job not submitted", fg="yellow"))
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
click.echo()
|
|
102
|
+
click.echo(f"Retry job ID: {retry_id}")
|
|
103
|
+
|
|
104
|
+
# Handle reslicing if requested
|
|
105
|
+
if reslice:
|
|
106
|
+
click.echo(f"Reslice prefix: {reslice_prefix}")
|
|
107
|
+
actual_chunks = _reslice_failed_chunks(
|
|
108
|
+
job_dir, retry_indices, reslice_prefix, reslice
|
|
109
|
+
)
|
|
110
|
+
click.echo(f"Created {actual_chunks} resliced chunks")
|
|
111
|
+
array_size = actual_chunks
|
|
112
|
+
else:
|
|
113
|
+
array_size = len(retry_indices)
|
|
114
|
+
|
|
115
|
+
# Submit retry job
|
|
116
|
+
try:
|
|
117
|
+
client = BatchClient()
|
|
118
|
+
|
|
119
|
+
environment = {
|
|
120
|
+
"JOB_DIR": str(job_dir),
|
|
121
|
+
"JOB_ID": job_id,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# Use provided queue or fall back to original
|
|
125
|
+
job_queue = queue or manifest.batch.queue
|
|
126
|
+
if queue and queue != manifest.batch.queue:
|
|
127
|
+
click.echo(f"Using queue: {job_queue} (original: {manifest.batch.queue})")
|
|
128
|
+
|
|
129
|
+
if reslice:
|
|
130
|
+
# Resliced retry: use RESLICE_PREFIX, sequential indices 0..N-1
|
|
131
|
+
environment["RESLICE_PREFIX"] = reslice_prefix
|
|
132
|
+
batch_job_id = client.submit_job(
|
|
133
|
+
job_name=retry_id,
|
|
134
|
+
job_definition=manifest.batch.job_definition or "dayhoff-embed-t5",
|
|
135
|
+
job_queue=job_queue,
|
|
136
|
+
array_size=array_size,
|
|
137
|
+
environment=environment,
|
|
138
|
+
timeout_seconds=6 * 3600,
|
|
139
|
+
retry_attempts=5,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
# Standard retry: use BATCH_RETRY_INDICES mapping
|
|
143
|
+
environment["BATCH_RETRY_INDICES"] = ",".join(str(i) for i in retry_indices)
|
|
144
|
+
batch_job_id = client.submit_array_job_with_indices(
|
|
145
|
+
job_name=retry_id,
|
|
146
|
+
job_definition=manifest.batch.job_definition or "dayhoff-embed-t5",
|
|
147
|
+
job_queue=job_queue,
|
|
148
|
+
indices=retry_indices,
|
|
149
|
+
environment=environment,
|
|
150
|
+
timeout_seconds=6 * 3600,
|
|
151
|
+
retry_attempts=5,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Update manifest with retry info
|
|
155
|
+
retry_info = RetryInfo(
|
|
156
|
+
retry_id=retry_id,
|
|
157
|
+
indices=retry_indices,
|
|
158
|
+
batch_job_id=batch_job_id,
|
|
159
|
+
reslice_prefix=reslice_prefix,
|
|
160
|
+
reslice_count=array_size if reslice else None,
|
|
161
|
+
created=datetime.utcnow(),
|
|
162
|
+
)
|
|
163
|
+
manifest.retries.append(retry_info)
|
|
164
|
+
manifest.status = JobStatus.RUNNING
|
|
165
|
+
save_manifest(manifest, base_path)
|
|
166
|
+
|
|
167
|
+
click.echo()
|
|
168
|
+
click.echo(click.style("✓ Retry job submitted successfully!", fg="green"))
|
|
169
|
+
click.echo()
|
|
170
|
+
click.echo(f"AWS Batch Job ID: {batch_job_id}")
|
|
171
|
+
click.echo()
|
|
172
|
+
click.echo("Next steps:")
|
|
173
|
+
click.echo(f" Check status: dh batch status {job_id}")
|
|
174
|
+
click.echo(f" View logs: dh batch logs {job_id}")
|
|
175
|
+
|
|
176
|
+
except BatchError as e:
|
|
177
|
+
click.echo(
|
|
178
|
+
click.style(f"✗ Failed to submit retry job: {e}", fg="red"), err=True
|
|
179
|
+
)
|
|
180
|
+
raise SystemExit(1)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _find_incomplete_chunks(job_id: str, base_path: str) -> list[int]:
|
|
184
|
+
"""Find chunks that don't have .done markers."""
|
|
185
|
+
job_dir = get_job_dir(job_id, base_path)
|
|
186
|
+
input_dir = job_dir / "input"
|
|
187
|
+
output_dir = job_dir / "output"
|
|
188
|
+
|
|
189
|
+
if not input_dir.exists():
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
# Find all original input chunks (not resliced ones)
|
|
193
|
+
input_chunks = sorted(input_dir.glob("chunk_[0-9][0-9][0-9].fasta"))
|
|
194
|
+
incomplete = []
|
|
195
|
+
|
|
196
|
+
for chunk_path in input_chunks:
|
|
197
|
+
# Extract index from filename (chunk_000.fasta -> 0)
|
|
198
|
+
idx_str = chunk_path.stem.split("_")[1]
|
|
199
|
+
idx = int(idx_str)
|
|
200
|
+
|
|
201
|
+
# Check for .done marker
|
|
202
|
+
done_marker = output_dir / f"embed_{idx:03d}.done"
|
|
203
|
+
if not done_marker.exists():
|
|
204
|
+
incomplete.append(idx)
|
|
205
|
+
|
|
206
|
+
return incomplete
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _count_sequences_in_chunks(job_dir: Path, indices: list[int]) -> int:
|
|
210
|
+
"""Count total sequences in the specified chunk files."""
|
|
211
|
+
input_dir = job_dir / "input"
|
|
212
|
+
total = 0
|
|
213
|
+
|
|
214
|
+
for idx in indices:
|
|
215
|
+
chunk_path = input_dir / f"chunk_{idx:03d}.fasta"
|
|
216
|
+
if chunk_path.exists():
|
|
217
|
+
with open(chunk_path) as f:
|
|
218
|
+
for line in f:
|
|
219
|
+
if line.startswith(">"):
|
|
220
|
+
total += 1
|
|
221
|
+
|
|
222
|
+
return total
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _reslice_failed_chunks(
|
|
226
|
+
job_dir: Path, indices: list[int], reslice_prefix: str, num_chunks: int
|
|
227
|
+
) -> int:
|
|
228
|
+
"""Concatenate failed chunks and re-split into thinner slices.
|
|
229
|
+
|
|
230
|
+
Creates new chunk files named chunk_{prefix}_000.fasta, etc.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
job_dir: Job directory path
|
|
234
|
+
indices: List of failed chunk indices
|
|
235
|
+
reslice_prefix: Prefix for new chunk files (e.g., 'r1')
|
|
236
|
+
num_chunks: Target number of new chunks
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Actual number of chunks created
|
|
240
|
+
"""
|
|
241
|
+
from dayhoff_tools.fasta import split_fasta
|
|
242
|
+
import tempfile
|
|
243
|
+
|
|
244
|
+
input_dir = job_dir / "input"
|
|
245
|
+
|
|
246
|
+
# Concatenate all failed chunks into a temp file
|
|
247
|
+
with tempfile.NamedTemporaryFile(
|
|
248
|
+
mode="w", suffix=".fasta", delete=False
|
|
249
|
+
) as tmp_file:
|
|
250
|
+
tmp_path = tmp_file.name
|
|
251
|
+
total_seqs = 0
|
|
252
|
+
|
|
253
|
+
for idx in indices:
|
|
254
|
+
chunk_path = input_dir / f"chunk_{idx:03d}.fasta"
|
|
255
|
+
if chunk_path.exists():
|
|
256
|
+
with open(chunk_path) as f:
|
|
257
|
+
for line in f:
|
|
258
|
+
tmp_file.write(line)
|
|
259
|
+
if line.startswith(">"):
|
|
260
|
+
total_seqs += 1
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
# Calculate sequences per chunk
|
|
264
|
+
seqs_per_chunk = max(1, (total_seqs + num_chunks - 1) // num_chunks)
|
|
265
|
+
|
|
266
|
+
# Split into new chunks with reslice prefix
|
|
267
|
+
# split_fasta creates files like: chunk_r1_1.fasta, chunk_r1_2.fasta, etc.
|
|
268
|
+
actual_chunks = split_fasta(
|
|
269
|
+
fasta_file=tmp_path,
|
|
270
|
+
target_folder=str(input_dir),
|
|
271
|
+
base_name=f"chunk_{reslice_prefix}",
|
|
272
|
+
sequences_per_file=seqs_per_chunk,
|
|
273
|
+
max_files=num_chunks,
|
|
274
|
+
show_progress=True,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Rename to zero-padded indices (chunk_r1_000.fasta, etc.)
|
|
278
|
+
for i in range(1, actual_chunks + 1):
|
|
279
|
+
old_name = input_dir / f"chunk_{reslice_prefix}_{i}.fasta"
|
|
280
|
+
new_name = input_dir / f"chunk_{reslice_prefix}_{i-1:03d}.fasta"
|
|
281
|
+
if old_name.exists():
|
|
282
|
+
old_name.rename(new_name)
|
|
283
|
+
|
|
284
|
+
return actual_chunks
|
|
285
|
+
|
|
286
|
+
finally:
|
|
287
|
+
# Clean up temp file
|
|
288
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
@@ -62,6 +62,12 @@ class RetryInfo(BaseModel):
|
|
|
62
62
|
retry_id: str = Field(..., description="Retry job ID")
|
|
63
63
|
indices: list[int] = Field(..., description="Array indices being retried")
|
|
64
64
|
batch_job_id: str | None = Field(None, description="AWS Batch job ID for retry")
|
|
65
|
+
reslice_prefix: str | None = Field(
|
|
66
|
+
None, description="Reslice prefix if chunks were resliced (e.g., 'r1')"
|
|
67
|
+
)
|
|
68
|
+
reslice_count: int | None = Field(
|
|
69
|
+
None, description="Number of resliced chunks created"
|
|
70
|
+
)
|
|
65
71
|
created: datetime = Field(default_factory=datetime.utcnow)
|
|
66
72
|
|
|
67
73
|
|
|
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "dayhoff-tools"
|
|
8
|
-
version = "1.14.
|
|
8
|
+
version = "1.14.11"
|
|
9
9
|
description = "Common tools for all the repos at Dayhoff Labs"
|
|
10
10
|
authors = [
|
|
11
11
|
{name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
"""Retry command for re-running failed chunks."""
|
|
2
|
-
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
|
|
5
|
-
import click
|
|
6
|
-
|
|
7
|
-
from ..aws_batch import BatchClient, BatchError
|
|
8
|
-
from ..job_id import generate_job_id
|
|
9
|
-
from ..manifest import (
|
|
10
|
-
BATCH_JOBS_BASE,
|
|
11
|
-
JobStatus,
|
|
12
|
-
RetryInfo,
|
|
13
|
-
get_job_dir,
|
|
14
|
-
load_manifest,
|
|
15
|
-
save_manifest,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@click.command()
|
|
20
|
-
@click.argument("job_id")
|
|
21
|
-
@click.option("--indices", help="Specific indices to retry (comma-separated)")
|
|
22
|
-
@click.option(
|
|
23
|
-
"--dry-run", is_flag=True, help="Show what would be retried without submitting"
|
|
24
|
-
)
|
|
25
|
-
@click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
|
|
26
|
-
def retry(job_id, indices, dry_run, base_path):
|
|
27
|
-
"""Retry failed chunks of a batch job.
|
|
28
|
-
|
|
29
|
-
Identifies failed array indices and submits a new job to retry only
|
|
30
|
-
those specific indices.
|
|
31
|
-
|
|
32
|
-
\b
|
|
33
|
-
Examples:
|
|
34
|
-
dh batch retry dma-embed-20260109-a3f2 # Retry all failed
|
|
35
|
-
dh batch retry dma-embed-20260109-a3f2 --indices 5,12,27 # Retry specific indices
|
|
36
|
-
dh batch retry dma-embed-20260109-a3f2 --dry-run # Show what would be retried
|
|
37
|
-
"""
|
|
38
|
-
# Load manifest
|
|
39
|
-
try:
|
|
40
|
-
manifest = load_manifest(job_id, base_path)
|
|
41
|
-
except FileNotFoundError:
|
|
42
|
-
click.echo(f"Job not found: {job_id}", err=True)
|
|
43
|
-
raise SystemExit(1)
|
|
44
|
-
|
|
45
|
-
# Get failed indices
|
|
46
|
-
if indices:
|
|
47
|
-
# User specified indices
|
|
48
|
-
retry_indices = [int(i.strip()) for i in indices.split(",")]
|
|
49
|
-
else:
|
|
50
|
-
# Auto-detect from .done markers
|
|
51
|
-
retry_indices = _find_incomplete_chunks(job_id, base_path)
|
|
52
|
-
|
|
53
|
-
if not retry_indices:
|
|
54
|
-
click.echo("No failed or incomplete chunks found. Nothing to retry.")
|
|
55
|
-
return
|
|
56
|
-
|
|
57
|
-
click.echo(f"Found {len(retry_indices)} chunks to retry: {retry_indices}")
|
|
58
|
-
|
|
59
|
-
if dry_run:
|
|
60
|
-
click.echo()
|
|
61
|
-
click.echo(click.style("Dry run - job not submitted", fg="yellow"))
|
|
62
|
-
return
|
|
63
|
-
|
|
64
|
-
# Check if we have the required info
|
|
65
|
-
if not manifest.batch:
|
|
66
|
-
click.echo("Job has no batch configuration.", err=True)
|
|
67
|
-
raise SystemExit(1)
|
|
68
|
-
|
|
69
|
-
# Generate retry job ID
|
|
70
|
-
retry_id = f"{job_id}-r{len(manifest.retries) + 1}"
|
|
71
|
-
|
|
72
|
-
click.echo()
|
|
73
|
-
click.echo(f"Retry job ID: {retry_id}")
|
|
74
|
-
|
|
75
|
-
# Submit retry job
|
|
76
|
-
try:
|
|
77
|
-
client = BatchClient()
|
|
78
|
-
job_dir = get_job_dir(job_id, base_path)
|
|
79
|
-
|
|
80
|
-
environment = {
|
|
81
|
-
"JOB_DIR": str(job_dir),
|
|
82
|
-
"JOB_ID": job_id,
|
|
83
|
-
"BATCH_RETRY_INDICES": ",".join(str(i) for i in retry_indices),
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
batch_job_id = client.submit_array_job_with_indices(
|
|
87
|
-
job_name=retry_id,
|
|
88
|
-
job_definition=manifest.batch.job_definition or "dayhoff-embed-t5",
|
|
89
|
-
job_queue=manifest.batch.queue,
|
|
90
|
-
indices=retry_indices,
|
|
91
|
-
environment=environment,
|
|
92
|
-
timeout_seconds=6 * 3600,
|
|
93
|
-
retry_attempts=3,
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
# Update manifest with retry info
|
|
97
|
-
retry_info = RetryInfo(
|
|
98
|
-
retry_id=retry_id,
|
|
99
|
-
indices=retry_indices,
|
|
100
|
-
batch_job_id=batch_job_id,
|
|
101
|
-
created=datetime.utcnow(),
|
|
102
|
-
)
|
|
103
|
-
manifest.retries.append(retry_info)
|
|
104
|
-
manifest.status = JobStatus.RUNNING
|
|
105
|
-
save_manifest(manifest, base_path)
|
|
106
|
-
|
|
107
|
-
click.echo()
|
|
108
|
-
click.echo(click.style("✓ Retry job submitted successfully!", fg="green"))
|
|
109
|
-
click.echo()
|
|
110
|
-
click.echo(f"AWS Batch Job ID: {batch_job_id}")
|
|
111
|
-
click.echo()
|
|
112
|
-
click.echo("Next steps:")
|
|
113
|
-
click.echo(f" Check status: dh batch status {job_id}")
|
|
114
|
-
click.echo(f" View logs: dh batch logs {job_id}")
|
|
115
|
-
|
|
116
|
-
except BatchError as e:
|
|
117
|
-
click.echo(
|
|
118
|
-
click.style(f"✗ Failed to submit retry job: {e}", fg="red"), err=True
|
|
119
|
-
)
|
|
120
|
-
raise SystemExit(1)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def _find_incomplete_chunks(job_id: str, base_path: str) -> list[int]:
|
|
124
|
-
"""Find chunks that don't have .done markers."""
|
|
125
|
-
job_dir = get_job_dir(job_id, base_path)
|
|
126
|
-
input_dir = job_dir / "input"
|
|
127
|
-
output_dir = job_dir / "output"
|
|
128
|
-
|
|
129
|
-
if not input_dir.exists():
|
|
130
|
-
return []
|
|
131
|
-
|
|
132
|
-
# Find all input chunks
|
|
133
|
-
input_chunks = sorted(input_dir.glob("chunk_*.fasta"))
|
|
134
|
-
incomplete = []
|
|
135
|
-
|
|
136
|
-
for chunk_path in input_chunks:
|
|
137
|
-
# Extract index from filename (chunk_000.fasta -> 0)
|
|
138
|
-
idx_str = chunk_path.stem.split("_")[1]
|
|
139
|
-
idx = int(idx_str)
|
|
140
|
-
|
|
141
|
-
# Check for .done marker
|
|
142
|
-
done_marker = output_dir / f"embed_{idx:03d}.done"
|
|
143
|
-
if not done_marker.exists():
|
|
144
|
-
incomplete.append(idx)
|
|
145
|
-
|
|
146
|
-
return incomplete
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/batch/commands/list_jobs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_lifecycle.py
RENAMED
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_maintenance.py
RENAMED
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engine1/engine_management.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/__init__.py
RENAMED
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/api_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/engine_commands.py
RENAMED
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/progress.py
RENAMED
|
File without changes
|
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/simulators/demo.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/ssh_config.py
RENAMED
|
File without changes
|
{dayhoff_tools-1.14.9 → dayhoff_tools-1.14.11}/dayhoff_tools/cli/engines_studios/studio_commands.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|