dayhoff-tools 1.14.1__py3-none-any.whl → 1.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. dayhoff_tools/batch/__init__.py +8 -0
  2. dayhoff_tools/batch/workers/__init__.py +12 -0
  3. dayhoff_tools/batch/workers/base.py +150 -0
  4. dayhoff_tools/batch/workers/boltz.py +407 -0
  5. dayhoff_tools/batch/workers/embed_t5.py +92 -0
  6. dayhoff_tools/cli/batch/__init__.py +85 -0
  7. dayhoff_tools/cli/batch/aws_batch.py +401 -0
  8. dayhoff_tools/cli/batch/commands/__init__.py +25 -0
  9. dayhoff_tools/cli/batch/commands/boltz.py +362 -0
  10. dayhoff_tools/cli/batch/commands/cancel.py +82 -0
  11. dayhoff_tools/cli/batch/commands/embed_t5.py +303 -0
  12. dayhoff_tools/cli/batch/commands/finalize.py +206 -0
  13. dayhoff_tools/cli/batch/commands/list_jobs.py +78 -0
  14. dayhoff_tools/cli/batch/commands/local.py +95 -0
  15. dayhoff_tools/cli/batch/commands/logs.py +142 -0
  16. dayhoff_tools/cli/batch/commands/retry.py +142 -0
  17. dayhoff_tools/cli/batch/commands/status.py +214 -0
  18. dayhoff_tools/cli/batch/commands/submit.py +215 -0
  19. dayhoff_tools/cli/batch/job_id.py +151 -0
  20. dayhoff_tools/cli/batch/manifest.py +293 -0
  21. dayhoff_tools/cli/engines_studios/engine-studio-cli.md +26 -21
  22. dayhoff_tools/cli/engines_studios/engine_commands.py +16 -89
  23. dayhoff_tools/cli/engines_studios/ssh_config.py +96 -0
  24. dayhoff_tools/cli/engines_studios/studio_commands.py +13 -2
  25. dayhoff_tools/cli/main.py +14 -0
  26. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/METADATA +6 -1
  27. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/RECORD +29 -8
  28. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/WHEEL +0 -0
  29. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,303 @@
1
+ """T5 embedding pipeline command."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from ..aws_batch import BatchClient, BatchError
9
+ from ..job_id import generate_job_id
10
+ from ..manifest import (
11
+ BATCH_JOBS_BASE,
12
+ BatchConfig,
13
+ InputConfig,
14
+ JobManifest,
15
+ JobStatus,
16
+ OutputConfig,
17
+ create_job_directory,
18
+ get_job_dir,
19
+ save_manifest,
20
+ )
21
+
22
+
23
+ # Default settings for T5 embedding
24
+ DEFAULT_QUEUE = "t4-1x-spot"
25
+ DEFAULT_WORKERS = 50
26
+ DEFAULT_SEQS_PER_CHUNK = 5000
27
+ DEFAULT_JOB_DEFINITION = "dayhoff-embed-t5"
28
+ DEFAULT_IMAGE_URI = "074735440724.dkr.ecr.us-east-1.amazonaws.com/dayhoff:embed-latest"
29
+
30
+
31
+ @click.command()
32
+ @click.argument("input_fasta", type=click.Path(exists=True))
33
+ @click.option("--workers", default=DEFAULT_WORKERS, type=int, help=f"Number of parallel workers [default: {DEFAULT_WORKERS}]")
34
+ @click.option("--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]")
35
+ @click.option("--seqs-per-chunk", default=DEFAULT_SEQS_PER_CHUNK, type=int, help=f"Sequences per chunk [default: {DEFAULT_SEQS_PER_CHUNK}]")
36
+ @click.option("--local", "run_local", is_flag=True, help="Run single chunk locally instead of Batch")
37
+ @click.option("--shell", "run_shell", is_flag=True, help="Drop into container shell for debugging")
38
+ @click.option("--dry-run", is_flag=True, help="Show plan without submitting")
39
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
40
+ def embed_t5(input_fasta, workers, queue, seqs_per_chunk, run_local, run_shell, dry_run, base_path):
41
+ """Generate T5 protein embeddings for a FASTA file.
42
+
43
+ Splits the input FASTA into chunks and processes them in parallel using
44
+ AWS Batch array jobs. Each worker generates embeddings for its chunk
45
+ and writes an H5 file.
46
+
47
+ \b
48
+ Examples:
49
+ # Submit to AWS Batch with 50 workers
50
+ dh batch embed-t5 /primordial/proteins.fasta --workers 50
51
+
52
+ # Use a faster queue with more workers
53
+ dh batch embed-t5 /primordial/big.fasta --workers 100 --queue a10g-1x-spot
54
+
55
+ # Test locally with a single chunk
56
+ dh batch embed-t5 /primordial/test.fasta --local
57
+
58
+ # Debug by dropping into container shell
59
+ dh batch embed-t5 /primordial/test.fasta --shell
60
+
61
+ \b
62
+ After job completes:
63
+ dh batch status <job-id> # Check status
64
+ dh batch finalize <job-id> --output out.h5 # Combine results
65
+ """
66
+ input_path = Path(input_fasta).resolve()
67
+
68
+ if run_shell:
69
+ _run_shell_mode(input_path)
70
+ return
71
+
72
+ if run_local:
73
+ _run_local_mode(input_path)
74
+ return
75
+
76
+ # Batch submission mode
77
+ _submit_batch_job(input_path, workers, queue, seqs_per_chunk, dry_run, base_path)
78
+
79
+
80
+ def _count_sequences(fasta_path: Path) -> int:
81
+ """Count sequences in a FASTA file (fast, just counts > lines)."""
82
+ count = 0
83
+ with open(fasta_path) as f:
84
+ for line in f:
85
+ if line.startswith(">"):
86
+ count += 1
87
+ return count
88
+
89
+
90
+ def _split_fasta(input_path: Path, output_dir: Path, seqs_per_chunk: int) -> int:
91
+ """Split FASTA file into chunks.
92
+
93
+ Returns:
94
+ Number of chunks created
95
+ """
96
+ from dayhoff_tools.fasta import split_fasta
97
+
98
+ num_chunks = split_fasta(
99
+ fasta_file=str(input_path),
100
+ target_folder=str(output_dir),
101
+ base_name="chunk",
102
+ sequences_per_file=seqs_per_chunk,
103
+ show_progress=True,
104
+ )
105
+
106
+ # Rename files to use zero-padded indices (chunk_000.fasta, etc.)
107
+ for i in range(1, num_chunks + 1):
108
+ old_name = output_dir / f"chunk_{i}.fasta"
109
+ new_name = output_dir / f"chunk_{i-1:03d}.fasta"
110
+ if old_name.exists():
111
+ old_name.rename(new_name)
112
+
113
+ return num_chunks
114
+
115
+
116
+ def _submit_batch_job(input_path: Path, workers: int, queue: str, seqs_per_chunk: int, dry_run: bool, base_path: str):
117
+ """Submit embedding job to AWS Batch."""
118
+ # Count sequences
119
+ click.echo(f"Counting sequences in {input_path}...")
120
+ num_sequences = _count_sequences(input_path)
121
+ click.echo(f"Found {num_sequences:,} sequences")
122
+
123
+ if num_sequences == 0:
124
+ click.echo(click.style("Error: No sequences found in input file", fg="red"), err=True)
125
+ raise SystemExit(1)
126
+
127
+ # Calculate chunks
128
+ num_chunks = min((num_sequences + seqs_per_chunk - 1) // seqs_per_chunk, workers)
129
+ actual_seqs_per_chunk = (num_sequences + num_chunks - 1) // num_chunks
130
+
131
+ # Generate job ID
132
+ job_id = generate_job_id("embed")
133
+
134
+ # Show plan
135
+ click.echo()
136
+ click.echo(f"Job ID: {job_id}")
137
+ click.echo(f"Input: {input_path}")
138
+ click.echo(f"Sequences: {num_sequences:,}")
139
+ click.echo(f"Chunks: {num_chunks}")
140
+ click.echo(f"Seqs per chunk: ~{actual_seqs_per_chunk:,}")
141
+ click.echo(f"Queue: {queue}")
142
+ click.echo(f"Job definition: {DEFAULT_JOB_DEFINITION}")
143
+
144
+ if dry_run:
145
+ click.echo()
146
+ click.echo(click.style("Dry run - job not submitted", fg="yellow"))
147
+ return
148
+
149
+ click.echo()
150
+
151
+ # Create job directory
152
+ job_dir = create_job_directory(job_id, base_path)
153
+ input_dir = job_dir / "input"
154
+ output_dir = job_dir / "output"
155
+
156
+ click.echo(f"Created job directory: {job_dir}")
157
+
158
+ # Split FASTA into chunks
159
+ click.echo("Splitting FASTA into chunks...")
160
+ actual_chunks = _split_fasta(input_path, input_dir, actual_seqs_per_chunk)
161
+ click.echo(f"Created {actual_chunks} chunks")
162
+
163
+ # Create manifest
164
+ manifest = JobManifest(
165
+ job_id=job_id,
166
+ user=job_id.split("-")[0],
167
+ pipeline="embed-t5",
168
+ status=JobStatus.PENDING,
169
+ image_uri=DEFAULT_IMAGE_URI,
170
+ input=InputConfig(
171
+ source=str(input_path),
172
+ num_sequences=num_sequences,
173
+ num_chunks=actual_chunks,
174
+ sequences_per_chunk=actual_seqs_per_chunk,
175
+ ),
176
+ batch=BatchConfig(
177
+ queue=queue,
178
+ job_definition=DEFAULT_JOB_DEFINITION,
179
+ array_size=actual_chunks,
180
+ ),
181
+ output=OutputConfig(
182
+ destination=None,
183
+ finalized=False,
184
+ ),
185
+ )
186
+
187
+ save_manifest(manifest, base_path)
188
+
189
+ # Submit to AWS Batch
190
+ try:
191
+ client = BatchClient()
192
+
193
+ environment = {
194
+ "JOB_DIR": str(job_dir),
195
+ "JOB_ID": job_id,
196
+ }
197
+
198
+ batch_job_id = client.submit_job(
199
+ job_name=job_id,
200
+ job_definition=DEFAULT_JOB_DEFINITION,
201
+ job_queue=queue,
202
+ array_size=actual_chunks,
203
+ environment=environment,
204
+ timeout_seconds=6 * 3600, # 6 hours
205
+ retry_attempts=3,
206
+ )
207
+
208
+ # Update manifest
209
+ manifest.status = JobStatus.SUBMITTED
210
+ manifest.batch.job_id = batch_job_id
211
+ save_manifest(manifest, base_path)
212
+
213
+ click.echo()
214
+ click.echo(click.style("✓ Job submitted successfully!", fg="green"))
215
+ click.echo()
216
+ click.echo(f"AWS Batch Job ID: {batch_job_id}")
217
+ click.echo()
218
+ click.echo("Next steps:")
219
+ click.echo(f" Check status: dh batch status {job_id}")
220
+ click.echo(f" View logs: dh batch logs {job_id}")
221
+ click.echo(f" Cancel: dh batch cancel {job_id}")
222
+ click.echo()
223
+ click.echo("After completion:")
224
+ click.echo(f" Finalize: dh batch finalize {job_id} --output /primordial/embeddings.h5")
225
+
226
+ except BatchError as e:
227
+ manifest.status = JobStatus.FAILED
228
+ manifest.error_message = str(e)
229
+ save_manifest(manifest, base_path)
230
+ click.echo(click.style(f"✗ Failed to submit job: {e}", fg="red"), err=True)
231
+ raise SystemExit(1)
232
+
233
+
234
+ def _run_local_mode(input_path: Path):
235
+ """Run embedding locally for a single chunk."""
236
+ import subprocess
237
+
238
+ click.echo("Running T5 embedding locally...")
239
+ click.echo(f"Input: {input_path}")
240
+
241
+ # Check if we have the embedder available
242
+ try:
243
+ from dayhoff_tools.embedders import T5Embedder
244
+
245
+ output_file = input_path.with_suffix(".h5")
246
+ click.echo(f"Output: {output_file}")
247
+ click.echo()
248
+
249
+ embedder = T5Embedder(
250
+ max_seq_length=4500,
251
+ large_protein_threshold=2500,
252
+ batch_residue_limit=4500,
253
+ )
254
+ embedder.run(str(input_path), str(output_file))
255
+
256
+ click.echo()
257
+ click.echo(click.style("✓ Embedding complete!", fg="green"))
258
+ click.echo(f"Output: {output_file}")
259
+
260
+ except ImportError:
261
+ click.echo(
262
+ click.style(
263
+ "Error: T5Embedder requires 'embedders' extra. "
264
+ "Install with: pip install 'dayhoff-tools[embedders]'",
265
+ fg="red",
266
+ ),
267
+ err=True,
268
+ )
269
+ raise SystemExit(1)
270
+
271
+
272
+ def _run_shell_mode(input_path: Path):
273
+ """Drop into container shell for debugging."""
274
+ import subprocess
275
+
276
+ click.echo("Dropping into container shell...")
277
+ click.echo(f"Input will be available at: /input/{input_path.name}")
278
+ click.echo()
279
+
280
+ input_dir = input_path.parent
281
+
282
+ cmd = [
283
+ "docker", "run", "--rm", "-it",
284
+ "--gpus", "all",
285
+ "-v", "/primordial:/primordial",
286
+ "-v", f"{input_dir}:/input",
287
+ "-e", "JOB_DIR=/input",
288
+ "-e", "AWS_BATCH_JOB_ARRAY_INDEX=0",
289
+ "--entrypoint", "/bin/bash",
290
+ DEFAULT_IMAGE_URI,
291
+ ]
292
+
293
+ click.echo(f"Running: {' '.join(cmd)}")
294
+ click.echo()
295
+
296
+ try:
297
+ subprocess.run(cmd)
298
+ except FileNotFoundError:
299
+ click.echo(
300
+ click.style("Error: Docker not found. Is Docker installed and running?", fg="red"),
301
+ err=True,
302
+ )
303
+ raise SystemExit(1)
@@ -0,0 +1,206 @@
1
+ """Finalize command for combining results and cleaning up."""
2
+
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from ..manifest import (
9
+ BATCH_JOBS_BASE,
10
+ JobStatus,
11
+ delete_job_directory,
12
+ get_job_dir,
13
+ load_manifest,
14
+ save_manifest,
15
+ )
16
+
17
+
18
+ @click.command()
19
+ @click.argument("job_id")
20
+ @click.option("--output", required=True, type=click.Path(), help="Output path for combined results")
21
+ @click.option("--force", is_flag=True, help="Finalize even if some chunks failed")
22
+ @click.option("--keep-intermediates", is_flag=True, help="Don't delete job directory after finalizing")
23
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
24
+ def finalize(job_id, output, force, keep_intermediates, base_path):
25
+ """Combine results and clean up job intermediates.
26
+
27
+ For embedding jobs, combines H5 files into a single output file.
28
+ For structure prediction, moves outputs to the destination.
29
+
30
+ \b
31
+ Examples:
32
+ dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5
33
+ dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5 --force
34
+ dh batch finalize dma-embed-20260109-a3f2 --output /primordial/out.h5 --keep-intermediates
35
+ """
36
+ # Load manifest
37
+ try:
38
+ manifest = load_manifest(job_id, base_path)
39
+ except FileNotFoundError:
40
+ click.echo(f"Job not found: {job_id}", err=True)
41
+ raise SystemExit(1)
42
+
43
+ # Check job status
44
+ if manifest.status == JobStatus.FINALIZED:
45
+ click.echo(f"Job {job_id} is already finalized.", err=True)
46
+ raise SystemExit(1)
47
+
48
+ job_dir = get_job_dir(job_id, base_path)
49
+ output_dir = job_dir / "output"
50
+ output_path = Path(output).resolve()
51
+
52
+ # Check completion status
53
+ incomplete = _check_completion(job_id, base_path)
54
+ if incomplete:
55
+ click.echo(f"Found {len(incomplete)} incomplete chunks: {incomplete[:10]}...")
56
+ if not force:
57
+ click.echo()
58
+ click.echo("Use --force to finalize anyway, or retry failed chunks:")
59
+ click.echo(f" dh batch retry {job_id}")
60
+ raise SystemExit(1)
61
+ click.echo()
62
+ click.echo(click.style("Warning: Finalizing with incomplete chunks", fg="yellow"))
63
+
64
+ # Update status
65
+ manifest.status = JobStatus.FINALIZING
66
+ save_manifest(manifest, base_path)
67
+
68
+ # Finalize based on pipeline type
69
+ click.echo()
70
+ if manifest.pipeline in ("embed-t5", "embed"):
71
+ _finalize_embeddings(output_dir, output_path)
72
+ elif manifest.pipeline == "boltz":
73
+ _finalize_boltz(output_dir, output_path)
74
+ else:
75
+ _finalize_generic(output_dir, output_path)
76
+
77
+ # Update manifest
78
+ manifest.status = JobStatus.FINALIZED
79
+ if manifest.output:
80
+ manifest.output.destination = str(output_path)
81
+ manifest.output.finalized = True
82
+ save_manifest(manifest, base_path)
83
+
84
+ click.echo()
85
+ click.echo(click.style(f"✓ Results saved to: {output_path}", fg="green"))
86
+
87
+ # Clean up
88
+ if not keep_intermediates:
89
+ click.echo(f"Cleaning up job directory: {job_dir}")
90
+ delete_job_directory(job_id, base_path)
91
+ click.echo(click.style("✓ Job directory deleted", fg="green"))
92
+ else:
93
+ click.echo(f"Job directory preserved: {job_dir}")
94
+
95
+
96
+ def _check_completion(job_id: str, base_path: str) -> list[int]:
97
+ """Check which chunks are incomplete (no .done marker)."""
98
+ job_dir = get_job_dir(job_id, base_path)
99
+ input_dir = job_dir / "input"
100
+ output_dir = job_dir / "output"
101
+
102
+ if not input_dir.exists():
103
+ return []
104
+
105
+ incomplete = []
106
+ for chunk_path in sorted(input_dir.glob("chunk_*.fasta")):
107
+ idx_str = chunk_path.stem.split("_")[1]
108
+ idx = int(idx_str)
109
+ done_marker = output_dir / f"embed_{idx:03d}.done"
110
+ if not done_marker.exists():
111
+ incomplete.append(idx)
112
+
113
+ return incomplete
114
+
115
+
116
+ def _finalize_embeddings(output_dir: Path, output_path: Path):
117
+ """Combine H5 embedding files into a single output."""
118
+ h5_files = sorted(output_dir.glob("embed_*.h5"))
119
+
120
+ if not h5_files:
121
+ click.echo("No H5 files found in output directory.", err=True)
122
+ raise SystemExit(1)
123
+
124
+ click.echo(f"Found {len(h5_files)} H5 files to combine")
125
+
126
+ # Check if output already exists
127
+ if output_path.exists():
128
+ click.echo(f"Output file already exists: {output_path}", err=True)
129
+ raise SystemExit(1)
130
+
131
+ # Ensure output directory exists
132
+ output_path.parent.mkdir(parents=True, exist_ok=True)
133
+
134
+ try:
135
+ from dayhoff_tools.h5 import combine_h5_files, deduplicate_h5_file, optimize_protein_embedding_chunks
136
+
137
+ # Combine H5 files
138
+ click.echo("Combining H5 files...")
139
+ combine_h5_files(
140
+ input_folder=str(output_dir),
141
+ output_file=str(output_path),
142
+ glob_pattern="embed_*.h5",
143
+ )
144
+
145
+ # Deduplicate
146
+ click.echo("Deduplicating...")
147
+ deduplicate_h5_file(str(output_path))
148
+
149
+ # Optimize chunks
150
+ click.echo("Optimizing chunks...")
151
+ optimize_protein_embedding_chunks(str(output_path))
152
+
153
+ click.echo(click.style("✓ H5 files combined successfully", fg="green"))
154
+
155
+ except ImportError:
156
+ # Fall back to simple concatenation
157
+ click.echo("h5 module not available, using simple copy...")
158
+ if len(h5_files) == 1:
159
+ shutil.copy2(h5_files[0], output_path)
160
+ else:
161
+ # For multiple files without h5 module, just copy first file
162
+ # This is a fallback - the h5 module should be available
163
+ click.echo(
164
+ click.style(
165
+ "Warning: Cannot combine multiple H5 files without dayhoff_tools.h5 module. "
166
+ "Only copying first file.",
167
+ fg="yellow",
168
+ )
169
+ )
170
+ shutil.copy2(h5_files[0], output_path)
171
+
172
+
173
+ def _finalize_boltz(output_dir: Path, output_path: Path):
174
+ """Move Boltz output directories to destination."""
175
+ # Find all output directories (one per complex)
176
+ complex_dirs = [d for d in output_dir.iterdir() if d.is_dir()]
177
+
178
+ if not complex_dirs:
179
+ click.echo("No output directories found.", err=True)
180
+ raise SystemExit(1)
181
+
182
+ click.echo(f"Found {len(complex_dirs)} structure predictions to move")
183
+
184
+ # Ensure output directory exists
185
+ output_path.mkdir(parents=True, exist_ok=True)
186
+
187
+ for complex_dir in complex_dirs:
188
+ dest = output_path / complex_dir.name
189
+ if dest.exists():
190
+ click.echo(f" Skipping {complex_dir.name} (already exists)")
191
+ continue
192
+ shutil.move(str(complex_dir), str(dest))
193
+ click.echo(f" Moved {complex_dir.name}")
194
+
195
+ click.echo(click.style("✓ Structures moved successfully", fg="green"))
196
+
197
+
198
+ def _finalize_generic(output_dir: Path, output_path: Path):
199
+ """Generic finalization - copy output directory."""
200
+ if output_path.exists():
201
+ click.echo(f"Output path already exists: {output_path}", err=True)
202
+ raise SystemExit(1)
203
+
204
+ click.echo(f"Copying output directory to {output_path}...")
205
+ shutil.copytree(output_dir, output_path)
206
+ click.echo(click.style("✓ Output copied successfully", fg="green"))
@@ -0,0 +1,78 @@
1
+ """List command for showing recent jobs."""
2
+
3
+ import click
4
+
5
+ from ..manifest import BATCH_JOBS_BASE, JobStatus, list_jobs as list_manifests
6
+ from .status import format_status, format_time_ago
7
+
8
+
9
+ @click.command("list")
10
+ @click.option("--user", help="Filter by username")
11
+ @click.option(
12
+ "--status",
13
+ "status_filter",
14
+ type=click.Choice([s.value for s in JobStatus]),
15
+ help="Filter by status",
16
+ )
17
+ @click.option("--pipeline", help="Filter by pipeline type")
18
+ @click.option("--limit", default=20, type=int, help="Maximum number of jobs to show [default: 20]")
19
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
20
+ def list_jobs(user, status_filter, pipeline, limit, base_path):
21
+ """List recent batch jobs.
22
+
23
+ Shows a table of recent jobs with their status, pipeline type, and creation time.
24
+
25
+ \b
26
+ Examples:
27
+ dh batch list # All recent jobs
28
+ dh batch list --user dma # Filter by user
29
+ dh batch list --status running # Filter by status
30
+ dh batch list --pipeline embed-t5 # Filter by pipeline type
31
+ dh batch list --limit 50 # Show more jobs
32
+ """
33
+ status_enum = JobStatus(status_filter) if status_filter else None
34
+
35
+ manifests = list_manifests(
36
+ base_path=base_path,
37
+ user=user,
38
+ status=status_enum,
39
+ pipeline=pipeline,
40
+ limit=limit,
41
+ )
42
+
43
+ if not manifests:
44
+ click.echo("No jobs found.")
45
+ if user or status_filter or pipeline:
46
+ click.echo("Try removing filters to see all jobs.")
47
+ return
48
+
49
+ # Print header
50
+ click.echo()
51
+ click.echo(
52
+ f"{'JOB ID':<35} {'STATUS':<12} {'PIPELINE':<12} {'USER':<10} {'CREATED':<12}"
53
+ )
54
+ click.echo("-" * 85)
55
+
56
+ for manifest in manifests:
57
+ click.echo(
58
+ f"{manifest.job_id:<35} "
59
+ f"{format_status(manifest.status):<21} " # Extra space for ANSI color codes
60
+ f"{manifest.pipeline:<12} "
61
+ f"{manifest.user:<10} "
62
+ f"{format_time_ago(manifest.created):<12}"
63
+ )
64
+
65
+ click.echo()
66
+ click.echo(f"Showing {len(manifests)} jobs.")
67
+
68
+ # Show filter hints
69
+ hints = []
70
+ if not user:
71
+ hints.append("--user <name>")
72
+ if not status_filter:
73
+ hints.append("--status <status>")
74
+ if not pipeline:
75
+ hints.append("--pipeline <type>")
76
+
77
+ if hints:
78
+ click.echo(f"Filter with: {' '.join(hints)}")
@@ -0,0 +1,95 @@
1
+ """Local command for debugging job chunks locally."""
2
+
3
+ import subprocess
4
+
5
+ import click
6
+
7
+ from ..manifest import BATCH_JOBS_BASE, get_job_dir, load_manifest
8
+
9
+
10
+ @click.command()
11
+ @click.argument("job_id")
12
+ @click.option("--index", required=True, type=int, help="Array index to run")
13
+ @click.option("--shell", "run_shell", is_flag=True, help="Drop into shell instead of running command")
14
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
15
+ def local(job_id, index, run_shell, base_path):
16
+ """Run a job chunk locally for debugging.
17
+
18
+ Runs a specific array index of a job in a local Docker container,
19
+ allowing you to debug failed chunks or test changes.
20
+
21
+ \b
22
+ Examples:
23
+ dh batch local dma-embed-20260109-a3f2 --index 27
24
+ dh batch local dma-embed-20260109-a3f2 --index 27 --shell
25
+ """
26
+ # Load manifest
27
+ try:
28
+ manifest = load_manifest(job_id, base_path)
29
+ except FileNotFoundError:
30
+ click.echo(f"Job not found: {job_id}", err=True)
31
+ raise SystemExit(1)
32
+
33
+ # Get job directory and image
34
+ job_dir = get_job_dir(job_id, base_path)
35
+ image_uri = manifest.image_uri
36
+
37
+ if not image_uri:
38
+ click.echo("Job has no image URI, cannot run locally.", err=True)
39
+ raise SystemExit(1)
40
+
41
+ # Validate index
42
+ if manifest.input and manifest.input.num_chunks:
43
+ if index >= manifest.input.num_chunks:
44
+ click.echo(
45
+ f"Index {index} out of range. Job has {manifest.input.num_chunks} chunks (0-{manifest.input.num_chunks - 1}).",
46
+ err=True,
47
+ )
48
+ raise SystemExit(1)
49
+
50
+ click.echo(f"Running job {job_id} index {index} locally")
51
+ click.echo(f"Image: {image_uri}")
52
+ click.echo(f"Job directory: {job_dir}")
53
+ click.echo()
54
+
55
+ # Build Docker command
56
+ cmd = [
57
+ "docker", "run", "--rm",
58
+ "--gpus", "all",
59
+ "-v", "/primordial:/primordial",
60
+ "-v", f"{job_dir}:{job_dir}",
61
+ "-e", f"AWS_BATCH_JOB_ARRAY_INDEX={index}",
62
+ "-e", f"JOB_DIR={job_dir}",
63
+ "-e", f"JOB_ID={job_id}",
64
+ ]
65
+
66
+ if run_shell:
67
+ cmd.extend(["-it", "--entrypoint", "/bin/bash"])
68
+ click.echo("Dropping into container shell...")
69
+ click.echo(f" JOB_DIR={job_dir}")
70
+ click.echo(f" AWS_BATCH_JOB_ARRAY_INDEX={index}")
71
+ else:
72
+ click.echo("Running worker command...")
73
+
74
+ cmd.append(image_uri)
75
+
76
+ click.echo()
77
+ click.echo(f"Command: {' '.join(cmd)}")
78
+ click.echo()
79
+
80
+ try:
81
+ result = subprocess.run(cmd)
82
+ if result.returncode != 0:
83
+ click.echo(
84
+ click.style(f"Container exited with code {result.returncode}", fg="red"),
85
+ err=True,
86
+ )
87
+ raise SystemExit(result.returncode)
88
+ else:
89
+ click.echo(click.style("✓ Container completed successfully", fg="green"))
90
+ except FileNotFoundError:
91
+ click.echo(
92
+ click.style("Error: Docker not found. Is Docker installed and running?", fg="red"),
93
+ err=True,
94
+ )
95
+ raise SystemExit(1)