dayhoff-tools 1.14.1__py3-none-any.whl → 1.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. dayhoff_tools/batch/__init__.py +8 -0
  2. dayhoff_tools/batch/workers/__init__.py +12 -0
  3. dayhoff_tools/batch/workers/base.py +150 -0
  4. dayhoff_tools/batch/workers/boltz.py +407 -0
  5. dayhoff_tools/batch/workers/embed_t5.py +92 -0
  6. dayhoff_tools/cli/batch/__init__.py +85 -0
  7. dayhoff_tools/cli/batch/aws_batch.py +401 -0
  8. dayhoff_tools/cli/batch/commands/__init__.py +25 -0
  9. dayhoff_tools/cli/batch/commands/boltz.py +362 -0
  10. dayhoff_tools/cli/batch/commands/cancel.py +82 -0
  11. dayhoff_tools/cli/batch/commands/embed_t5.py +303 -0
  12. dayhoff_tools/cli/batch/commands/finalize.py +206 -0
  13. dayhoff_tools/cli/batch/commands/list_jobs.py +78 -0
  14. dayhoff_tools/cli/batch/commands/local.py +95 -0
  15. dayhoff_tools/cli/batch/commands/logs.py +142 -0
  16. dayhoff_tools/cli/batch/commands/retry.py +142 -0
  17. dayhoff_tools/cli/batch/commands/status.py +214 -0
  18. dayhoff_tools/cli/batch/commands/submit.py +215 -0
  19. dayhoff_tools/cli/batch/job_id.py +151 -0
  20. dayhoff_tools/cli/batch/manifest.py +293 -0
  21. dayhoff_tools/cli/engines_studios/engine-studio-cli.md +26 -21
  22. dayhoff_tools/cli/engines_studios/engine_commands.py +16 -89
  23. dayhoff_tools/cli/engines_studios/ssh_config.py +96 -0
  24. dayhoff_tools/cli/engines_studios/studio_commands.py +15 -4
  25. dayhoff_tools/cli/main.py +14 -0
  26. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.3.dist-info}/METADATA +6 -1
  27. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.3.dist-info}/RECORD +29 -8
  28. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.3.dist-info}/WHEEL +0 -0
  29. {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,215 @@
1
+ """Submit command for generic batch jobs."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import click
7
+ import yaml
8
+
9
+ from ..aws_batch import BatchClient, BatchError
10
+ from ..job_id import generate_job_id
11
+ from ..manifest import (
12
+ BATCH_JOBS_BASE,
13
+ BatchConfig,
14
+ InputConfig,
15
+ JobManifest,
16
+ JobStatus,
17
+ create_job_directory,
18
+ save_manifest,
19
+ )
20
+
21
+
22
+ # Default job definition for generic jobs
23
+ DEFAULT_JOB_DEFINITION = "dayhoff-batch-base"
24
+ DEFAULT_QUEUE = "t4-1x-spot"
25
+
26
+
27
+ @click.command()
28
+ @click.option("-f", "--file", "config_file", type=click.Path(exists=True), help="Config file path")
29
+ @click.option("--command", help="Command to run (alternative to config file)")
30
+ @click.option("--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]")
31
+ @click.option("--memory", default="30G", help="Memory limit (e.g., 30G)")
32
+ @click.option("--vcpus", default=8, type=int, help="Number of vCPUs")
33
+ @click.option("--gpus", default=1, type=int, help="Number of GPUs")
34
+ @click.option("--array", default=1, type=int, help="Number of array tasks")
35
+ @click.option("--retry", default=3, type=int, help="Retry attempts")
36
+ @click.option("--timeout", default="6h", help="Job timeout (e.g., 6h, 1d)")
37
+ @click.option("--image", help="Pre-built image URI")
38
+ @click.option("--env", multiple=True, help="Environment variables (KEY=VALUE)")
39
+ @click.option("--dry-run", is_flag=True, help="Show plan without submitting")
40
+ @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
41
+ def submit(
42
+ config_file,
43
+ command,
44
+ queue,
45
+ memory,
46
+ vcpus,
47
+ gpus,
48
+ array,
49
+ retry,
50
+ timeout,
51
+ image,
52
+ env,
53
+ dry_run,
54
+ base_path,
55
+ ):
56
+ """Submit a custom batch job.
57
+
58
+ Jobs can be defined via a config file (-f) or inline options.
59
+
60
+ \b
61
+ Examples:
62
+ # Submit from config file
63
+ dh batch submit -f config.yaml
64
+
65
+ # Submit with inline command
66
+ dh batch submit --command "python train.py --epochs 100" --queue a10g-1x-spot
67
+
68
+ # Array job
69
+ dh batch submit -f config.yaml --array 10
70
+
71
+ \b
72
+ Config file format (YAML):
73
+ command: python scripts/train.py --epochs 100
74
+ queue: t4-1x-spot
75
+ memory: 30G
76
+ vcpus: 8
77
+ gpus: 1
78
+ array: 10
79
+ retry: 3
80
+ timeout: 6h
81
+ image: custom-image:tag
82
+ env:
83
+ MY_VAR: value
84
+ """
85
+ # Parse config file if provided
86
+ config = {}
87
+ if config_file:
88
+ with open(config_file) as f:
89
+ config = yaml.safe_load(f)
90
+
91
+ # Override with command-line options
92
+ job_command = command or config.get("command")
93
+ if not job_command:
94
+ raise click.UsageError("Must specify --command or provide config file with 'command' field")
95
+
96
+ job_queue = queue if queue != DEFAULT_QUEUE else config.get("queue", queue)
97
+ job_memory = memory if memory != "30G" else config.get("memory", memory)
98
+ job_vcpus = vcpus if vcpus != 8 else config.get("vcpus", vcpus)
99
+ job_gpus = gpus if gpus != 1 else config.get("gpus", gpus)
100
+ job_array = array if array != 1 else config.get("array", array)
101
+ job_retry = retry if retry != 3 else config.get("retry", retry)
102
+ job_timeout = timeout if timeout != "6h" else config.get("timeout", timeout)
103
+ job_image = image or config.get("image")
104
+
105
+ # Parse environment variables
106
+ job_env = dict(config.get("env", {}))
107
+ for e in env:
108
+ if "=" in e:
109
+ key, value = e.split("=", 1)
110
+ job_env[key] = value
111
+
112
+ # Generate job ID
113
+ job_id = generate_job_id("batch")
114
+
115
+ # Parse timeout
116
+ timeout_seconds = _parse_timeout(job_timeout)
117
+
118
+ # Show plan
119
+ click.echo()
120
+ click.echo(f"Job ID: {job_id}")
121
+ click.echo(f"Command: {job_command}")
122
+ click.echo(f"Queue: {job_queue}")
123
+ click.echo(f"Resources: {job_vcpus} vCPUs, {job_memory} memory, {job_gpus} GPUs")
124
+ click.echo(f"Array Size: {job_array}")
125
+ click.echo(f"Retry: {job_retry}")
126
+ click.echo(f"Timeout: {job_timeout} ({timeout_seconds}s)")
127
+ if job_image:
128
+ click.echo(f"Image: {job_image}")
129
+ if job_env:
130
+ click.echo(f"Environment: {len(job_env)} variables")
131
+
132
+ if dry_run:
133
+ click.echo()
134
+ click.echo(click.style("Dry run - job not submitted", fg="yellow"))
135
+ return
136
+
137
+ click.echo()
138
+
139
+ # Create job directory and manifest
140
+ job_dir = create_job_directory(job_id, base_path)
141
+ click.echo(f"Created job directory: {job_dir}")
142
+
143
+ manifest = JobManifest(
144
+ job_id=job_id,
145
+ user=job_id.split("-")[0], # Extract username from job ID
146
+ pipeline="batch",
147
+ status=JobStatus.PENDING,
148
+ command=job_command,
149
+ image_uri=job_image,
150
+ batch=BatchConfig(
151
+ queue=job_queue,
152
+ array_size=job_array if job_array > 1 else None,
153
+ ),
154
+ )
155
+
156
+ # Submit to AWS Batch
157
+ try:
158
+ client = BatchClient()
159
+
160
+ # Prepare environment
161
+ submit_env = {
162
+ "JOB_DIR": str(job_dir),
163
+ "JOB_ID": job_id,
164
+ **job_env,
165
+ }
166
+
167
+ batch_job_id = client.submit_job(
168
+ job_name=job_id,
169
+ job_definition=job_image or DEFAULT_JOB_DEFINITION,
170
+ job_queue=job_queue,
171
+ array_size=job_array if job_array > 1 else None,
172
+ environment=submit_env,
173
+ timeout_seconds=timeout_seconds,
174
+ retry_attempts=job_retry,
175
+ )
176
+
177
+ # Update manifest with Batch job ID
178
+ manifest.status = JobStatus.SUBMITTED
179
+ manifest.batch.job_id = batch_job_id
180
+ save_manifest(manifest, base_path)
181
+
182
+ click.echo(click.style("✓ Job submitted successfully!", fg="green"))
183
+ click.echo()
184
+ click.echo(f"AWS Batch Job ID: {batch_job_id}")
185
+ click.echo()
186
+ click.echo("Next steps:")
187
+ click.echo(f" Check status: dh batch status {job_id}")
188
+ click.echo(f" View logs: dh batch logs {job_id}")
189
+ click.echo(f" Cancel: dh batch cancel {job_id}")
190
+
191
+ except BatchError as e:
192
+ manifest.status = JobStatus.FAILED
193
+ manifest.error_message = str(e)
194
+ save_manifest(manifest, base_path)
195
+ click.echo(click.style(f"✗ Failed to submit job: {e}", fg="red"), err=True)
196
+ raise SystemExit(1)
197
+
198
+
199
+ def _parse_timeout(timeout_str: str) -> int:
200
+ """Parse timeout string to seconds.
201
+
202
+ Supports formats like: 6h, 1d, 30m, 3600
203
+ """
204
+ timeout_str = timeout_str.strip().lower()
205
+
206
+ if timeout_str.endswith("h"):
207
+ return int(timeout_str[:-1]) * 3600
208
+ elif timeout_str.endswith("d"):
209
+ return int(timeout_str[:-1]) * 86400
210
+ elif timeout_str.endswith("m"):
211
+ return int(timeout_str[:-1]) * 60
212
+ elif timeout_str.endswith("s"):
213
+ return int(timeout_str[:-1])
214
+ else:
215
+ return int(timeout_str)
@@ -0,0 +1,151 @@
1
+ """Job ID generation for batch jobs.
2
+
3
+ Job IDs follow the format: {username}-{pipeline}-{YYYYMMDD}-{random4}
4
+ Examples:
5
+ - dma-embed-20260109-a3f2
6
+ - josh-boltz-20260109-b7c1
7
+ - sam-batch-20260109-c9d2 (for generic submit jobs)
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import secrets
13
+ import subprocess
14
+ from datetime import datetime
15
+ from functools import lru_cache
16
+
17
+
18
+ class JobIdError(Exception):
19
+ """Error generating job ID."""
20
+
21
+ pass
22
+
23
+
24
+ @lru_cache(maxsize=1)
25
+ def get_aws_username() -> str:
26
+ """Extract username from AWS SSO session.
27
+
28
+ Attempts multiple methods in order:
29
+ 1. AWS_SSO_USER environment variable (if set by dh aws login)
30
+ 2. Parse from `aws sts get-caller-identity` ARN
31
+
32
+ Returns:
33
+ Username string (lowercase, alphanumeric only)
34
+
35
+ Raises:
36
+ JobIdError: If username cannot be determined
37
+ """
38
+ # Method 1: Check environment variable (fastest)
39
+ env_user = os.environ.get("AWS_SSO_USER")
40
+ if env_user:
41
+ return _sanitize_username(env_user)
42
+
43
+ # Method 2: Parse from STS caller identity
44
+ try:
45
+ result = subprocess.run(
46
+ ["aws", "sts", "get-caller-identity", "--output", "json"],
47
+ capture_output=True,
48
+ text=True,
49
+ timeout=10,
50
+ )
51
+ if result.returncode == 0:
52
+ identity = json.loads(result.stdout)
53
+ arn = identity.get("Arn", "")
54
+ # ARN format: arn:aws:sts::ACCOUNT:assumed-role/AWSReservedSSO_ROLE/username
55
+ # or: arn:aws:iam::ACCOUNT:user/username
56
+ if "/AWSReservedSSO_" in arn:
57
+ # SSO assumed role - username is last part
58
+ username = arn.split("/")[-1]
59
+ return _sanitize_username(username)
60
+ elif ":user/" in arn:
61
+ # IAM user
62
+ username = arn.split("/")[-1]
63
+ return _sanitize_username(username)
64
+ except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
65
+ pass
66
+
67
+ # Method 3: Fall back to system username
68
+ import getpass
69
+
70
+ try:
71
+ username = getpass.getuser()
72
+ return _sanitize_username(username)
73
+ except Exception:
74
+ pass
75
+
76
+ raise JobIdError(
77
+ "Could not determine AWS username. "
78
+ "Ensure you're logged in with 'dh aws login' or set AWS_SSO_USER environment variable."
79
+ )
80
+
81
+
82
+ def _sanitize_username(username: str) -> str:
83
+ """Sanitize username to be safe for job IDs.
84
+
85
+ - Convert to lowercase
86
+ - Keep only alphanumeric characters
87
+ - Truncate to 20 characters
88
+ """
89
+ sanitized = "".join(c for c in username.lower() if c.isalnum())
90
+ return sanitized[:20] if sanitized else "unknown"
91
+
92
+
93
+ def generate_job_id(pipeline: str = "batch") -> str:
94
+ """Generate a unique job ID.
95
+
96
+ Args:
97
+ pipeline: Pipeline type (e.g., 'embed', 'boltz', 'batch')
98
+
99
+ Returns:
100
+ Job ID in format: {username}-{pipeline}-{YYYYMMDD}-{random4}
101
+
102
+ Examples:
103
+ >>> generate_job_id("embed")
104
+ 'dma-embed-20260109-a3f2'
105
+ >>> generate_job_id()
106
+ 'dma-batch-20260109-b7c1'
107
+ """
108
+ username = get_aws_username()
109
+ date_str = datetime.now().strftime("%Y%m%d")
110
+ random_suffix = secrets.token_hex(2) # 4 hex characters
111
+
112
+ # Sanitize pipeline name
113
+ pipeline_clean = "".join(c for c in pipeline.lower() if c.isalnum())[:10]
114
+
115
+ return f"{username}-{pipeline_clean}-{date_str}-{random_suffix}"
116
+
117
+
118
+ def parse_job_id(job_id: str) -> dict:
119
+ """Parse a job ID into its components.
120
+
121
+ Args:
122
+ job_id: Job ID string
123
+
124
+ Returns:
125
+ Dictionary with keys: username, pipeline, date, suffix
126
+
127
+ Raises:
128
+ ValueError: If job ID format is invalid
129
+ """
130
+ parts = job_id.split("-")
131
+ if len(parts) < 4:
132
+ raise ValueError(f"Invalid job ID format: {job_id}")
133
+
134
+ # Handle usernames with dashes by taking last 3 parts as known components
135
+ suffix = parts[-1]
136
+ date_str = parts[-2]
137
+ pipeline = parts[-3]
138
+ username = "-".join(parts[:-3])
139
+
140
+ # Validate date format
141
+ try:
142
+ datetime.strptime(date_str, "%Y%m%d")
143
+ except ValueError:
144
+ raise ValueError(f"Invalid date in job ID: {date_str}")
145
+
146
+ return {
147
+ "username": username,
148
+ "pipeline": pipeline,
149
+ "date": date_str,
150
+ "suffix": suffix,
151
+ }
@@ -0,0 +1,293 @@
1
+ """Manifest management for batch jobs.
2
+
3
+ Manifests are JSON files stored in Primordial that track job metadata,
4
+ status, and configuration. They provide the single source of truth for
5
+ job state.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import tempfile
11
+ from datetime import datetime
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from pydantic import BaseModel, Field
17
+
18
+
19
+ class JobStatus(str, Enum):
20
+ """Possible job statuses."""
21
+
22
+ PENDING = "pending"
23
+ SUBMITTED = "submitted"
24
+ RUNNING = "running"
25
+ SUCCEEDED = "succeeded"
26
+ FAILED = "failed"
27
+ CANCELLED = "cancelled"
28
+ FINALIZING = "finalizing"
29
+ FINALIZED = "finalized"
30
+
31
+
32
+ class InputConfig(BaseModel):
33
+ """Configuration for job input."""
34
+
35
+ source: str = Field(..., description="Path to input file or directory")
36
+ num_sequences: int | None = Field(None, description="Number of sequences (for FASTA)")
37
+ num_chunks: int | None = Field(None, description="Number of chunks created")
38
+ sequences_per_chunk: int | None = Field(None, description="Sequences per chunk")
39
+
40
+
41
+ class BatchConfig(BaseModel):
42
+ """AWS Batch job configuration."""
43
+
44
+ job_id: str | None = Field(None, description="AWS Batch job ID")
45
+ job_definition: str | None = Field(None, description="Job definition name:revision")
46
+ queue: str = Field(..., description="Batch queue name")
47
+ array_size: int | None = Field(None, description="Array job size")
48
+
49
+
50
+ class OutputConfig(BaseModel):
51
+ """Configuration for job output."""
52
+
53
+ destination: str | None = Field(None, description="Final output path")
54
+ finalized: bool = Field(False, description="Whether output has been finalized")
55
+
56
+
57
+ class RetryInfo(BaseModel):
58
+ """Information about a retry attempt."""
59
+
60
+ retry_id: str = Field(..., description="Retry job ID")
61
+ indices: list[int] = Field(..., description="Array indices being retried")
62
+ batch_job_id: str | None = Field(None, description="AWS Batch job ID for retry")
63
+ created: datetime = Field(default_factory=datetime.utcnow)
64
+
65
+
66
+ class JobManifest(BaseModel):
67
+ """Complete manifest for a batch job."""
68
+
69
+ job_id: str = Field(..., description="Job ID")
70
+ user: str = Field(..., description="Username who submitted the job")
71
+ pipeline: str = Field(..., description="Pipeline type (embed-t5, boltz, batch)")
72
+ status: JobStatus = Field(JobStatus.PENDING, description="Current job status")
73
+ created: datetime = Field(default_factory=datetime.utcnow)
74
+ updated: datetime = Field(default_factory=datetime.utcnow)
75
+
76
+ input: InputConfig | None = Field(None, description="Input configuration")
77
+ batch: BatchConfig | None = Field(None, description="Batch job configuration")
78
+ output: OutputConfig | None = Field(None, description="Output configuration")
79
+
80
+ retries: list[RetryInfo] = Field(default_factory=list, description="Retry history")
81
+
82
+ # Additional metadata
83
+ image_uri: str | None = Field(None, description="Container image URI")
84
+ command: str | None = Field(None, description="Command to run")
85
+ error_message: str | None = Field(None, description="Error message if failed")
86
+
87
+ class Config:
88
+ json_encoders = {datetime: lambda v: v.isoformat()}
89
+
90
+
91
+ # Default base path for job data
92
+ BATCH_JOBS_BASE = "/primordial/.batch-jobs"
93
+
94
+
95
+ def get_job_dir(job_id: str, base_path: str = BATCH_JOBS_BASE) -> Path:
96
+ """Get the directory path for a job.
97
+
98
+ Args:
99
+ job_id: Job ID
100
+ base_path: Base path for batch jobs (default: /primordial/.batch-jobs)
101
+
102
+ Returns:
103
+ Path to job directory
104
+ """
105
+ return Path(base_path) / job_id
106
+
107
+
108
+ def get_manifest_path(job_id: str, base_path: str = BATCH_JOBS_BASE) -> Path:
109
+ """Get the manifest file path for a job.
110
+
111
+ Args:
112
+ job_id: Job ID
113
+ base_path: Base path for batch jobs
114
+
115
+ Returns:
116
+ Path to manifest.json
117
+ """
118
+ return get_job_dir(job_id, base_path) / "manifest.json"
119
+
120
+
121
+ def create_job_directory(job_id: str, base_path: str = BATCH_JOBS_BASE) -> Path:
122
+ """Create the directory structure for a new job.
123
+
124
+ Creates:
125
+ - {base_path}/{job_id}/
126
+ - {base_path}/{job_id}/input/
127
+ - {base_path}/{job_id}/output/
128
+
129
+ Args:
130
+ job_id: Job ID
131
+ base_path: Base path for batch jobs
132
+
133
+ Returns:
134
+ Path to job directory
135
+ """
136
+ job_dir = get_job_dir(job_id, base_path)
137
+ (job_dir / "input").mkdir(parents=True, exist_ok=True)
138
+ (job_dir / "output").mkdir(parents=True, exist_ok=True)
139
+ return job_dir
140
+
141
+
142
+ def save_manifest(manifest: JobManifest, base_path: str = BATCH_JOBS_BASE) -> Path:
143
+ """Save a manifest to disk atomically.
144
+
145
+ Uses write-to-temp-then-rename for atomicity to prevent corruption
146
+ if interrupted.
147
+
148
+ Args:
149
+ manifest: JobManifest to save
150
+ base_path: Base path for batch jobs
151
+
152
+ Returns:
153
+ Path to saved manifest
154
+ """
155
+ manifest.updated = datetime.utcnow()
156
+ manifest_path = get_manifest_path(manifest.job_id, base_path)
157
+ manifest_path.parent.mkdir(parents=True, exist_ok=True)
158
+
159
+ # Write to temp file first, then rename for atomicity
160
+ temp_fd, temp_path = tempfile.mkstemp(
161
+ dir=manifest_path.parent, prefix=".manifest_", suffix=".json"
162
+ )
163
+ try:
164
+ with os.fdopen(temp_fd, "w") as f:
165
+ f.write(manifest.model_dump_json(indent=2))
166
+ os.rename(temp_path, manifest_path)
167
+ except Exception:
168
+ # Clean up temp file on error
169
+ if os.path.exists(temp_path):
170
+ os.unlink(temp_path)
171
+ raise
172
+
173
+ return manifest_path
174
+
175
+
176
+ def load_manifest(job_id: str, base_path: str = BATCH_JOBS_BASE) -> JobManifest:
177
+ """Load a manifest from disk.
178
+
179
+ Args:
180
+ job_id: Job ID
181
+ base_path: Base path for batch jobs
182
+
183
+ Returns:
184
+ JobManifest
185
+
186
+ Raises:
187
+ FileNotFoundError: If manifest doesn't exist
188
+ ValueError: If manifest is invalid
189
+ """
190
+ manifest_path = get_manifest_path(job_id, base_path)
191
+ if not manifest_path.exists():
192
+ raise FileNotFoundError(f"Manifest not found for job: {job_id}")
193
+
194
+ with open(manifest_path) as f:
195
+ data = json.load(f)
196
+
197
+ return JobManifest(**data)
198
+
199
+
200
+ def update_manifest(
201
+ job_id: str, updates: dict[str, Any], base_path: str = BATCH_JOBS_BASE
202
+ ) -> JobManifest:
203
+ """Update specific fields in a manifest.
204
+
205
+ Args:
206
+ job_id: Job ID
207
+ updates: Dictionary of fields to update
208
+ base_path: Base path for batch jobs
209
+
210
+ Returns:
211
+ Updated JobManifest
212
+ """
213
+ manifest = load_manifest(job_id, base_path)
214
+
215
+ # Apply updates
216
+ for key, value in updates.items():
217
+ if hasattr(manifest, key):
218
+ setattr(manifest, key, value)
219
+ else:
220
+ raise ValueError(f"Unknown manifest field: {key}")
221
+
222
+ save_manifest(manifest, base_path)
223
+ return manifest
224
+
225
+
226
+ def list_jobs(
227
+ base_path: str = BATCH_JOBS_BASE,
228
+ user: str | None = None,
229
+ status: JobStatus | None = None,
230
+ pipeline: str | None = None,
231
+ limit: int = 50,
232
+ ) -> list[JobManifest]:
233
+ """List jobs from the batch jobs directory.
234
+
235
+ Args:
236
+ base_path: Base path for batch jobs
237
+ user: Filter by username
238
+ status: Filter by status
239
+ pipeline: Filter by pipeline type
240
+ limit: Maximum number of jobs to return
241
+
242
+ Returns:
243
+ List of JobManifest objects, sorted by created date (newest first)
244
+ """
245
+ base = Path(base_path)
246
+ if not base.exists():
247
+ return []
248
+
249
+ manifests = []
250
+ for job_dir in base.iterdir():
251
+ if not job_dir.is_dir():
252
+ continue
253
+
254
+ manifest_path = job_dir / "manifest.json"
255
+ if not manifest_path.exists():
256
+ continue
257
+
258
+ try:
259
+ with open(manifest_path) as f:
260
+ data = json.load(f)
261
+ manifest = JobManifest(**data)
262
+
263
+ # Apply filters
264
+ if user and manifest.user != user:
265
+ continue
266
+ if status and manifest.status != status:
267
+ continue
268
+ if pipeline and manifest.pipeline != pipeline:
269
+ continue
270
+
271
+ manifests.append(manifest)
272
+ except (json.JSONDecodeError, ValueError):
273
+ # Skip invalid manifests
274
+ continue
275
+
276
+ # Sort by created date, newest first
277
+ manifests.sort(key=lambda m: m.created, reverse=True)
278
+
279
+ return manifests[:limit]
280
+
281
+
282
+ def delete_job_directory(job_id: str, base_path: str = BATCH_JOBS_BASE) -> None:
283
+ """Delete a job directory and all its contents.
284
+
285
+ Args:
286
+ job_id: Job ID
287
+ base_path: Base path for batch jobs
288
+ """
289
+ import shutil
290
+
291
+ job_dir = get_job_dir(job_id, base_path)
292
+ if job_dir.exists():
293
+ shutil.rmtree(job_dir)