dayhoff-tools 1.14.1__py3-none-any.whl → 1.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/batch/__init__.py +8 -0
- dayhoff_tools/batch/workers/__init__.py +12 -0
- dayhoff_tools/batch/workers/base.py +150 -0
- dayhoff_tools/batch/workers/boltz.py +407 -0
- dayhoff_tools/batch/workers/embed_t5.py +92 -0
- dayhoff_tools/cli/batch/__init__.py +85 -0
- dayhoff_tools/cli/batch/aws_batch.py +401 -0
- dayhoff_tools/cli/batch/commands/__init__.py +25 -0
- dayhoff_tools/cli/batch/commands/boltz.py +362 -0
- dayhoff_tools/cli/batch/commands/cancel.py +82 -0
- dayhoff_tools/cli/batch/commands/embed_t5.py +303 -0
- dayhoff_tools/cli/batch/commands/finalize.py +206 -0
- dayhoff_tools/cli/batch/commands/list_jobs.py +78 -0
- dayhoff_tools/cli/batch/commands/local.py +95 -0
- dayhoff_tools/cli/batch/commands/logs.py +142 -0
- dayhoff_tools/cli/batch/commands/retry.py +142 -0
- dayhoff_tools/cli/batch/commands/status.py +214 -0
- dayhoff_tools/cli/batch/commands/submit.py +215 -0
- dayhoff_tools/cli/batch/job_id.py +151 -0
- dayhoff_tools/cli/batch/manifest.py +293 -0
- dayhoff_tools/cli/engines_studios/engine-studio-cli.md +26 -21
- dayhoff_tools/cli/engines_studios/engine_commands.py +16 -89
- dayhoff_tools/cli/engines_studios/ssh_config.py +96 -0
- dayhoff_tools/cli/engines_studios/studio_commands.py +13 -2
- dayhoff_tools/cli/main.py +14 -0
- {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/METADATA +6 -1
- {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/RECORD +29 -8
- {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.14.1.dist-info → dayhoff_tools-1.14.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Submit command for generic batch jobs."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from ..aws_batch import BatchClient, BatchError
|
|
10
|
+
from ..job_id import generate_job_id
|
|
11
|
+
from ..manifest import (
|
|
12
|
+
BATCH_JOBS_BASE,
|
|
13
|
+
BatchConfig,
|
|
14
|
+
InputConfig,
|
|
15
|
+
JobManifest,
|
|
16
|
+
JobStatus,
|
|
17
|
+
create_job_directory,
|
|
18
|
+
save_manifest,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Default job definition for generic jobs
|
|
23
|
+
DEFAULT_JOB_DEFINITION = "dayhoff-batch-base"
|
|
24
|
+
DEFAULT_QUEUE = "t4-1x-spot"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@click.command()
|
|
28
|
+
@click.option("-f", "--file", "config_file", type=click.Path(exists=True), help="Config file path")
|
|
29
|
+
@click.option("--command", help="Command to run (alternative to config file)")
|
|
30
|
+
@click.option("--queue", default=DEFAULT_QUEUE, help=f"Batch queue [default: {DEFAULT_QUEUE}]")
|
|
31
|
+
@click.option("--memory", default="30G", help="Memory limit (e.g., 30G)")
|
|
32
|
+
@click.option("--vcpus", default=8, type=int, help="Number of vCPUs")
|
|
33
|
+
@click.option("--gpus", default=1, type=int, help="Number of GPUs")
|
|
34
|
+
@click.option("--array", default=1, type=int, help="Number of array tasks")
|
|
35
|
+
@click.option("--retry", default=3, type=int, help="Retry attempts")
|
|
36
|
+
@click.option("--timeout", default="6h", help="Job timeout (e.g., 6h, 1d)")
|
|
37
|
+
@click.option("--image", help="Pre-built image URI")
|
|
38
|
+
@click.option("--env", multiple=True, help="Environment variables (KEY=VALUE)")
|
|
39
|
+
@click.option("--dry-run", is_flag=True, help="Show plan without submitting")
|
|
40
|
+
@click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
|
|
41
|
+
def submit(
|
|
42
|
+
config_file,
|
|
43
|
+
command,
|
|
44
|
+
queue,
|
|
45
|
+
memory,
|
|
46
|
+
vcpus,
|
|
47
|
+
gpus,
|
|
48
|
+
array,
|
|
49
|
+
retry,
|
|
50
|
+
timeout,
|
|
51
|
+
image,
|
|
52
|
+
env,
|
|
53
|
+
dry_run,
|
|
54
|
+
base_path,
|
|
55
|
+
):
|
|
56
|
+
"""Submit a custom batch job.
|
|
57
|
+
|
|
58
|
+
Jobs can be defined via a config file (-f) or inline options.
|
|
59
|
+
|
|
60
|
+
\b
|
|
61
|
+
Examples:
|
|
62
|
+
# Submit from config file
|
|
63
|
+
dh batch submit -f config.yaml
|
|
64
|
+
|
|
65
|
+
# Submit with inline command
|
|
66
|
+
dh batch submit --command "python train.py --epochs 100" --queue a10g-1x-spot
|
|
67
|
+
|
|
68
|
+
# Array job
|
|
69
|
+
dh batch submit -f config.yaml --array 10
|
|
70
|
+
|
|
71
|
+
\b
|
|
72
|
+
Config file format (YAML):
|
|
73
|
+
command: python scripts/train.py --epochs 100
|
|
74
|
+
queue: t4-1x-spot
|
|
75
|
+
memory: 30G
|
|
76
|
+
vcpus: 8
|
|
77
|
+
gpus: 1
|
|
78
|
+
array: 10
|
|
79
|
+
retry: 3
|
|
80
|
+
timeout: 6h
|
|
81
|
+
image: custom-image:tag
|
|
82
|
+
env:
|
|
83
|
+
MY_VAR: value
|
|
84
|
+
"""
|
|
85
|
+
# Parse config file if provided
|
|
86
|
+
config = {}
|
|
87
|
+
if config_file:
|
|
88
|
+
with open(config_file) as f:
|
|
89
|
+
config = yaml.safe_load(f)
|
|
90
|
+
|
|
91
|
+
# Override with command-line options
|
|
92
|
+
job_command = command or config.get("command")
|
|
93
|
+
if not job_command:
|
|
94
|
+
raise click.UsageError("Must specify --command or provide config file with 'command' field")
|
|
95
|
+
|
|
96
|
+
job_queue = queue if queue != DEFAULT_QUEUE else config.get("queue", queue)
|
|
97
|
+
job_memory = memory if memory != "30G" else config.get("memory", memory)
|
|
98
|
+
job_vcpus = vcpus if vcpus != 8 else config.get("vcpus", vcpus)
|
|
99
|
+
job_gpus = gpus if gpus != 1 else config.get("gpus", gpus)
|
|
100
|
+
job_array = array if array != 1 else config.get("array", array)
|
|
101
|
+
job_retry = retry if retry != 3 else config.get("retry", retry)
|
|
102
|
+
job_timeout = timeout if timeout != "6h" else config.get("timeout", timeout)
|
|
103
|
+
job_image = image or config.get("image")
|
|
104
|
+
|
|
105
|
+
# Parse environment variables
|
|
106
|
+
job_env = dict(config.get("env", {}))
|
|
107
|
+
for e in env:
|
|
108
|
+
if "=" in e:
|
|
109
|
+
key, value = e.split("=", 1)
|
|
110
|
+
job_env[key] = value
|
|
111
|
+
|
|
112
|
+
# Generate job ID
|
|
113
|
+
job_id = generate_job_id("batch")
|
|
114
|
+
|
|
115
|
+
# Parse timeout
|
|
116
|
+
timeout_seconds = _parse_timeout(job_timeout)
|
|
117
|
+
|
|
118
|
+
# Show plan
|
|
119
|
+
click.echo()
|
|
120
|
+
click.echo(f"Job ID: {job_id}")
|
|
121
|
+
click.echo(f"Command: {job_command}")
|
|
122
|
+
click.echo(f"Queue: {job_queue}")
|
|
123
|
+
click.echo(f"Resources: {job_vcpus} vCPUs, {job_memory} memory, {job_gpus} GPUs")
|
|
124
|
+
click.echo(f"Array Size: {job_array}")
|
|
125
|
+
click.echo(f"Retry: {job_retry}")
|
|
126
|
+
click.echo(f"Timeout: {job_timeout} ({timeout_seconds}s)")
|
|
127
|
+
if job_image:
|
|
128
|
+
click.echo(f"Image: {job_image}")
|
|
129
|
+
if job_env:
|
|
130
|
+
click.echo(f"Environment: {len(job_env)} variables")
|
|
131
|
+
|
|
132
|
+
if dry_run:
|
|
133
|
+
click.echo()
|
|
134
|
+
click.echo(click.style("Dry run - job not submitted", fg="yellow"))
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
click.echo()
|
|
138
|
+
|
|
139
|
+
# Create job directory and manifest
|
|
140
|
+
job_dir = create_job_directory(job_id, base_path)
|
|
141
|
+
click.echo(f"Created job directory: {job_dir}")
|
|
142
|
+
|
|
143
|
+
manifest = JobManifest(
|
|
144
|
+
job_id=job_id,
|
|
145
|
+
user=job_id.split("-")[0], # Extract username from job ID
|
|
146
|
+
pipeline="batch",
|
|
147
|
+
status=JobStatus.PENDING,
|
|
148
|
+
command=job_command,
|
|
149
|
+
image_uri=job_image,
|
|
150
|
+
batch=BatchConfig(
|
|
151
|
+
queue=job_queue,
|
|
152
|
+
array_size=job_array if job_array > 1 else None,
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Submit to AWS Batch
|
|
157
|
+
try:
|
|
158
|
+
client = BatchClient()
|
|
159
|
+
|
|
160
|
+
# Prepare environment
|
|
161
|
+
submit_env = {
|
|
162
|
+
"JOB_DIR": str(job_dir),
|
|
163
|
+
"JOB_ID": job_id,
|
|
164
|
+
**job_env,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
batch_job_id = client.submit_job(
|
|
168
|
+
job_name=job_id,
|
|
169
|
+
job_definition=job_image or DEFAULT_JOB_DEFINITION,
|
|
170
|
+
job_queue=job_queue,
|
|
171
|
+
array_size=job_array if job_array > 1 else None,
|
|
172
|
+
environment=submit_env,
|
|
173
|
+
timeout_seconds=timeout_seconds,
|
|
174
|
+
retry_attempts=job_retry,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Update manifest with Batch job ID
|
|
178
|
+
manifest.status = JobStatus.SUBMITTED
|
|
179
|
+
manifest.batch.job_id = batch_job_id
|
|
180
|
+
save_manifest(manifest, base_path)
|
|
181
|
+
|
|
182
|
+
click.echo(click.style("✓ Job submitted successfully!", fg="green"))
|
|
183
|
+
click.echo()
|
|
184
|
+
click.echo(f"AWS Batch Job ID: {batch_job_id}")
|
|
185
|
+
click.echo()
|
|
186
|
+
click.echo("Next steps:")
|
|
187
|
+
click.echo(f" Check status: dh batch status {job_id}")
|
|
188
|
+
click.echo(f" View logs: dh batch logs {job_id}")
|
|
189
|
+
click.echo(f" Cancel: dh batch cancel {job_id}")
|
|
190
|
+
|
|
191
|
+
except BatchError as e:
|
|
192
|
+
manifest.status = JobStatus.FAILED
|
|
193
|
+
manifest.error_message = str(e)
|
|
194
|
+
save_manifest(manifest, base_path)
|
|
195
|
+
click.echo(click.style(f"✗ Failed to submit job: {e}", fg="red"), err=True)
|
|
196
|
+
raise SystemExit(1)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _parse_timeout(timeout_str: str) -> int:
|
|
200
|
+
"""Parse timeout string to seconds.
|
|
201
|
+
|
|
202
|
+
Supports formats like: 6h, 1d, 30m, 3600
|
|
203
|
+
"""
|
|
204
|
+
timeout_str = timeout_str.strip().lower()
|
|
205
|
+
|
|
206
|
+
if timeout_str.endswith("h"):
|
|
207
|
+
return int(timeout_str[:-1]) * 3600
|
|
208
|
+
elif timeout_str.endswith("d"):
|
|
209
|
+
return int(timeout_str[:-1]) * 86400
|
|
210
|
+
elif timeout_str.endswith("m"):
|
|
211
|
+
return int(timeout_str[:-1]) * 60
|
|
212
|
+
elif timeout_str.endswith("s"):
|
|
213
|
+
return int(timeout_str[:-1])
|
|
214
|
+
else:
|
|
215
|
+
return int(timeout_str)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Job ID generation for batch jobs.
|
|
2
|
+
|
|
3
|
+
Job IDs follow the format: {username}-{pipeline}-{YYYYMMDD}-{random4}
|
|
4
|
+
Examples:
|
|
5
|
+
- dma-embed-20260109-a3f2
|
|
6
|
+
- josh-boltz-20260109-b7c1
|
|
7
|
+
- sam-batch-20260109-c9d2 (for generic submit jobs)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import secrets
|
|
13
|
+
import subprocess
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class JobIdError(Exception):
|
|
19
|
+
"""Error generating job ID."""
|
|
20
|
+
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@lru_cache(maxsize=1)
|
|
25
|
+
def get_aws_username() -> str:
|
|
26
|
+
"""Extract username from AWS SSO session.
|
|
27
|
+
|
|
28
|
+
Attempts multiple methods in order:
|
|
29
|
+
1. AWS_SSO_USER environment variable (if set by dh aws login)
|
|
30
|
+
2. Parse from `aws sts get-caller-identity` ARN
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Username string (lowercase, alphanumeric only)
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
JobIdError: If username cannot be determined
|
|
37
|
+
"""
|
|
38
|
+
# Method 1: Check environment variable (fastest)
|
|
39
|
+
env_user = os.environ.get("AWS_SSO_USER")
|
|
40
|
+
if env_user:
|
|
41
|
+
return _sanitize_username(env_user)
|
|
42
|
+
|
|
43
|
+
# Method 2: Parse from STS caller identity
|
|
44
|
+
try:
|
|
45
|
+
result = subprocess.run(
|
|
46
|
+
["aws", "sts", "get-caller-identity", "--output", "json"],
|
|
47
|
+
capture_output=True,
|
|
48
|
+
text=True,
|
|
49
|
+
timeout=10,
|
|
50
|
+
)
|
|
51
|
+
if result.returncode == 0:
|
|
52
|
+
identity = json.loads(result.stdout)
|
|
53
|
+
arn = identity.get("Arn", "")
|
|
54
|
+
# ARN format: arn:aws:sts::ACCOUNT:assumed-role/AWSReservedSSO_ROLE/username
|
|
55
|
+
# or: arn:aws:iam::ACCOUNT:user/username
|
|
56
|
+
if "/AWSReservedSSO_" in arn:
|
|
57
|
+
# SSO assumed role - username is last part
|
|
58
|
+
username = arn.split("/")[-1]
|
|
59
|
+
return _sanitize_username(username)
|
|
60
|
+
elif ":user/" in arn:
|
|
61
|
+
# IAM user
|
|
62
|
+
username = arn.split("/")[-1]
|
|
63
|
+
return _sanitize_username(username)
|
|
64
|
+
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
# Method 3: Fall back to system username
|
|
68
|
+
import getpass
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
username = getpass.getuser()
|
|
72
|
+
return _sanitize_username(username)
|
|
73
|
+
except Exception:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
raise JobIdError(
|
|
77
|
+
"Could not determine AWS username. "
|
|
78
|
+
"Ensure you're logged in with 'dh aws login' or set AWS_SSO_USER environment variable."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _sanitize_username(username: str) -> str:
|
|
83
|
+
"""Sanitize username to be safe for job IDs.
|
|
84
|
+
|
|
85
|
+
- Convert to lowercase
|
|
86
|
+
- Keep only alphanumeric characters
|
|
87
|
+
- Truncate to 20 characters
|
|
88
|
+
"""
|
|
89
|
+
sanitized = "".join(c for c in username.lower() if c.isalnum())
|
|
90
|
+
return sanitized[:20] if sanitized else "unknown"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def generate_job_id(pipeline: str = "batch") -> str:
|
|
94
|
+
"""Generate a unique job ID.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
pipeline: Pipeline type (e.g., 'embed', 'boltz', 'batch')
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Job ID in format: {username}-{pipeline}-{YYYYMMDD}-{random4}
|
|
101
|
+
|
|
102
|
+
Examples:
|
|
103
|
+
>>> generate_job_id("embed")
|
|
104
|
+
'dma-embed-20260109-a3f2'
|
|
105
|
+
>>> generate_job_id()
|
|
106
|
+
'dma-batch-20260109-b7c1'
|
|
107
|
+
"""
|
|
108
|
+
username = get_aws_username()
|
|
109
|
+
date_str = datetime.now().strftime("%Y%m%d")
|
|
110
|
+
random_suffix = secrets.token_hex(2) # 4 hex characters
|
|
111
|
+
|
|
112
|
+
# Sanitize pipeline name
|
|
113
|
+
pipeline_clean = "".join(c for c in pipeline.lower() if c.isalnum())[:10]
|
|
114
|
+
|
|
115
|
+
return f"{username}-{pipeline_clean}-{date_str}-{random_suffix}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parse_job_id(job_id: str) -> dict:
|
|
119
|
+
"""Parse a job ID into its components.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
job_id: Job ID string
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Dictionary with keys: username, pipeline, date, suffix
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
ValueError: If job ID format is invalid
|
|
129
|
+
"""
|
|
130
|
+
parts = job_id.split("-")
|
|
131
|
+
if len(parts) < 4:
|
|
132
|
+
raise ValueError(f"Invalid job ID format: {job_id}")
|
|
133
|
+
|
|
134
|
+
# Handle usernames with dashes by taking last 3 parts as known components
|
|
135
|
+
suffix = parts[-1]
|
|
136
|
+
date_str = parts[-2]
|
|
137
|
+
pipeline = parts[-3]
|
|
138
|
+
username = "-".join(parts[:-3])
|
|
139
|
+
|
|
140
|
+
# Validate date format
|
|
141
|
+
try:
|
|
142
|
+
datetime.strptime(date_str, "%Y%m%d")
|
|
143
|
+
except ValueError:
|
|
144
|
+
raise ValueError(f"Invalid date in job ID: {date_str}")
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"username": username,
|
|
148
|
+
"pipeline": pipeline,
|
|
149
|
+
"date": date_str,
|
|
150
|
+
"suffix": suffix,
|
|
151
|
+
}
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""Manifest management for batch jobs.
|
|
2
|
+
|
|
3
|
+
Manifests are JSON files stored in Primordial that track job metadata,
|
|
4
|
+
status, and configuration. They provide the single source of truth for
|
|
5
|
+
job state.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import tempfile
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class JobStatus(str, Enum):
|
|
20
|
+
"""Possible job statuses."""
|
|
21
|
+
|
|
22
|
+
PENDING = "pending"
|
|
23
|
+
SUBMITTED = "submitted"
|
|
24
|
+
RUNNING = "running"
|
|
25
|
+
SUCCEEDED = "succeeded"
|
|
26
|
+
FAILED = "failed"
|
|
27
|
+
CANCELLED = "cancelled"
|
|
28
|
+
FINALIZING = "finalizing"
|
|
29
|
+
FINALIZED = "finalized"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class InputConfig(BaseModel):
|
|
33
|
+
"""Configuration for job input."""
|
|
34
|
+
|
|
35
|
+
source: str = Field(..., description="Path to input file or directory")
|
|
36
|
+
num_sequences: int | None = Field(None, description="Number of sequences (for FASTA)")
|
|
37
|
+
num_chunks: int | None = Field(None, description="Number of chunks created")
|
|
38
|
+
sequences_per_chunk: int | None = Field(None, description="Sequences per chunk")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BatchConfig(BaseModel):
|
|
42
|
+
"""AWS Batch job configuration."""
|
|
43
|
+
|
|
44
|
+
job_id: str | None = Field(None, description="AWS Batch job ID")
|
|
45
|
+
job_definition: str | None = Field(None, description="Job definition name:revision")
|
|
46
|
+
queue: str = Field(..., description="Batch queue name")
|
|
47
|
+
array_size: int | None = Field(None, description="Array job size")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class OutputConfig(BaseModel):
|
|
51
|
+
"""Configuration for job output."""
|
|
52
|
+
|
|
53
|
+
destination: str | None = Field(None, description="Final output path")
|
|
54
|
+
finalized: bool = Field(False, description="Whether output has been finalized")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class RetryInfo(BaseModel):
|
|
58
|
+
"""Information about a retry attempt."""
|
|
59
|
+
|
|
60
|
+
retry_id: str = Field(..., description="Retry job ID")
|
|
61
|
+
indices: list[int] = Field(..., description="Array indices being retried")
|
|
62
|
+
batch_job_id: str | None = Field(None, description="AWS Batch job ID for retry")
|
|
63
|
+
created: datetime = Field(default_factory=datetime.utcnow)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class JobManifest(BaseModel):
|
|
67
|
+
"""Complete manifest for a batch job."""
|
|
68
|
+
|
|
69
|
+
job_id: str = Field(..., description="Job ID")
|
|
70
|
+
user: str = Field(..., description="Username who submitted the job")
|
|
71
|
+
pipeline: str = Field(..., description="Pipeline type (embed-t5, boltz, batch)")
|
|
72
|
+
status: JobStatus = Field(JobStatus.PENDING, description="Current job status")
|
|
73
|
+
created: datetime = Field(default_factory=datetime.utcnow)
|
|
74
|
+
updated: datetime = Field(default_factory=datetime.utcnow)
|
|
75
|
+
|
|
76
|
+
input: InputConfig | None = Field(None, description="Input configuration")
|
|
77
|
+
batch: BatchConfig | None = Field(None, description="Batch job configuration")
|
|
78
|
+
output: OutputConfig | None = Field(None, description="Output configuration")
|
|
79
|
+
|
|
80
|
+
retries: list[RetryInfo] = Field(default_factory=list, description="Retry history")
|
|
81
|
+
|
|
82
|
+
# Additional metadata
|
|
83
|
+
image_uri: str | None = Field(None, description="Container image URI")
|
|
84
|
+
command: str | None = Field(None, description="Command to run")
|
|
85
|
+
error_message: str | None = Field(None, description="Error message if failed")
|
|
86
|
+
|
|
87
|
+
class Config:
|
|
88
|
+
json_encoders = {datetime: lambda v: v.isoformat()}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# Default base path for job data
|
|
92
|
+
BATCH_JOBS_BASE = "/primordial/.batch-jobs"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_job_dir(job_id: str, base_path: str = BATCH_JOBS_BASE) -> Path:
|
|
96
|
+
"""Get the directory path for a job.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
job_id: Job ID
|
|
100
|
+
base_path: Base path for batch jobs (default: /primordial/.batch-jobs)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Path to job directory
|
|
104
|
+
"""
|
|
105
|
+
return Path(base_path) / job_id
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_manifest_path(job_id: str, base_path: str = BATCH_JOBS_BASE) -> Path:
|
|
109
|
+
"""Get the manifest file path for a job.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
job_id: Job ID
|
|
113
|
+
base_path: Base path for batch jobs
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Path to manifest.json
|
|
117
|
+
"""
|
|
118
|
+
return get_job_dir(job_id, base_path) / "manifest.json"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def create_job_directory(job_id: str, base_path: str = BATCH_JOBS_BASE) -> Path:
|
|
122
|
+
"""Create the directory structure for a new job.
|
|
123
|
+
|
|
124
|
+
Creates:
|
|
125
|
+
- {base_path}/{job_id}/
|
|
126
|
+
- {base_path}/{job_id}/input/
|
|
127
|
+
- {base_path}/{job_id}/output/
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
job_id: Job ID
|
|
131
|
+
base_path: Base path for batch jobs
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Path to job directory
|
|
135
|
+
"""
|
|
136
|
+
job_dir = get_job_dir(job_id, base_path)
|
|
137
|
+
(job_dir / "input").mkdir(parents=True, exist_ok=True)
|
|
138
|
+
(job_dir / "output").mkdir(parents=True, exist_ok=True)
|
|
139
|
+
return job_dir
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def save_manifest(manifest: JobManifest, base_path: str = BATCH_JOBS_BASE) -> Path:
|
|
143
|
+
"""Save a manifest to disk atomically.
|
|
144
|
+
|
|
145
|
+
Uses write-to-temp-then-rename for atomicity to prevent corruption
|
|
146
|
+
if interrupted.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
manifest: JobManifest to save
|
|
150
|
+
base_path: Base path for batch jobs
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Path to saved manifest
|
|
154
|
+
"""
|
|
155
|
+
manifest.updated = datetime.utcnow()
|
|
156
|
+
manifest_path = get_manifest_path(manifest.job_id, base_path)
|
|
157
|
+
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
|
|
159
|
+
# Write to temp file first, then rename for atomicity
|
|
160
|
+
temp_fd, temp_path = tempfile.mkstemp(
|
|
161
|
+
dir=manifest_path.parent, prefix=".manifest_", suffix=".json"
|
|
162
|
+
)
|
|
163
|
+
try:
|
|
164
|
+
with os.fdopen(temp_fd, "w") as f:
|
|
165
|
+
f.write(manifest.model_dump_json(indent=2))
|
|
166
|
+
os.rename(temp_path, manifest_path)
|
|
167
|
+
except Exception:
|
|
168
|
+
# Clean up temp file on error
|
|
169
|
+
if os.path.exists(temp_path):
|
|
170
|
+
os.unlink(temp_path)
|
|
171
|
+
raise
|
|
172
|
+
|
|
173
|
+
return manifest_path
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def load_manifest(job_id: str, base_path: str = BATCH_JOBS_BASE) -> JobManifest:
|
|
177
|
+
"""Load a manifest from disk.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
job_id: Job ID
|
|
181
|
+
base_path: Base path for batch jobs
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
JobManifest
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
FileNotFoundError: If manifest doesn't exist
|
|
188
|
+
ValueError: If manifest is invalid
|
|
189
|
+
"""
|
|
190
|
+
manifest_path = get_manifest_path(job_id, base_path)
|
|
191
|
+
if not manifest_path.exists():
|
|
192
|
+
raise FileNotFoundError(f"Manifest not found for job: {job_id}")
|
|
193
|
+
|
|
194
|
+
with open(manifest_path) as f:
|
|
195
|
+
data = json.load(f)
|
|
196
|
+
|
|
197
|
+
return JobManifest(**data)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def update_manifest(
|
|
201
|
+
job_id: str, updates: dict[str, Any], base_path: str = BATCH_JOBS_BASE
|
|
202
|
+
) -> JobManifest:
|
|
203
|
+
"""Update specific fields in a manifest.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
job_id: Job ID
|
|
207
|
+
updates: Dictionary of fields to update
|
|
208
|
+
base_path: Base path for batch jobs
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Updated JobManifest
|
|
212
|
+
"""
|
|
213
|
+
manifest = load_manifest(job_id, base_path)
|
|
214
|
+
|
|
215
|
+
# Apply updates
|
|
216
|
+
for key, value in updates.items():
|
|
217
|
+
if hasattr(manifest, key):
|
|
218
|
+
setattr(manifest, key, value)
|
|
219
|
+
else:
|
|
220
|
+
raise ValueError(f"Unknown manifest field: {key}")
|
|
221
|
+
|
|
222
|
+
save_manifest(manifest, base_path)
|
|
223
|
+
return manifest
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def list_jobs(
|
|
227
|
+
base_path: str = BATCH_JOBS_BASE,
|
|
228
|
+
user: str | None = None,
|
|
229
|
+
status: JobStatus | None = None,
|
|
230
|
+
pipeline: str | None = None,
|
|
231
|
+
limit: int = 50,
|
|
232
|
+
) -> list[JobManifest]:
|
|
233
|
+
"""List jobs from the batch jobs directory.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
base_path: Base path for batch jobs
|
|
237
|
+
user: Filter by username
|
|
238
|
+
status: Filter by status
|
|
239
|
+
pipeline: Filter by pipeline type
|
|
240
|
+
limit: Maximum number of jobs to return
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
List of JobManifest objects, sorted by created date (newest first)
|
|
244
|
+
"""
|
|
245
|
+
base = Path(base_path)
|
|
246
|
+
if not base.exists():
|
|
247
|
+
return []
|
|
248
|
+
|
|
249
|
+
manifests = []
|
|
250
|
+
for job_dir in base.iterdir():
|
|
251
|
+
if not job_dir.is_dir():
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
manifest_path = job_dir / "manifest.json"
|
|
255
|
+
if not manifest_path.exists():
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
with open(manifest_path) as f:
|
|
260
|
+
data = json.load(f)
|
|
261
|
+
manifest = JobManifest(**data)
|
|
262
|
+
|
|
263
|
+
# Apply filters
|
|
264
|
+
if user and manifest.user != user:
|
|
265
|
+
continue
|
|
266
|
+
if status and manifest.status != status:
|
|
267
|
+
continue
|
|
268
|
+
if pipeline and manifest.pipeline != pipeline:
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
manifests.append(manifest)
|
|
272
|
+
except (json.JSONDecodeError, ValueError):
|
|
273
|
+
# Skip invalid manifests
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
# Sort by created date, newest first
|
|
277
|
+
manifests.sort(key=lambda m: m.created, reverse=True)
|
|
278
|
+
|
|
279
|
+
return manifests[:limit]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def delete_job_directory(job_id: str, base_path: str = BATCH_JOBS_BASE) -> None:
|
|
283
|
+
"""Delete a job directory and all its contents.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
job_id: Job ID
|
|
287
|
+
base_path: Base path for batch jobs
|
|
288
|
+
"""
|
|
289
|
+
import shutil
|
|
290
|
+
|
|
291
|
+
job_dir = get_job_dir(job_id, base_path)
|
|
292
|
+
if job_dir.exists():
|
|
293
|
+
shutil.rmtree(job_dir)
|