dayhoff-tools 1.1.10__py3-none-any.whl → 1.13.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +10 -0
- dayhoff_tools/cli/cloud_commands.py +179 -43
- dayhoff_tools/cli/engine1/__init__.py +323 -0
- dayhoff_tools/cli/engine1/engine_core.py +703 -0
- dayhoff_tools/cli/engine1/engine_lifecycle.py +136 -0
- dayhoff_tools/cli/engine1/engine_maintenance.py +431 -0
- dayhoff_tools/cli/engine1/engine_management.py +505 -0
- dayhoff_tools/cli/engine1/shared.py +501 -0
- dayhoff_tools/cli/engine1/studio_commands.py +825 -0
- dayhoff_tools/cli/engines_studios/__init__.py +6 -0
- dayhoff_tools/cli/engines_studios/api_client.py +351 -0
- dayhoff_tools/cli/engines_studios/auth.py +144 -0
- dayhoff_tools/cli/engines_studios/engine-studio-cli.md +1230 -0
- dayhoff_tools/cli/engines_studios/engine_commands.py +1151 -0
- dayhoff_tools/cli/engines_studios/progress.py +260 -0
- dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
- dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
- dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
- dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
- dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +476 -0
- dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
- dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
- dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
- dayhoff_tools/cli/engines_studios/studio_commands.py +755 -0
- dayhoff_tools/cli/main.py +106 -7
- dayhoff_tools/cli/utility_commands.py +896 -179
- dayhoff_tools/deployment/base.py +70 -6
- dayhoff_tools/deployment/deploy_aws.py +165 -25
- dayhoff_tools/deployment/deploy_gcp.py +78 -5
- dayhoff_tools/deployment/deploy_utils.py +20 -7
- dayhoff_tools/deployment/job_runner.py +9 -4
- dayhoff_tools/deployment/processors.py +230 -418
- dayhoff_tools/deployment/swarm.py +47 -12
- dayhoff_tools/embedders.py +28 -26
- dayhoff_tools/fasta.py +181 -64
- dayhoff_tools/warehouse.py +268 -1
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/METADATA +20 -5
- dayhoff_tools-1.13.12.dist-info/RECORD +54 -0
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/WHEEL +1 -1
- dayhoff_tools-1.1.10.dist-info/RECORD +0 -32
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/entry_points.txt +0 -0
dayhoff_tools/deployment/base.py
CHANGED
|
@@ -128,8 +128,9 @@ def build_job_image(config: dict) -> str:
|
|
|
128
128
|
|
|
129
129
|
This function handles the complete image building process:
|
|
130
130
|
1. Ensures we're in the repo root
|
|
131
|
-
2.
|
|
132
|
-
3.
|
|
131
|
+
2. Cleans Docker config to avoid credential helper conflicts
|
|
132
|
+
3. Constructs the image URI based on config
|
|
133
|
+
4. Builds the image using docker build
|
|
133
134
|
|
|
134
135
|
Args:
|
|
135
136
|
config: Dictionary containing the configuration loaded from YAML.
|
|
@@ -146,13 +147,20 @@ def build_job_image(config: dict) -> str:
|
|
|
146
147
|
"""
|
|
147
148
|
move_to_repo_root()
|
|
148
149
|
|
|
150
|
+
# Clean Docker config to avoid VS Code dev container credential helper conflicts
|
|
151
|
+
from dayhoff_tools.deployment.deploy_utils import clean_docker_config
|
|
152
|
+
|
|
153
|
+
clean_docker_config()
|
|
154
|
+
|
|
149
155
|
# Get image URI
|
|
150
156
|
image_uri = _build_image_uri(config)
|
|
151
157
|
docker_config = config["docker"]
|
|
152
158
|
|
|
153
159
|
print("\nBuilding Docker image: ", image_uri)
|
|
154
160
|
print(f"Using Dockerfile: {docker_config['dockerfile']}")
|
|
155
|
-
print(f"Using shared memory: {docker_config['shared_memory']}
|
|
161
|
+
print(f"Using shared memory: {docker_config['shared_memory']}")
|
|
162
|
+
platform = docker_config.get("platform", "linux/amd64")
|
|
163
|
+
print(f"Building for platform: {platform}\n")
|
|
156
164
|
|
|
157
165
|
# Build the image
|
|
158
166
|
build_image_command = [
|
|
@@ -163,8 +171,20 @@ def build_job_image(config: dict) -> str:
|
|
|
163
171
|
docker_config["dockerfile"],
|
|
164
172
|
"-t",
|
|
165
173
|
image_uri,
|
|
166
|
-
".", # Use the root of the repo as image context
|
|
167
174
|
]
|
|
175
|
+
|
|
176
|
+
# Add platform specification if provided, default to linux/amd64 for cloud deployments
|
|
177
|
+
platform = docker_config.get("platform", "linux/amd64")
|
|
178
|
+
build_image_command.extend(["--platform", platform])
|
|
179
|
+
|
|
180
|
+
# Add build args if provided (for parameterized Dockerfiles)
|
|
181
|
+
build_args = docker_config.get("build_args", {})
|
|
182
|
+
for arg_name, arg_value in build_args.items():
|
|
183
|
+
build_image_command.extend(["--build-arg", f"{arg_name}={arg_value}"])
|
|
184
|
+
|
|
185
|
+
# Add build context (defaults to "." for backward compatibility)
|
|
186
|
+
build_context = docker_config.get("build_context", ".")
|
|
187
|
+
build_image_command.append(build_context)
|
|
168
188
|
subprocess.run(build_image_command, check=True)
|
|
169
189
|
|
|
170
190
|
# Get and print image size
|
|
@@ -281,7 +301,7 @@ def run_container(config: dict, image_uri: str, mode: str) -> None:
|
|
|
281
301
|
4. Handles container logs for detached mode
|
|
282
302
|
|
|
283
303
|
The container name is generated using:
|
|
284
|
-
- Username (from LOCAL_USER env var)
|
|
304
|
+
- Username (from LOCAL_USER or USER env var)
|
|
285
305
|
- Timestamp (YYYYMMDD_HHMMSS format)
|
|
286
306
|
|
|
287
307
|
Args:
|
|
@@ -299,7 +319,7 @@ def run_container(config: dict, image_uri: str, mode: str) -> None:
|
|
|
299
319
|
)
|
|
300
320
|
|
|
301
321
|
# Generate unique container name
|
|
302
|
-
username = os.getenv("LOCAL_USER", "unknown_user")
|
|
322
|
+
username = os.getenv("LOCAL_USER") or os.getenv("USER", "unknown_user")
|
|
303
323
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
304
324
|
container_name = f"{username}_job_{timestamp}"
|
|
305
325
|
|
|
@@ -394,6 +414,50 @@ def deploy(
|
|
|
394
414
|
get_boto_session(config)
|
|
395
415
|
print("AWS credentials verified.")
|
|
396
416
|
|
|
417
|
+
# Check GCP credentials early if using GCP
|
|
418
|
+
elif cloud == "gcp":
|
|
419
|
+
print("\nVerifying GCP credentials...")
|
|
420
|
+
from dayhoff_tools.cli.cloud_commands import (
|
|
421
|
+
_is_adc_authenticated,
|
|
422
|
+
_is_gcp_user_authenticated,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
user_creds_valid = _is_gcp_user_authenticated()
|
|
426
|
+
adc_creds_valid = _is_adc_authenticated()
|
|
427
|
+
|
|
428
|
+
if not user_creds_valid:
|
|
429
|
+
print(
|
|
430
|
+
"\n⚠️ Warning: Your GCP user credentials appear to be stale/expired."
|
|
431
|
+
)
|
|
432
|
+
print(
|
|
433
|
+
" This may cause authentication issues when deploying to GCP Batch."
|
|
434
|
+
)
|
|
435
|
+
print(" Consider running 'dh gcp login' to refresh your credentials.")
|
|
436
|
+
|
|
437
|
+
if not adc_creds_valid:
|
|
438
|
+
print(
|
|
439
|
+
"\n⚠️ Warning: Your Application Default Credentials (ADC) appear to be stale/expired."
|
|
440
|
+
)
|
|
441
|
+
print(
|
|
442
|
+
" This may cause authentication issues with API client libraries."
|
|
443
|
+
)
|
|
444
|
+
print(
|
|
445
|
+
" Consider running 'dh gcp use-user-adc' or 'dh gcp use-devcon-adc' to refresh your ADC."
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
if user_creds_valid and adc_creds_valid:
|
|
449
|
+
print("GCP credentials verified.")
|
|
450
|
+
else:
|
|
451
|
+
# Ask for confirmation before proceeding with possibly invalid credentials
|
|
452
|
+
proceed = (
|
|
453
|
+
input("\nProceed with potentially invalid credentials? (y/n): ")
|
|
454
|
+
.lower()
|
|
455
|
+
.strip()
|
|
456
|
+
)
|
|
457
|
+
if proceed != "y":
|
|
458
|
+
print("Deployment aborted.")
|
|
459
|
+
return
|
|
460
|
+
|
|
397
461
|
# Track if we built a new image
|
|
398
462
|
had_image_uri = bool(config["docker"]["image_uri"])
|
|
399
463
|
|
|
@@ -286,15 +286,15 @@ def create_or_update_job_definition(
|
|
|
286
286
|
raise ValueError("docker.container_entrypoint is required in configuration")
|
|
287
287
|
|
|
288
288
|
# Create linux parameters with devices
|
|
289
|
-
linux_params: dict[str, Any] = {
|
|
290
|
-
|
|
289
|
+
linux_params: dict[str, Any] = {}
|
|
290
|
+
if compute_specs.get("gpus", 0) > 0:
|
|
291
|
+
linux_params["devices"] = [
|
|
291
292
|
{
|
|
292
293
|
"hostPath": "/dev/nvidia0",
|
|
293
294
|
"containerPath": "/dev/nvidia0",
|
|
294
295
|
"permissions": ["READ", "WRITE"],
|
|
295
296
|
},
|
|
296
|
-
]
|
|
297
|
-
}
|
|
297
|
+
]
|
|
298
298
|
|
|
299
299
|
# Add shared memory configuration if specified in docker config
|
|
300
300
|
if "shared_memory" in config.get("docker", {}):
|
|
@@ -318,6 +318,82 @@ def create_or_update_job_definition(
|
|
|
318
318
|
linux_params["sharedMemorySize"] = shared_memory_mib
|
|
319
319
|
print(f"Setting shared memory size to {shared_memory_mib} MiB")
|
|
320
320
|
|
|
321
|
+
# Prepare containerProperties
|
|
322
|
+
container_properties = {
|
|
323
|
+
"image": image_uri,
|
|
324
|
+
"vcpus": compute_specs["vcpus"],
|
|
325
|
+
"memory": compute_specs["memory"],
|
|
326
|
+
"resourceRequirements": gpu_requirements,
|
|
327
|
+
"executionRoleArn": aws_config["execution_role_arn"],
|
|
328
|
+
"jobRoleArn": aws_config["job_role_arn"],
|
|
329
|
+
"privileged": compute_specs.get("gpus", 0) > 0,
|
|
330
|
+
"command": entrypoint_command,
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if linux_params:
|
|
334
|
+
container_properties["linuxParameters"] = linux_params
|
|
335
|
+
|
|
336
|
+
# Add volumes and mount points if defined in AWS batch_job config
|
|
337
|
+
batch_job_config = aws_config.get("batch_job", {})
|
|
338
|
+
if "volumes" in batch_job_config:
|
|
339
|
+
container_properties["volumes"] = batch_job_config["volumes"]
|
|
340
|
+
print(f"Adding volumes to job definition: {batch_job_config['volumes']}")
|
|
341
|
+
if "mountPoints" in batch_job_config:
|
|
342
|
+
container_properties["mountPoints"] = batch_job_config["mountPoints"]
|
|
343
|
+
print(
|
|
344
|
+
f"Adding mount points to job definition: {batch_job_config['mountPoints']}"
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Mount Primordial Drive if explicitly enabled via feature flag
|
|
348
|
+
# Add 'mount_primordial_drive' to features list to mount shared EFS at /primordial/
|
|
349
|
+
features = config.get("features", []) or []
|
|
350
|
+
features_set = {f if isinstance(f, str) else next(iter(f)) for f in features}
|
|
351
|
+
mount_primordial = "mount_primordial_drive" in features_set
|
|
352
|
+
|
|
353
|
+
if mount_primordial:
|
|
354
|
+
primordial_fs_id = get_primordial_fs_id(session)
|
|
355
|
+
if primordial_fs_id:
|
|
356
|
+
print(f"Adding Primordial Drive configuration (fs_id: {primordial_fs_id})")
|
|
357
|
+
|
|
358
|
+
# Add volume configuration
|
|
359
|
+
efs_volume = {
|
|
360
|
+
"name": "primordial",
|
|
361
|
+
"efsVolumeConfiguration": {
|
|
362
|
+
"fileSystemId": primordial_fs_id,
|
|
363
|
+
"rootDirectory": "/",
|
|
364
|
+
},
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if "volumes" not in container_properties:
|
|
368
|
+
container_properties["volumes"] = []
|
|
369
|
+
|
|
370
|
+
# Check if already added to avoid duplicates
|
|
371
|
+
if not any(
|
|
372
|
+
v.get("name") == "primordial" for v in container_properties["volumes"]
|
|
373
|
+
):
|
|
374
|
+
container_properties["volumes"].append(efs_volume)
|
|
375
|
+
|
|
376
|
+
# Add mount point
|
|
377
|
+
mount_point = {
|
|
378
|
+
"sourceVolume": "primordial",
|
|
379
|
+
"containerPath": "/primordial",
|
|
380
|
+
"readOnly": False,
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
if "mountPoints" not in container_properties:
|
|
384
|
+
container_properties["mountPoints"] = []
|
|
385
|
+
|
|
386
|
+
# Check if already added
|
|
387
|
+
if not any(
|
|
388
|
+
mp.get("containerPath") == "/primordial"
|
|
389
|
+
for mp in container_properties["mountPoints"]
|
|
390
|
+
):
|
|
391
|
+
container_properties["mountPoints"].append(mount_point)
|
|
392
|
+
else:
|
|
393
|
+
print(
|
|
394
|
+
"Warning: mount_primordial_drive enabled but Primordial Drive not found in this environment"
|
|
395
|
+
)
|
|
396
|
+
|
|
321
397
|
# Check if job definition already exists using the session client
|
|
322
398
|
try:
|
|
323
399
|
existing = batch.describe_job_definitions(
|
|
@@ -330,41 +406,67 @@ def create_or_update_job_definition(
|
|
|
330
406
|
print(f"\nCreating new job definition: {job_def_name}")
|
|
331
407
|
|
|
332
408
|
except batch.exceptions.ClientError as e:
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
e
|
|
337
|
-
): #
|
|
409
|
+
if e.response.get("Error", {}).get(
|
|
410
|
+
"Code"
|
|
411
|
+
) == "ClientError" and "JobDefinitionNotFoundException" in str(
|
|
412
|
+
e
|
|
413
|
+
): # More specific check for not found
|
|
338
414
|
print(f"\nCreating new job definition: {job_def_name}")
|
|
339
415
|
else:
|
|
340
|
-
# Re-raise unexpected client errors
|
|
341
416
|
raise
|
|
342
417
|
|
|
343
|
-
# Prepare job definition
|
|
418
|
+
# Prepare job definition arguments
|
|
344
419
|
job_definition_args = {
|
|
345
420
|
"jobDefinitionName": job_def_name,
|
|
346
421
|
"type": "container",
|
|
347
|
-
"containerProperties":
|
|
348
|
-
"image": image_uri,
|
|
349
|
-
"vcpus": compute_specs["vcpus"],
|
|
350
|
-
"memory": compute_specs["memory"],
|
|
351
|
-
"resourceRequirements": gpu_requirements,
|
|
352
|
-
"executionRoleArn": aws_config["execution_role_arn"],
|
|
353
|
-
"jobRoleArn": aws_config["job_role_arn"],
|
|
354
|
-
"privileged": compute_specs.get("gpus", 0) > 0,
|
|
355
|
-
"command": entrypoint_command,
|
|
356
|
-
**({"linuxParameters": linux_params} if linux_params else {}),
|
|
357
|
-
},
|
|
422
|
+
"containerProperties": container_properties,
|
|
358
423
|
"platformCapabilities": ["EC2"],
|
|
359
424
|
"timeout": {"attemptDurationSeconds": aws_config.get("timeout_seconds", 86400)},
|
|
360
425
|
}
|
|
361
426
|
|
|
427
|
+
# Add tags if specified in config
|
|
428
|
+
if "tags" in aws_config:
|
|
429
|
+
job_definition_args["tags"] = aws_config["tags"]
|
|
430
|
+
print(f"Adding tags to job definition: {aws_config['tags']}")
|
|
431
|
+
|
|
362
432
|
# Register new revision using the session client
|
|
363
433
|
response = batch.register_job_definition(**job_definition_args)
|
|
364
434
|
|
|
365
435
|
return response["jobDefinitionName"]
|
|
366
436
|
|
|
367
437
|
|
|
438
|
+
def get_primordial_fs_id(session: boto3.Session) -> Optional[str]:
|
|
439
|
+
"""Fetch Primordial Drive EFS ID from SSM.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
session: Boto3 session
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
FileSystemId if found, None otherwise
|
|
446
|
+
"""
|
|
447
|
+
ssm = session.client("ssm")
|
|
448
|
+
|
|
449
|
+
# Determine environment from profile name
|
|
450
|
+
# Default to dev if cannot determine
|
|
451
|
+
env = "dev"
|
|
452
|
+
if session.profile_name and "sand" in session.profile_name:
|
|
453
|
+
env = "sand"
|
|
454
|
+
|
|
455
|
+
param_name = f"/{env}/primordial/fs_id"
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
response = ssm.get_parameter(Name=param_name)
|
|
459
|
+
return response["Parameter"]["Value"]
|
|
460
|
+
except ClientError as e:
|
|
461
|
+
# Silently fail if not found - Primordial might not be deployed in this env
|
|
462
|
+
# or we might not have permissions
|
|
463
|
+
# ParameterNotFound is a ClientError with error code "ParameterNotFound"
|
|
464
|
+
return None
|
|
465
|
+
except Exception as e:
|
|
466
|
+
print(f"Warning: Failed to check for Primordial Drive: {e}")
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
|
|
368
470
|
def submit_aws_batch_job(
|
|
369
471
|
image_uri: str,
|
|
370
472
|
config: dict[str, Any],
|
|
@@ -403,9 +505,30 @@ def submit_aws_batch_job(
|
|
|
403
505
|
print(f"- Job Role: {aws_config['job_role_arn']}")
|
|
404
506
|
|
|
405
507
|
# Get all environment variables, including special ones like WANDB_API_KEY and GCP credentials
|
|
406
|
-
|
|
508
|
+
env_vars_map = get_container_env_vars(config) # This returns a dict
|
|
509
|
+
|
|
510
|
+
# If EFS is configured for InterProScan, override INTERPROSCAN_INSTALL_DIR
|
|
511
|
+
# Check based on the conventional volume name used in interp_bulk.yaml
|
|
512
|
+
efs_interproscan_mount_path = None
|
|
513
|
+
aws_batch_job_config = aws_config.get("batch_job", {})
|
|
514
|
+
if "mountPoints" in aws_batch_job_config:
|
|
515
|
+
for mp in aws_batch_job_config["mountPoints"]:
|
|
516
|
+
if (
|
|
517
|
+
mp.get("sourceVolume") == "interproscan-efs-volume"
|
|
518
|
+
): # Convention from YAML
|
|
519
|
+
efs_interproscan_mount_path = mp.get("containerPath")
|
|
520
|
+
break
|
|
521
|
+
|
|
522
|
+
if efs_interproscan_mount_path:
|
|
523
|
+
env_vars_map["INTERPROSCAN_INSTALL_DIR"] = efs_interproscan_mount_path
|
|
524
|
+
print(
|
|
525
|
+
f"INTERPROSCAN_INSTALL_DIR overridden to EFS mount path: {efs_interproscan_mount_path}"
|
|
526
|
+
)
|
|
407
527
|
|
|
408
|
-
print(
|
|
528
|
+
print(
|
|
529
|
+
"Environment Variables (after potential EFS override):",
|
|
530
|
+
list(env_vars_map.keys()),
|
|
531
|
+
)
|
|
409
532
|
|
|
410
533
|
# Create/Update Job Definition using the config (now implicitly uses the correct session)
|
|
411
534
|
job_definition = create_or_update_job_definition(image_uri, config)
|
|
@@ -418,7 +541,8 @@ def submit_aws_batch_job(
|
|
|
418
541
|
"jobDefinition": job_definition,
|
|
419
542
|
"containerOverrides": {
|
|
420
543
|
"environment": [
|
|
421
|
-
{"name": key, "value": str(value)}
|
|
544
|
+
{"name": key, "value": str(value)}
|
|
545
|
+
for key, value in env_vars_map.items()
|
|
422
546
|
],
|
|
423
547
|
},
|
|
424
548
|
}
|
|
@@ -435,6 +559,22 @@ def submit_aws_batch_job(
|
|
|
435
559
|
print(f"Setting retry attempts to {retry_attempts}")
|
|
436
560
|
job_submit_args["retryStrategy"] = {"attempts": retry_attempts}
|
|
437
561
|
|
|
562
|
+
# Automatically add User tag for cost tracking
|
|
563
|
+
username = os.getenv("LOCAL_USER", "unknown_user")
|
|
564
|
+
default_tags = {"User": username}
|
|
565
|
+
|
|
566
|
+
# Merge with any tags specified in config (config tags take precedence)
|
|
567
|
+
if "tags" in aws_config:
|
|
568
|
+
tags = {**default_tags, **aws_config["tags"]}
|
|
569
|
+
else:
|
|
570
|
+
tags = default_tags
|
|
571
|
+
|
|
572
|
+
job_submit_args["tags"] = tags
|
|
573
|
+
job_submit_args["propagateTags"] = (
|
|
574
|
+
True # Propagate tags to ECS tasks and EC2 instances
|
|
575
|
+
)
|
|
576
|
+
print(f"Adding tags to batch job: {tags}")
|
|
577
|
+
|
|
438
578
|
# Submit the job using the session client
|
|
439
579
|
response = batch.submit_job(**job_submit_args)
|
|
440
580
|
|
|
@@ -50,9 +50,56 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
|
|
|
50
50
|
|
|
51
51
|
Returns:
|
|
52
52
|
Dictionary containing GCP Batch job configuration
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If the configuration contains unexpected keys.
|
|
53
56
|
"""
|
|
54
57
|
gcp_config = config["gcp"]
|
|
55
58
|
|
|
59
|
+
# Validate top-level gcp_config keys used for Batch job JSON construction
|
|
60
|
+
EXPECTED_GCP_CONFIG_KEYS = {
|
|
61
|
+
"allocation_policy", # Goes into batch_config.allocationPolicy
|
|
62
|
+
"logs_policy", # Goes into batch_config.logsPolicy
|
|
63
|
+
"batch_job", # Contains detailed task and resource specs
|
|
64
|
+
"image_uri",
|
|
65
|
+
# Keys like job_name, region, registry_uri, repository are used by other functions
|
|
66
|
+
# or for other purposes, not directly for constructing the core batch_config JSON here.
|
|
67
|
+
}
|
|
68
|
+
actual_gcp_keys = set(gcp_config.keys())
|
|
69
|
+
# Filter out keys not relevant to this function's direct Batch config construction
|
|
70
|
+
# These keys are used by the calling context or other parts of the deployment.
|
|
71
|
+
keys_to_ignore_for_this_check = {"job_name", "region", "registry_uri", "repository"}
|
|
72
|
+
relevant_gcp_keys = {
|
|
73
|
+
key for key in actual_gcp_keys if key not in keys_to_ignore_for_this_check
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
unhandled_gcp_keys = relevant_gcp_keys - EXPECTED_GCP_CONFIG_KEYS
|
|
77
|
+
if unhandled_gcp_keys:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"Unexpected keys in 'gcp' configuration section: {unhandled_gcp_keys}. "
|
|
80
|
+
f"Expected keys for Batch job JSON construction are: {EXPECTED_GCP_CONFIG_KEYS}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Validate keys within gcp_config["batch_job"]
|
|
84
|
+
if "batch_job" not in gcp_config:
|
|
85
|
+
raise ValueError("Missing 'batch_job' section in 'gcp' configuration.")
|
|
86
|
+
|
|
87
|
+
gcp_batch_job_config = gcp_config["batch_job"]
|
|
88
|
+
EXPECTED_GCP_BATCH_JOB_KEYS = {
|
|
89
|
+
"taskCount",
|
|
90
|
+
"parallelism",
|
|
91
|
+
"computeResource",
|
|
92
|
+
"instance", # Contains machineType, accelerators
|
|
93
|
+
"volumes",
|
|
94
|
+
}
|
|
95
|
+
actual_batch_job_keys = set(gcp_batch_job_config.keys())
|
|
96
|
+
unhandled_batch_job_keys = actual_batch_job_keys - EXPECTED_GCP_BATCH_JOB_KEYS
|
|
97
|
+
if unhandled_batch_job_keys:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Unexpected keys in 'gcp.batch_job' configuration section: {unhandled_batch_job_keys}. "
|
|
100
|
+
f"Expected keys are: {EXPECTED_GCP_BATCH_JOB_KEYS}"
|
|
101
|
+
)
|
|
102
|
+
|
|
56
103
|
# Start with the allocation and logs policies
|
|
57
104
|
batch_config = {
|
|
58
105
|
"allocationPolicy": gcp_config["allocation_policy"],
|
|
@@ -75,16 +122,27 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
|
|
|
75
122
|
"commands": ["-c", " ".join(entrypoint_command)],
|
|
76
123
|
}
|
|
77
124
|
|
|
125
|
+
# Handle container options - both shared memory and any custom options
|
|
126
|
+
docker_options = []
|
|
127
|
+
|
|
78
128
|
# Add shared memory option if specified
|
|
79
129
|
if "shared_memory" in config.get("docker", {}):
|
|
80
|
-
|
|
130
|
+
docker_options.append(f"--shm-size={config['docker']['shared_memory']}")
|
|
131
|
+
|
|
132
|
+
# Add any custom Docker options if specified
|
|
133
|
+
if "options" in config.get("docker", {}):
|
|
134
|
+
docker_options.append(config["docker"]["options"])
|
|
135
|
+
|
|
136
|
+
# Set the options field if any options were collected
|
|
137
|
+
if docker_options:
|
|
138
|
+
container_config["options"] = " ".join(docker_options)
|
|
81
139
|
|
|
82
140
|
# Build the task group configuration
|
|
83
141
|
task_group = {
|
|
84
|
-
"taskCount":
|
|
85
|
-
"parallelism":
|
|
142
|
+
"taskCount": gcp_batch_job_config["taskCount"],
|
|
143
|
+
"parallelism": gcp_batch_job_config["parallelism"],
|
|
86
144
|
"taskSpec": {
|
|
87
|
-
"computeResource":
|
|
145
|
+
"computeResource": gcp_batch_job_config["computeResource"],
|
|
88
146
|
"runnables": [{"container": container_config}],
|
|
89
147
|
},
|
|
90
148
|
}
|
|
@@ -96,8 +154,12 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
|
|
|
96
154
|
if env_vars:
|
|
97
155
|
task_group["taskSpec"]["runnables"][0]["environment"] = {"variables": env_vars}
|
|
98
156
|
|
|
157
|
+
# Add volumes to the taskSpec if specified in the config
|
|
158
|
+
if "volumes" in gcp_batch_job_config and gcp_batch_job_config["volumes"]:
|
|
159
|
+
task_group["taskSpec"]["volumes"] = gcp_batch_job_config["volumes"]
|
|
160
|
+
|
|
99
161
|
# Add machine type and optional accelerators from instance config
|
|
100
|
-
instance_config =
|
|
162
|
+
instance_config = gcp_batch_job_config["instance"]
|
|
101
163
|
if "machineType" in instance_config:
|
|
102
164
|
# Add machine type to the allocation policy
|
|
103
165
|
if "policy" not in batch_config["allocationPolicy"]["instances"]:
|
|
@@ -123,6 +185,17 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
|
|
|
123
185
|
print("Machine Type:", policy.get("machineType", "Not specified"))
|
|
124
186
|
print("Accelerators:", policy.get("accelerators", "Not specified"))
|
|
125
187
|
print("Environment Variables:", list(env_vars.keys()))
|
|
188
|
+
if (
|
|
189
|
+
"runnables" in task_group["taskSpec"]
|
|
190
|
+
and task_group["taskSpec"]["runnables"]
|
|
191
|
+
):
|
|
192
|
+
print(
|
|
193
|
+
"Container Options:",
|
|
194
|
+
task_group["taskSpec"]["runnables"][0]
|
|
195
|
+
.get("container", {})
|
|
196
|
+
.get("options", "Not specified"),
|
|
197
|
+
)
|
|
198
|
+
|
|
126
199
|
except KeyError as e:
|
|
127
200
|
print(f"Warning: Could not find {e} in configuration")
|
|
128
201
|
|
|
@@ -614,6 +614,24 @@ def setup_rxnfp() -> None:
|
|
|
614
614
|
)
|
|
615
615
|
|
|
616
616
|
|
|
617
|
+
def clean_docker_config() -> None:
|
|
618
|
+
"""Clean Docker configuration to avoid credential helper conflicts.
|
|
619
|
+
|
|
620
|
+
VS Code dev containers can set up credential helpers that interfere with
|
|
621
|
+
Docker builds and registry operations. This function creates a clean
|
|
622
|
+
configuration that disables problematic credential helpers.
|
|
623
|
+
|
|
624
|
+
This is automatically called before Docker builds and registry operations
|
|
625
|
+
to prevent authentication failures.
|
|
626
|
+
"""
|
|
627
|
+
docker_config_dir = os.path.expanduser("~/.docker")
|
|
628
|
+
os.makedirs(docker_config_dir, exist_ok=True)
|
|
629
|
+
|
|
630
|
+
# Write a minimal config file that disables credential helpers
|
|
631
|
+
with open(os.path.join(docker_config_dir, "config.json"), "w") as f:
|
|
632
|
+
json.dump({"auths": {}, "credsStore": ""}, f)
|
|
633
|
+
|
|
634
|
+
|
|
617
635
|
def docker_login(registry: str, username: str, password: str) -> None:
|
|
618
636
|
"""Login to a Docker registry using provided credentials.
|
|
619
637
|
|
|
@@ -625,13 +643,8 @@ def docker_login(registry: str, username: str, password: str) -> None:
|
|
|
625
643
|
Raises:
|
|
626
644
|
subprocess.CalledProcessError: If Docker login fails
|
|
627
645
|
"""
|
|
628
|
-
#
|
|
629
|
-
|
|
630
|
-
os.makedirs(docker_config_dir, exist_ok=True)
|
|
631
|
-
|
|
632
|
-
# Write a minimal config file that disables credential helpers
|
|
633
|
-
with open(os.path.join(docker_config_dir, "config.json"), "w") as f:
|
|
634
|
-
json.dump({"auths": {}, "credsStore": ""}, f)
|
|
646
|
+
# Clean Docker config to avoid credential helper conflicts
|
|
647
|
+
clean_docker_config()
|
|
635
648
|
|
|
636
649
|
# Login to Docker using the credentials
|
|
637
650
|
login_process = subprocess.run(
|
|
@@ -94,9 +94,9 @@ def run_command() -> None:
|
|
|
94
94
|
stderr=None, # Use parent's stderr
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
-
logger.info("
|
|
97
|
+
logger.info("Job command completed successfully")
|
|
98
98
|
except subprocess.CalledProcessError as e:
|
|
99
|
-
logger.error(f"
|
|
99
|
+
logger.error(f"Job command failed with return code: {e.returncode}")
|
|
100
100
|
raise
|
|
101
101
|
except Exception as e:
|
|
102
102
|
logger.error(f"Error executing command: {str(e)}")
|
|
@@ -133,6 +133,13 @@ def run_job(
|
|
|
133
133
|
logger = logging.getLogger(__name__)
|
|
134
134
|
|
|
135
135
|
logger.info(f"Job runner starting in mode: {mode}")
|
|
136
|
+
import importlib.metadata
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
version = importlib.metadata.version("dayhoff-tools")
|
|
140
|
+
logger.info(f"dayhoff-tools version: {version}")
|
|
141
|
+
except importlib.metadata.PackageNotFoundError:
|
|
142
|
+
logger.warning("Could not determine dayhoff-tools version")
|
|
136
143
|
|
|
137
144
|
if mode not in ["setup", "execute", "setup_and_execute"]:
|
|
138
145
|
logger.error(f"Invalid mode: {mode}")
|
|
@@ -146,8 +153,6 @@ def run_job(
|
|
|
146
153
|
if mode in ["execute", "setup_and_execute"]:
|
|
147
154
|
run_command()
|
|
148
155
|
|
|
149
|
-
logger.info("Job completed successfully")
|
|
150
|
-
|
|
151
156
|
except Exception as e:
|
|
152
157
|
logger.error(f"Job failed with error: {str(e)}", exc_info=True)
|
|
153
158
|
sys.exit(1)
|