dayhoff-tools 1.1.10__py3-none-any.whl → 1.13.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dayhoff_tools/__init__.py +10 -0
  2. dayhoff_tools/cli/cloud_commands.py +179 -43
  3. dayhoff_tools/cli/engine1/__init__.py +323 -0
  4. dayhoff_tools/cli/engine1/engine_core.py +703 -0
  5. dayhoff_tools/cli/engine1/engine_lifecycle.py +136 -0
  6. dayhoff_tools/cli/engine1/engine_maintenance.py +431 -0
  7. dayhoff_tools/cli/engine1/engine_management.py +505 -0
  8. dayhoff_tools/cli/engine1/shared.py +501 -0
  9. dayhoff_tools/cli/engine1/studio_commands.py +825 -0
  10. dayhoff_tools/cli/engines_studios/__init__.py +6 -0
  11. dayhoff_tools/cli/engines_studios/api_client.py +351 -0
  12. dayhoff_tools/cli/engines_studios/auth.py +144 -0
  13. dayhoff_tools/cli/engines_studios/engine-studio-cli.md +1230 -0
  14. dayhoff_tools/cli/engines_studios/engine_commands.py +1151 -0
  15. dayhoff_tools/cli/engines_studios/progress.py +260 -0
  16. dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
  17. dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
  18. dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
  19. dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
  20. dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +476 -0
  21. dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
  22. dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
  23. dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
  24. dayhoff_tools/cli/engines_studios/studio_commands.py +755 -0
  25. dayhoff_tools/cli/main.py +106 -7
  26. dayhoff_tools/cli/utility_commands.py +896 -179
  27. dayhoff_tools/deployment/base.py +70 -6
  28. dayhoff_tools/deployment/deploy_aws.py +165 -25
  29. dayhoff_tools/deployment/deploy_gcp.py +78 -5
  30. dayhoff_tools/deployment/deploy_utils.py +20 -7
  31. dayhoff_tools/deployment/job_runner.py +9 -4
  32. dayhoff_tools/deployment/processors.py +230 -418
  33. dayhoff_tools/deployment/swarm.py +47 -12
  34. dayhoff_tools/embedders.py +28 -26
  35. dayhoff_tools/fasta.py +181 -64
  36. dayhoff_tools/warehouse.py +268 -1
  37. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/METADATA +20 -5
  38. dayhoff_tools-1.13.12.dist-info/RECORD +54 -0
  39. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/WHEEL +1 -1
  40. dayhoff_tools-1.1.10.dist-info/RECORD +0 -32
  41. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/entry_points.txt +0 -0
@@ -128,8 +128,9 @@ def build_job_image(config: dict) -> str:
128
128
 
129
129
  This function handles the complete image building process:
130
130
  1. Ensures we're in the repo root
131
- 2. Constructs the image URI based on config
132
- 3. Builds the image using docker build
131
+ 2. Cleans Docker config to avoid credential helper conflicts
132
+ 3. Constructs the image URI based on config
133
+ 4. Builds the image using docker build
133
134
 
134
135
  Args:
135
136
  config: Dictionary containing the configuration loaded from YAML.
@@ -146,13 +147,20 @@ def build_job_image(config: dict) -> str:
146
147
  """
147
148
  move_to_repo_root()
148
149
 
150
+ # Clean Docker config to avoid VS Code dev container credential helper conflicts
151
+ from dayhoff_tools.deployment.deploy_utils import clean_docker_config
152
+
153
+ clean_docker_config()
154
+
149
155
  # Get image URI
150
156
  image_uri = _build_image_uri(config)
151
157
  docker_config = config["docker"]
152
158
 
153
159
  print("\nBuilding Docker image: ", image_uri)
154
160
  print(f"Using Dockerfile: {docker_config['dockerfile']}")
155
- print(f"Using shared memory: {docker_config['shared_memory']}\n")
161
+ print(f"Using shared memory: {docker_config['shared_memory']}")
162
+ platform = docker_config.get("platform", "linux/amd64")
163
+ print(f"Building for platform: {platform}\n")
156
164
 
157
165
  # Build the image
158
166
  build_image_command = [
@@ -163,8 +171,20 @@ def build_job_image(config: dict) -> str:
163
171
  docker_config["dockerfile"],
164
172
  "-t",
165
173
  image_uri,
166
- ".", # Use the root of the repo as image context
167
174
  ]
175
+
176
+ # Add platform specification if provided, default to linux/amd64 for cloud deployments
177
+ platform = docker_config.get("platform", "linux/amd64")
178
+ build_image_command.extend(["--platform", platform])
179
+
180
+ # Add build args if provided (for parameterized Dockerfiles)
181
+ build_args = docker_config.get("build_args", {})
182
+ for arg_name, arg_value in build_args.items():
183
+ build_image_command.extend(["--build-arg", f"{arg_name}={arg_value}"])
184
+
185
+ # Add build context (defaults to "." for backward compatibility)
186
+ build_context = docker_config.get("build_context", ".")
187
+ build_image_command.append(build_context)
168
188
  subprocess.run(build_image_command, check=True)
169
189
 
170
190
  # Get and print image size
@@ -281,7 +301,7 @@ def run_container(config: dict, image_uri: str, mode: str) -> None:
281
301
  4. Handles container logs for detached mode
282
302
 
283
303
  The container name is generated using:
284
- - Username (from LOCAL_USER env var)
304
+ - Username (from LOCAL_USER or USER env var)
285
305
  - Timestamp (YYYYMMDD_HHMMSS format)
286
306
 
287
307
  Args:
@@ -299,7 +319,7 @@ def run_container(config: dict, image_uri: str, mode: str) -> None:
299
319
  )
300
320
 
301
321
  # Generate unique container name
302
- username = os.getenv("LOCAL_USER", "unknown_user")
322
+ username = os.getenv("LOCAL_USER") or os.getenv("USER", "unknown_user")
303
323
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
304
324
  container_name = f"{username}_job_{timestamp}"
305
325
 
@@ -394,6 +414,50 @@ def deploy(
394
414
  get_boto_session(config)
395
415
  print("AWS credentials verified.")
396
416
 
417
+ # Check GCP credentials early if using GCP
418
+ elif cloud == "gcp":
419
+ print("\nVerifying GCP credentials...")
420
+ from dayhoff_tools.cli.cloud_commands import (
421
+ _is_adc_authenticated,
422
+ _is_gcp_user_authenticated,
423
+ )
424
+
425
+ user_creds_valid = _is_gcp_user_authenticated()
426
+ adc_creds_valid = _is_adc_authenticated()
427
+
428
+ if not user_creds_valid:
429
+ print(
430
+ "\n⚠️ Warning: Your GCP user credentials appear to be stale/expired."
431
+ )
432
+ print(
433
+ " This may cause authentication issues when deploying to GCP Batch."
434
+ )
435
+ print(" Consider running 'dh gcp login' to refresh your credentials.")
436
+
437
+ if not adc_creds_valid:
438
+ print(
439
+ "\n⚠️ Warning: Your Application Default Credentials (ADC) appear to be stale/expired."
440
+ )
441
+ print(
442
+ " This may cause authentication issues with API client libraries."
443
+ )
444
+ print(
445
+ " Consider running 'dh gcp use-user-adc' or 'dh gcp use-devcon-adc' to refresh your ADC."
446
+ )
447
+
448
+ if user_creds_valid and adc_creds_valid:
449
+ print("GCP credentials verified.")
450
+ else:
451
+ # Ask for confirmation before proceeding with possibly invalid credentials
452
+ proceed = (
453
+ input("\nProceed with potentially invalid credentials? (y/n): ")
454
+ .lower()
455
+ .strip()
456
+ )
457
+ if proceed != "y":
458
+ print("Deployment aborted.")
459
+ return
460
+
397
461
  # Track if we built a new image
398
462
  had_image_uri = bool(config["docker"]["image_uri"])
399
463
 
@@ -286,15 +286,15 @@ def create_or_update_job_definition(
286
286
  raise ValueError("docker.container_entrypoint is required in configuration")
287
287
 
288
288
  # Create linux parameters with devices
289
- linux_params: dict[str, Any] = {
290
- "devices": [
289
+ linux_params: dict[str, Any] = {}
290
+ if compute_specs.get("gpus", 0) > 0:
291
+ linux_params["devices"] = [
291
292
  {
292
293
  "hostPath": "/dev/nvidia0",
293
294
  "containerPath": "/dev/nvidia0",
294
295
  "permissions": ["READ", "WRITE"],
295
296
  },
296
- ],
297
- }
297
+ ]
298
298
 
299
299
  # Add shared memory configuration if specified in docker config
300
300
  if "shared_memory" in config.get("docker", {}):
@@ -318,6 +318,82 @@ def create_or_update_job_definition(
318
318
  linux_params["sharedMemorySize"] = shared_memory_mib
319
319
  print(f"Setting shared memory size to {shared_memory_mib} MiB")
320
320
 
321
+ # Prepare containerProperties
322
+ container_properties = {
323
+ "image": image_uri,
324
+ "vcpus": compute_specs["vcpus"],
325
+ "memory": compute_specs["memory"],
326
+ "resourceRequirements": gpu_requirements,
327
+ "executionRoleArn": aws_config["execution_role_arn"],
328
+ "jobRoleArn": aws_config["job_role_arn"],
329
+ "privileged": compute_specs.get("gpus", 0) > 0,
330
+ "command": entrypoint_command,
331
+ }
332
+
333
+ if linux_params:
334
+ container_properties["linuxParameters"] = linux_params
335
+
336
+ # Add volumes and mount points if defined in AWS batch_job config
337
+ batch_job_config = aws_config.get("batch_job", {})
338
+ if "volumes" in batch_job_config:
339
+ container_properties["volumes"] = batch_job_config["volumes"]
340
+ print(f"Adding volumes to job definition: {batch_job_config['volumes']}")
341
+ if "mountPoints" in batch_job_config:
342
+ container_properties["mountPoints"] = batch_job_config["mountPoints"]
343
+ print(
344
+ f"Adding mount points to job definition: {batch_job_config['mountPoints']}"
345
+ )
346
+
347
+ # Mount Primordial Drive if explicitly enabled via feature flag
348
+ # Add 'mount_primordial_drive' to features list to mount shared EFS at /primordial/
349
+ features = config.get("features", []) or []
350
+ features_set = {f if isinstance(f, str) else next(iter(f)) for f in features}
351
+ mount_primordial = "mount_primordial_drive" in features_set
352
+
353
+ if mount_primordial:
354
+ primordial_fs_id = get_primordial_fs_id(session)
355
+ if primordial_fs_id:
356
+ print(f"Adding Primordial Drive configuration (fs_id: {primordial_fs_id})")
357
+
358
+ # Add volume configuration
359
+ efs_volume = {
360
+ "name": "primordial",
361
+ "efsVolumeConfiguration": {
362
+ "fileSystemId": primordial_fs_id,
363
+ "rootDirectory": "/",
364
+ },
365
+ }
366
+
367
+ if "volumes" not in container_properties:
368
+ container_properties["volumes"] = []
369
+
370
+ # Check if already added to avoid duplicates
371
+ if not any(
372
+ v.get("name") == "primordial" for v in container_properties["volumes"]
373
+ ):
374
+ container_properties["volumes"].append(efs_volume)
375
+
376
+ # Add mount point
377
+ mount_point = {
378
+ "sourceVolume": "primordial",
379
+ "containerPath": "/primordial",
380
+ "readOnly": False,
381
+ }
382
+
383
+ if "mountPoints" not in container_properties:
384
+ container_properties["mountPoints"] = []
385
+
386
+ # Check if already added
387
+ if not any(
388
+ mp.get("containerPath") == "/primordial"
389
+ for mp in container_properties["mountPoints"]
390
+ ):
391
+ container_properties["mountPoints"].append(mount_point)
392
+ else:
393
+ print(
394
+ "Warning: mount_primordial_drive enabled but Primordial Drive not found in this environment"
395
+ )
396
+
321
397
  # Check if job definition already exists using the session client
322
398
  try:
323
399
  existing = batch.describe_job_definitions(
@@ -330,41 +406,67 @@ def create_or_update_job_definition(
330
406
  print(f"\nCreating new job definition: {job_def_name}")
331
407
 
332
408
  except batch.exceptions.ClientError as e:
333
- # Handle case where the error is specifically 'JobDefinitionNotFoundException'
334
- # Boto3 typically includes error codes in the response
335
- if (
336
- e.response.get("Error", {}).get("Code") == "ClientError"
337
- ): # Simple check, might need refinement
409
+ if e.response.get("Error", {}).get(
410
+ "Code"
411
+ ) == "ClientError" and "JobDefinitionNotFoundException" in str(
412
+ e
413
+ ): # More specific check for not found
338
414
  print(f"\nCreating new job definition: {job_def_name}")
339
415
  else:
340
- # Re-raise unexpected client errors
341
416
  raise
342
417
 
343
- # Prepare job definition properties
418
+ # Prepare job definition arguments
344
419
  job_definition_args = {
345
420
  "jobDefinitionName": job_def_name,
346
421
  "type": "container",
347
- "containerProperties": {
348
- "image": image_uri,
349
- "vcpus": compute_specs["vcpus"],
350
- "memory": compute_specs["memory"],
351
- "resourceRequirements": gpu_requirements,
352
- "executionRoleArn": aws_config["execution_role_arn"],
353
- "jobRoleArn": aws_config["job_role_arn"],
354
- "privileged": compute_specs.get("gpus", 0) > 0,
355
- "command": entrypoint_command,
356
- **({"linuxParameters": linux_params} if linux_params else {}),
357
- },
422
+ "containerProperties": container_properties,
358
423
  "platformCapabilities": ["EC2"],
359
424
  "timeout": {"attemptDurationSeconds": aws_config.get("timeout_seconds", 86400)},
360
425
  }
361
426
 
427
+ # Add tags if specified in config
428
+ if "tags" in aws_config:
429
+ job_definition_args["tags"] = aws_config["tags"]
430
+ print(f"Adding tags to job definition: {aws_config['tags']}")
431
+
362
432
  # Register new revision using the session client
363
433
  response = batch.register_job_definition(**job_definition_args)
364
434
 
365
435
  return response["jobDefinitionName"]
366
436
 
367
437
 
438
+ def get_primordial_fs_id(session: boto3.Session) -> Optional[str]:
439
+ """Fetch Primordial Drive EFS ID from SSM.
440
+
441
+ Args:
442
+ session: Boto3 session
443
+
444
+ Returns:
445
+ FileSystemId if found, None otherwise
446
+ """
447
+ ssm = session.client("ssm")
448
+
449
+ # Determine environment from profile name
450
+ # Default to dev if cannot determine
451
+ env = "dev"
452
+ if session.profile_name and "sand" in session.profile_name:
453
+ env = "sand"
454
+
455
+ param_name = f"/{env}/primordial/fs_id"
456
+
457
+ try:
458
+ response = ssm.get_parameter(Name=param_name)
459
+ return response["Parameter"]["Value"]
460
+ except ClientError as e:
461
+ # Silently fail if not found - Primordial might not be deployed in this env
462
+ # or we might not have permissions
463
+ # ParameterNotFound is a ClientError with error code "ParameterNotFound"
464
+ return None
465
+ except Exception as e:
466
+ print(f"Warning: Failed to check for Primordial Drive: {e}")
467
+ return None
468
+
469
+
368
470
  def submit_aws_batch_job(
369
471
  image_uri: str,
370
472
  config: dict[str, Any],
@@ -403,9 +505,30 @@ def submit_aws_batch_job(
403
505
  print(f"- Job Role: {aws_config['job_role_arn']}")
404
506
 
405
507
  # Get all environment variables, including special ones like WANDB_API_KEY and GCP credentials
406
- env_vars = get_container_env_vars(config)
508
+ env_vars_map = get_container_env_vars(config) # This returns a dict
509
+
510
+ # If EFS is configured for InterProScan, override INTERPROSCAN_INSTALL_DIR
511
+ # Check based on the conventional volume name used in interp_bulk.yaml
512
+ efs_interproscan_mount_path = None
513
+ aws_batch_job_config = aws_config.get("batch_job", {})
514
+ if "mountPoints" in aws_batch_job_config:
515
+ for mp in aws_batch_job_config["mountPoints"]:
516
+ if (
517
+ mp.get("sourceVolume") == "interproscan-efs-volume"
518
+ ): # Convention from YAML
519
+ efs_interproscan_mount_path = mp.get("containerPath")
520
+ break
521
+
522
+ if efs_interproscan_mount_path:
523
+ env_vars_map["INTERPROSCAN_INSTALL_DIR"] = efs_interproscan_mount_path
524
+ print(
525
+ f"INTERPROSCAN_INSTALL_DIR overridden to EFS mount path: {efs_interproscan_mount_path}"
526
+ )
407
527
 
408
- print("Environment Variables:", list(env_vars.keys()))
528
+ print(
529
+ "Environment Variables (after potential EFS override):",
530
+ list(env_vars_map.keys()),
531
+ )
409
532
 
410
533
  # Create/Update Job Definition using the config (now implicitly uses the correct session)
411
534
  job_definition = create_or_update_job_definition(image_uri, config)
@@ -418,7 +541,8 @@ def submit_aws_batch_job(
418
541
  "jobDefinition": job_definition,
419
542
  "containerOverrides": {
420
543
  "environment": [
421
- {"name": key, "value": str(value)} for key, value in env_vars.items()
544
+ {"name": key, "value": str(value)}
545
+ for key, value in env_vars_map.items()
422
546
  ],
423
547
  },
424
548
  }
@@ -435,6 +559,22 @@ def submit_aws_batch_job(
435
559
  print(f"Setting retry attempts to {retry_attempts}")
436
560
  job_submit_args["retryStrategy"] = {"attempts": retry_attempts}
437
561
 
562
+ # Automatically add User tag for cost tracking
563
+ username = os.getenv("LOCAL_USER", "unknown_user")
564
+ default_tags = {"User": username}
565
+
566
+ # Merge with any tags specified in config (config tags take precedence)
567
+ if "tags" in aws_config:
568
+ tags = {**default_tags, **aws_config["tags"]}
569
+ else:
570
+ tags = default_tags
571
+
572
+ job_submit_args["tags"] = tags
573
+ job_submit_args["propagateTags"] = (
574
+ True # Propagate tags to ECS tasks and EC2 instances
575
+ )
576
+ print(f"Adding tags to batch job: {tags}")
577
+
438
578
  # Submit the job using the session client
439
579
  response = batch.submit_job(**job_submit_args)
440
580
 
@@ -50,9 +50,56 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
50
50
 
51
51
  Returns:
52
52
  Dictionary containing GCP Batch job configuration
53
+
54
+ Raises:
55
+ ValueError: If the configuration contains unexpected keys.
53
56
  """
54
57
  gcp_config = config["gcp"]
55
58
 
59
+ # Validate top-level gcp_config keys used for Batch job JSON construction
60
+ EXPECTED_GCP_CONFIG_KEYS = {
61
+ "allocation_policy", # Goes into batch_config.allocationPolicy
62
+ "logs_policy", # Goes into batch_config.logsPolicy
63
+ "batch_job", # Contains detailed task and resource specs
64
+ "image_uri",
65
+ # Keys like job_name, region, registry_uri, repository are used by other functions
66
+ # or for other purposes, not directly for constructing the core batch_config JSON here.
67
+ }
68
+ actual_gcp_keys = set(gcp_config.keys())
69
+ # Filter out keys not relevant to this function's direct Batch config construction
70
+ # These keys are used by the calling context or other parts of the deployment.
71
+ keys_to_ignore_for_this_check = {"job_name", "region", "registry_uri", "repository"}
72
+ relevant_gcp_keys = {
73
+ key for key in actual_gcp_keys if key not in keys_to_ignore_for_this_check
74
+ }
75
+
76
+ unhandled_gcp_keys = relevant_gcp_keys - EXPECTED_GCP_CONFIG_KEYS
77
+ if unhandled_gcp_keys:
78
+ raise ValueError(
79
+ f"Unexpected keys in 'gcp' configuration section: {unhandled_gcp_keys}. "
80
+ f"Expected keys for Batch job JSON construction are: {EXPECTED_GCP_CONFIG_KEYS}"
81
+ )
82
+
83
+ # Validate keys within gcp_config["batch_job"]
84
+ if "batch_job" not in gcp_config:
85
+ raise ValueError("Missing 'batch_job' section in 'gcp' configuration.")
86
+
87
+ gcp_batch_job_config = gcp_config["batch_job"]
88
+ EXPECTED_GCP_BATCH_JOB_KEYS = {
89
+ "taskCount",
90
+ "parallelism",
91
+ "computeResource",
92
+ "instance", # Contains machineType, accelerators
93
+ "volumes",
94
+ }
95
+ actual_batch_job_keys = set(gcp_batch_job_config.keys())
96
+ unhandled_batch_job_keys = actual_batch_job_keys - EXPECTED_GCP_BATCH_JOB_KEYS
97
+ if unhandled_batch_job_keys:
98
+ raise ValueError(
99
+ f"Unexpected keys in 'gcp.batch_job' configuration section: {unhandled_batch_job_keys}. "
100
+ f"Expected keys are: {EXPECTED_GCP_BATCH_JOB_KEYS}"
101
+ )
102
+
56
103
  # Start with the allocation and logs policies
57
104
  batch_config = {
58
105
  "allocationPolicy": gcp_config["allocation_policy"],
@@ -75,16 +122,27 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
75
122
  "commands": ["-c", " ".join(entrypoint_command)],
76
123
  }
77
124
 
125
+ # Handle container options - both shared memory and any custom options
126
+ docker_options = []
127
+
78
128
  # Add shared memory option if specified
79
129
  if "shared_memory" in config.get("docker", {}):
80
- container_config["options"] = f"--shm-size={config['docker']['shared_memory']}"
130
+ docker_options.append(f"--shm-size={config['docker']['shared_memory']}")
131
+
132
+ # Add any custom Docker options if specified
133
+ if "options" in config.get("docker", {}):
134
+ docker_options.append(config["docker"]["options"])
135
+
136
+ # Set the options field if any options were collected
137
+ if docker_options:
138
+ container_config["options"] = " ".join(docker_options)
81
139
 
82
140
  # Build the task group configuration
83
141
  task_group = {
84
- "taskCount": gcp_config["batch_job"]["taskCount"],
85
- "parallelism": gcp_config["batch_job"]["parallelism"],
142
+ "taskCount": gcp_batch_job_config["taskCount"],
143
+ "parallelism": gcp_batch_job_config["parallelism"],
86
144
  "taskSpec": {
87
- "computeResource": gcp_config["batch_job"]["computeResource"],
145
+ "computeResource": gcp_batch_job_config["computeResource"],
88
146
  "runnables": [{"container": container_config}],
89
147
  },
90
148
  }
@@ -96,8 +154,12 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
96
154
  if env_vars:
97
155
  task_group["taskSpec"]["runnables"][0]["environment"] = {"variables": env_vars}
98
156
 
157
+ # Add volumes to the taskSpec if specified in the config
158
+ if "volumes" in gcp_batch_job_config and gcp_batch_job_config["volumes"]:
159
+ task_group["taskSpec"]["volumes"] = gcp_batch_job_config["volumes"]
160
+
99
161
  # Add machine type and optional accelerators from instance config
100
- instance_config = gcp_config["batch_job"]["instance"]
162
+ instance_config = gcp_batch_job_config["instance"]
101
163
  if "machineType" in instance_config:
102
164
  # Add machine type to the allocation policy
103
165
  if "policy" not in batch_config["allocationPolicy"]["instances"]:
@@ -123,6 +185,17 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
123
185
  print("Machine Type:", policy.get("machineType", "Not specified"))
124
186
  print("Accelerators:", policy.get("accelerators", "Not specified"))
125
187
  print("Environment Variables:", list(env_vars.keys()))
188
+ if (
189
+ "runnables" in task_group["taskSpec"]
190
+ and task_group["taskSpec"]["runnables"]
191
+ ):
192
+ print(
193
+ "Container Options:",
194
+ task_group["taskSpec"]["runnables"][0]
195
+ .get("container", {})
196
+ .get("options", "Not specified"),
197
+ )
198
+
126
199
  except KeyError as e:
127
200
  print(f"Warning: Could not find {e} in configuration")
128
201
 
@@ -614,6 +614,24 @@ def setup_rxnfp() -> None:
614
614
  )
615
615
 
616
616
 
617
+ def clean_docker_config() -> None:
618
+ """Clean Docker configuration to avoid credential helper conflicts.
619
+
620
+ VS Code dev containers can set up credential helpers that interfere with
621
+ Docker builds and registry operations. This function creates a clean
622
+ configuration that disables problematic credential helpers.
623
+
624
+ This is automatically called before Docker builds and registry operations
625
+ to prevent authentication failures.
626
+ """
627
+ docker_config_dir = os.path.expanduser("~/.docker")
628
+ os.makedirs(docker_config_dir, exist_ok=True)
629
+
630
+ # Write a minimal config file that disables credential helpers
631
+ with open(os.path.join(docker_config_dir, "config.json"), "w") as f:
632
+ json.dump({"auths": {}, "credsStore": ""}, f)
633
+
634
+
617
635
  def docker_login(registry: str, username: str, password: str) -> None:
618
636
  """Login to a Docker registry using provided credentials.
619
637
 
@@ -625,13 +643,8 @@ def docker_login(registry: str, username: str, password: str) -> None:
625
643
  Raises:
626
644
  subprocess.CalledProcessError: If Docker login fails
627
645
  """
628
- # Create .docker directory if it doesn't exist
629
- docker_config_dir = os.path.expanduser("~/.docker")
630
- os.makedirs(docker_config_dir, exist_ok=True)
631
-
632
- # Write a minimal config file that disables credential helpers
633
- with open(os.path.join(docker_config_dir, "config.json"), "w") as f:
634
- json.dump({"auths": {}, "credsStore": ""}, f)
646
+ # Clean Docker config to avoid credential helper conflicts
647
+ clean_docker_config()
635
648
 
636
649
  # Login to Docker using the credentials
637
650
  login_process = subprocess.run(
@@ -94,9 +94,9 @@ def run_command() -> None:
94
94
  stderr=None, # Use parent's stderr
95
95
  )
96
96
 
97
- logger.info("Command completed successfully")
97
+ logger.info("Job command completed successfully")
98
98
  except subprocess.CalledProcessError as e:
99
- logger.error(f"Command failed with return code: {e.returncode}")
99
+ logger.error(f"Job command failed with return code: {e.returncode}")
100
100
  raise
101
101
  except Exception as e:
102
102
  logger.error(f"Error executing command: {str(e)}")
@@ -133,6 +133,13 @@ def run_job(
133
133
  logger = logging.getLogger(__name__)
134
134
 
135
135
  logger.info(f"Job runner starting in mode: {mode}")
136
+ import importlib.metadata
137
+
138
+ try:
139
+ version = importlib.metadata.version("dayhoff-tools")
140
+ logger.info(f"dayhoff-tools version: {version}")
141
+ except importlib.metadata.PackageNotFoundError:
142
+ logger.warning("Could not determine dayhoff-tools version")
136
143
 
137
144
  if mode not in ["setup", "execute", "setup_and_execute"]:
138
145
  logger.error(f"Invalid mode: {mode}")
@@ -146,8 +153,6 @@ def run_job(
146
153
  if mode in ["execute", "setup_and_execute"]:
147
154
  run_command()
148
155
 
149
- logger.info("Job completed successfully")
150
-
151
156
  except Exception as e:
152
157
  logger.error(f"Job failed with error: {str(e)}", exc_info=True)
153
158
  sys.exit(1)