dayhoff-tools 1.12.9__tar.gz → 1.14.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/PKG-INFO +6 -1
  2. dayhoff_tools-1.14.8/dayhoff_tools/batch/__init__.py +8 -0
  3. dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/__init__.py +12 -0
  4. dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/base.py +146 -0
  5. dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/boltz.py +436 -0
  6. dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/embed_t5.py +92 -0
  7. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/__init__.py +88 -0
  8. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/aws_batch.py +459 -0
  9. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/__init__.py +25 -0
  10. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/boltz.py +419 -0
  11. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/cancel.py +96 -0
  12. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/clean.py +139 -0
  13. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/embed_t5.py +402 -0
  14. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/finalize.py +232 -0
  15. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/list_jobs.py +128 -0
  16. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/local.py +112 -0
  17. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/logs.py +146 -0
  18. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/retry.py +146 -0
  19. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/status.py +287 -0
  20. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/commands/submit.py +221 -0
  21. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/job_id.py +151 -0
  22. dayhoff_tools-1.14.8/dayhoff_tools/cli/batch/manifest.py +295 -0
  23. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/shared.py +7 -3
  24. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engines_studios/__init__.py +1 -3
  25. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/api_client.py +350 -0
  26. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/auth.py +144 -0
  27. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engines_studios/engine-studio-cli.md +142 -94
  28. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engines_studios/engine_commands.py +469 -187
  29. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engines_studios/progress.py +45 -17
  30. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
  31. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
  32. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
  33. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
  34. {dayhoff_tools-1.12.9/dayhoff_tools/cli/engines_studios → dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators}/idle_status_simulator.py +6 -6
  35. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
  36. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
  37. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
  38. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/ssh_config.py +96 -0
  39. dayhoff_tools-1.14.8/dayhoff_tools/cli/engines_studios/studio_commands.py +769 -0
  40. dayhoff_tools-1.14.8/dayhoff_tools/cli/github_commands.py +286 -0
  41. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/main.py +59 -10
  42. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/base.py +8 -2
  43. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/deploy_aws.py +82 -0
  44. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/pyproject.toml +10 -1
  45. dayhoff_tools-1.12.9/dayhoff_tools/cli/engines_studios/api_client.py +0 -266
  46. dayhoff_tools-1.12.9/dayhoff_tools/cli/engines_studios/idle-status-simulator.md +0 -140
  47. dayhoff_tools-1.12.9/dayhoff_tools/cli/engines_studios/studio_commands.py +0 -408
  48. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/README.md +0 -0
  49. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/__init__.py +0 -0
  50. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/chemistry/standardizer.py +0 -0
  51. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/chemistry/utils.py +0 -0
  52. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/__init__.py +0 -0
  53. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/cloud_commands.py +0 -0
  54. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/__init__.py +0 -0
  55. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/engine_core.py +0 -0
  56. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/engine_lifecycle.py +0 -0
  57. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/engine_maintenance.py +0 -0
  58. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/engine_management.py +0 -0
  59. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/engine1/studio_commands.py +0 -0
  60. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/swarm_commands.py +0 -0
  61. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/cli/utility_commands.py +0 -0
  62. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  63. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  64. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/job_runner.py +0 -0
  65. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/processors.py +0 -0
  66. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/deployment/swarm.py +0 -0
  67. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/embedders.py +0 -0
  68. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/fasta.py +0 -0
  69. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/file_ops.py +0 -0
  70. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/h5.py +0 -0
  71. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/intake/gcp.py +0 -0
  72. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/intake/gtdb.py +0 -0
  73. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/intake/kegg.py +0 -0
  74. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/intake/mmseqs.py +0 -0
  75. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/intake/structure.py +0 -0
  76. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/intake/uniprot.py +0 -0
  77. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/logs.py +0 -0
  78. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/sqlite.py +0 -0
  79. {dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dayhoff-tools
3
- Version: 1.12.9
3
+ Version: 1.14.8
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -11,11 +11,14 @@ Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
13
  Classifier: Programming Language :: Python :: 3.14
14
+ Provides-Extra: batch
15
+ Provides-Extra: boltz
14
16
  Provides-Extra: embedders
15
17
  Provides-Extra: full
16
18
  Requires-Dist: biopython (>=1.84) ; extra == "full"
17
19
  Requires-Dist: biopython (>=1.85) ; extra == "embedders"
18
20
  Requires-Dist: boto3 (>=1.36.8)
21
+ Requires-Dist: click (>=8.0.0) ; extra == "batch"
19
22
  Requires-Dist: docker (>=7.1.0) ; extra == "full"
20
23
  Requires-Dist: fair-esm (>=2.0.0) ; extra == "embedders"
21
24
  Requires-Dist: fair-esm (>=2.0.0) ; extra == "full"
@@ -25,10 +28,12 @@ Requires-Dist: h5py (>=3.13.0) ; extra == "embedders"
25
28
  Requires-Dist: numpy (>=1.26.4) ; extra == "embedders"
26
29
  Requires-Dist: pandas (>=2.2.0,<2.2.3) ; extra == "embedders"
27
30
  Requires-Dist: pandas (>=2.2.0,<2.2.3) ; extra == "full"
31
+ Requires-Dist: pydantic (>=2.0.0) ; extra == "batch"
28
32
  Requires-Dist: pyyaml (>=6.0)
29
33
  Requires-Dist: questionary (>=2.0.1)
30
34
  Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "full"
31
35
  Requires-Dist: requests (>=2.31.0)
36
+ Requires-Dist: ruamel.yaml (>=0.17.0) ; extra == "boltz"
32
37
  Requires-Dist: sentencepiece (>=0.2.0) ; extra == "embedders"
33
38
  Requires-Dist: sentencepiece (>=0.2.0) ; extra == "full"
34
39
  Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
@@ -0,0 +1,8 @@
1
+ """Batch job infrastructure for AWS Batch.
2
+
3
+ This module contains:
4
+ - Worker code for container entrypoints
5
+ - Utilities for batch job coordination
6
+ """
7
+
8
+ __all__ = []
@@ -0,0 +1,12 @@
1
+ """Worker entrypoints for AWS Batch jobs.
2
+
3
+ These modules are designed to run inside containers as the main entrypoint.
4
+ They use AWS_BATCH_JOB_ARRAY_INDEX for work distribution.
5
+
6
+ Available workers:
7
+ - embed_t5: T5 protein sequence embedding
8
+ - boltz: Boltz protein structure prediction
9
+ - base: Common utilities for all workers
10
+ """
11
+
12
+ __all__ = ["embed_t5", "boltz", "base"]
@@ -0,0 +1,146 @@
1
+ """Base utilities for batch workers.
2
+
3
+ These utilities are shared across all worker implementations.
4
+ """
5
+
6
+ import logging
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def configure_worker_logging():
15
+ """Configure logging for batch workers.
16
+
17
+ Sets up logging to output to stdout with timestamps and log levels,
18
+ which CloudWatch will capture.
19
+ """
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
23
+ datefmt="%Y-%m-%d %H:%M:%S",
24
+ handlers=[logging.StreamHandler(sys.stdout)],
25
+ )
26
+
27
+
28
+ def get_array_index() -> int:
29
+ """Get the array index for this worker.
30
+
31
+ For array jobs, reads AWS_BATCH_JOB_ARRAY_INDEX.
32
+ For retry jobs, maps from BATCH_RETRY_INDICES.
33
+ For single jobs (array_size=1), defaults to 0.
34
+
35
+ Returns:
36
+ The array index this worker should process
37
+ """
38
+ # Check for retry mode first
39
+ retry_indices = os.environ.get("BATCH_RETRY_INDICES")
40
+ if retry_indices:
41
+ # In retry mode, we have a list of indices and use array index to pick
42
+ indices = [int(i) for i in retry_indices.split(",")]
43
+ array_idx = int(os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0"))
44
+ if array_idx >= len(indices):
45
+ raise RuntimeError(
46
+ f"Array index {array_idx} out of range for retry indices {indices}"
47
+ )
48
+ return indices[array_idx]
49
+
50
+ # Standard array job mode - default to 0 for single jobs
51
+ # Note: When array_size=1, AWS Batch runs a single job (not an array),
52
+ # so AWS_BATCH_JOB_ARRAY_INDEX is not set. Default to 0.
53
+ array_idx = os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0")
54
+ return int(array_idx)
55
+
56
+
57
+ def get_job_dir() -> Path:
58
+ """Get the job directory from environment.
59
+
60
+ Returns:
61
+ Path to the job directory
62
+
63
+ Raises:
64
+ RuntimeError: If JOB_DIR is not set
65
+ """
66
+ job_dir = os.environ.get("JOB_DIR")
67
+ if not job_dir:
68
+ raise RuntimeError("JOB_DIR environment variable not set")
69
+ return Path(job_dir)
70
+
71
+
72
+ def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
73
+ """Get the input file path for a given index.
74
+
75
+ Args:
76
+ index: Array index
77
+ job_dir: Job directory path
78
+ prefix: File prefix (default: 'chunk')
79
+
80
+ Returns:
81
+ Path to input file
82
+ """
83
+ return job_dir / "input" / f"{prefix}_{index:03d}.fasta"
84
+
85
+
86
+ def get_output_file(
87
+ index: int, job_dir: Path, prefix: str = "embed", suffix: str = ".h5"
88
+ ) -> Path:
89
+ """Get the output file path for a given index.
90
+
91
+ Args:
92
+ index: Array index
93
+ job_dir: Job directory path
94
+ prefix: File prefix (default: 'embed')
95
+ suffix: File suffix (default: '.h5')
96
+
97
+ Returns:
98
+ Path to output file
99
+ """
100
+ return job_dir / "output" / f"{prefix}_{index:03d}{suffix}"
101
+
102
+
103
+ def get_done_marker(index: int, job_dir: Path, prefix: str = "embed") -> Path:
104
+ """Get the done marker path for a given index.
105
+
106
+ Args:
107
+ index: Array index
108
+ job_dir: Job directory path
109
+ prefix: File prefix (default: 'embed')
110
+
111
+ Returns:
112
+ Path to done marker file
113
+ """
114
+ return job_dir / "output" / f"{prefix}_{index:03d}.done"
115
+
116
+
117
+ def check_already_complete(index: int, job_dir: Path, prefix: str = "embed") -> bool:
118
+ """Check if this chunk is already complete (idempotency).
119
+
120
+ Args:
121
+ index: Array index
122
+ job_dir: Job directory path
123
+ prefix: File prefix (default: 'embed')
124
+
125
+ Returns:
126
+ True if already complete, False otherwise
127
+ """
128
+ done_marker = get_done_marker(index, job_dir, prefix)
129
+ if done_marker.exists():
130
+ logger.info(f"Chunk {index} already complete (found {done_marker}), skipping")
131
+ return True
132
+ return False
133
+
134
+
135
+ def mark_complete(index: int, job_dir: Path, prefix: str = "embed"):
136
+ """Mark a chunk as complete by creating the done marker.
137
+
138
+ Args:
139
+ index: Array index
140
+ job_dir: Job directory path
141
+ prefix: File prefix (default: 'embed')
142
+ """
143
+ done_marker = get_done_marker(index, job_dir, prefix)
144
+ done_marker.parent.mkdir(parents=True, exist_ok=True)
145
+ done_marker.touch()
146
+ logger.info(f"Chunk {index} marked complete: {done_marker}")
@@ -0,0 +1,436 @@
1
+ """Boltz structure prediction worker for AWS Batch array jobs.
2
+
3
+ This module contains:
4
+ 1. BoltzProcessor - Core processor class for running Boltz predictions
5
+ 2. Worker entrypoint for AWS Batch array jobs
6
+
7
+ The worker processes a single YAML config file based on AWS_BATCH_JOB_ARRAY_INDEX.
8
+
9
+ Usage:
10
+ python -m dayhoff_tools.batch.workers.boltz
11
+
12
+ Environment variables:
13
+ AWS_BATCH_JOB_ARRAY_INDEX: The index of the input file to process
14
+ JOB_DIR: Path to job directory (contains input/ and output/ subdirectories)
15
+ BOLTZ_CACHE: Path to Boltz model cache (default: /primordial/.cache/boltz)
16
+ MSA_DIR: Path to global MSA cache (default: /primordial/.cache/msas)
17
+ BOLTZ_OPTIONS: Additional Boltz command-line options
18
+ BATCH_RETRY_INDICES: (optional) Comma-separated list of indices for retry mode
19
+ """
20
+
21
+ import logging
22
+ import os
23
+ import re
24
+ import shlex
25
+ import shutil
26
+ import subprocess
27
+ from pathlib import Path
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class BoltzProcessor:
33
+ """Processor for running Boltz structure predictions.
34
+
35
+ This class wraps the Boltz prediction tool to predict protein structures
36
+ from YAML configuration files containing sequence data.
37
+
38
+ Attributes:
39
+ num_workers: Number of CPU workers for Boltz internal parallelization
40
+ boltz_options: Additional command-line options for Boltz
41
+ msa_folder: Path to folder containing pre-computed MSA files (.a3m)
42
+ cache_dir: Path to Boltz model cache directory
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ num_workers: int | None = None,
48
+ boltz_options: str | None = None,
49
+ msa_folder: str | None = None,
50
+ cache_dir: str | None = None,
51
+ ):
52
+ """Initialize the BoltzProcessor.
53
+
54
+ Args:
55
+ num_workers: Number of worker threads for Boltz. If None, uses CPU count - 1.
56
+ boltz_options: Additional command-line options to pass to Boltz
57
+ (e.g., "--recycling_steps 3 --sampling_steps 200")
58
+ msa_folder: Path to folder containing MSA files (.a3m format).
59
+ If provided, searches for MSAs matching protein IDs.
60
+ cache_dir: Path to Boltz model cache. Defaults to /primordial/.cache/boltz
61
+ """
62
+ if num_workers is None:
63
+ num_workers = max(1, (os.cpu_count() or 4) - 1)
64
+
65
+ self.num_workers = num_workers
66
+ self.boltz_options = boltz_options
67
+ self.msa_folder = msa_folder
68
+ self.cache_dir = cache_dir or "/primordial/.cache/boltz"
69
+
70
+ def _extract_protein_id_from_filename(self, filename: str) -> str | None:
71
+ """Extract protein ID from input filename.
72
+
73
+ Supports multiple filename formats:
74
+ - {number}_{PROTEIN_ID}_{suffix}.yaml (e.g., '567_IR0041_p.yaml' -> 'IR0041')
75
+ - {PROTEIN_ID}.yaml (e.g., 'IR0041.yaml' -> 'IR0041')
76
+ - {PROTEIN_ID}_{suffix}.yaml (e.g., 'IR0041_2mer.yaml' -> 'IR0041')
77
+
78
+ Args:
79
+ filename: The input filename (without path)
80
+
81
+ Returns:
82
+ The extracted protein ID, or None if pattern doesn't match
83
+ """
84
+ base_name = os.path.splitext(filename)[0]
85
+
86
+ # Pattern 1: number_PROTEINID_suffix
87
+ pattern1 = r"^\d+_([A-Za-z0-9]+)_.+$"
88
+ match = re.match(pattern1, base_name)
89
+ if match:
90
+ protein_id = match.group(1)
91
+ logger.debug(
92
+ f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 1)"
93
+ )
94
+ return protein_id
95
+
96
+ # Pattern 2: PROTEINID_suffix (no leading number)
97
+ pattern2 = r"^([A-Za-z0-9]+)_\d*mer$"
98
+ match = re.match(pattern2, base_name)
99
+ if match:
100
+ protein_id = match.group(1)
101
+ logger.debug(
102
+ f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 2)"
103
+ )
104
+ return protein_id
105
+
106
+ # Pattern 3: Just PROTEINID (no suffix)
107
+ pattern3 = r"^([A-Za-z0-9]+)$"
108
+ match = re.match(pattern3, base_name)
109
+ if match:
110
+ protein_id = match.group(1)
111
+ logger.debug(
112
+ f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 3)"
113
+ )
114
+ return protein_id
115
+
116
+ logger.debug(f"Could not extract protein ID from filename '{filename}'")
117
+ return None
118
+
119
+ def _find_msa_file(self, protein_id: str) -> str | None:
120
+ """Find MSA file for a given protein ID.
121
+
122
+ Searches for files in the format: {protein_id}.a3m
123
+
124
+ Args:
125
+ protein_id: The protein ID to search for
126
+
127
+ Returns:
128
+ Full path to the MSA file, or None if not found
129
+ """
130
+ if not self.msa_folder or not os.path.exists(self.msa_folder):
131
+ return None
132
+
133
+ msa_filename = f"{protein_id}.a3m"
134
+ msa_path = os.path.join(self.msa_folder, msa_filename)
135
+
136
+ if os.path.exists(msa_path):
137
+ logger.info(f"Found MSA file for protein {protein_id}: {msa_path}")
138
+ return msa_path
139
+ else:
140
+ logger.debug(f"MSA file not found: {msa_path}")
141
+ return None
142
+
143
+ def _enhance_yaml_with_msa(self, input_file: str) -> tuple[str, bool, str | None]:
144
+ """Enhance input YAML file with MSA information if available.
145
+
146
+ Modifies the input YAML file in place, adding MSA paths to protein entries.
147
+ Returns the original content so it can be restored later.
148
+
149
+ Args:
150
+ input_file: Path to the input YAML file to modify
151
+
152
+ Returns:
153
+ Tuple of (input file path, whether MSA was added, original content for restoration)
154
+ """
155
+ try:
156
+ from ruamel.yaml import YAML
157
+ except ImportError:
158
+ logger.warning("ruamel.yaml not available, skipping MSA enhancement")
159
+ return input_file, False, None
160
+
161
+ filename = os.path.basename(input_file)
162
+ protein_id = self._extract_protein_id_from_filename(filename)
163
+
164
+ if not protein_id:
165
+ logger.debug(f"No protein ID extracted from {filename}")
166
+ return input_file, False, None
167
+
168
+ msa_path = self._find_msa_file(protein_id)
169
+ if not msa_path:
170
+ return input_file, False, None
171
+
172
+ # Read original content for backup
173
+ try:
174
+ with open(input_file, "r") as f:
175
+ original_content = f.read()
176
+ except Exception as e:
177
+ logger.error(f"Error reading YAML file {input_file}: {e}")
178
+ return input_file, False, None
179
+
180
+ # Parse and modify YAML
181
+ yaml_parser = YAML()
182
+ yaml_parser.preserve_quotes = True
183
+ yaml_parser.width = 4096
184
+
185
+ try:
186
+ with open(input_file, "r") as f:
187
+ yaml_data = yaml_parser.load(f)
188
+ except Exception as e:
189
+ logger.error(f"Error parsing YAML file {input_file}: {e}")
190
+ return input_file, False, None
191
+
192
+ # Add MSA path to protein entries
193
+ msa_added = False
194
+ if "sequences" in yaml_data and isinstance(yaml_data["sequences"], list):
195
+ for sequence in yaml_data["sequences"]:
196
+ if "protein" in sequence and isinstance(sequence["protein"], dict):
197
+ sequence["protein"]["msa"] = msa_path
198
+ logger.info(f"Added MSA path {msa_path} to protein in YAML")
199
+ msa_added = True
200
+
201
+ if not msa_added:
202
+ return input_file, False, None
203
+
204
+ # Write modified YAML
205
+ try:
206
+ with open(input_file, "w") as f:
207
+ yaml_parser.dump(yaml_data, f)
208
+ return input_file, True, original_content
209
+ except Exception as e:
210
+ logger.error(f"Error writing enhanced YAML: {e}")
211
+ return input_file, False, None
212
+
213
+ def run(self, input_file: str, output_dir: str | None = None) -> str:
214
+ """Run Boltz prediction on the input file.
215
+
216
+ Args:
217
+ input_file: Path to input YAML file containing sequences
218
+ output_dir: Optional output directory. If None, uses boltz_results_{basename}
219
+
220
+ Returns:
221
+ Path to the output directory created by Boltz
222
+
223
+ Raises:
224
+ subprocess.CalledProcessError: If Boltz prediction fails
225
+ FileNotFoundError: If input file doesn't exist
226
+ """
227
+ if not os.path.exists(input_file):
228
+ raise FileNotFoundError(f"Input file not found: {input_file}")
229
+
230
+ # Enhance with MSA if available
231
+ enhanced_input_file, msa_found, original_yaml_data = (
232
+ self._enhance_yaml_with_msa(input_file)
233
+ )
234
+
235
+ # Determine output directory
236
+ # Boltz always creates boltz_results_{input_name} inside --out_dir
237
+ input_base = os.path.splitext(os.path.basename(input_file))[0]
238
+
239
+ if output_dir is None:
240
+ # No output_dir specified, boltz creates in current directory
241
+ expected_output_dir = f"boltz_results_{input_base}"
242
+ out_dir_arg = None
243
+ else:
244
+ # output_dir specified - use its parent for --out_dir
245
+ # and expect boltz_results_{input_base} inside it
246
+ parent_dir = os.path.dirname(output_dir)
247
+ expected_output_dir = os.path.join(parent_dir, f"boltz_results_{input_base}")
248
+ out_dir_arg = parent_dir if parent_dir else None
249
+
250
+ logger.info(f"Running Boltz prediction for {input_file}")
251
+ logger.info(f"Output directory: {expected_output_dir}")
252
+
253
+ # Build command
254
+ cmd = ["boltz", "predict", input_file]
255
+
256
+ # Add output directory if specified
257
+ if out_dir_arg:
258
+ cmd.extend(["--out_dir", out_dir_arg])
259
+
260
+ # Add cache directory
261
+ cmd.extend(["--cache", self.cache_dir])
262
+
263
+ # Parse additional options
264
+ additional_args = []
265
+ num_workers_in_opts = False
266
+ use_msa_server_in_opts = False
267
+
268
+ if self.boltz_options:
269
+ try:
270
+ parsed_opts = shlex.split(self.boltz_options)
271
+ additional_args.extend(parsed_opts)
272
+ num_workers_in_opts = "--num_workers" in parsed_opts
273
+ use_msa_server_in_opts = "--use_msa_server" in parsed_opts
274
+ except ValueError as e:
275
+ logger.error(f"Error parsing boltz_options '{self.boltz_options}': {e}")
276
+
277
+ # Handle MSA server option
278
+ if msa_found:
279
+ if use_msa_server_in_opts:
280
+ additional_args = [
281
+ arg for arg in additional_args if arg != "--use_msa_server"
282
+ ]
283
+ logger.info("Removed --use_msa_server since local MSA was found")
284
+ else:
285
+ if not use_msa_server_in_opts:
286
+ additional_args.append("--use_msa_server")
287
+ logger.info("Added --use_msa_server since no local MSA found")
288
+
289
+ # Add num_workers if not in options
290
+ if not num_workers_in_opts:
291
+ cmd.extend(["--num_workers", str(self.num_workers)])
292
+
293
+ # Disable cuequivariance kernels - they require cuda-devel image
294
+ # which is much larger. The performance difference is modest.
295
+ # TODO: Consider switching to cuda-devel base image if perf is critical
296
+ cmd.append("--no_kernels")
297
+
298
+ cmd.extend(additional_args)
299
+
300
+ # Log and run command
301
+ logger.info(f"Running command: {shlex.join(cmd)}")
302
+
303
+ process = subprocess.Popen(
304
+ cmd,
305
+ stdout=subprocess.PIPE,
306
+ stderr=subprocess.STDOUT,
307
+ text=True,
308
+ bufsize=1,
309
+ )
310
+
311
+ if process.stdout:
312
+ for line in iter(process.stdout.readline, ""):
313
+ logger.info(f"BOLTZ: {line.rstrip()}")
314
+
315
+ return_code = process.wait()
316
+ if return_code != 0:
317
+ logger.error(f"Boltz prediction failed with exit code {return_code}")
318
+ raise subprocess.CalledProcessError(return_code, cmd)
319
+
320
+ logger.info(f"Boltz prediction completed successfully")
321
+
322
+ # Restore original YAML if modified
323
+ if original_yaml_data is not None:
324
+ try:
325
+ with open(input_file, "w") as f:
326
+ f.write(original_yaml_data)
327
+ logger.debug(f"Restored original YAML content")
328
+ except Exception as e:
329
+ logger.warning(f"Failed to restore original YAML: {e}")
330
+
331
+ # Copy input config to output directory
332
+ try:
333
+ config_dest = os.path.join(
334
+ expected_output_dir, os.path.basename(input_file)
335
+ )
336
+ shutil.copy2(input_file, config_dest)
337
+ logger.debug(f"Copied input config to results: {config_dest}")
338
+ except Exception as e:
339
+ logger.warning(f"Failed to copy input config: {e}")
340
+
341
+ return expected_output_dir
342
+
343
+
344
+ def main():
345
+ """Boltz worker main entrypoint for AWS Batch array jobs."""
346
+ from .base import (
347
+ check_already_complete,
348
+ configure_worker_logging,
349
+ get_array_index,
350
+ get_job_dir,
351
+ mark_complete,
352
+ )
353
+
354
+ configure_worker_logging()
355
+ logger.info("Starting Boltz prediction worker")
356
+
357
+ try:
358
+ # Get configuration from environment
359
+ index = get_array_index()
360
+ job_dir = get_job_dir()
361
+
362
+ logger.info(f"Worker configuration:")
363
+ logger.info(f" Array index: {index}")
364
+ logger.info(f" Job directory: {job_dir}")
365
+
366
+ # Check idempotency
367
+ if check_already_complete(index, job_dir, prefix="boltz"):
368
+ logger.info("Exiting - complex already processed")
369
+ return
370
+
371
+ # Find input file by index
372
+ input_dir = job_dir / "input"
373
+ input_files = sorted(input_dir.glob("*.yaml"))
374
+
375
+ if index >= len(input_files):
376
+ logger.error(
377
+ f"Index {index} out of range. Found {len(input_files)} input files."
378
+ )
379
+ raise RuntimeError(f"Index {index} out of range")
380
+
381
+ input_file = input_files[index]
382
+ logger.info(f" Input file: {input_file}")
383
+
384
+ # Determine output directory
385
+ output_base = input_file.stem
386
+ output_dir = job_dir / "output" / output_base
387
+
388
+ # Get MSA directories
389
+ job_msa_dir = job_dir / "msas"
390
+ global_msa_dir = Path(os.environ.get("MSA_DIR", "/primordial/.cache/msas"))
391
+
392
+ if job_msa_dir.exists():
393
+ msa_folder = str(job_msa_dir)
394
+ logger.info(f" Using job-specific MSAs: {msa_folder}")
395
+ elif global_msa_dir.exists():
396
+ msa_folder = str(global_msa_dir)
397
+ logger.info(f" Using global MSA cache: {msa_folder}")
398
+ else:
399
+ msa_folder = None
400
+ logger.info(" No MSA folder available, will use MSA server")
401
+
402
+ # Get cache directory
403
+ cache_dir = os.environ.get("BOLTZ_CACHE", "/primordial/.cache/boltz")
404
+ logger.info(f" Cache directory: {cache_dir}")
405
+
406
+ # Get additional options
407
+ boltz_options = os.environ.get("BOLTZ_OPTIONS")
408
+ if boltz_options:
409
+ logger.info(f" Boltz options: {boltz_options}")
410
+
411
+ # Create processor and run
412
+ processor = BoltzProcessor(
413
+ num_workers=None, # Auto-detect
414
+ boltz_options=boltz_options,
415
+ msa_folder=msa_folder,
416
+ cache_dir=cache_dir,
417
+ )
418
+
419
+ # Ensure output directory exists
420
+ output_dir.parent.mkdir(parents=True, exist_ok=True)
421
+
422
+ result_dir = processor.run(str(input_file), str(output_dir))
423
+
424
+ # Mark as complete
425
+ mark_complete(index, job_dir, prefix="boltz")
426
+
427
+ logger.info(f"Complex {input_file.stem} completed successfully")
428
+ logger.info(f"Output: {result_dir}")
429
+
430
+ except Exception as e:
431
+ logger.exception(f"Worker failed with error: {e}")
432
+ raise SystemExit(1)
433
+
434
+
435
+ if __name__ == "__main__":
436
+ main()