dayhoff-tools 1.14.1__tar.gz → 1.14.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/PKG-INFO +6 -1
  2. dayhoff_tools-1.14.2/dayhoff_tools/batch/__init__.py +8 -0
  3. dayhoff_tools-1.14.2/dayhoff_tools/batch/workers/__init__.py +12 -0
  4. dayhoff_tools-1.14.2/dayhoff_tools/batch/workers/base.py +150 -0
  5. dayhoff_tools-1.14.2/dayhoff_tools/batch/workers/boltz.py +407 -0
  6. dayhoff_tools-1.14.2/dayhoff_tools/batch/workers/embed_t5.py +92 -0
  7. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/__init__.py +85 -0
  8. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/aws_batch.py +401 -0
  9. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/__init__.py +25 -0
  10. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/boltz.py +362 -0
  11. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/cancel.py +82 -0
  12. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/embed_t5.py +303 -0
  13. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/finalize.py +206 -0
  14. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/list_jobs.py +78 -0
  15. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/local.py +95 -0
  16. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/logs.py +142 -0
  17. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/retry.py +142 -0
  18. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/status.py +214 -0
  19. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/commands/submit.py +215 -0
  20. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/job_id.py +151 -0
  21. dayhoff_tools-1.14.2/dayhoff_tools/cli/batch/manifest.py +293 -0
  22. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/engine-studio-cli.md +26 -21
  23. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/engine_commands.py +16 -89
  24. dayhoff_tools-1.14.2/dayhoff_tools/cli/engines_studios/ssh_config.py +96 -0
  25. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/studio_commands.py +13 -2
  26. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/main.py +14 -0
  27. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/pyproject.toml +10 -1
  28. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/README.md +0 -0
  29. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/__init__.py +0 -0
  30. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/chemistry/standardizer.py +0 -0
  31. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/chemistry/utils.py +0 -0
  32. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/__init__.py +0 -0
  33. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/cloud_commands.py +0 -0
  34. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/__init__.py +0 -0
  35. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/engine_core.py +0 -0
  36. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/engine_lifecycle.py +0 -0
  37. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/engine_maintenance.py +0 -0
  38. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/engine_management.py +0 -0
  39. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/shared.py +0 -0
  40. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engine1/studio_commands.py +0 -0
  41. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/__init__.py +0 -0
  42. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/api_client.py +0 -0
  43. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/auth.py +0 -0
  44. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/progress.py +0 -0
  45. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +0 -0
  46. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/demo.sh +0 -0
  47. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +0 -0
  48. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +0 -0
  49. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +0 -0
  50. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +0 -0
  51. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +0 -0
  52. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +0 -0
  53. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/github_commands.py +0 -0
  54. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/swarm_commands.py +0 -0
  55. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/cli/utility_commands.py +0 -0
  56. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/base.py +0 -0
  57. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  58. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  59. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  60. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/job_runner.py +0 -0
  61. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/processors.py +0 -0
  62. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/deployment/swarm.py +0 -0
  63. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/embedders.py +0 -0
  64. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/fasta.py +0 -0
  65. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/file_ops.py +0 -0
  66. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/h5.py +0 -0
  67. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/intake/gcp.py +0 -0
  68. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/intake/gtdb.py +0 -0
  69. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/intake/kegg.py +0 -0
  70. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/intake/mmseqs.py +0 -0
  71. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/intake/structure.py +0 -0
  72. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/intake/uniprot.py +0 -0
  73. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/logs.py +0 -0
  74. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/sqlite.py +0 -0
  75. {dayhoff_tools-1.14.1 → dayhoff_tools-1.14.2}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dayhoff-tools
3
- Version: 1.14.1
3
+ Version: 1.14.2
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -11,11 +11,14 @@ Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
13
  Classifier: Programming Language :: Python :: 3.14
14
+ Provides-Extra: batch
15
+ Provides-Extra: boltz
14
16
  Provides-Extra: embedders
15
17
  Provides-Extra: full
16
18
  Requires-Dist: biopython (>=1.84) ; extra == "full"
17
19
  Requires-Dist: biopython (>=1.85) ; extra == "embedders"
18
20
  Requires-Dist: boto3 (>=1.36.8)
21
+ Requires-Dist: click (>=8.0.0) ; extra == "batch"
19
22
  Requires-Dist: docker (>=7.1.0) ; extra == "full"
20
23
  Requires-Dist: fair-esm (>=2.0.0) ; extra == "embedders"
21
24
  Requires-Dist: fair-esm (>=2.0.0) ; extra == "full"
@@ -25,10 +28,12 @@ Requires-Dist: h5py (>=3.13.0) ; extra == "embedders"
25
28
  Requires-Dist: numpy (>=1.26.4) ; extra == "embedders"
26
29
  Requires-Dist: pandas (>=2.2.0,<2.2.3) ; extra == "embedders"
27
30
  Requires-Dist: pandas (>=2.2.0,<2.2.3) ; extra == "full"
31
+ Requires-Dist: pydantic (>=2.0.0) ; extra == "batch"
28
32
  Requires-Dist: pyyaml (>=6.0)
29
33
  Requires-Dist: questionary (>=2.0.1)
30
34
  Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "full"
31
35
  Requires-Dist: requests (>=2.31.0)
36
+ Requires-Dist: ruamel.yaml (>=0.17.0) ; extra == "boltz"
32
37
  Requires-Dist: sentencepiece (>=0.2.0) ; extra == "embedders"
33
38
  Requires-Dist: sentencepiece (>=0.2.0) ; extra == "full"
34
39
  Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
@@ -0,0 +1,8 @@
1
+ """Batch job infrastructure for AWS Batch.
2
+
3
+ This module contains:
4
+ - Worker code for container entrypoints
5
+ - Utilities for batch job coordination
6
+ """
7
+
8
+ __all__ = []
@@ -0,0 +1,12 @@
1
+ """Worker entrypoints for AWS Batch jobs.
2
+
3
+ These modules are designed to run inside containers as the main entrypoint.
4
+ They use AWS_BATCH_JOB_ARRAY_INDEX for work distribution.
5
+
6
+ Available workers:
7
+ - embed_t5: T5 protein sequence embedding
8
+ - boltz: Boltz protein structure prediction
9
+ - base: Common utilities for all workers
10
+ """
11
+
12
+ __all__ = ["embed_t5", "boltz", "base"]
@@ -0,0 +1,150 @@
1
+ """Base utilities for batch workers.
2
+
3
+ These utilities are shared across all worker implementations.
4
+ """
5
+
6
+ import logging
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def configure_worker_logging():
15
+ """Configure logging for batch workers.
16
+
17
+ Sets up logging to output to stdout with timestamps and log levels,
18
+ which CloudWatch will capture.
19
+ """
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
23
+ datefmt="%Y-%m-%d %H:%M:%S",
24
+ handlers=[logging.StreamHandler(sys.stdout)],
25
+ )
26
+
27
+
28
+ def get_array_index() -> int:
29
+ """Get the array index for this worker.
30
+
31
+ For array jobs, reads AWS_BATCH_JOB_ARRAY_INDEX.
32
+ For retry jobs, maps from BATCH_RETRY_INDICES.
33
+
34
+ Returns:
35
+ The array index this worker should process
36
+
37
+ Raises:
38
+ RuntimeError: If no array index can be determined
39
+ """
40
+ # Check for retry mode first
41
+ retry_indices = os.environ.get("BATCH_RETRY_INDICES")
42
+ if retry_indices:
43
+ # In retry mode, we have a list of indices and use array index to pick
44
+ indices = [int(i) for i in retry_indices.split(",")]
45
+ array_idx = int(os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0"))
46
+ if array_idx >= len(indices):
47
+ raise RuntimeError(
48
+ f"Array index {array_idx} out of range for retry indices {indices}"
49
+ )
50
+ return indices[array_idx]
51
+
52
+ # Standard array job mode
53
+ array_idx = os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX")
54
+ if array_idx is not None:
55
+ return int(array_idx)
56
+
57
+ raise RuntimeError(
58
+ "Could not determine array index. "
59
+ "Set AWS_BATCH_JOB_ARRAY_INDEX or BATCH_RETRY_INDICES environment variable."
60
+ )
61
+
62
+
63
+ def get_job_dir() -> Path:
64
+ """Get the job directory from environment.
65
+
66
+ Returns:
67
+ Path to the job directory
68
+
69
+ Raises:
70
+ RuntimeError: If JOB_DIR is not set
71
+ """
72
+ job_dir = os.environ.get("JOB_DIR")
73
+ if not job_dir:
74
+ raise RuntimeError("JOB_DIR environment variable not set")
75
+ return Path(job_dir)
76
+
77
+
78
+ def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
79
+ """Get the input file path for a given index.
80
+
81
+ Args:
82
+ index: Array index
83
+ job_dir: Job directory path
84
+ prefix: File prefix (default: 'chunk')
85
+
86
+ Returns:
87
+ Path to input file
88
+ """
89
+ return job_dir / "input" / f"{prefix}_{index:03d}.fasta"
90
+
91
+
92
+ def get_output_file(index: int, job_dir: Path, prefix: str = "embed", suffix: str = ".h5") -> Path:
93
+ """Get the output file path for a given index.
94
+
95
+ Args:
96
+ index: Array index
97
+ job_dir: Job directory path
98
+ prefix: File prefix (default: 'embed')
99
+ suffix: File suffix (default: '.h5')
100
+
101
+ Returns:
102
+ Path to output file
103
+ """
104
+ return job_dir / "output" / f"{prefix}_{index:03d}{suffix}"
105
+
106
+
107
+ def get_done_marker(index: int, job_dir: Path, prefix: str = "embed") -> Path:
108
+ """Get the done marker path for a given index.
109
+
110
+ Args:
111
+ index: Array index
112
+ job_dir: Job directory path
113
+ prefix: File prefix (default: 'embed')
114
+
115
+ Returns:
116
+ Path to done marker file
117
+ """
118
+ return job_dir / "output" / f"{prefix}_{index:03d}.done"
119
+
120
+
121
+ def check_already_complete(index: int, job_dir: Path, prefix: str = "embed") -> bool:
122
+ """Check if this chunk is already complete (idempotency).
123
+
124
+ Args:
125
+ index: Array index
126
+ job_dir: Job directory path
127
+ prefix: File prefix (default: 'embed')
128
+
129
+ Returns:
130
+ True if already complete, False otherwise
131
+ """
132
+ done_marker = get_done_marker(index, job_dir, prefix)
133
+ if done_marker.exists():
134
+ logger.info(f"Chunk {index} already complete (found {done_marker}), skipping")
135
+ return True
136
+ return False
137
+
138
+
139
+ def mark_complete(index: int, job_dir: Path, prefix: str = "embed"):
140
+ """Mark a chunk as complete by creating the done marker.
141
+
142
+ Args:
143
+ index: Array index
144
+ job_dir: Job directory path
145
+ prefix: File prefix (default: 'embed')
146
+ """
147
+ done_marker = get_done_marker(index, job_dir, prefix)
148
+ done_marker.parent.mkdir(parents=True, exist_ok=True)
149
+ done_marker.touch()
150
+ logger.info(f"Chunk {index} marked complete: {done_marker}")
@@ -0,0 +1,407 @@
1
+ """Boltz structure prediction worker for AWS Batch array jobs.
2
+
3
+ This module contains:
4
+ 1. BoltzProcessor - Core processor class for running Boltz predictions
5
+ 2. Worker entrypoint for AWS Batch array jobs
6
+
7
+ The worker processes a single YAML config file based on AWS_BATCH_JOB_ARRAY_INDEX.
8
+
9
+ Usage:
10
+ python -m dayhoff_tools.batch.workers.boltz
11
+
12
+ Environment variables:
13
+ AWS_BATCH_JOB_ARRAY_INDEX: The index of the input file to process
14
+ JOB_DIR: Path to job directory (contains input/ and output/ subdirectories)
15
+ BOLTZ_CACHE: Path to Boltz model cache (default: /primordial/.cache/boltz)
16
+ MSA_DIR: Path to global MSA cache (default: /primordial/.cache/msas)
17
+ BOLTZ_OPTIONS: Additional Boltz command-line options
18
+ BATCH_RETRY_INDICES: (optional) Comma-separated list of indices for retry mode
19
+ """
20
+
21
+ import logging
22
+ import os
23
+ import re
24
+ import shlex
25
+ import shutil
26
+ import subprocess
27
+ from pathlib import Path
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class BoltzProcessor:
33
+ """Processor for running Boltz structure predictions.
34
+
35
+ This class wraps the Boltz prediction tool to predict protein structures
36
+ from YAML configuration files containing sequence data.
37
+
38
+ Attributes:
39
+ num_workers: Number of CPU workers for Boltz internal parallelization
40
+ boltz_options: Additional command-line options for Boltz
41
+ msa_folder: Path to folder containing pre-computed MSA files (.a3m)
42
+ cache_dir: Path to Boltz model cache directory
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ num_workers: int | None = None,
48
+ boltz_options: str | None = None,
49
+ msa_folder: str | None = None,
50
+ cache_dir: str | None = None,
51
+ ):
52
+ """Initialize the BoltzProcessor.
53
+
54
+ Args:
55
+ num_workers: Number of worker threads for Boltz. If None, uses CPU count - 1.
56
+ boltz_options: Additional command-line options to pass to Boltz
57
+ (e.g., "--recycling_steps 3 --sampling_steps 200")
58
+ msa_folder: Path to folder containing MSA files (.a3m format).
59
+ If provided, searches for MSAs matching protein IDs.
60
+ cache_dir: Path to Boltz model cache. Defaults to /primordial/.cache/boltz
61
+ """
62
+ if num_workers is None:
63
+ num_workers = max(1, (os.cpu_count() or 4) - 1)
64
+
65
+ self.num_workers = num_workers
66
+ self.boltz_options = boltz_options
67
+ self.msa_folder = msa_folder
68
+ self.cache_dir = cache_dir or "/primordial/.cache/boltz"
69
+
70
+ def _extract_protein_id_from_filename(self, filename: str) -> str | None:
71
+ """Extract protein ID from input filename.
72
+
73
+ Supports multiple filename formats:
74
+ - {number}_{PROTEIN_ID}_{suffix}.yaml (e.g., '567_IR0041_p.yaml' -> 'IR0041')
75
+ - {PROTEIN_ID}.yaml (e.g., 'IR0041.yaml' -> 'IR0041')
76
+ - {PROTEIN_ID}_{suffix}.yaml (e.g., 'IR0041_2mer.yaml' -> 'IR0041')
77
+
78
+ Args:
79
+ filename: The input filename (without path)
80
+
81
+ Returns:
82
+ The extracted protein ID, or None if pattern doesn't match
83
+ """
84
+ base_name = os.path.splitext(filename)[0]
85
+
86
+ # Pattern 1: number_PROTEINID_suffix
87
+ pattern1 = r"^\d+_([A-Za-z0-9]+)_.+$"
88
+ match = re.match(pattern1, base_name)
89
+ if match:
90
+ protein_id = match.group(1)
91
+ logger.debug(f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 1)")
92
+ return protein_id
93
+
94
+ # Pattern 2: PROTEINID_suffix (no leading number)
95
+ pattern2 = r"^([A-Za-z0-9]+)_\d*mer$"
96
+ match = re.match(pattern2, base_name)
97
+ if match:
98
+ protein_id = match.group(1)
99
+ logger.debug(f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 2)")
100
+ return protein_id
101
+
102
+ # Pattern 3: Just PROTEINID (no suffix)
103
+ pattern3 = r"^([A-Za-z0-9]+)$"
104
+ match = re.match(pattern3, base_name)
105
+ if match:
106
+ protein_id = match.group(1)
107
+ logger.debug(f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 3)")
108
+ return protein_id
109
+
110
+ logger.debug(f"Could not extract protein ID from filename '{filename}'")
111
+ return None
112
+
113
+ def _find_msa_file(self, protein_id: str) -> str | None:
114
+ """Find MSA file for a given protein ID.
115
+
116
+ Searches for files in the format: {protein_id}.a3m
117
+
118
+ Args:
119
+ protein_id: The protein ID to search for
120
+
121
+ Returns:
122
+ Full path to the MSA file, or None if not found
123
+ """
124
+ if not self.msa_folder or not os.path.exists(self.msa_folder):
125
+ return None
126
+
127
+ msa_filename = f"{protein_id}.a3m"
128
+ msa_path = os.path.join(self.msa_folder, msa_filename)
129
+
130
+ if os.path.exists(msa_path):
131
+ logger.info(f"Found MSA file for protein {protein_id}: {msa_path}")
132
+ return msa_path
133
+ else:
134
+ logger.debug(f"MSA file not found: {msa_path}")
135
+ return None
136
+
137
+ def _enhance_yaml_with_msa(self, input_file: str) -> tuple[str, bool, str | None]:
138
+ """Enhance input YAML file with MSA information if available.
139
+
140
+ Modifies the input YAML file in place, adding MSA paths to protein entries.
141
+ Returns the original content so it can be restored later.
142
+
143
+ Args:
144
+ input_file: Path to the input YAML file to modify
145
+
146
+ Returns:
147
+ Tuple of (input file path, whether MSA was added, original content for restoration)
148
+ """
149
+ try:
150
+ from ruamel.yaml import YAML
151
+ except ImportError:
152
+ logger.warning("ruamel.yaml not available, skipping MSA enhancement")
153
+ return input_file, False, None
154
+
155
+ filename = os.path.basename(input_file)
156
+ protein_id = self._extract_protein_id_from_filename(filename)
157
+
158
+ if not protein_id:
159
+ logger.debug(f"No protein ID extracted from {filename}")
160
+ return input_file, False, None
161
+
162
+ msa_path = self._find_msa_file(protein_id)
163
+ if not msa_path:
164
+ return input_file, False, None
165
+
166
+ # Read original content for backup
167
+ try:
168
+ with open(input_file, "r") as f:
169
+ original_content = f.read()
170
+ except Exception as e:
171
+ logger.error(f"Error reading YAML file {input_file}: {e}")
172
+ return input_file, False, None
173
+
174
+ # Parse and modify YAML
175
+ yaml_parser = YAML()
176
+ yaml_parser.preserve_quotes = True
177
+ yaml_parser.width = 4096
178
+
179
+ try:
180
+ with open(input_file, "r") as f:
181
+ yaml_data = yaml_parser.load(f)
182
+ except Exception as e:
183
+ logger.error(f"Error parsing YAML file {input_file}: {e}")
184
+ return input_file, False, None
185
+
186
+ # Add MSA path to protein entries
187
+ msa_added = False
188
+ if "sequences" in yaml_data and isinstance(yaml_data["sequences"], list):
189
+ for sequence in yaml_data["sequences"]:
190
+ if "protein" in sequence and isinstance(sequence["protein"], dict):
191
+ sequence["protein"]["msa"] = msa_path
192
+ logger.info(f"Added MSA path {msa_path} to protein in YAML")
193
+ msa_added = True
194
+
195
+ if not msa_added:
196
+ return input_file, False, None
197
+
198
+ # Write modified YAML
199
+ try:
200
+ with open(input_file, "w") as f:
201
+ yaml_parser.dump(yaml_data, f)
202
+ return input_file, True, original_content
203
+ except Exception as e:
204
+ logger.error(f"Error writing enhanced YAML: {e}")
205
+ return input_file, False, None
206
+
207
+ def run(self, input_file: str, output_dir: str | None = None) -> str:
208
+ """Run Boltz prediction on the input file.
209
+
210
+ Args:
211
+ input_file: Path to input YAML file containing sequences
212
+ output_dir: Optional output directory. If None, uses boltz_results_{basename}
213
+
214
+ Returns:
215
+ Path to the output directory created by Boltz
216
+
217
+ Raises:
218
+ subprocess.CalledProcessError: If Boltz prediction fails
219
+ FileNotFoundError: If input file doesn't exist
220
+ """
221
+ if not os.path.exists(input_file):
222
+ raise FileNotFoundError(f"Input file not found: {input_file}")
223
+
224
+ # Enhance with MSA if available
225
+ enhanced_input_file, msa_found, original_yaml_data = self._enhance_yaml_with_msa(
226
+ input_file
227
+ )
228
+
229
+ # Determine output directory
230
+ input_base = os.path.splitext(os.path.basename(input_file))[0]
231
+ if output_dir is None:
232
+ expected_output_dir = f"boltz_results_{input_base}"
233
+ else:
234
+ expected_output_dir = output_dir
235
+
236
+ logger.info(f"Running Boltz prediction for {input_file}")
237
+ logger.info(f"Output directory: {expected_output_dir}")
238
+
239
+ # Build command
240
+ cmd = ["boltz", "predict", input_file]
241
+
242
+ # Add cache directory
243
+ cmd.extend(["--cache", self.cache_dir])
244
+
245
+ # Parse additional options
246
+ additional_args = []
247
+ num_workers_in_opts = False
248
+ use_msa_server_in_opts = False
249
+
250
+ if self.boltz_options:
251
+ try:
252
+ parsed_opts = shlex.split(self.boltz_options)
253
+ additional_args.extend(parsed_opts)
254
+ num_workers_in_opts = "--num_workers" in parsed_opts
255
+ use_msa_server_in_opts = "--use_msa_server" in parsed_opts
256
+ except ValueError as e:
257
+ logger.error(f"Error parsing boltz_options '{self.boltz_options}': {e}")
258
+
259
+ # Handle MSA server option
260
+ if msa_found:
261
+ if use_msa_server_in_opts:
262
+ additional_args = [arg for arg in additional_args if arg != "--use_msa_server"]
263
+ logger.info("Removed --use_msa_server since local MSA was found")
264
+ else:
265
+ if not use_msa_server_in_opts:
266
+ additional_args.append("--use_msa_server")
267
+ logger.info("Added --use_msa_server since no local MSA found")
268
+
269
+ # Add num_workers if not in options
270
+ if not num_workers_in_opts:
271
+ cmd.extend(["--num_workers", str(self.num_workers)])
272
+
273
+ cmd.extend(additional_args)
274
+
275
+ # Log and run command
276
+ logger.info(f"Running command: {shlex.join(cmd)}")
277
+
278
+ process = subprocess.Popen(
279
+ cmd,
280
+ stdout=subprocess.PIPE,
281
+ stderr=subprocess.STDOUT,
282
+ text=True,
283
+ bufsize=1,
284
+ )
285
+
286
+ if process.stdout:
287
+ for line in iter(process.stdout.readline, ""):
288
+ logger.info(f"BOLTZ: {line.rstrip()}")
289
+
290
+ return_code = process.wait()
291
+ if return_code != 0:
292
+ logger.error(f"Boltz prediction failed with exit code {return_code}")
293
+ raise subprocess.CalledProcessError(return_code, cmd)
294
+
295
+ logger.info(f"Boltz prediction completed successfully")
296
+
297
+ # Restore original YAML if modified
298
+ if original_yaml_data is not None:
299
+ try:
300
+ with open(input_file, "w") as f:
301
+ f.write(original_yaml_data)
302
+ logger.debug(f"Restored original YAML content")
303
+ except Exception as e:
304
+ logger.warning(f"Failed to restore original YAML: {e}")
305
+
306
+ # Copy input config to output directory
307
+ try:
308
+ config_dest = os.path.join(expected_output_dir, os.path.basename(input_file))
309
+ shutil.copy2(input_file, config_dest)
310
+ logger.debug(f"Copied input config to results: {config_dest}")
311
+ except Exception as e:
312
+ logger.warning(f"Failed to copy input config: {e}")
313
+
314
+ return expected_output_dir
315
+
316
+
317
+ def main():
318
+ """Boltz worker main entrypoint for AWS Batch array jobs."""
319
+ from .base import (
320
+ check_already_complete,
321
+ configure_worker_logging,
322
+ get_array_index,
323
+ get_job_dir,
324
+ mark_complete,
325
+ )
326
+
327
+ configure_worker_logging()
328
+ logger.info("Starting Boltz prediction worker")
329
+
330
+ try:
331
+ # Get configuration from environment
332
+ index = get_array_index()
333
+ job_dir = get_job_dir()
334
+
335
+ logger.info(f"Worker configuration:")
336
+ logger.info(f" Array index: {index}")
337
+ logger.info(f" Job directory: {job_dir}")
338
+
339
+ # Check idempotency
340
+ if check_already_complete(index, job_dir, prefix="boltz"):
341
+ logger.info("Exiting - complex already processed")
342
+ return
343
+
344
+ # Find input file by index
345
+ input_dir = job_dir / "input"
346
+ input_files = sorted(input_dir.glob("*.yaml"))
347
+
348
+ if index >= len(input_files):
349
+ logger.error(f"Index {index} out of range. Found {len(input_files)} input files.")
350
+ raise RuntimeError(f"Index {index} out of range")
351
+
352
+ input_file = input_files[index]
353
+ logger.info(f" Input file: {input_file}")
354
+
355
+ # Determine output directory
356
+ output_base = input_file.stem
357
+ output_dir = job_dir / "output" / output_base
358
+
359
+ # Get MSA directories
360
+ job_msa_dir = job_dir / "msas"
361
+ global_msa_dir = Path(os.environ.get("MSA_DIR", "/primordial/.cache/msas"))
362
+
363
+ if job_msa_dir.exists():
364
+ msa_folder = str(job_msa_dir)
365
+ logger.info(f" Using job-specific MSAs: {msa_folder}")
366
+ elif global_msa_dir.exists():
367
+ msa_folder = str(global_msa_dir)
368
+ logger.info(f" Using global MSA cache: {msa_folder}")
369
+ else:
370
+ msa_folder = None
371
+ logger.info(" No MSA folder available, will use MSA server")
372
+
373
+ # Get cache directory
374
+ cache_dir = os.environ.get("BOLTZ_CACHE", "/primordial/.cache/boltz")
375
+ logger.info(f" Cache directory: {cache_dir}")
376
+
377
+ # Get additional options
378
+ boltz_options = os.environ.get("BOLTZ_OPTIONS")
379
+ if boltz_options:
380
+ logger.info(f" Boltz options: {boltz_options}")
381
+
382
+ # Create processor and run
383
+ processor = BoltzProcessor(
384
+ num_workers=None, # Auto-detect
385
+ boltz_options=boltz_options,
386
+ msa_folder=msa_folder,
387
+ cache_dir=cache_dir,
388
+ )
389
+
390
+ # Ensure output directory exists
391
+ output_dir.parent.mkdir(parents=True, exist_ok=True)
392
+
393
+ result_dir = processor.run(str(input_file), str(output_dir))
394
+
395
+ # Mark as complete
396
+ mark_complete(index, job_dir, prefix="boltz")
397
+
398
+ logger.info(f"Complex {input_file.stem} completed successfully")
399
+ logger.info(f"Output: {result_dir}")
400
+
401
+ except Exception as e:
402
+ logger.exception(f"Worker failed with error: {e}")
403
+ raise SystemExit(1)
404
+
405
+
406
+ if __name__ == "__main__":
407
+ main()