PyPI - dayhoff-tools - Versions diffs - 1.12.9__tar.gz → 1.14.8__tar.gz - Mend

dayhoff-tools 1.12.9tar.gz → 1.14.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

{dayhoff_tools-1.12.9 → dayhoff_tools-1.14.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dayhoff-tools
-Version: 1.12.9
+Version: 1.14.8
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com
@@ -11,11 +11,14 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: 3.14
+Provides-Extra: batch
+Provides-Extra: boltz
 Provides-Extra: embedders
 Provides-Extra: full
 Requires-Dist: biopython (>=1.84) ; extra == "full"
 Requires-Dist: biopython (>=1.85) ; extra == "embedders"
 Requires-Dist: boto3 (>=1.36.8)
+Requires-Dist: click (>=8.0.0) ; extra == "batch"
 Requires-Dist: docker (>=7.1.0) ; extra == "full"
 Requires-Dist: fair-esm (>=2.0.0) ; extra == "embedders"
 Requires-Dist: fair-esm (>=2.0.0) ; extra == "full"
@@ -25,10 +28,12 @@ Requires-Dist: h5py (>=3.13.0) ; extra == "embedders"
 Requires-Dist: numpy (>=1.26.4) ; extra == "embedders"
 Requires-Dist: pandas (>=2.2.0,<2.2.3) ; extra == "embedders"
 Requires-Dist: pandas (>=2.2.0,<2.2.3) ; extra == "full"
+Requires-Dist: pydantic (>=2.0.0) ; extra == "batch"
 Requires-Dist: pyyaml (>=6.0)
 Requires-Dist: questionary (>=2.0.1)
 Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "full"
 Requires-Dist: requests (>=2.31.0)
+Requires-Dist: ruamel.yaml (>=0.17.0) ; extra == "boltz"
 Requires-Dist: sentencepiece (>=0.2.0) ; extra == "embedders"
 Requires-Dist: sentencepiece (>=0.2.0) ; extra == "full"
 Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"

dayhoff_tools-1.14.8/dayhoff_tools/batch/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Batch job infrastructure for AWS Batch.
+This module contains:
+- Worker code for container entrypoints
+- Utilities for batch job coordination
+"""
+__all__ = []

dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Worker entrypoints for AWS Batch jobs.
+These modules are designed to run inside containers as the main entrypoint.
+They use AWS_BATCH_JOB_ARRAY_INDEX for work distribution.
+Available workers:
+- embed_t5: T5 protein sequence embedding
+- boltz: Boltz protein structure prediction
+- base: Common utilities for all workers
+"""
+__all__ = ["embed_t5", "boltz", "base"]

dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/base.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Base utilities for batch workers.
+These utilities are shared across all worker implementations.
+"""
+import logging
+import os
+import sys
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def configure_worker_logging():
+    """Configure logging for batch workers.
+    Sets up logging to output to stdout with timestamps and log levels,
+    which CloudWatch will capture.
+    """
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+def get_array_index() -> int:
+    """Get the array index for this worker.
+    For array jobs, reads AWS_BATCH_JOB_ARRAY_INDEX.
+    For retry jobs, maps from BATCH_RETRY_INDICES.
+    For single jobs (array_size=1), defaults to 0.
+    Returns:
+        The array index this worker should process
+    """
+    # Check for retry mode first
+    retry_indices = os.environ.get("BATCH_RETRY_INDICES")
+    if retry_indices:
+        # In retry mode, we have a list of indices and use array index to pick
+        indices = [int(i) for i in retry_indices.split(",")]
+        array_idx = int(os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0"))
+        if array_idx >= len(indices):
+            raise RuntimeError(
+                f"Array index {array_idx} out of range for retry indices {indices}"
+            )
+        return indices[array_idx]
+    # Standard array job mode - default to 0 for single jobs
+    # Note: When array_size=1, AWS Batch runs a single job (not an array),
+    # so AWS_BATCH_JOB_ARRAY_INDEX is not set. Default to 0.
+    array_idx = os.environ.get("AWS_BATCH_JOB_ARRAY_INDEX", "0")
+    return int(array_idx)
+def get_job_dir() -> Path:
+    """Get the job directory from environment.
+    Returns:
+        Path to the job directory
+    Raises:
+        RuntimeError: If JOB_DIR is not set
+    """
+    job_dir = os.environ.get("JOB_DIR")
+    if not job_dir:
+        raise RuntimeError("JOB_DIR environment variable not set")
+    return Path(job_dir)
+def get_input_file(index: int, job_dir: Path, prefix: str = "chunk") -> Path:
+    """Get the input file path for a given index.
+    Args:
+        index: Array index
+        job_dir: Job directory path
+        prefix: File prefix (default: 'chunk')
+    Returns:
+        Path to input file
+    """
+    return job_dir / "input" / f"{prefix}_{index:03d}.fasta"
+def get_output_file(
+    index: int, job_dir: Path, prefix: str = "embed", suffix: str = ".h5"
+) -> Path:
+    """Get the output file path for a given index.
+    Args:
+        index: Array index
+        job_dir: Job directory path
+        prefix: File prefix (default: 'embed')
+        suffix: File suffix (default: '.h5')
+    Returns:
+        Path to output file
+    """
+    return job_dir / "output" / f"{prefix}_{index:03d}{suffix}"
+def get_done_marker(index: int, job_dir: Path, prefix: str = "embed") -> Path:
+    """Get the done marker path for a given index.
+    Args:
+        index: Array index
+        job_dir: Job directory path
+        prefix: File prefix (default: 'embed')
+    Returns:
+        Path to done marker file
+    """
+    return job_dir / "output" / f"{prefix}_{index:03d}.done"
+def check_already_complete(index: int, job_dir: Path, prefix: str = "embed") -> bool:
+    """Check if this chunk is already complete (idempotency).
+    Args:
+        index: Array index
+        job_dir: Job directory path
+        prefix: File prefix (default: 'embed')
+    Returns:
+        True if already complete, False otherwise
+    """
+    done_marker = get_done_marker(index, job_dir, prefix)
+    if done_marker.exists():
+        logger.info(f"Chunk {index} already complete (found {done_marker}), skipping")
+        return True
+    return False
+def mark_complete(index: int, job_dir: Path, prefix: str = "embed"):
+    """Mark a chunk as complete by creating the done marker.
+    Args:
+        index: Array index
+        job_dir: Job directory path
+        prefix: File prefix (default: 'embed')
+    """
+    done_marker = get_done_marker(index, job_dir, prefix)
+    done_marker.parent.mkdir(parents=True, exist_ok=True)
+    done_marker.touch()
+    logger.info(f"Chunk {index} marked complete: {done_marker}")

dayhoff_tools-1.14.8/dayhoff_tools/batch/workers/boltz.py ADDED Viewed

@@ -0,0 +1,436 @@
+"""Boltz structure prediction worker for AWS Batch array jobs.
+This module contains:
+1. BoltzProcessor - Core processor class for running Boltz predictions
+2. Worker entrypoint for AWS Batch array jobs
+The worker processes a single YAML config file based on AWS_BATCH_JOB_ARRAY_INDEX.
+Usage:
+    python -m dayhoff_tools.batch.workers.boltz
+Environment variables:
+    AWS_BATCH_JOB_ARRAY_INDEX: The index of the input file to process
+    JOB_DIR: Path to job directory (contains input/ and output/ subdirectories)
+    BOLTZ_CACHE: Path to Boltz model cache (default: /primordial/.cache/boltz)
+    MSA_DIR: Path to global MSA cache (default: /primordial/.cache/msas)
+    BOLTZ_OPTIONS: Additional Boltz command-line options
+    BATCH_RETRY_INDICES: (optional) Comma-separated list of indices for retry mode
+"""
+import logging
+import os
+import re
+import shlex
+import shutil
+import subprocess
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class BoltzProcessor:
+    """Processor for running Boltz structure predictions.
+    This class wraps the Boltz prediction tool to predict protein structures
+    from YAML configuration files containing sequence data.
+    Attributes:
+        num_workers: Number of CPU workers for Boltz internal parallelization
+        boltz_options: Additional command-line options for Boltz
+        msa_folder: Path to folder containing pre-computed MSA files (.a3m)
+        cache_dir: Path to Boltz model cache directory
+    """
+    def __init__(
+        self,
+        num_workers: int | None = None,
+        boltz_options: str | None = None,
+        msa_folder: str | None = None,
+        cache_dir: str | None = None,
+    ):
+        """Initialize the BoltzProcessor.
+        Args:
+            num_workers: Number of worker threads for Boltz. If None, uses CPU count - 1.
+            boltz_options: Additional command-line options to pass to Boltz
+                          (e.g., "--recycling_steps 3 --sampling_steps 200")
+            msa_folder: Path to folder containing MSA files (.a3m format).
+                       If provided, searches for MSAs matching protein IDs.
+            cache_dir: Path to Boltz model cache. Defaults to /primordial/.cache/boltz
+        """
+        if num_workers is None:
+            num_workers = max(1, (os.cpu_count() or 4) - 1)
+        self.num_workers = num_workers
+        self.boltz_options = boltz_options
+        self.msa_folder = msa_folder
+        self.cache_dir = cache_dir or "/primordial/.cache/boltz"
+    def _extract_protein_id_from_filename(self, filename: str) -> str | None:
+        """Extract protein ID from input filename.
+        Supports multiple filename formats:
+        - {number}_{PROTEIN_ID}_{suffix}.yaml (e.g., '567_IR0041_p.yaml' -> 'IR0041')
+        - {PROTEIN_ID}.yaml (e.g., 'IR0041.yaml' -> 'IR0041')
+        - {PROTEIN_ID}_{suffix}.yaml (e.g., 'IR0041_2mer.yaml' -> 'IR0041')
+        Args:
+            filename: The input filename (without path)
+        Returns:
+            The extracted protein ID, or None if pattern doesn't match
+        """
+        base_name = os.path.splitext(filename)[0]
+        # Pattern 1: number_PROTEINID_suffix
+        pattern1 = r"^\d+_([A-Za-z0-9]+)_.+$"
+        match = re.match(pattern1, base_name)
+        if match:
+            protein_id = match.group(1)
+            logger.debug(
+                f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 1)"
+            )
+            return protein_id
+        # Pattern 2: PROTEINID_suffix (no leading number)
+        pattern2 = r"^([A-Za-z0-9]+)_\d*mer$"
+        match = re.match(pattern2, base_name)
+        if match:
+            protein_id = match.group(1)
+            logger.debug(
+                f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 2)"
+            )
+            return protein_id
+        # Pattern 3: Just PROTEINID (no suffix)
+        pattern3 = r"^([A-Za-z0-9]+)$"
+        match = re.match(pattern3, base_name)
+        if match:
+            protein_id = match.group(1)
+            logger.debug(
+                f"Extracted protein ID '{protein_id}' from '{filename}' (pattern 3)"
+            )
+            return protein_id
+        logger.debug(f"Could not extract protein ID from filename '{filename}'")
+        return None
+    def _find_msa_file(self, protein_id: str) -> str | None:
+        """Find MSA file for a given protein ID.
+        Searches for files in the format: {protein_id}.a3m
+        Args:
+            protein_id: The protein ID to search for
+        Returns:
+            Full path to the MSA file, or None if not found
+        """
+        if not self.msa_folder or not os.path.exists(self.msa_folder):
+            return None
+        msa_filename = f"{protein_id}.a3m"
+        msa_path = os.path.join(self.msa_folder, msa_filename)
+        if os.path.exists(msa_path):
+            logger.info(f"Found MSA file for protein {protein_id}: {msa_path}")
+            return msa_path
+        else:
+            logger.debug(f"MSA file not found: {msa_path}")
+            return None
+    def _enhance_yaml_with_msa(self, input_file: str) -> tuple[str, bool, str | None]:
+        """Enhance input YAML file with MSA information if available.
+        Modifies the input YAML file in place, adding MSA paths to protein entries.
+        Returns the original content so it can be restored later.
+        Args:
+            input_file: Path to the input YAML file to modify
+        Returns:
+            Tuple of (input file path, whether MSA was added, original content for restoration)
+        """
+        try:
+            from ruamel.yaml import YAML
+        except ImportError:
+            logger.warning("ruamel.yaml not available, skipping MSA enhancement")
+            return input_file, False, None
+        filename = os.path.basename(input_file)
+        protein_id = self._extract_protein_id_from_filename(filename)
+        if not protein_id:
+            logger.debug(f"No protein ID extracted from {filename}")
+            return input_file, False, None
+        msa_path = self._find_msa_file(protein_id)
+        if not msa_path:
+            return input_file, False, None
+        # Read original content for backup
+        try:
+            with open(input_file, "r") as f:
+                original_content = f.read()
+        except Exception as e:
+            logger.error(f"Error reading YAML file {input_file}: {e}")
+            return input_file, False, None
+        # Parse and modify YAML
+        yaml_parser = YAML()
+        yaml_parser.preserve_quotes = True
+        yaml_parser.width = 4096
+        try:
+            with open(input_file, "r") as f:
+                yaml_data = yaml_parser.load(f)
+        except Exception as e:
+            logger.error(f"Error parsing YAML file {input_file}: {e}")
+            return input_file, False, None
+        # Add MSA path to protein entries
+        msa_added = False
+        if "sequences" in yaml_data and isinstance(yaml_data["sequences"], list):
+            for sequence in yaml_data["sequences"]:
+                if "protein" in sequence and isinstance(sequence["protein"], dict):
+                    sequence["protein"]["msa"] = msa_path
+                    logger.info(f"Added MSA path {msa_path} to protein in YAML")
+                    msa_added = True
+        if not msa_added:
+            return input_file, False, None
+        # Write modified YAML
+        try:
+            with open(input_file, "w") as f:
+                yaml_parser.dump(yaml_data, f)
+            return input_file, True, original_content
+        except Exception as e:
+            logger.error(f"Error writing enhanced YAML: {e}")
+            return input_file, False, None
+    def run(self, input_file: str, output_dir: str | None = None) -> str:
+        """Run Boltz prediction on the input file.
+        Args:
+            input_file: Path to input YAML file containing sequences
+            output_dir: Optional output directory. If None, uses boltz_results_{basename}
+        Returns:
+            Path to the output directory created by Boltz
+        Raises:
+            subprocess.CalledProcessError: If Boltz prediction fails
+            FileNotFoundError: If input file doesn't exist
+        """
+        if not os.path.exists(input_file):
+            raise FileNotFoundError(f"Input file not found: {input_file}")
+        # Enhance with MSA if available
+        enhanced_input_file, msa_found, original_yaml_data = (
+            self._enhance_yaml_with_msa(input_file)
+        )
+        # Determine output directory
+        # Boltz always creates boltz_results_{input_name} inside --out_dir
+        input_base = os.path.splitext(os.path.basename(input_file))[0]
+        if output_dir is None:
+            # No output_dir specified, boltz creates in current directory
+            expected_output_dir = f"boltz_results_{input_base}"
+            out_dir_arg = None
+        else:
+            # output_dir specified - use its parent for --out_dir
+            # and expect boltz_results_{input_base} inside it
+            parent_dir = os.path.dirname(output_dir)
+            expected_output_dir = os.path.join(parent_dir, f"boltz_results_{input_base}")
+            out_dir_arg = parent_dir if parent_dir else None
+        logger.info(f"Running Boltz prediction for {input_file}")
+        logger.info(f"Output directory: {expected_output_dir}")
+        # Build command
+        cmd = ["boltz", "predict", input_file]
+        # Add output directory if specified
+        if out_dir_arg:
+            cmd.extend(["--out_dir", out_dir_arg])
+        # Add cache directory
+        cmd.extend(["--cache", self.cache_dir])
+        # Parse additional options
+        additional_args = []
+        num_workers_in_opts = False
+        use_msa_server_in_opts = False
+        if self.boltz_options:
+            try:
+                parsed_opts = shlex.split(self.boltz_options)
+                additional_args.extend(parsed_opts)
+                num_workers_in_opts = "--num_workers" in parsed_opts
+                use_msa_server_in_opts = "--use_msa_server" in parsed_opts
+            except ValueError as e:
+                logger.error(f"Error parsing boltz_options '{self.boltz_options}': {e}")
+        # Handle MSA server option
+        if msa_found:
+            if use_msa_server_in_opts:
+                additional_args = [
+                    arg for arg in additional_args if arg != "--use_msa_server"
+                ]
+                logger.info("Removed --use_msa_server since local MSA was found")
+        else:
+            if not use_msa_server_in_opts:
+                additional_args.append("--use_msa_server")
+                logger.info("Added --use_msa_server since no local MSA found")
+        # Add num_workers if not in options
+        if not num_workers_in_opts:
+            cmd.extend(["--num_workers", str(self.num_workers)])
+        # Disable cuequivariance kernels - they require cuda-devel image
+        # which is much larger. The performance difference is modest.
+        # TODO: Consider switching to cuda-devel base image if perf is critical
+        cmd.append("--no_kernels")
+        cmd.extend(additional_args)
+        # Log and run command
+        logger.info(f"Running command: {shlex.join(cmd)}")
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+        if process.stdout:
+            for line in iter(process.stdout.readline, ""):
+                logger.info(f"BOLTZ: {line.rstrip()}")
+        return_code = process.wait()
+        if return_code != 0:
+            logger.error(f"Boltz prediction failed with exit code {return_code}")
+            raise subprocess.CalledProcessError(return_code, cmd)
+        logger.info(f"Boltz prediction completed successfully")
+        # Restore original YAML if modified
+        if original_yaml_data is not None:
+            try:
+                with open(input_file, "w") as f:
+                    f.write(original_yaml_data)
+                logger.debug(f"Restored original YAML content")
+            except Exception as e:
+                logger.warning(f"Failed to restore original YAML: {e}")
+        # Copy input config to output directory
+        try:
+            config_dest = os.path.join(
+                expected_output_dir, os.path.basename(input_file)
+            )
+            shutil.copy2(input_file, config_dest)
+            logger.debug(f"Copied input config to results: {config_dest}")
+        except Exception as e:
+            logger.warning(f"Failed to copy input config: {e}")
+        return expected_output_dir
+def main():
+    """Boltz worker main entrypoint for AWS Batch array jobs."""
+    from .base import (
+        check_already_complete,
+        configure_worker_logging,
+        get_array_index,
+        get_job_dir,
+        mark_complete,
+    )
+    configure_worker_logging()
+    logger.info("Starting Boltz prediction worker")
+    try:
+        # Get configuration from environment
+        index = get_array_index()
+        job_dir = get_job_dir()
+        logger.info(f"Worker configuration:")
+        logger.info(f"  Array index: {index}")
+        logger.info(f"  Job directory: {job_dir}")
+        # Check idempotency
+        if check_already_complete(index, job_dir, prefix="boltz"):
+            logger.info("Exiting - complex already processed")
+            return
+        # Find input file by index
+        input_dir = job_dir / "input"
+        input_files = sorted(input_dir.glob("*.yaml"))
+        if index >= len(input_files):
+            logger.error(
+                f"Index {index} out of range. Found {len(input_files)} input files."
+            )
+            raise RuntimeError(f"Index {index} out of range")
+        input_file = input_files[index]
+        logger.info(f"  Input file: {input_file}")
+        # Determine output directory
+        output_base = input_file.stem
+        output_dir = job_dir / "output" / output_base
+        # Get MSA directories
+        job_msa_dir = job_dir / "msas"
+        global_msa_dir = Path(os.environ.get("MSA_DIR", "/primordial/.cache/msas"))
+        if job_msa_dir.exists():
+            msa_folder = str(job_msa_dir)
+            logger.info(f"  Using job-specific MSAs: {msa_folder}")
+        elif global_msa_dir.exists():
+            msa_folder = str(global_msa_dir)
+            logger.info(f"  Using global MSA cache: {msa_folder}")
+        else:
+            msa_folder = None
+            logger.info("  No MSA folder available, will use MSA server")
+        # Get cache directory
+        cache_dir = os.environ.get("BOLTZ_CACHE", "/primordial/.cache/boltz")
+        logger.info(f"  Cache directory: {cache_dir}")
+        # Get additional options
+        boltz_options = os.environ.get("BOLTZ_OPTIONS")
+        if boltz_options:
+            logger.info(f"  Boltz options: {boltz_options}")
+        # Create processor and run
+        processor = BoltzProcessor(
+            num_workers=None,  # Auto-detect
+            boltz_options=boltz_options,
+            msa_folder=msa_folder,
+            cache_dir=cache_dir,
+        )
+        # Ensure output directory exists
+        output_dir.parent.mkdir(parents=True, exist_ok=True)
+        result_dir = processor.run(str(input_file), str(output_dir))
+        # Mark as complete
+        mark_complete(index, job_dir, prefix="boltz")
+        logger.info(f"Complex {input_file.stem} completed successfully")
+        logger.info(f"Output: {result_dir}")
+    except Exception as e:
+        logger.exception(f"Worker failed with error: {e}")
+        raise SystemExit(1)
+if __name__ == "__main__":
+    main()

dayhoff-tools 1.12.9__tar.gz → 1.14.8__tar.gz

dayhoff-tools 1.12.9tar.gz → 1.14.8tar.gz