PyPI - dayhoff-tools - Versions diffs - 1.14.8__tar.gz → 1.14.10__tar.gz - Mend

dayhoff-tools 1.14.8tar.gz → 1.14.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{dayhoff_tools-1.14.8 → dayhoff_tools-1.14.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dayhoff-tools
-Version: 1.14.8
+Version: 1.14.10
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

{dayhoff_tools-1.14.8 → dayhoff_tools-1.14.10}/dayhoff_tools/batch/workers/boltz.py RENAMED Viewed

@@ -341,10 +341,31 @@ class BoltzProcessor:
         return expected_output_dir
+def _get_done_marker_for_file(job_dir: Path, file_stem: str) -> Path:
+    """Get the done marker path for a specific input file."""
+    return job_dir / "output" / f"boltz_{file_stem}.done"
+def _check_file_complete(job_dir: Path, file_stem: str) -> bool:
+    """Check if a specific file has been processed."""
+    return _get_done_marker_for_file(job_dir, file_stem).exists()
+def _mark_file_complete(job_dir: Path, file_stem: str):
+    """Mark a specific file as complete."""
+    done_marker = _get_done_marker_for_file(job_dir, file_stem)
+    done_marker.parent.mkdir(parents=True, exist_ok=True)
+    done_marker.touch()
+    logger.info(f"File {file_stem} marked complete: {done_marker}")
 def main():
-    """Boltz worker main entrypoint for AWS Batch array jobs."""
+    """Boltz worker main entrypoint for AWS Batch array jobs.
+    Each worker processes multiple files based on array index and total workers.
+    With N files and M workers, worker i processes files where file_index % M == i.
+    """
     from .base import (
-        check_already_complete,
         configure_worker_logging,
         get_array_index,
         get_job_dir,
@@ -358,34 +379,36 @@ def main():
         # Get configuration from environment
         index = get_array_index()
         job_dir = get_job_dir()
+        array_size = int(os.environ.get("BATCH_ARRAY_SIZE", "1"))
+        num_files = int(os.environ.get("BATCH_NUM_FILES", "0"))
         logger.info(f"Worker configuration:")
         logger.info(f"  Array index: {index}")
+        logger.info(f"  Array size: {array_size}")
+        logger.info(f"  Total files: {num_files}")
         logger.info(f"  Job directory: {job_dir}")
-        # Check idempotency
-        if check_already_complete(index, job_dir, prefix="boltz"):
-            logger.info("Exiting - complex already processed")
-            return
-        # Find input file by index
+        # Find all input files
         input_dir = job_dir / "input"
         input_files = sorted(input_dir.glob("*.yaml"))
+        total_files = len(input_files)
-        if index >= len(input_files):
-            logger.error(
-                f"Index {index} out of range. Found {len(input_files)} input files."
-            )
-            raise RuntimeError(f"Index {index} out of range")
+        if total_files == 0:
+            logger.error("No input files found")
+            raise RuntimeError("No input files found")
-        input_file = input_files[index]
-        logger.info(f"  Input file: {input_file}")
+        # Calculate which files this worker should process
+        # Worker i processes files where file_index % array_size == index
+        my_files = [f for i, f in enumerate(input_files) if i % array_size == index]
-        # Determine output directory
-        output_base = input_file.stem
-        output_dir = job_dir / "output" / output_base
+        logger.info(f"  Files assigned to this worker: {len(my_files)}")
-        # Get MSA directories
+        if not my_files:
+            logger.info("No files assigned to this worker, exiting successfully")
+            mark_complete(index, job_dir, prefix="boltz")
+            return
+        # Get MSA directories (shared across all files)
         job_msa_dir = job_dir / "msas"
         global_msa_dir = Path(os.environ.get("MSA_DIR", "/primordial/.cache/msas"))
@@ -408,7 +431,7 @@ def main():
         if boltz_options:
             logger.info(f"  Boltz options: {boltz_options}")
-        # Create processor and run
+        # Create processor (reused for all files)
         processor = BoltzProcessor(
             num_workers=None,  # Auto-detect
             boltz_options=boltz_options,
@@ -416,16 +439,60 @@ def main():
             cache_dir=cache_dir,
         )
-        # Ensure output directory exists
-        output_dir.parent.mkdir(parents=True, exist_ok=True)
+        # Process each assigned file
+        completed = 0
+        failed = 0
+        for file_idx, input_file in enumerate(my_files):
+            file_stem = input_file.stem
+            # Check if this file is already complete (idempotency)
+            if _check_file_complete(job_dir, file_stem):
+                logger.info(
+                    f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
+                    "already complete, skipping"
+                )
+                completed += 1
+                continue
+            logger.info(
+                f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}..."
+            )
+            try:
+                # Determine output directory
+                output_dir = job_dir / "output" / file_stem
+                output_dir.parent.mkdir(parents=True, exist_ok=True)
+                result_dir = processor.run(str(input_file), str(output_dir))
+                # Mark this file as complete
+                _mark_file_complete(job_dir, file_stem)
+                logger.info(
+                    f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
+                    f"completed successfully -> {result_dir}"
+                )
+                completed += 1
+            except Exception as e:
+                logger.error(
+                    f"[{file_idx + 1}/{len(my_files)}] {file_stem}: "
+                    f"failed with error: {e}"
+                )
+                failed += 1
+                # Continue processing other files even if one fails
-        result_dir = processor.run(str(input_file), str(output_dir))
+        # Summary
+        logger.info(f"Worker {index} finished: {completed} completed, {failed} failed")
-        # Mark as complete
+        # Mark worker as complete
         mark_complete(index, job_dir, prefix="boltz")
-        logger.info(f"Complex {input_file.stem} completed successfully")
-        logger.info(f"Output: {result_dir}")
+        if failed > 0:
+            logger.warning(f"{failed} file(s) failed to process")
+            # Don't exit with error - some files succeeded and are marked complete
+            # The failed files can be retried later
     except Exception as e:
         logger.exception(f"Worker failed with error: {e}")

{dayhoff_tools-1.14.8 → dayhoff_tools-1.14.10}/dayhoff_tools/cli/batch/commands/boltz.py RENAMED Viewed

@@ -232,6 +232,8 @@ def _submit_batch_job(
             "JOB_ID": job_id,
             "BOLTZ_CACHE": "/primordial/.cache/boltz",
             "MSA_DIR": "/primordial/.cache/msas",
+            "BATCH_ARRAY_SIZE": str(array_size),
+            "BATCH_NUM_FILES": str(num_files),
         }
         batch_job_id = client.submit_job(

{dayhoff_tools-1.14.8 → dayhoff_tools-1.14.10}/dayhoff_tools/cli/batch/commands/clean.py RENAMED Viewed

@@ -80,9 +80,9 @@ def clean(user, older_than, dry_run, force, base_path):
             client = BatchClient()
             live_statuses = client.get_job_statuses_batch(batch_job_ids)
         except BatchError as e:
-            click.echo(f"Error: Could not fetch status from AWS Batch: {e}")
-            click.echo("Cannot safely clean jobs without knowing their status.")
-            return
+            click.echo(f"Error: Could not fetch status from AWS Batch: {e}", err=True)
+            click.echo("Cannot safely clean jobs without knowing their status.", err=True)
+            raise SystemExit(1)
     # Find jobs that are safe to clean (SUCCEEDED or FAILED)
     safe_to_clean = []

{dayhoff_tools-1.14.8 → dayhoff_tools-1.14.10}/dayhoff_tools/cli/batch/commands/finalize.py RENAMED Viewed

@@ -29,17 +29,30 @@ from ..manifest import (
     is_flag=True,
     help="Don't delete job directory after finalizing",
 )
+@click.option(
+    "--full-output",
+    is_flag=True,
+    help="For Boltz: copy entire output directory (default: only essential files)",
+)
 @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
-def finalize(job_id, output, force, keep_intermediates, base_path):
+def finalize(job_id, output, force, keep_intermediates, full_output, base_path):
     """Combine results and clean up job intermediates.
     For embedding jobs, combines H5 files into a single output file.
-    For structure prediction, moves outputs to the destination.
+    For Boltz jobs, extracts essential files (CIF structures and confidence JSON).
     \b
     Examples:
+      # Embedding job - combine H5 files
       dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5
-      dh batch finalize dma-embed-20260109-a3f2 --output /primordial/embeddings.h5 --force
+      # Boltz job - extract essential files only (default)
+      dh batch finalize dma-boltz-20260113-190a --output /primordial/structures/
+      # Boltz job - copy all output files
+      dh batch finalize dma-boltz-20260113-190a --output /primordial/structures/ --full-output
+      # Keep job directory after finalizing
       dh batch finalize dma-embed-20260109-a3f2 --output /primordial/out.h5 --keep-intermediates
     """
     # Load manifest
@@ -81,7 +94,7 @@ def finalize(job_id, output, force, keep_intermediates, base_path):
     if manifest.pipeline in ("embed-t5", "embed"):
         _finalize_embeddings(output_dir, output_path)
     elif manifest.pipeline == "boltz":
-        _finalize_boltz(output_dir, output_path)
+        _finalize_boltz(output_dir, output_path, full_output=full_output)
     else:
         _finalize_generic(output_dir, output_path)
@@ -196,29 +209,100 @@ def _finalize_embeddings(output_dir: Path, output_path: Path):
             shutil.copy2(h5_files[0], output_path)
-def _finalize_boltz(output_dir: Path, output_path: Path):
-    """Move Boltz output directories to destination."""
+def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = False):
+    """Move Boltz output to destination.
+    Args:
+        output_dir: Source directory containing boltz_results_* folders
+        output_path: Destination directory for outputs
+        full_output: If True, copy entire output directories. If False (default),
+                    extract only essential files (CIF structures and confidence JSON).
+    """
     # Find all output directories (one per complex)
-    complex_dirs = [d for d in output_dir.iterdir() if d.is_dir()]
+    complex_dirs = [d for d in output_dir.iterdir() if d.is_dir() and d.name.startswith("boltz_results_")]
     if not complex_dirs:
         click.echo("No output directories found.", err=True)
         raise SystemExit(1)
-    click.echo(f"Found {len(complex_dirs)} structure predictions to move")
+    click.echo(f"Found {len(complex_dirs)} structure predictions")
+    if full_output:
+        click.echo("Mode: Copying full output (all files)")
+    else:
+        click.echo("Mode: Extracting essential files only (CIF + confidence JSON)")
+        click.echo("       Use --full-output to copy all files")
+    # Confirm before proceeding
+    click.echo()
+    if not click.confirm(f"Copy results to {output_path}?"):
+        click.echo("Cancelled.")
+        raise SystemExit(0)
     # Ensure output directory exists
     output_path.mkdir(parents=True, exist_ok=True)
+    copied_count = 0
+    skipped_count = 0
     for complex_dir in complex_dirs:
-        dest = output_path / complex_dir.name
+        complex_name = complex_dir.name.replace("boltz_results_", "")
+        dest = output_path / complex_name
         if dest.exists():
-            click.echo(f"  Skipping {complex_dir.name} (already exists)")
+            click.echo(f"  Skipping {complex_name} (already exists)")
+            skipped_count += 1
             continue
-        shutil.move(str(complex_dir), str(dest))
-        click.echo(f"  Moved {complex_dir.name}")
+        if full_output:
+            # Copy entire directory
+            shutil.copytree(complex_dir, dest)
+            click.echo(f"  Copied {complex_name} (full output)")
+        else:
+            # Extract only essential files
+            _extract_essential_boltz_files(complex_dir, dest, complex_name)
+            click.echo(f"  Extracted {complex_name} (essential files)")
+        copied_count += 1
-    click.echo(click.style("✓ Structures moved successfully", fg="green"))
+    click.echo()
+    if skipped_count > 0:
+        click.echo(f"Copied {copied_count} predictions, skipped {skipped_count} existing")
+    else:
+        click.echo(click.style(f"✓ Copied {copied_count} structure predictions successfully", fg="green"))
+def _extract_essential_boltz_files(source_dir: Path, dest_dir: Path, complex_name: str):
+    """Extract only essential files from Boltz output.
+    Essential files are:
+    - predictions/*/*.cif (structure files)
+    - predictions/*/confidence_*.json (confidence metrics)
+    Args:
+        source_dir: Source boltz_results_* directory
+        dest_dir: Destination directory to create
+        complex_name: Name of the complex (for better error messages)
+    """
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    predictions_dir = source_dir / "predictions"
+    if not predictions_dir.exists():
+        click.echo(f"    Warning: No predictions directory found for {complex_name}", err=True)
+        return
+    # Find all subdirectories in predictions/ (usually just one named after the complex)
+    for pred_subdir in predictions_dir.iterdir():
+        if not pred_subdir.is_dir():
+            continue
+        # Copy CIF files (structures)
+        for cif_file in pred_subdir.glob("*.cif"):
+            shutil.copy2(cif_file, dest_dir / cif_file.name)
+        # Copy confidence JSON files
+        for json_file in pred_subdir.glob("confidence_*.json"):
+            shutil.copy2(json_file, dest_dir / json_file.name)
 def _finalize_generic(output_dir: Path, output_path: Path):

{dayhoff_tools-1.14.8 → dayhoff_tools-1.14.10}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "dayhoff-tools"
-version = "1.14.8"
+version = "1.14.10"
 description = "Common tools for all the repos at Dayhoff Labs"
 authors = [
     {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}