PyPI - dayhoff-tools - Versions diffs - 1.15.0__tar.gz → 1.15.1__tar.gz - Mend

dayhoff-tools 1.15.0tar.gz → 1.15.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dayhoff-tools
-Version: 1.15.0
+Version: 1.15.1
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com
@@ -40,8 +40,8 @@ Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
 Requires-Dist: toml (>=0.10)
 Requires-Dist: tqdm (>=4.67.1) ; extra == "embedders"
 Requires-Dist: tqdm (>=4.67.1) ; extra == "full"
-Requires-Dist: transformers (==4.36.2) ; extra == "full"
 Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
+Requires-Dist: transformers (>=4.36.2) ; extra == "full"
 Requires-Dist: typer (>=0.9.0)
 Requires-Dist: tzdata (>=2025.2)
 Description-Content-Type: text/markdown

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/batch/workers/boltz.py RENAMED Viewed

@@ -235,7 +235,7 @@ class BoltzProcessor:
         # Determine output directory
         # Boltz always creates boltz_results_{input_name} inside --out_dir
         input_base = os.path.splitext(os.path.basename(input_file))[0]
         if output_dir is None:
             # No output_dir specified, boltz creates in current directory
             expected_output_dir = f"boltz_results_{input_base}"
@@ -244,7 +244,9 @@ class BoltzProcessor:
             # output_dir specified - use its parent for --out_dir
             # and expect boltz_results_{input_base} inside it
             parent_dir = os.path.dirname(output_dir)
-            expected_output_dir = os.path.join(parent_dir, f"boltz_results_{input_base}")
+            expected_output_dir = os.path.join(
+                parent_dir, f"boltz_results_{input_base}"
+            )
             out_dir_arg = parent_dir if parent_dir else None
         logger.info(f"Running Boltz prediction for {input_file}")
@@ -455,9 +457,7 @@ def main():
                 completed += 1
                 continue
-            logger.info(
-                f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}..."
-            )
+            logger.info(f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}...")
             try:
                 # Determine output directory

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/cancel.py RENAMED Viewed

@@ -152,9 +152,13 @@ def _cancel_retry_job(manifest, retry_id: str, force: bool, base_path: str):
             )
         click.echo()
-        click.echo(click.style(f"✓ Retry job {retry_id} cancelled successfully", fg="green"))
+        click.echo(
+            click.style(f"✓ Retry job {retry_id} cancelled successfully", fg="green")
+        )
         click.echo(f"Parent job: {manifest.job_id}")
     except BatchError as e:
-        click.echo(click.style(f"✗ Failed to cancel retry job: {e}", fg="red"), err=True)
+        click.echo(
+            click.style(f"✗ Failed to cancel retry job: {e}", fg="red"), err=True
+        )
         raise SystemExit(1)

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/clean.py RENAMED Viewed

@@ -20,7 +20,9 @@ from .status import format_time_ago, _aws_status_to_job_status
     default=7,
     help="Only clean jobs older than N days [default: 7]",
 )
-@click.option("--dry-run", is_flag=True, help="Show what would be cleaned without deleting")
+@click.option(
+    "--dry-run", is_flag=True, help="Show what would be cleaned without deleting"
+)
 @click.option("--force", is_flag=True, help="Delete without confirmation")
 @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
 def clean(user, older_than, dry_run, force, base_path):
@@ -81,7 +83,9 @@ def clean(user, older_than, dry_run, force, base_path):
             live_statuses = client.get_job_statuses_batch(batch_job_ids)
         except BatchError as e:
             click.echo(f"Error: Could not fetch status from AWS Batch: {e}", err=True)
-            click.echo("Cannot safely clean jobs without knowing their status.", err=True)
+            click.echo(
+                "Cannot safely clean jobs without knowing their status.", err=True
+            )
             raise SystemExit(1)
     # Find jobs that are safe to clean (SUCCEEDED or FAILED)

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/embed_t5.py RENAMED Viewed

@@ -21,7 +21,6 @@ from ..manifest import (
     save_manifest,
 )
 # Default settings for T5 embedding
 DEFAULT_QUEUE = "t4-1x-spot"
 DEFAULT_WORKERS = 50

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/finalize.py RENAMED Viewed

@@ -40,7 +40,9 @@ from ..manifest import (
     help="Skip deduplication step (use if input has no duplicates)",
 )
 @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
-def finalize(job_id, output, force, keep_intermediates, full_output, skip_dedup, base_path):
+def finalize(
+    job_id, output, force, keep_intermediates, full_output, skip_dedup, base_path
+):
     """Combine results and clean up job intermediates.
     For embedding jobs, combines H5 files into a single output file.
@@ -238,14 +240,18 @@ def _finalize_embeddings(output_dir: Path, output_path: Path, skip_dedup: bool =
                 if skip_dedup:
                     # Skip dedup - optimize directly from combined
                     click.echo("Optimizing chunks...")
-                    optimize_protein_embedding_chunks(str(combined_path), str(output_path))
+                    optimize_protein_embedding_chunks(
+                        str(combined_path), str(output_path)
+                    )
                 else:
                     # Full pipeline: combine -> dedup -> optimize
                     deduped_path = Path(tmpdir) / "deduped.h5"
                     click.echo("Deduplicating...")
                     deduplicate_h5_file(str(combined_path), str(deduped_path))
                     click.echo("Optimizing chunks...")
-                    optimize_protein_embedding_chunks(str(deduped_path), str(output_path))
+                    optimize_protein_embedding_chunks(
+                        str(deduped_path), str(output_path)
+                    )
         click.echo(click.style("✓ H5 files combined successfully", fg="green"))
@@ -269,7 +275,7 @@ def _finalize_embeddings(output_dir: Path, output_path: Path, skip_dedup: bool =
 def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = False):
     """Move Boltz output to destination.
     Args:
         output_dir: Source directory containing boltz_results_* folders
         output_path: Destination directory for outputs
@@ -277,20 +283,24 @@ def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = Fal
                     extract only essential files (CIF structures and confidence JSON).
     """
     # Find all output directories (one per complex)
-    complex_dirs = [d for d in output_dir.iterdir() if d.is_dir() and d.name.startswith("boltz_results_")]
+    complex_dirs = [
+        d
+        for d in output_dir.iterdir()
+        if d.is_dir() and d.name.startswith("boltz_results_")
+    ]
     if not complex_dirs:
         click.echo("No output directories found.", err=True)
         raise SystemExit(1)
     click.echo(f"Found {len(complex_dirs)} structure predictions")
     if full_output:
         click.echo("Mode: Copying full output (all files)")
     else:
         click.echo("Mode: Extracting essential files only (CIF + confidence JSON)")
         click.echo("       Use --full-output to copy all files")
     # Confirm before proceeding
     click.echo()
     if not click.confirm(f"Copy results to {output_path}?"):
@@ -302,16 +312,16 @@ def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = Fal
     copied_count = 0
     skipped_count = 0
     for complex_dir in complex_dirs:
         complex_name = complex_dir.name.replace("boltz_results_", "")
         dest = output_path / complex_name
         if dest.exists():
             click.echo(f"  Skipping {complex_name} (already exists)")
             skipped_count += 1
             continue
         if full_output:
             # Copy entire directory
             shutil.copytree(complex_dir, dest)
@@ -320,44 +330,53 @@ def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = Fal
             # Extract only essential files
             _extract_essential_boltz_files(complex_dir, dest, complex_name)
             click.echo(f"  Extracted {complex_name} (essential files)")
         copied_count += 1
     click.echo()
     if skipped_count > 0:
-        click.echo(f"Copied {copied_count} predictions, skipped {skipped_count} existing")
+        click.echo(
+            f"Copied {copied_count} predictions, skipped {skipped_count} existing"
+        )
     else:
-        click.echo(click.style(f"✓ Copied {copied_count} structure predictions successfully", fg="green"))
+        click.echo(
+            click.style(
+                f"✓ Copied {copied_count} structure predictions successfully",
+                fg="green",
+            )
+        )
 def _extract_essential_boltz_files(source_dir: Path, dest_dir: Path, complex_name: str):
     """Extract only essential files from Boltz output.
     Essential files are:
     - predictions/*/*.cif (structure files)
     - predictions/*/confidence_*.json (confidence metrics)
     Args:
         source_dir: Source boltz_results_* directory
         dest_dir: Destination directory to create
         complex_name: Name of the complex (for better error messages)
     """
     dest_dir.mkdir(parents=True, exist_ok=True)
     predictions_dir = source_dir / "predictions"
     if not predictions_dir.exists():
-        click.echo(f"    Warning: No predictions directory found for {complex_name}", err=True)
+        click.echo(
+            f"    Warning: No predictions directory found for {complex_name}", err=True
+        )
         return
     # Find all subdirectories in predictions/ (usually just one named after the complex)
     for pred_subdir in predictions_dir.iterdir():
         if not pred_subdir.is_dir():
             continue
         # Copy CIF files (structures)
         for cif_file in pred_subdir.glob("*.cif"):
             shutil.copy2(cif_file, dest_dir / cif_file.name)
         # Copy confidence JSON files
         for json_file in pred_subdir.glob("confidence_*.json"):
             shutil.copy2(json_file, dest_dir / json_file.name)

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/status.py RENAMED Viewed

@@ -264,7 +264,9 @@ def _show_job_details(job_id: str, base_path: str):
             reslice_info = ""
             if retry.reslice_prefix:
                 reslice_info = f" (resliced to {retry.reslice_count} chunks)"
-            click.echo(f"  - {retry.retry_id}: {len(retry.indices)} indices{reslice_info}")
+            click.echo(
+                f"  - {retry.retry_id}: {len(retry.indices)} indices{reslice_info}"
+            )
             click.echo(f"    Indices: {retry.indices}")
             if retry.batch_job_id:
                 # Show brief status for retry job
@@ -273,7 +275,9 @@ def _show_job_details(job_id: str, base_path: str):
                     array_status = client.get_array_job_status(retry.batch_job_id)
                     if array_status.is_complete:
                         pct = array_status.success_rate * 100
-                        color = "green" if pct == 100 else "yellow" if pct > 90 else "red"
+                        color = (
+                            "green" if pct == 100 else "yellow" if pct > 90 else "red"
+                        )
                         click.echo(
                             f"    Status: Complete - {click.style(f'{pct:.0f}%', fg=color)} "
                             f"({array_status.succeeded}/{array_status.total} succeeded)"
@@ -364,7 +368,9 @@ def _show_retry_details(manifest, retry_id: str):
     click.echo("Retry Config:")
     click.echo(f"  Indices:   {retry_info.indices}")
     if retry_info.reslice_prefix:
-        click.echo(f"  Reslice:   {retry_info.reslice_prefix} ({retry_info.reslice_count} chunks)")
+        click.echo(
+            f"  Reslice:   {retry_info.reslice_prefix} ({retry_info.reslice_count} chunks)"
+        )
     else:
         click.echo(f"  Reslice:   No (retrying original chunks)")

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/submit.py RENAMED Viewed

@@ -18,7 +18,6 @@ from ..manifest import (
     save_manifest,
 )
 # Default job definition for generic jobs
 DEFAULT_JOB_DEFINITION = "dayhoff-batch-base"
 DEFAULT_QUEUE = "t4-1x-spot"

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/utility_commands.py RENAMED Viewed

@@ -258,5 +258,3 @@ def build_and_upload_wheel(bump_part: str = "patch"):
                     print(f"Warning: Could not find version {new_version} to revert.")
             except Exception as revert_e:
                 print(f"Warning: Failed to revert version change: {revert_e}")

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/embedders.py RENAMED Viewed

@@ -179,8 +179,8 @@ class H5Reformatter(Processor):
     def embedding_file_to_df(self, file_name: str) -> pd.DataFrame:
         with h5py.File(file_name, "r") as f:
             gene_names = list(f.keys())
-            Xg = [f[key][()] for key in gene_names]  # type:ignore
-        return pd.DataFrame(np.asmatrix(Xg), index=gene_names)  # type:ignore
+            Xg = [f[key][()] for key in gene_names]  # type: ignore
+        return pd.DataFrame(np.asmatrix(Xg), index=gene_names)  # type: ignore
     def write_df_to_h5(self, df: pd.DataFrame, filename: str, description: str) -> None:
         """

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/fasta.py RENAMED Viewed

@@ -857,14 +857,12 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
     # Create the SQLite database and table
     print("Creating SQLite database...")
     with sqlite3.connect(db_file) as conn:
-        conn.execute(
-            """
+        conn.execute("""
             CREATE TABLE IF NOT EXISTS proteins (
                 protein_id TEXT PRIMARY KEY,
                 sequence TEXT NOT NULL
             )
-        """
-        )
+        """)
     print("Database created successfully.")
     # Estimate number of records for progress bar

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/kegg.py RENAMED Viewed

@@ -25,9 +25,7 @@ def get_ko2gene_df(db: str, ko: str | list[str] | None = None) -> pd.DataFrame:
         query = (
             f"SELECT gene,ko FROM gene_to_ko WHERE ko IN ({','.join('?' * len(ko))})"
         )
-        result_df = pd.read_sql_query(
-            query, conn, params=ko  # type:ignore
-        )
+        result_df = pd.read_sql_query(query, conn, params=ko)  # type: ignore
     else:
         query = f"SELECT gene,ko FROM gene_to_ko"
         result_df = pd.read_sql_query(query, conn)

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/structure.py RENAMED Viewed

@@ -409,10 +409,10 @@ class PDBFolderProcessor:
     def _get_pdb_files(self) -> list[str]:
         """
         Get a list of PDB files in the specified directory, optionally filtered by ID set.
-        Files are sorted by creation time to ensure consistent processing order.
+        Files are sorted alphabetically to ensure consistent, reproducible processing order.
         Returns:
-            List of PDB file names sorted by creation time.
+            List of PDB file names sorted alphabetically.
         """
         print("Scanning directory for PDB files...")
         pdb_files = [
@@ -424,8 +424,8 @@ class PDBFolderProcessor:
                 f for f in pdb_files if self._extract_id_from_filename(f) in self.id_set
             ]
-        # Sort files by creation time
-        pdb_files.sort(key=lambda f: os.path.getctime(os.path.join(self.pdb_dir, f)))
+        # Sort files alphabetically for deterministic, reproducible order
+        pdb_files.sort()
         print(f"Found {len(pdb_files)} PDB files")
         return pdb_files

{dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/pyproject.toml RENAMED Viewed

@@ -11,7 +11,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "dayhoff-tools"
-version = "1.15.0"
+version = "1.15.1"
 description = "Common tools for all the repos at Dayhoff Labs"
 authors = [
     {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
@@ -43,7 +43,7 @@ full = [
     "sentencepiece>=0.2.0",
     "sqlalchemy>=2.0.40,<3.0.0",
     "tqdm>=4.67.1",
-    "transformers==4.36.2",
+    "transformers>=4.36.2",  # Relaxed: exact pin broke conda envs with huggingface-hub>=1.0
 ]
 # Embedding models (requires torch - user must install separately for their platform)