PyPI - dayhoff-tools - Versions diffs - 1.1.39__py3-none-any.whl → 1.1.41__py3-none-any.whl - Mend

dayhoff-tools 1.1.39py3-none-any.whl → 1.1.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

dayhoff_tools/deployment/processors.py CHANGED Viewed

@@ -400,425 +400,3 @@ class BoltzPredictor(Processor):
             f"Boltz prediction completed successfully. Output in {expected_output_dir}"
         )
         return expected_output_dir
-class MMSeqsProfileProcessor(Processor):
-    """Processor for running MMseqs2 profile searches.
-    This class wraps the MMseqs2 workflow to perform a profile-based search
-    against a target database using a query FASTA.
-    """
-    def __init__(
-        self,
-        query_fasta_path_in_image: str,
-        num_threads: int = 8,
-        mmseqs_args: dict | None = None,
-    ):
-        """Initialize the MMSeqsProfileProcessor.
-        Args:
-            query_fasta_path_in_image: Path to the query FASTA file. This path is expected
-                                       to be accessible within the execution environment (e.g.,
-                                       packaged in a Docker image).
-            num_threads: Number of threads to use for MMseqs2 commands.
-            mmseqs_args: A dictionary of additional MMseqs2 parameters.
-                         Expected keys: "memory_limit_gb", "evalue", "sensitivity",
-                         "max_seqs_search", "min_seq_id_cluster", "max_seqs_profile_msa".
-                         Defaults are used if not provided.
-        """
-        if not Path(query_fasta_path_in_image).is_file():
-            raise FileNotFoundError(
-                f"Query FASTA file not found at: {query_fasta_path_in_image}"
-            )
-        self.query_fasta_path = query_fasta_path_in_image
-        self.num_threads = str(num_threads)  # MMseqs2 expects string for threads
-        default_mmseqs_args = {
-            "memory_limit_gb": "25",
-            "evalue": "10",
-            "sensitivity": "7.5",
-            "max_seqs_search": "300",
-            "min_seq_id_cluster": "0.8",
-            "max_seqs_profile_msa": "1000",
-        }
-        if mmseqs_args:
-            self.mmseqs_args = {**default_mmseqs_args, **mmseqs_args}
-        else:
-            self.mmseqs_args = default_mmseqs_args
-        # Log dayhoff-tools version
-        from dayhoff_tools import __version__
-        logger.info(f"dayhoff-tools version: {__version__}")
-        logger.info(
-            f"MMSeqsProfileProcessor initialized with query: {self.query_fasta_path}"
-        )
-        logger.info(f"MMSeqs args: {self.mmseqs_args}")
-        logger.info(f"Num threads: {self.num_threads}")
-    def _run_mmseqs_command(
-        self, command_parts: list[str], step_description: str, work_dir: Path
-    ):
-        """Runs an MMseqs2 command and logs its execution.
-        Args:
-            command_parts: A list of strings representing the command and its arguments.
-            step_description: A human-readable description of the MMseqs2 step.
-            work_dir: The working directory for the command.
-        Raises:
-            subprocess.CalledProcessError: If the MMseqs2 command returns a non-zero exit code.
-        """
-        full_command = " ".join(command_parts)
-        logger.info(f"Running MMseqs2 step in {work_dir}: {step_description}")
-        logger.info(f"Command: {full_command}")
-        try:
-            process = subprocess.run(
-                command_parts,
-                check=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True,
-                cwd=work_dir,  # Run command in the specified working directory
-            )
-            if process.stdout:
-                logger.info(f"MMseqs2 stdout: {process.stdout.strip()}")
-            if process.stderr:  # MMseqs often outputs informational messages to stderr
-                logger.info(f"MMseqs2 stderr: {process.stderr.strip()}")
-            logger.info(f"MMseqs2 step '{step_description}' completed successfully.")
-        except subprocess.CalledProcessError as e:
-            logger.error(f"MMseqs2 step '{step_description}' failed in {work_dir}.")
-            if e.stdout:
-                logger.error(f"MMseqs2 stdout: {e.stdout.strip()}")
-            if e.stderr:
-                logger.error(f"MMseqs2 stderr: {e.stderr.strip()}")
-            raise
-    def run(self, input_file: str) -> str:
-        """Run MMseqs2 profile search.
-        The input_file is the target FASTA. The query FASTA is provided
-        during initialization.
-        The method creates an output directory (e.g., {target_stem})
-        which contains the result files, now named meaningfully using the target stem
-        (e.g., {target_stem}_results.m8 and {target_stem}_hits.fasta).
-        Args:
-            input_file: Path to the input target FASTA file.
-        Returns:
-            Path to the output directory (e.g., {target_stem}) containing
-            the meaningfully named result files.
-        Raises:
-            subprocess.CalledProcessError: If any MMseqs2 command fails.
-            FileNotFoundError: If the input_file is not found.
-        """
-        if not Path(input_file).is_file():
-            raise FileNotFoundError(f"Input target FASTA file not found: {input_file}")
-        input_file_path = Path(input_file).resolve()  # Ensure absolute path
-        target_fasta_filename = input_file_path.name
-        target_fasta_stem = input_file_path.stem  # Get stem for naming
-        # Create a unique base directory for this run's outputs and temp files
-        # This directory will be returned and subsequently uploaded by the Operator
-        run_base_dir_name = f"{target_fasta_stem}"  # Use stem as the dir name
-        run_base_dir = Path(run_base_dir_name).resolve()
-        run_base_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Created run base directory: {run_base_dir}")
-        # Define local paths within the run_base_dir
-        local_target_file = run_base_dir / target_fasta_filename
-        # Copy the target file into the run directory to keep inputs and outputs together
-        shutil.copy(input_file_path, local_target_file)
-        logger.info(f"Copied target file {input_file_path} to {local_target_file}")
-        # Query file is already specified by self.query_fasta_path (path in image)
-        local_query_file = Path(self.query_fasta_path).resolve()
-        # Temporary directory for MMseqs2 intermediate files, created inside run_base_dir
-        mmseqs_temp_dir = run_base_dir / "mmseqs_tmp"
-        mmseqs_temp_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Created MMseqs2 temporary directory: {mmseqs_temp_dir}")
-        # Define INTERMEDIATE output file paths within mmseqs_temp_dir
-        intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
-        intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
-        # Define FINAL output file paths within run_base_dir, using target stem
-        final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
-        final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
-        # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
-        query_db = mmseqs_temp_dir / "queryDB"
-        target_db = mmseqs_temp_dir / "targetDB"
-        # Ensure local_target_file is used for creating targetDB
-        target_db_input_file = local_target_file
-        query_db_cluster = mmseqs_temp_dir / "queryDB_cluster"
-        query_db_rep = mmseqs_temp_dir / "queryDB_rep"
-        aln_db = mmseqs_temp_dir / "alnDB"
-        profile_db = mmseqs_temp_dir / "profileDB"
-        result_db = mmseqs_temp_dir / "resultDB"
-        try:
-            # 1. Create query database
-            self._run_mmseqs_command(
-                ["mmseqs", "createdb", str(local_query_file), str(query_db)],
-                "Create query DB",
-                run_base_dir,  # Working directory for the command
-            )
-            # 2. Create target database
-            self._run_mmseqs_command(
-                ["mmseqs", "createdb", str(target_db_input_file), str(target_db)],
-                "Create target DB",
-                run_base_dir,
-            )
-            # 3. Cluster query sequences
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "cluster",
-                    str(query_db),
-                    str(query_db_cluster),
-                    str(
-                        mmseqs_temp_dir / "tmp_cluster"
-                    ),  # MMseqs needs a temp dir for cluster
-                    "--min-seq-id",
-                    self.mmseqs_args["min_seq_id_cluster"],
-                    "--threads",
-                    self.num_threads,
-                ],
-                "Cluster query sequences",
-                run_base_dir,
-            )
-            # 4. Create representative set from query clusters
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "createsubdb",
-                    str(query_db_cluster),
-                    str(query_db),
-                    str(query_db_rep),
-                ],
-                "Create representative query set",
-                run_base_dir,
-            )
-            # 5. Create MSA for profile generation
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "search",
-                    str(query_db_rep),
-                    str(query_db),  # Search representative against full query DB
-                    str(aln_db),
-                    str(mmseqs_temp_dir / "tmp_search_msa"),  # Temp for this search
-                    "--max-seqs",
-                    self.mmseqs_args["max_seqs_profile_msa"],
-                    "--threads",
-                    self.num_threads,
-                ],
-                "Create MSA for profile",
-                run_base_dir,
-            )
-            # 6. Create profile database
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "result2profile",
-                    str(query_db_rep),  # Use query_db_rep as input for profile
-                    str(query_db),  # Full query DB as second arg
-                    str(aln_db),
-                    str(profile_db),
-                    "--threads",  # Added threads option
-                    self.num_threads,
-                ],
-                "Create profile DB",
-                run_base_dir,
-            )
-            # 7. Perform profile search
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "search",
-                    str(profile_db),
-                    str(target_db),
-                    str(result_db),
-                    str(mmseqs_temp_dir / "tmp_search_profile"),  # Temp for this search
-                    "--split-memory-limit",
-                    f"{self.mmseqs_args['memory_limit_gb']}G",
-                    "-e",
-                    self.mmseqs_args["evalue"],
-                    "--max-seqs",
-                    self.mmseqs_args["max_seqs_search"],
-                    "--threads",
-                    self.num_threads,
-                    "-s",
-                    self.mmseqs_args["sensitivity"],
-                ],
-                "Perform profile search",
-                run_base_dir,
-            )
-            # 8. Convert results to tabular format (M8) -> to intermediate file
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "convertalis",
-                    str(profile_db),  # Query DB used for search (profileDB)
-                    str(target_db),
-                    str(result_db),
-                    str(intermediate_results_m8_file),  # Output M8 file to temp dir
-                    "--threads",
-                    self.num_threads,
-                ],
-                "Convert results to M8",
-                run_base_dir,
-            )
-            # 8.5 Convert M8 to CSV with headers
-            logger.info(
-                f"Converting M8 results to CSV: {intermediate_results_m8_file} -> {intermediate_results_as_csv_file}"
-            )
-            csv_headers = [
-                "query_id",
-                "target_id",
-                "percent_identity",
-                "alignment_length",
-                "mismatches",
-                "gap_openings",
-                "query_start",
-                "query_end",
-                "target_start",
-                "target_end",
-                "e_value",
-                "bit_score",
-            ]
-            try:
-                if not intermediate_results_m8_file.exists():
-                    logger.warning(
-                        f"M8 results file {intermediate_results_m8_file} not found. CSV will be empty."
-                    )
-                    # Create an empty CSV with headers if M8 is missing
-                    with open(
-                        intermediate_results_as_csv_file, "w", newline=""
-                    ) as csvfile:
-                        writer = csv.writer(csvfile)
-                        writer.writerow(csv_headers)
-                else:
-                    with (
-                        open(intermediate_results_m8_file, "r") as m8file,
-                        open(
-                            intermediate_results_as_csv_file, "w", newline=""
-                        ) as csvfile,
-                    ):
-                        writer = csv.writer(csvfile)
-                        writer.writerow(csv_headers)
-                        for line in m8file:
-                            writer.writerow(line.strip().split("\t"))
-            except Exception as e:
-                logger.error(f"Error converting M8 to CSV: {e}", exc_info=True)
-                # Ensure an empty csv is created on error to prevent downstream issues
-                if not intermediate_results_as_csv_file.exists():
-                    with open(
-                        intermediate_results_as_csv_file, "w", newline=""
-                    ) as csvfile:
-                        writer = csv.writer(csvfile)
-                        writer.writerow(csv_headers)  # write headers even on error
-            # 9. Extract hit sequence IDs from M8 results for the TXT file
-            hit_sequence_ids = set()
-            logger.info(
-                f"Extracting hit IDs from {intermediate_results_m8_file} for TXT output."
-            )
-            try:
-                if intermediate_results_m8_file.exists():
-                    with open(intermediate_results_m8_file, "r") as m8_file:
-                        for line in m8_file:
-                            if line.strip():  # Check if line is not empty
-                                columns = line.strip().split("\t")
-                                if len(columns) >= 2:
-                                    hit_sequence_ids.add(
-                                        columns[1]
-                                    )  # Add target_accession
-                    logger.info(
-                        f"Found {len(hit_sequence_ids)} unique hit IDs in M8 file."
-                    )
-                else:
-                    logger.warning(
-                        f"Intermediate M8 file {intermediate_results_m8_file} not found. Hit TXT file will be empty."
-                    )
-            except Exception as e:
-                logger.error(
-                    f"Error reading M8 file {intermediate_results_m8_file} for hit ID extraction: {e}",
-                    exc_info=True,
-                )
-                # Proceed even if M8 reading fails, TXT will be empty
-            # 10. Write the set of hit sequence IDs to the final .txt file
-            logger.info(
-                f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
-            )
-            try:
-                with open(final_hits_txt_file, "w") as txt_out:
-                    # Sort IDs for consistent output
-                    for seq_id in sorted(list(hit_sequence_ids)):
-                        txt_out.write(f"{seq_id}\n")
-                logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
-            except Exception as e:
-                logger.error(
-                    f"Failed to write hit IDs to {final_hits_txt_file}: {e}",
-                    exc_info=True,
-                )
-                # Ensure the file exists even if writing fails
-                if not final_hits_txt_file.exists():
-                    final_hits_txt_file.touch()
-            logger.info(
-                f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
-            )
-            # Move and rename final output files from mmseqs_temp_dir to run_base_dir
-            if intermediate_results_as_csv_file.exists():
-                shutil.move(
-                    str(intermediate_results_as_csv_file), str(final_results_csv_file)
-                )
-                logger.info(
-                    f"Moved and renamed M8 results to CSV: {final_results_csv_file}"
-                )
-            else:
-                logger.warning(
-                    f"Intermediate CSV file {intermediate_results_as_csv_file} not found. Creating empty target CSV file."
-                )
-                final_results_csv_file.touch()  # Create empty file in run_base_dir if not found
-            logger.info(
-                f"MMSeqsProfileProcessor run completed for {input_file}. Output CSV: {final_results_csv_file}"
-            )
-        except Exception as e:
-            logger.error(
-                f"MMSeqsProfileProcessor failed for {input_file}: {e}", exc_info=True
-            )
-            raise
-        finally:
-            # --- Cleanup --- #
-            logger.info(f"Cleaning up temporary directory: {mmseqs_temp_dir}")
-            if mmseqs_temp_dir.exists():
-                shutil.rmtree(mmseqs_temp_dir)
-            if local_target_file.exists() and local_target_file != Path(input_file):
-                logger.info(
-                    f"Cleaning up local copy of target file: {local_target_file}"
-                )
-                local_target_file.unlink()
-            logger.info("MMSeqsProfileProcessor cleanup finished.")
-        return str(run_base_dir)  # Return the path to the directory containing outputs

dayhoff_tools/embedders.py CHANGED Viewed

@@ -442,35 +442,38 @@ class Embedder(Processor):
             for seq_id, seq in small_seqs_sorted:
                 seq_len = len(seq)
-                if current_size + seq_len > self.batch_residue_limit:
-                    if current_batch:
-                        small_batch_count += 1
-                        logger.info(
-                            f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
-                        )
-                        batch_results = self.embed_batch(current_batch)
-                        results.update(batch_results)
-                        self.cleanup_memory()
+                # Check if adding this sequence would exceed the limit
+                if current_batch and current_size + seq_len > self.batch_residue_limit:
+                    # Process current batch before adding the new sequence
+                    small_batch_count += 1
+                    logger.info(
+                        f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
+                    )
+                    batch_results = self.embed_batch(current_batch)
+                    results.update(batch_results)
+                    self.cleanup_memory()
-                        # Update progress
-                        processed_sequences += len(current_batch)
-                        elapsed_time = time.time() - start_time
-                        remaining_sequences = total_sequences - processed_sequences
-                        avg_time_per_seq = (
-                            elapsed_time / processed_sequences
-                            if processed_sequences > 0
-                            else 0
-                        )
-                        estimated_time_left = avg_time_per_seq * remaining_sequences
+                    # Update progress
+                    processed_sequences += len(current_batch)
+                    elapsed_time = time.time() - start_time
+                    remaining_sequences = total_sequences - processed_sequences
+                    avg_time_per_seq = (
+                        elapsed_time / processed_sequences
+                        if processed_sequences > 0
+                        else 0
+                    )
+                    estimated_time_left = avg_time_per_seq * remaining_sequences
-                        logger.info(
-                            f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
-                            f"Elapsed: {elapsed_time/60:.1f} min | "
-                            f"Est. remaining: {estimated_time_left/60:.1f} min"
-                        )
+                    logger.info(
+                        f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
+                        f"Elapsed: {elapsed_time/60:.1f} min | "
+                        f"Est. remaining: {estimated_time_left/60:.1f} min"
+                    )
+                    # Start new batch
                     current_batch = []
                     current_size = 0
+                # Add the current sequence to the batch
                 current_batch.append((seq_id, seq, seq_len))
                 current_size += seq_len

{dayhoff_tools-1.1.39.dist-info → dayhoff_tools-1.1.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.1.39
+Version: 1.1.41
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

{dayhoff_tools-1.1.39.dist-info → dayhoff_tools-1.1.41.dist-info}/RECORD RENAMED Viewed

@@ -11,9 +11,9 @@ dayhoff_tools/deployment/deploy_aws.py,sha256=jQyQ0fbm2793jEHFO84lr5tNqiOpdBg6U0
 dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
 dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
 dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
-dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
+dayhoff_tools/deployment/processors.py,sha256=f4L52ekx_zYirl8C4WfavxtOioyD-c34TdTJVDoLpWs,16572
 dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
-dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
+dayhoff_tools/embedders.py,sha256=fRkyWjHo8OmbNUBY_FwrgfvyiLqpmrpI57UAb1Szn1Y,36609
 dayhoff_tools/fasta.py,sha256=_kA2Cpiy7JAGbBqLrjElkzbcUD_p-nO2d5Aj1LVmOvc,50509
 dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
 dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
 dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
 dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
 dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
-dayhoff_tools-1.1.39.dist-info/METADATA,sha256=c-7TpBB15X71z48kz2BUHFFqWddHgywH88BLequz3d0,2843
-dayhoff_tools-1.1.39.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-dayhoff_tools-1.1.39.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
-dayhoff_tools-1.1.39.dist-info/RECORD,,
+dayhoff_tools-1.1.41.dist-info/METADATA,sha256=HgiBVffpoYUtLMGV4uAKXJCyiPVmo39ytRHQ41b6-hg,2843
+dayhoff_tools-1.1.41.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+dayhoff_tools-1.1.41.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
+dayhoff_tools-1.1.41.dist-info/RECORD,,

{dayhoff_tools-1.1.39.dist-info → dayhoff_tools-1.1.41.dist-info}/WHEEL RENAMED Viewed

File without changes

{dayhoff_tools-1.1.39.dist-info → dayhoff_tools-1.1.41.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dayhoff-tools 1.1.39__py3-none-any.whl → 1.1.41__py3-none-any.whl

dayhoff-tools 1.1.39py3-none-any.whl → 1.1.41py3-none-any.whl