PyPI - dayhoff-tools - Versions diffs - 1.1.22__tar.gz → 1.1.24__tar.gz - Mend

dayhoff-tools 1.1.22tar.gz → 1.1.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.1.22
+Version: 1.1.24
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

{dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/cli/utility_commands.py RENAMED Viewed

@@ -577,7 +577,7 @@ def update_dependencies(
                     "dayhoff-tools"
                 )  # Use the actual package name
                 pattern = re.compile(
-                    rf"^(\\s*['\"])({package_name_re})(\\[[^\\]]+\\])?(?:[^'\"[\,\\s]*)?(['\"].*)$",
+                    rf"^(\\s*['\"])({package_name_re})(\\[[^\]]+\\])?(?:[^'\"[\,\\s]*)?(['\"].*)$",
                     re.MULTILINE,
                 )
                 # package_name variable is still 'dayhoff-tools'

{dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/deployment/processors.py RENAMED Viewed

@@ -460,35 +460,44 @@ class MMSeqsProfileProcessor(Processor):
                 # We might still want to raise e here, depending on desired error handling for CSV conversion failure
             # 9. Extract hit sequences from M8 results using subset_fasta
-            logger.info(f"Parsing M8 results from: {intermediate_results_m8_file}")
+            logger.info(
+                f"PROCESSOR: Parsing M8 results from: {intermediate_results_m8_file}"
+            )
             hit_sequence_ids = set()
             try:
                 if not intermediate_results_m8_file.exists():
                     logger.warning(
-                        f"M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty."
+                        f"PROCESSOR: M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty.",
+                        exc_info=True,
                     )
                     intermediate_hits_fasta_file.touch()  # Create empty hits file
                 else:
                     with open(intermediate_results_m8_file, "r") as m8_file:
                         for line in m8_file:
-                            if line.strip():  # Ensure line is not empty
+                            if line.strip():
                                 columns = line.strip().split("\t")
                                 if len(columns) >= 2:
-                                    hit_sequence_ids.add(
-                                        columns[1]
-                                    )  # Target ID is the second column
+                                    hit_sequence_ids.add(columns[1])
                     logger.info(
-                        f"Found {len(hit_sequence_ids)} unique target IDs in M8 results."
+                        f"PROCESSOR: Found {len(hit_sequence_ids)} unique target IDs in M8 results."
                     )
                     if not hit_sequence_ids:
                         logger.warning(
-                            f"No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty."
+                            f"PROCESSOR: No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty.",
+                            exc_info=True,
                         )
-                        intermediate_hits_fasta_file.touch()  # Create empty file
+                        intermediate_hits_fasta_file.touch()
                     else:
+                        logger.info(f"PROCESSOR: === CALLING subset_fasta ===")
+                        logger.info(
+                            f"PROCESSOR: Input FASTA for subset_fasta: {local_target_file}"
+                        )
+                        logger.info(
+                            f"PROCESSOR: Output FASTA for subset_fasta: {intermediate_hits_fasta_file}"
+                        )
                         logger.info(
-                            f"Extracting {len(hit_sequence_ids)} hit sequences from {local_target_file} to {intermediate_hits_fasta_file} using subset_fasta."
+                            f"PROCESSOR: Number of target IDs for subset_fasta: {len(hit_sequence_ids)}"
                         )
                         try:
                             subset_fasta(
@@ -499,25 +508,34 @@ class MMSeqsProfileProcessor(Processor):
                                 return_written_ids=False,
                             )
                             logger.info(
-                                f"Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
+                                f"PROCESSOR: === RETURNED from subset_fasta ==="
+                            )
+                            logger.info(
+                                f"PROCESSOR: Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
                             )
-                        except FileNotFoundError as e:
+                        # More specific error catching can be added if subset_fasta raises custom exceptions
+                        except FileNotFoundError as e_fnf:
                             logger.error(
-                                f"subset_fasta FileNotFoundError: {e}. Ensuring {intermediate_hits_fasta_file} exists as empty."
+                                f"PROCESSOR: subset_fasta FileNotFoundError: {e_fnf}. Ensuring {intermediate_hits_fasta_file} exists as empty.",
+                                exc_info=True,
                             )
                             if not intermediate_hits_fasta_file.exists():
                                 intermediate_hits_fasta_file.touch()
                             raise
-                        except Exception as e:
+                        except (
+                            Exception
+                        ) as e_sub:  # Catch any other exception from subset_fasta
                             logger.error(
-                                f"subset_fasta failed to create {intermediate_hits_fasta_file}: {e}"
+                                f"PROCESSOR: subset_fasta failed to create {intermediate_hits_fasta_file}: {e_sub}",
+                                exc_info=True,
                             )
                             if not intermediate_hits_fasta_file.exists():
                                 intermediate_hits_fasta_file.touch()
                             raise
-            except Exception as e:
+            except Exception as e_m8_proc:
                 logger.error(
-                    f"Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e}"
+                    f"PROCESSOR: Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e_m8_proc}",
+                    exc_info=True,
                 )
                 if not intermediate_hits_fasta_file.exists():
                     intermediate_hits_fasta_file.touch()
@@ -525,7 +543,7 @@ class MMSeqsProfileProcessor(Processor):
             # 10. Write the set of hit sequence IDs to a .txt file
             logger.info(
-                f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
+                f"PROCESSOR: Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
             )
             try:
                 with open(final_hits_txt_file, "w") as txt_out:
@@ -533,13 +551,15 @@ class MMSeqsProfileProcessor(Processor):
                         list(hit_sequence_ids)
                     ):  # Sort for consistent output
                         txt_out.write(f"{seq_id}\n")
-                logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
+                logger.info(
+                    f"PROCESSOR: Successfully wrote hit IDs to {final_hits_txt_file}"
+                )
             except Exception as e:
                 logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
                 # The main workflow should still proceed even if this supplementary file fails
             logger.info(
-                f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
+                f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
             )
             # Move and rename final output files from mmseqs_temp_dir to run_base_dir

{dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/dayhoff_tools/fasta.py RENAMED Viewed

@@ -8,7 +8,7 @@ import sqlite3
 import time
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
 import requests
 from Bio import SeqIO
@@ -604,29 +604,48 @@ def process_chunk(
 ) -> Tuple[List[str], Set[str]]:
     output_sequences = []
     written_ids = set()
-    current_id = ""
-    current_seq = []
-    def id_matches(seq_id: str) -> bool:
-        return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
-    for line in chunk:
-        line = line.strip()
-        if line.startswith(">"):
-            if current_id and current_seq:
-                if id_matches(current_id) != exclude:
-                    output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
-                    written_ids.add(current_id)
-            current_id = line[1:]
-            current_seq = []
-        elif current_id:
-            current_seq.append(line)
-    # Process the last sequence in the chunk
-    if current_id and current_seq and id_matches(current_id) != exclude:
-        output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
-        written_ids.add(current_id)
+    current_id: str = ""
+    current_seq: List[str] = []
+    # Get a unique worker ID, could be process ID
+    worker_id = os.getpid()
+    logger.debug(
+        f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} processing a chunk. Target IDs count: {len(target_ids_lower)}, Exclude: {exclude}"
+    )
+    try:
+        def id_matches(seq_id: str) -> bool:
+            return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
+        for line in chunk:
+            line = line.strip()
+            if line.startswith(">"):
+                if current_id and current_seq:
+                    if id_matches(current_id) != exclude:
+                        output_sequences.append(
+                            f">{current_id}\n{''.join(current_seq)}\n"
+                        )
+                        written_ids.add(current_id)
+                current_id = line[1:]
+                current_seq = []
+            elif current_id:
+                current_seq.append(line)
+        # Process the last sequence in the chunk
+        if current_id and current_seq and id_matches(current_id) != exclude:
+            output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
+            written_ids.add(current_id)
+    except Exception as e:
+        logger.error(
+            f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} encountered error: {e}",
+            exc_info=True,
+        )
+        # Re-raising the exception so the main process's pool error handling can catch it
+        raise
+    logger.debug(
+        f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} finished chunk. Output sequences: {len(output_sequences)}, Written IDs: {len(written_ids)}"
+    )
     return output_sequences, written_ids
@@ -655,50 +674,98 @@ def subset_fasta(
     Raises:
         FileExistsError: If the output file already exists.
     """
+    logger.info(
+        f"SUBSET_FASTA: Starting for input '{fasta_file}', output '{output_path}'. Target IDs: {len(target_ids)}, Exclude: {exclude}"
+    )
     _check_output_file(output_path)
     target_ids_lower = {id.lower() for id in target_ids}
     total_size = os.path.getsize(fasta_file)
-    chunk_size = max(
-        1, total_size // (multiprocessing.cpu_count() * 2)
-    )  # Adjust chunk size based on CPU count
-    def chunk_reader(file_obj, chunk_size: int):
+    # Determine a reasonable number of processes
+    num_processes = multiprocessing.cpu_count()
+    # Adjust chunk size based on number of processes to balance load vs memory
+    # Aim for at least a few chunks per process if possible, but not too many small chunks.
+    # This is a heuristic and might need tuning.
+    # Let's make chunks reasonably large, e.g., 10-50MB, or ensure at least num_processes chunks.
+    # If total_size is very small, chunk_size could become 0 if not handled.
+    desired_chunk_size_mb = 32
+    chunk_size = max(1, desired_chunk_size_mb * 1024 * 1024)
+    num_chunks = max(1, math.ceil(total_size / chunk_size))
+    def chunk_reader(
+        file_obj, cs: int
+    ) -> Iterator[List[str]]:  # Explicitly Iterator[List[str]]
         chunk = []
         chunk_bytes = 0
         for line in file_obj:
             chunk.append(line)
             chunk_bytes += len(line)
-            if chunk_bytes >= chunk_size and line.startswith(">"):
+            if chunk_bytes >= cs and line.startswith(">"):
                 yield chunk
                 chunk = [line]
                 chunk_bytes = len(line)
         if chunk:
             yield chunk
-    open_func = gzip.open if fasta_file.endswith(".gz") else open
-    mode = "rt" if fasta_file.endswith(".gz") else "r"
+    mode = "rt"  # text mode for both gzip and regular open
-    with open_func(fasta_file, mode) as input_file:
-        with multiprocessing.Pool() as pool:
-            process_func = partial(
-                process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
+    all_written_ids: Set[str] = set()
+    try:
+        with open(fasta_file, mode) as input_file:
+            logger.info(
+                f"SUBSET_FASTA: Using up to {num_processes} worker processes for {num_chunks} potential chunks."
             )
-            results = list(
-                tqdm(
-                    pool.imap(process_func, chunk_reader(input_file, chunk_size)),
-                    total=total_size // chunk_size,
-                    desc="Processing FASTA",
+            with multiprocessing.Pool(processes=num_processes) as pool:
+                logger.info(
+                    f"SUBSET_FASTA: Multiprocessing pool created (intended processes: {num_processes})."
                 )
-            )
-    all_written_ids = set()
-    with open(output_path, "w") as output_file:
-        for output_sequences, written_ids in results:
-            output_file.writelines(output_sequences)
-            all_written_ids.update(written_ids)
+                process_func = partial(
+                    process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
+                )
-    print(f"Wrote {len(all_written_ids)} sequences to {output_path}")
+                # Using imap_unordered can sometimes be better for memory with many results,
+                # as results are processed as they complete.
+                # However, for aggregation later, order doesn't strictly matter for building the final set/list of strings.
+                # tqdm will work with imap and imap_unordered.
+                # Calculate total for tqdm more robustly
+                actual_num_chunks_for_tqdm = num_chunks  # Use the calculated num_chunks
+                try:
+                    results_buffer = []
+                    for result_tuple in tqdm(
+                        pool.imap(process_func, chunk_reader(input_file, chunk_size)),
+                        total=actual_num_chunks_for_tqdm,  # Use calculated number of chunks
+                        desc="Processing FASTA (subset_fasta)",
+                    ):
+                        results_buffer.append(result_tuple)
+                    logger.debug("SUBSET_FASTA: pool.imap completed.")
+                except Exception as e_pool:
+                    logger.error(
+                        f"SUBSET_FASTA: Error during multiprocessing pool.imap: {e_pool}",
+                        exc_info=True,
+                    )
+                    raise
+        logger.debug(
+            f"SUBSET_FASTA: Aggregating results from {len(results_buffer)} processed chunks."
+        )
+        with open(output_path, "w") as output_file:
+            for output_sequences, written_ids_chunk in results_buffer:
+                output_file.writelines(output_sequences)
+                all_written_ids.update(written_ids_chunk)
+    except Exception as e_main:
+        logger.error(
+            f"SUBSET_FASTA: Error in main processing logic: {e_main}", exc_info=True
+        )
+        raise
+    logger.info(
+        f"SUBSET_FASTA: Wrote {len(all_written_ids)} sequences to {output_path}. Finished."
+    )
     return all_written_ids if return_written_ids else None
@@ -779,7 +846,7 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
         batch = []
         for protein_id, sequence in tqdm(
-            _protein_generator(fasta_file),
+            _protein_generator(Path(fasta_file)),  # Pass as Path object
             total=estimated_records,
             desc="Processing proteins",
         ):
@@ -804,22 +871,27 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
     print(f"Conversion completed. SQLite database saved to {db_file}")
-def _protein_generator(fasta_path: Path) -> Iterator[tuple[str, str]]:
+def _protein_generator(
+    fasta_path: Path,
+) -> Iterator[tuple[str, str]]:  # fasta_path is Path
     """
     Generate protein data from a FASTA file.
     Args:
         fasta_path (Path): Path to the FASTA file.
     Yields:
         tuple[str, str]: A tuple containing protein_id and sequence.
     """
-    for record in SeqIO.parse(fasta_path, "fasta"):
-        protein_id = record.id.split()[
-            0
-        ]  # Assumes the first part of the id is the protein_id
-        sequence = str(record.seq)
-        yield protein_id, sequence
+    # Ensure we use 'rt' for text mode reading, especially if gzipped
+    open_func = gzip.open if str(fasta_path).endswith(".gz") else open
+    mode = "rt"
+    with open_func(fasta_path, mode) as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            protein_id = record.id.split()[
+                0
+            ]  # Assumes the first part of the id is the protein_id
+            sequence = str(record.seq)
+            yield protein_id, sequence
 def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:

{dayhoff_tools-1.1.22 → dayhoff_tools-1.1.24}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "dayhoff-tools"
-version = "1.1.22"
+version = "1.1.24"
 description = "Common tools for all the repos at Dayhoff Labs"
 authors = [
     {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}