PyPI - dayhoff-tools - Versions diffs - 1.1.3__tar.gz → 1.1.5__tar.gz - Mend

dayhoff-tools 1.1.3tar.gz → 1.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{dayhoff_tools-1.1.3 → dayhoff_tools-1.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.1.3
+Version: 1.1.5
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com
@@ -25,6 +25,7 @@ Requires-Dist: questionary (>=2.0.1)
 Requires-Dist: rdkit-pypi (>=2022.9.5)
 Requires-Dist: requests (>=2.31.0)
 Requires-Dist: requests (>=2.31.0) ; extra == "lite"
+Requires-Dist: sentencepiece (>=0.2.0)
 Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0)
 Requires-Dist: toml (>=0.10)
 Requires-Dist: transformers (==4.36.2)

{dayhoff_tools-1.1.3 → dayhoff_tools-1.1.5}/dayhoff_tools/fasta.py RENAMED Viewed

@@ -264,65 +264,168 @@ def split_fasta(
     target_folder: str,
     base_name: str,
     sequences_per_file: int = 1000,
-    max_files=None,
+    max_files: int | None = None,
+    show_progress: bool = True,
+    target_chunk_size_bytes: int | None = None,
 ) -> int:
-    """Split a FASTA file into multiple smaller files within a target folder.
+    """Split a FASTA file into multiple smaller files within a target folder,
+    with an overall progress bar. Files can be split based on a target number
+    of sequences or an approximate target file size in bytes.
     Args:
         fasta_file (str): Path to the input FASTA file.
         target_folder (str): Path to the folder where output files will be saved.
         base_name (str): Used to make output filenames: eg, basename_1.fasta.
         sequences_per_file (int): Number of sequences per output file.
-        max_files (int, optional): Maximum number of files to create. If None, all sequences are processed.
+            This is used if target_chunk_size_bytes is None.
+        max_files (int, optional): Maximum number of files to create.
+            If None, all sequences are processed.
+        show_progress (bool): If True, display a progress bar based on
+            file size processed. Defaults to True.
+        target_chunk_size_bytes (int, optional): Approximate target size for
+            each output file in bytes. If set, this takes precedence over
+            sequences_per_file. The actual file size may be slightly larger to
+            ensure full FASTA entries. Defaults to None.
+    Returns:
+        int: The number of output files created.
     """
     # Ensure the target folder exists
     os.makedirs(target_folder, exist_ok=True)
-    # Initialize counters
-    file_count = 1
-    sequence_count = 0
-    # Open the large FASTA file for reading
-    with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
-        # Prepare the first output file
-        output_file_path = os.path.join(
-            target_folder, f"{base_name}_{file_count}.fasta"
+    # We create output files lazily (on first sequence) so we don't end up with
+    # spurious empty files.  `files_created` tracks the number of *real* files
+    # present on disk when we finish.
+    files_created = 0
+    current_output_file_sequence_count = 0
+    current_output_file_bytes_written = 0
+    pbar: tqdm | None = None
+    output_file = None  # Will be opened when we encounter the first header line
+    output_file_path = ""
+    if target_chunk_size_bytes is not None:
+        print(
+            f"Splitting by target chunk size: {target_chunk_size_bytes / (1024*1024):.2f} MB"
         )
-        output_file = open(output_file_path, "w", buffering=1024 * 1024)
-        for line in fasta:
-            # Check if we've reached the maximum number of files, if specified
-            if max_files is not None and file_count > max_files:
-                break
+    else:
+        print(f"Splitting by sequences per file: {sequences_per_file}")
-            # If line starts with ">", it's the beginning of a new sequence
-            if line.startswith(">"):
-                sequence_count += 1
+    try:
+        # Open the large FASTA file for reading
+        with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
+            if show_progress:
+                total_size = os.path.getsize(fasta_file)
+                pbar = tqdm(
+                    total=total_size,
+                    unit="B",
+                    unit_scale=True,
+                    desc=f"Splitting {os.path.basename(fasta_file)}",
+                )
-                # If we reached the limit, start a new file
-                if sequence_count > sequences_per_file:
-                    # Close current file and open a new one
-                    output_file.close()
-                    print(f"File written: {output_file}")
-                    file_count += 1
-                    sequence_count = 1  # Reset sequence count for the new file
+            # We create output files on demand.  The very first file is not
+            # opened until we see the first sequence header.  This prevents
+            # an empty file from being created when the input FASTA is empty
+            # or when `max_files` is reached before any data are written.
+            def _open_new_output_file():
+                nonlocal output_file, output_file_path, files_created
-                    # Check again after incrementing file_count
-                    if max_files is not None and file_count > max_files:
-                        break
+                files_created += 1
+                output_file_path = os.path.join(
+                    target_folder, f"{base_name}_{files_created}.fasta"
+                )
+                output_file = open(output_file_path, "w", buffering=1024 * 1024)
-                    output_file_path = os.path.join(
-                        target_folder, f"{base_name}_{file_count}.fasta"
+            # Helper for logging and closing the current file
+            def _close_current_output_file():
+                nonlocal output_file, current_output_file_sequence_count, current_output_file_bytes_written
+                if output_file and not output_file.closed:
+                    output_file.close()
+                    print(
+                        f"File written: {output_file_path} "
+                        f"(Sequences: {current_output_file_sequence_count}, "
+                        f"Bytes: {current_output_file_bytes_written} / {(current_output_file_bytes_written / (1024*1024)):.2f} MB)"
                     )
-                    output_file = open(output_file_path, "w", buffering=1024 * 1024)
-            # Write the line to the current output file
-            output_file.write(line)
-        # Close the last output file
-        output_file.close()
-    return file_count
+            for line in fasta:
+                line_bytes = len(line.encode("utf-8"))
+                if pbar:
+                    pbar.update(line_bytes)
+                # Note: we don't enforce `max_files` here; we enforce it only when we
+                # are about to create *another* file (see logic further below). This
+                # ensures we finish writing the current file before stopping.
+                # If line starts with ">", it's the beginning of a new sequence
+                if line.startswith(">"):
+                    # Decide whether we need to roll over to a new output file.
+                    needs_new_file = False  # reset each time we encounter a header
+                    if (
+                        output_file is not None
+                        and current_output_file_sequence_count > 0
+                    ):
+                        if target_chunk_size_bytes is not None:
+                            # Size-based splitting takes precedence over sequence count.
+                            if (
+                                current_output_file_bytes_written
+                                >= target_chunk_size_bytes
+                            ):
+                                needs_new_file = True
+                        else:
+                            # Fallback to sequence-count based splitting.
+                            if current_output_file_sequence_count >= sequences_per_file:
+                                needs_new_file = True
+                    if needs_new_file:
+                        _close_current_output_file()
+                        # Respect `max_files`: do not create another file if limit reached
+                        if max_files is not None and files_created >= max_files:
+                            break
+                        _open_new_output_file()
+                        current_output_file_sequence_count = 0
+                        current_output_file_bytes_written = 0
+                    # Opening first file if not already open
+                    if output_file is None:
+                        _open_new_output_file()
+                    current_output_file_sequence_count += 1
+                # Write the line to the current output file (which should now exist)
+                if output_file is not None:
+                    output_file.write(line)
+                    current_output_file_bytes_written += line_bytes
+            # After loop, ensure the last file is handled
+            _close_current_output_file()
+    finally:
+        if pbar:
+            pbar.close()
+        # Ensure the file is closed in case of an exception before the natural end
+        if output_file and not output_file.closed:
+            output_file.close()
+            # It's hard to know the state to print a meaningful message here if an exception occurred mid-file.
+            # The primary 'File written' messages are handled within the loop and at the end of normal processing.
+    # If the last file was empty and removed, and it was the only file, file_count might be 1.
+    # Adjust file_count if the last output file was empty and removed.
+    if os.path.exists(output_file_path) and os.path.getsize(output_file_path) == 0:
+        # This can happen if max_files is hit exactly when a new file is due to be created,
+        # or if the input file itself is empty or contains no FASTA entries after the last split point.
+        # We should not count this empty file if it was removed.
+        # However, file_count is already incremented *before* a new file is opened.
+        # The logic for removing empty files is tricky to perfectly align with file_count
+        # without more complex state tracking. The current return reflects the number of
+        # *attempted* file creations that weren't immediately curtailed by max_files.
+        # For simplicity, we'll return the file_count as is, understanding it might
+        # include an empty file that was subsequently removed if it was the very last one.
+        # A more robust approach might decrement file_count if the last created file path was removed.
+        pass
+    return files_created
 def subtract_fasta_files(file1: str, file2: str, output_file: str):

dayhoff_tools-1.1.5/dayhoff_tools/intake/gtdb.py ADDED Viewed

@@ -0,0 +1,269 @@
+import collections.abc
+import csv
+import gzip
+import pathlib
+import re
+from tqdm import tqdm
+_ACCESSION_REGEX = re.compile(r"(GC[AF]_[0-9]+\.[0-9]+)")
+def _extract_accession_from_filename(filename_str: str) -> str:
+    """
+    Extracts the genome assembly accession (e.g., GCA_XXXXXXXXX.X or GCF_XXXXXXXXX.X)
+    from a filename.
+    Args:
+        filename_str (str): The filename string.
+    Returns:
+        str: The extracted accession or "UNKNOWN_ACCESSION" if not found.
+    """
+    match = _ACCESSION_REGEX.search(filename_str)
+    if match:
+        return match.group(1)
+    return "UNKNOWN_ACCESSION"
+def process_gtdb_files_to_fasta(
+    gtdb_top_folder: str,
+    output_fasta_path: str,
+    chunk_size: int = 10000,
+) -> None:
+    """
+    Processes a top-level GTDB folder containing gzipped FASTA files (.faa.gz)
+    and combines all protein sequences into a single FASTA file.
+    Output is written in chunks for efficiency with large datasets.
+    A progress bar is displayed during processing.
+    Args:
+        gtdb_top_folder (str): Path to the top-level GTDB directory.
+        output_fasta_path (str): Path to write the combined FASTA file.
+        chunk_size (int, optional): Number of sequences to process before
+            writing a chunk to the output file. Defaults to 10000.
+    """
+    gtdb_path = pathlib.Path(gtdb_top_folder)
+    faa_files = list(gtdb_path.rglob("*.faa.gz"))
+    if not faa_files:
+        print(f"No .faa.gz files found in {gtdb_top_folder}")
+        return
+    fasta_entries_chunk = []
+    sequences_in_current_chunk = 0
+    with open(output_fasta_path, "w") as fasta_out_file:
+        current_header_id = None
+        current_sequence_lines = []
+        for faa_file_path in tqdm(faa_files, desc="Processing GTDB files to FASTA"):
+            try:
+                with gzip.open(faa_file_path, "rt") as gz_file:
+                    for line_content in gz_file:
+                        line = line_content.strip()
+                        if not line:  # Skip empty lines
+                            continue
+                        if line.startswith(">"):
+                            if current_header_id and current_sequence_lines:
+                                sequence_string = "".join(current_sequence_lines)
+                                fasta_entries_chunk.append(
+                                    f">{current_header_id}\n{sequence_string}\n"
+                                )
+                                sequences_in_current_chunk += 1
+                            # Parse new header
+                            header_content = line[1:]
+                            parts = header_content.split(None, 1)
+                            current_header_id = parts[0]
+                            current_sequence_lines = []
+                            if sequences_in_current_chunk >= chunk_size:
+                                if fasta_entries_chunk:
+                                    fasta_out_file.write("".join(fasta_entries_chunk))
+                                fasta_entries_chunk = []
+                                sequences_in_current_chunk = 0
+                        else:
+                            if current_header_id:
+                                current_sequence_lines.append(line)
+                    if current_header_id and current_sequence_lines:
+                        sequence_string = "".join(current_sequence_lines)
+                        fasta_entries_chunk.append(
+                            f">{current_header_id}\n{sequence_string}\n"
+                        )
+                        sequences_in_current_chunk += 1
+                # Reset state for the next file to ensure clean parsing start for that file
+                current_header_id = None
+                current_sequence_lines = []
+            except gzip.BadGzipFile:
+                tqdm.write(
+                    f"Warning: Skipping corrupted or non-gzipped file: {faa_file_path}"
+                )
+                current_header_id = None
+                current_sequence_lines = []
+            except Exception as e:
+                tqdm.write(f"Warning: Error processing file {faa_file_path}: {e}")
+                current_header_id = None
+                current_sequence_lines = []
+        if fasta_entries_chunk:
+            fasta_out_file.write("".join(fasta_entries_chunk))
+    print(f"Processing complete. Output FASTA file created: {output_fasta_path}")
+def process_gtdb_files_to_csv(
+    gtdb_top_folder: str,
+    output_csv_path: str,
+    chunk_size: int = 10000,
+) -> None:
+    """
+    Processes a top-level GTDB folder containing gzipped FASTA files (.faa.gz)
+    and creates a CSV file with detailed information for each sequence entry.
+    The CSV includes the genome assembly accession, original FASTA header ID,
+    and header description for each entry. Output is written in chunks for
+    efficiency with large datasets. A progress bar is displayed during processing.
+    Args:
+        gtdb_top_folder (str): Path to the top-level GTDB directory.
+        output_csv_path (str): Path to write the CSV file.
+        chunk_size (int, optional): Number of sequences to process before
+            writing a chunk to the output file. Defaults to 10000.
+    """
+    gtdb_path = pathlib.Path(gtdb_top_folder)
+    faa_files = list(gtdb_path.rglob("*.faa.gz"))
+    if not faa_files:
+        print(f"No .faa.gz files found in {gtdb_top_folder}")
+        return
+    def _serial_iter(paths):
+        """Yield the same structure as the parallel branch but serially."""
+        for p in paths:
+            row_generator_for_file, file_warnings = _csv_rows_from_single_faa(str(p))
+            yield row_generator_for_file, file_warnings
+    # Open output CSV for streaming writes.
+    with open(output_csv_path, "w", newline="") as csv_out_file:
+        csv_writer = csv.writer(csv_out_file)
+        csv_writer.writerow(
+            [
+                "genome_assembly_accession",
+                "original_fasta_header_id",
+                "original_fasta_header_description",
+            ]
+        )
+        rows_buffer: list[list[str]] = []
+        # Choose the iterator depending on workers.
+        result_iter = _serial_iter(faa_files)
+        progress_iter = tqdm(
+            result_iter, total=len(faa_files), desc="Processing GTDB files to CSV"
+        )
+        # Consume iterator and stream rows to disk in chunks.
+        for row_generator_for_file, file_warnings in progress_iter:
+            # Add rows to buffer and flush in chunk-size batches.
+            # This will consume the generator, and in doing so, populate file_warnings if errors occur.
+            for r in row_generator_for_file:
+                rows_buffer.append(r)
+                if len(rows_buffer) >= chunk_size:
+                    csv_writer.writerows(rows_buffer)
+                    rows_buffer.clear()
+            # Now that the generator for the file has been processed (or attempted),
+            # emit any warnings that were collected for this specific file.
+            for w in file_warnings:
+                tqdm.write(w)
+        # Flush remaining rows.
+        if rows_buffer:
+            csv_writer.writerows(rows_buffer)
+    print(f"Processing complete. Output CSV file created: {output_csv_path}")
+# ---------------------------------------------------------------------------
+# Helper functions (private)
+# ---------------------------------------------------------------------------
+def _csv_rows_from_single_faa(
+    faa_file_path: str,
+) -> tuple[collections.abc.Iterable[list[str]], list[str]]:
+    """Parse a single gzipped FASTA (`.faa.gz`) file into CSV rows.
+    Parameters
+    ----------
+    faa_file_path
+        Path (as ``str``) to the ``.faa.gz`` file.
+    Returns
+    -------
+    tuple[collections.abc.Iterable[list[str]], list[str]]
+        * First element – an iterable (generator) of CSV rows ``[accession, header_id, description]``.
+        * Second element – list of warning strings produced while processing
+          the file.  The caller is responsible for emitting them.
+    """
+    warnings: list[str] = []  # Outer scope warnings list
+    faa_path = pathlib.Path(faa_file_path)
+    current_file_accession = _extract_accession_from_filename(faa_path.name)
+    def _generate_rows_iter_inner() -> (
+        collections.abc.Iterable[list[str]]
+    ):  # Renamed for clarity
+        # Local parsing state for the generator
+        current_header_id_gen = None
+        current_header_desc_gen = ""
+        has_sequence_lines_gen = False
+        try:
+            with gzip.open(faa_file_path, "rt") as gz_file:
+                for line_content in gz_file:
+                    line = line_content.strip()
+                    if not line:
+                        continue
+                    if line.startswith(">"):
+                        if current_header_id_gen and has_sequence_lines_gen:
+                            yield [
+                                current_file_accession,
+                                current_header_id_gen,
+                                current_header_desc_gen,
+                            ]
+                        header_content = line[1:]
+                        parts = header_content.split(None, 1)
+                        current_header_id_gen = parts[0]
+                        current_header_desc_gen = parts[1] if len(parts) > 1 else ""
+                        has_sequence_lines_gen = False
+                    else:
+                        if current_header_id_gen:
+                            has_sequence_lines_gen = True
+            # Add final entry if the file ended after sequence lines.
+            if current_header_id_gen and has_sequence_lines_gen:
+                yield [
+                    current_file_accession,
+                    current_header_id_gen,
+                    current_header_desc_gen,
+                ]
+        except gzip.BadGzipFile:
+            # Exception handled inside the generator.
+            # Append to the outer warnings list and terminate generator.
+            warnings.append(
+                f"Warning: Skipping corrupted or non-gzipped file: {faa_file_path}"
+            )
+            return  # Stop generation
+        except Exception as exc:
+            warnings.append(f"Warning: Error processing file {faa_file_path}: {exc}")
+            return  # Stop generation
+    # Directly return the generator instance and the warnings list.
+    # The warnings list will be populated by the generator if errors occur during its execution.
+    return _generate_rows_iter_inner(), warnings

{dayhoff_tools-1.1.3 → dayhoff_tools-1.1.5}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "dayhoff-tools"
-version = "1.1.3"
+version = "1.1.5"
 description = "Common tools for all the repos at Dayhoff Labs"
 authors = [
     {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
@@ -28,6 +28,7 @@ dependencies = [
     "rdkit-pypi>=2022.9.5",
     "sqlalchemy (>=2.0.40,<3.0.0)",
     "transformers==4.36.2",
+    "sentencepiece>=0.2.0",
 ]
 requires-python = ">=3.10,<4.0"
@@ -74,4 +75,4 @@ dev = [
 ]
 [project.scripts]
-dh = "dayhoff_tools.cli.main:app"
+dh = "dayhoff_tools.cli.main:app"