PyPI - smftools - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

smftools 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

smftools/_version.py +1 -1
smftools/cli/chimeric_adata.py +1563 -0
smftools/cli/helpers.py +49 -7
smftools/cli/hmm_adata.py +250 -32
smftools/cli/latent_adata.py +773 -0
smftools/cli/load_adata.py +78 -74
smftools/cli/preprocess_adata.py +122 -58
smftools/cli/recipes.py +26 -0
smftools/cli/spatial_adata.py +74 -112
smftools/cli/variant_adata.py +423 -0
smftools/cli_entry.py +52 -4
smftools/config/conversion.yaml +1 -1
smftools/config/deaminase.yaml +3 -0
smftools/config/default.yaml +85 -12
smftools/config/experiment_config.py +146 -1
smftools/constants.py +69 -0
smftools/hmm/HMM.py +88 -0
smftools/hmm/call_hmm_peaks.py +1 -1
smftools/informatics/__init__.py +6 -0
smftools/informatics/bam_functions.py +358 -8
smftools/informatics/binarize_converted_base_identities.py +2 -89
smftools/informatics/converted_BAM_to_adata.py +636 -175
smftools/informatics/h5ad_functions.py +198 -2
smftools/informatics/modkit_extract_to_adata.py +1007 -425
smftools/informatics/sequence_encoding.py +72 -0
smftools/logging_utils.py +21 -2
smftools/metadata.py +1 -1
smftools/plotting/__init__.py +26 -3
smftools/plotting/autocorrelation_plotting.py +22 -4
smftools/plotting/chimeric_plotting.py +1893 -0
smftools/plotting/classifiers.py +28 -14
smftools/plotting/general_plotting.py +62 -1583
smftools/plotting/hmm_plotting.py +1670 -8
smftools/plotting/latent_plotting.py +804 -0
smftools/plotting/plotting_utils.py +243 -0
smftools/plotting/position_stats.py +16 -8
smftools/plotting/preprocess_plotting.py +281 -0
smftools/plotting/qc_plotting.py +8 -3
smftools/plotting/spatial_plotting.py +1134 -0
smftools/plotting/variant_plotting.py +1231 -0
smftools/preprocessing/__init__.py +4 -0
smftools/preprocessing/append_base_context.py +18 -18
smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
smftools/preprocessing/append_variant_call_layer.py +480 -0
smftools/preprocessing/calculate_consensus.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +6 -1
smftools/preprocessing/flag_duplicate_reads.py +4 -4
smftools/preprocessing/invert_adata.py +1 -0
smftools/readwrite.py +159 -99
smftools/schema/anndata_schema_v1.yaml +15 -1
smftools/tools/__init__.py +10 -0
smftools/tools/calculate_knn.py +121 -0
smftools/tools/calculate_leiden.py +57 -0
smftools/tools/calculate_nmf.py +130 -0
smftools/tools/calculate_pca.py +180 -0
smftools/tools/calculate_umap.py +79 -80
smftools/tools/position_stats.py +4 -4
smftools/tools/rolling_nn_distance.py +872 -0
smftools/tools/sequence_alignment.py +140 -0
smftools/tools/tensor_factorization.py +217 -0
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
{smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/converted_BAM_to_adata.py CHANGED Viewed

@@ -5,22 +5,45 @@ import logging
 import shutil
 import time
 import traceback
+from dataclasses import dataclass
 from multiprocessing import Manager, Pool, current_process
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional, Union
 import anndata as ad
 import numpy as np
 import pandas as pd
+from smftools.constants import (
+    BAM_SUFFIX,
+    BARCODE,
+    BASE_QUALITY_SCORES,
+    DATASET,
+    DEMUX_TYPE,
+    H5_DIR,
+    MISMATCH_INTEGER_ENCODING,
+    MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
+    MODKIT_EXTRACT_SEQUENCE_BASES,
+    MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
+    MODKIT_EXTRACT_SEQUENCE_PADDING_BASE,
+    READ_MAPPING_DIRECTION,
+    READ_MISMATCH_TREND,
+    READ_SPAN_MASK,
+    REFERENCE,
+    REFERENCE_DATASET_STRAND,
+    REFERENCE_STRAND,
+    SAMPLE,
+    SEQUENCE_INTEGER_DECODING,
+    SEQUENCE_INTEGER_ENCODING,
+    STRAND,
+)
 from smftools.logging_utils import get_logger, setup_logging
 from smftools.optional_imports import require
 from ..readwrite import make_dirs
 from .bam_functions import count_aligned_reads, extract_base_identities
 from .binarize_converted_base_identities import binarize_converted_base_identities
-from .fasta_functions import find_conversion_sites
-from .ohe import ohe_batching
+from .fasta_functions import find_conversion_sites, get_native_references
 logger = get_logger(__name__)
@@ -30,6 +53,67 @@ if TYPE_CHECKING:
 torch = require("torch", extra="torch", purpose="converted BAM processing")
+@dataclass(frozen=True)
+class RecordFastaInfo:
+    """Structured FASTA metadata for a single converted record.
+    Attributes:
+        sequence: Padded top-strand sequence for the record.
+        complement: Padded bottom-strand sequence for the record.
+        chromosome: Canonical chromosome name for the record.
+        unconverted_name: FASTA record name for the unconverted reference.
+        sequence_length: Length of the unpadded reference sequence.
+        padding_length: Number of padded bases applied to reach max length.
+        conversion: Conversion label (e.g., "unconverted", "5mC").
+        strand: Strand label ("top" or "bottom").
+        max_reference_length: Maximum reference length across all records.
+    """
+    sequence: str
+    complement: str
+    chromosome: str
+    unconverted_name: str
+    sequence_length: int
+    padding_length: int
+    conversion: str
+    strand: str
+    max_reference_length: int
+@dataclass(frozen=True)
+class SequenceEncodingConfig:
+    """Configuration for integer sequence encoding.
+    Attributes:
+        base_to_int: Mapping of base characters to integer encodings.
+        bases: Valid base characters used for encoding.
+        padding_base: Base token used for padding.
+        batch_size: Number of reads per temporary batch file.
+    """
+    base_to_int: Mapping[str, int]
+    bases: tuple[str, ...]
+    padding_base: str
+    batch_size: int = 100000
+    @property
+    def padding_value(self) -> int:
+        """Return the integer value used for padding positions."""
+        return self.base_to_int[self.padding_base]
+    @property
+    def unknown_value(self) -> int:
+        """Return the integer value used for unknown bases."""
+        return self.base_to_int["N"]
+SEQUENCE_ENCODING_CONFIG = SequenceEncodingConfig(
+    base_to_int=MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
+    bases=MODKIT_EXTRACT_SEQUENCE_BASES,
+    padding_base=MODKIT_EXTRACT_SEQUENCE_PADDING_BASE,
+)
 def converted_BAM_to_adata(
     converted_FASTA: str | Path,
     split_dir: Path,
@@ -46,7 +130,7 @@ def converted_BAM_to_adata(
     double_barcoded_path: Path | None = None,
     samtools_backend: str | None = "auto",
 ) -> tuple[ad.AnnData | None, Path]:
-    """Convert BAM files into an AnnData object by binarizing modified base identities.
+    """Convert converted BAM files into an AnnData object with integer sequence encoding.
     Args:
         converted_FASTA: Path to the converted FASTA reference.
@@ -62,9 +146,18 @@ def converted_BAM_to_adata(
         deaminase_footprinting: Whether the footprinting used direct deamination chemistry.
         delete_intermediates: Whether to remove intermediate files after processing.
         double_barcoded_path: Path to dorado demux summary file of double-ended barcodes.
+        samtools_backend: Samtools backend choice for alignment parsing.
     Returns:
         tuple[anndata.AnnData | None, Path]: The AnnData object (if generated) and its path.
+    Processing Steps:
+        1. Resolve the best available torch device and create output directories.
+        2. Load converted FASTA records and compute conversion sites.
+        3. Filter BAMs based on mapping thresholds.
+        4. Process each BAM in parallel, building per-sample H5AD files.
+        5. Concatenate per-sample AnnData objects and attach reference metadata.
+        6. Add demultiplexing annotations and clean intermediate artifacts.
     """
     if torch.cuda.is_available():
         device = torch.device("cuda")
@@ -76,7 +169,7 @@ def converted_BAM_to_adata(
     logger.debug(f"Using device: {device}")
     ## Set Up Directories and File Paths
-    h5_dir = output_dir / "h5ads"
+    h5_dir = output_dir / H5_DIR
     tmp_dir = output_dir / "tmp"
     final_adata = None
     final_adata_path = h5_dir / f"{experiment_name}.h5ad.gz"
@@ -90,7 +183,7 @@ def converted_BAM_to_adata(
     bam_files = sorted(
         p
         for p in split_dir.iterdir()
-        if p.is_file() and p.suffix == ".bam" and "unclassified" not in p.name
+        if p.is_file() and p.suffix == BAM_SUFFIX and "unclassified" not in p.name
     )
     bam_path_list = bam_files
@@ -108,6 +201,16 @@ def converted_BAM_to_adata(
         bam_path_list, bam_files, mapping_threshold, samtools_backend
     )
+    # Get converted record sequences:
+    converted_FASTA_record_seq_map = get_native_references(converted_FASTA)
+    # Pad the record sequences
+    for record, [record_length, seq] in converted_FASTA_record_seq_map.items():
+        if max_reference_length > record_length:
+            pad_number = max_reference_length - record_length
+            record_length += pad_number
+            seq += "N" * pad_number
+            converted_FASTA_record_seq_map[record] = [record_length, seq]
     ## Process BAMs in Parallel
     final_adata = process_bams_parallel(
         bam_path_list,
@@ -121,8 +224,15 @@ def converted_BAM_to_adata(
         device,
         deaminase_footprinting,
         samtools_backend,
+        converted_FASTA_record_seq_map,
     )
+    final_adata.uns[f"{SEQUENCE_INTEGER_ENCODING}_map"] = dict(MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT)
+    final_adata.uns[f"{MISMATCH_INTEGER_ENCODING}_map"] = dict(MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT)
+    final_adata.uns[f"{SEQUENCE_INTEGER_DECODING}_map"] = {
+        str(key): value for key, value in MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE.items()
+    }
     final_adata.uns["References"] = {}
     for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
         final_adata.var[f"{chromosome}_top_strand_FASTA_base"] = list(seq)
@@ -130,6 +240,11 @@ def converted_BAM_to_adata(
         final_adata.uns[f"{chromosome}_FASTA_sequence"] = seq
         final_adata.uns["References"][f"{chromosome}_FASTA_sequence"] = seq
+    if not deaminase_footprinting:
+        for record, [_length, seq] in converted_FASTA_record_seq_map.items():
+            if "unconverted" not in record:
+                final_adata.var[f"{record}_top_strand_FASTA_base"] = list(seq)
     final_adata.obs_names_make_unique()
     cols = final_adata.obs.columns
@@ -137,9 +252,33 @@ def converted_BAM_to_adata(
     for col in cols:
         final_adata.obs[col] = final_adata.obs[col].astype("category")
+    consensus_bases = MODKIT_EXTRACT_SEQUENCE_BASES[:4]  # ignore N/PAD for consensus
+    consensus_base_ints = [MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT[base] for base in consensus_bases]
+    for ref_group in final_adata.obs[REFERENCE_DATASET_STRAND].cat.categories:
+        group_subset = final_adata[final_adata.obs[REFERENCE_DATASET_STRAND] == ref_group]
+        encoded_sequences = group_subset.layers[SEQUENCE_INTEGER_ENCODING]
+        layer_counts = [
+            np.sum(encoded_sequences == base_int, axis=0) for base_int in consensus_base_ints
+        ]
+        count_array = np.array(layer_counts)
+        nucleotide_indexes = np.argmax(count_array, axis=0)
+        consensus_sequence_list = [consensus_bases[i] for i in nucleotide_indexes]
+        no_calls_mask = np.sum(count_array, axis=0) == 0
+        if np.any(no_calls_mask):
+            consensus_sequence_list = np.array(consensus_sequence_list, dtype=object)
+            consensus_sequence_list[no_calls_mask] = "N"
+            consensus_sequence_list = consensus_sequence_list.tolist()
+        final_adata.var[f"{ref_group}_consensus_sequence_from_all_samples"] = (
+            consensus_sequence_list
+        )
+    from .h5ad_functions import append_reference_strand_quality_stats
+    append_reference_strand_quality_stats(final_adata)
     if input_already_demuxed:
-        final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
-        final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
+        final_adata.obs[DEMUX_TYPE] = ["already"] * final_adata.shape[0]
+        final_adata.obs[DEMUX_TYPE] = final_adata.obs[DEMUX_TYPE].astype("category")
     else:
         from .h5ad_functions import add_demux_type_annotation
@@ -156,37 +295,47 @@ def converted_BAM_to_adata(
 def process_conversion_sites(
-    converted_FASTA, conversions=["unconverted", "5mC"], deaminase_footprinting=False
-):
-    """
-    Extracts conversion sites and determines the max reference length.
+    converted_FASTA: str | Path,
+    conversions: list[str] | None = None,
+    deaminase_footprinting: bool = False,
+) -> tuple[int, dict[str, RecordFastaInfo], dict[str, tuple[str, str]]]:
+    """Extract conversion sites and FASTA metadata for converted references.
-    Parameters:
-        converted_FASTA (str): Path to the converted reference FASTA.
-        conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
-        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
+    Args:
+        converted_FASTA: Path to the converted reference FASTA.
+        conversions: List of modification types (e.g., ["unconverted", "5mC", "6mA"]).
+        deaminase_footprinting: Whether the footprinting was done with direct deamination chemistry.
     Returns:
-        max_reference_length (int): The length of the longest sequence.
-        record_FASTA_dict (dict): Dictionary of sequence information for **both converted & unconverted** records.
+        tuple[int, dict[str, RecordFastaInfo], dict[str, tuple[str, str]]]:
+            Maximum reference length, record metadata, and chromosome sequences.
+    Processing Steps:
+        1. Parse unconverted FASTA records to determine the max reference length.
+        2. Build record metadata for unconverted and converted strands.
+        3. Cache chromosome-level sequences for downstream annotation.
     """
-    modification_dict = {}
-    record_FASTA_dict = {}
-    chromosome_FASTA_dict = {}
+    if conversions is None:
+        conversions = ["unconverted", "5mC"]
+    modification_dict: dict[str, dict] = {}
+    record_FASTA_dict: dict[str, RecordFastaInfo] = {}
+    chromosome_FASTA_dict: dict[str, tuple[str, str]] = {}
     max_reference_length = 0
     unconverted = conversions[0]
     conversion_types = conversions[1:]
     # Process the unconverted sequence once
+    # modification dict is keyed by mod type (ie unconverted, 5mC, 6mA)
+    # modification_dict[unconverted] points to a dictionary keyed by unconverted record.id keys.
+    # This then maps to [sequence_length, [], [], unconverted sequence, unconverted complement]
     modification_dict[unconverted] = find_conversion_sites(
         converted_FASTA, unconverted, conversions, deaminase_footprinting
     )
-    # Above points to record_dict[record.id] = [sequence_length, [], [], sequence, complement] with only unconverted record.id keys
-    # Get **max sequence length** from unconverted records
+    # Get max sequence length from unconverted records
     max_reference_length = max(values[0] for values in modification_dict[unconverted].values())
-    # Add **unconverted records** to `record_FASTA_dict`
+    # Add unconverted records to `record_FASTA_dict`
     for record, values in modification_dict[unconverted].items():
         sequence_length, top_coords, bottom_coords, sequence, complement = values
@@ -196,61 +345,91 @@ def process_conversion_sites(
             chromosome = record
         # Store **original sequence**
-        record_FASTA_dict[record] = [
-            sequence + "N" * (max_reference_length - sequence_length),
-            complement + "N" * (max_reference_length - sequence_length),
-            chromosome,
-            record,
-            sequence_length,
-            max_reference_length - sequence_length,
-            unconverted,
-            "top",
-        ]
+        record_FASTA_dict[record] = RecordFastaInfo(
+            sequence=sequence + "N" * (max_reference_length - sequence_length),
+            complement=complement + "N" * (max_reference_length - sequence_length),
+            chromosome=chromosome,
+            unconverted_name=record,
+            sequence_length=sequence_length,
+            padding_length=max_reference_length - sequence_length,
+            conversion=unconverted,
+            strand="top",
+            max_reference_length=max_reference_length,
+        )
         if chromosome not in chromosome_FASTA_dict:
-            chromosome_FASTA_dict[chromosome] = [
+            chromosome_FASTA_dict[chromosome] = (
                 sequence + "N" * (max_reference_length - sequence_length),
                 complement + "N" * (max_reference_length - sequence_length),
-            ]
+            )
     # Process converted records
+    # For each conversion type (ie 5mC, 6mA), add the conversion type as a key to modification_dict.
+    # This points to a dictionary keyed by the unconverted record id key.
+    # This points to [sequence_length, top_strand_coordinates, bottom_strand_coordinates, unconverted sequence, unconverted complement]
     for conversion in conversion_types:
         modification_dict[conversion] = find_conversion_sites(
             converted_FASTA, conversion, conversions, deaminase_footprinting
         )
-        # Above points to record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement] with only unconverted record.id keys
+        # Iterate over the unconverted record ids in mod_dict, as well as the
+        # [sequence_length, top_strand_coordinates, bottom_strand_coordinates, unconverted sequence, unconverted complement] for the conversion type
         for record, values in modification_dict[conversion].items():
             sequence_length, top_coords, bottom_coords, sequence, complement = values
             if not deaminase_footprinting:
-                chromosome = record.split(f"_{unconverted}_")[0]  # Extract chromosome name
+                # For conversion smf, make the chromosome name the base record name
+                chromosome = record.split(f"_{unconverted}_")[0]
             else:
+                # For deaminase smf, make the chromosome and record name the same
                 chromosome = record
-            # Add **both strands** for converted records
+            # Add both strands for converted records
             for strand in ["top", "bottom"]:
+                # Generate converted/unconverted record names that are found in the converted FASTA
                 converted_name = f"{chromosome}_{conversion}_{strand}"
                 unconverted_name = f"{chromosome}_{unconverted}_top"
-                record_FASTA_dict[converted_name] = [
-                    sequence + "N" * (max_reference_length - sequence_length),
-                    complement + "N" * (max_reference_length - sequence_length),
-                    chromosome,
-                    unconverted_name,
-                    sequence_length,
-                    max_reference_length - sequence_length,
-                    conversion,
-                    strand,
-                ]
-    logger.debug("Updated record_FASTA_dict Keys:", list(record_FASTA_dict.keys()))
+                # Use the converted FASTA record names as keys to a dict that points to RecordFastaInfo objects.
+                # These objects will contain the unconverted sequence/complement.
+                record_FASTA_dict[converted_name] = RecordFastaInfo(
+                    sequence=sequence + "N" * (max_reference_length - sequence_length),
+                    complement=complement + "N" * (max_reference_length - sequence_length),
+                    chromosome=chromosome,
+                    unconverted_name=unconverted_name,
+                    sequence_length=sequence_length,
+                    padding_length=max_reference_length - sequence_length,
+                    conversion=conversion,
+                    strand=strand,
+                    max_reference_length=max_reference_length,
+                )
+    logger.debug("Updated record_FASTA_dict keys: %s", list(record_FASTA_dict.keys()))
     return max_reference_length, record_FASTA_dict, chromosome_FASTA_dict
-def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold, samtools_backend):
-    """Filters BAM files based on mapping threshold."""
-    records_to_analyze = set()
+def filter_bams_by_mapping_threshold(
+    bam_path_list: list[Path],
+    bam_files: list[Path],
+    mapping_threshold: float,
+    samtools_backend: str | None,
+) -> set[str]:
+    """Filter FASTA records based on per-BAM mapping thresholds.
+    Args:
+        bam_path_list: Ordered list of BAM paths to evaluate.
+        bam_files: Matching list of BAM paths used for reporting.
+        mapping_threshold: Minimum percentage of aligned reads to include a record.
+        samtools_backend: Samtools backend choice for alignment parsing.
+    Returns:
+        set[str]: FASTA record IDs that pass the mapping threshold.
+    Processing Steps:
+        1. Count aligned/unaligned reads and per-record percentages.
+        2. Collect record IDs that meet the mapping threshold.
+    """
+    records_to_analyze: set[str] = set()
     for i, bam in enumerate(bam_path_list):
         aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam, samtools_backend)
@@ -265,33 +444,182 @@ def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold
     return records_to_analyze
+def _encode_sequence_array(
+    read_sequence: np.ndarray,
+    valid_length: int,
+    config: SequenceEncodingConfig,
+) -> np.ndarray:
+    """Encode a base-identity array into integer values with padding.
+    Args:
+        read_sequence: Array of base calls (dtype "<U1").
+        valid_length: Number of valid reference positions for this record.
+        config: Integer encoding configuration.
+    Returns:
+        np.ndarray: Integer-encoded sequence with padding applied.
+    Processing Steps:
+        1. Initialize an array filled with the unknown base encoding.
+        2. Map A/C/G/T/N bases into integer values.
+        3. Mark positions beyond valid_length with the padding value.
+    """
+    read_sequence = np.asarray(read_sequence, dtype="<U1")
+    encoded = np.full(read_sequence.shape, config.unknown_value, dtype=np.int16)
+    for base in config.bases:
+        encoded[read_sequence == base] = config.base_to_int[base]
+    if valid_length < encoded.size:
+        encoded[valid_length:] = config.padding_value
+    return encoded
+def _write_sequence_batches(
+    base_identities: Mapping[str, np.ndarray],
+    tmp_dir: Path,
+    record: str,
+    prefix: str,
+    valid_length: int,
+    config: SequenceEncodingConfig,
+) -> list[str]:
+    """Encode base identities into integer arrays and write batched H5AD files.
+    Args:
+        base_identities: Mapping of read name to base identity arrays.
+        tmp_dir: Directory for temporary H5AD files.
+        record: Reference record identifier.
+        prefix: Prefix used to name batch files.
+        valid_length: Valid reference length for padding determination.
+        config: Integer encoding configuration.
+    Returns:
+        list[str]: Paths to written H5AD batch files.
+    Processing Steps:
+        1. Encode each read sequence into integers.
+        2. Accumulate encoded reads into batches.
+        3. Persist each batch to an H5AD file with `.uns` storage.
+    """
+    batch_files: list[str] = []
+    batch: dict[str, np.ndarray] = {}
+    batch_number = 0
+    for read_name, sequence in base_identities.items():
+        if sequence is None:
+            continue
+        batch[read_name] = _encode_sequence_array(sequence, valid_length, config)
+        if len(batch) >= config.batch_size:
+            save_name = tmp_dir / f"tmp_{prefix}_{record}_{batch_number}.h5ad"
+            ad.AnnData(X=np.zeros((1, 1)), uns=batch).write_h5ad(save_name)
+            batch_files.append(str(save_name))
+            batch = {}
+            batch_number += 1
+    if batch:
+        save_name = tmp_dir / f"tmp_{prefix}_{record}_{batch_number}.h5ad"
+        ad.AnnData(X=np.zeros((1, 1)), uns=batch).write_h5ad(save_name)
+        batch_files.append(str(save_name))
+    return batch_files
+def _load_sequence_batches(
+    batch_files: list[Path | str],
+) -> tuple[dict[str, np.ndarray], set[str], set[str]]:
+    """Load integer-encoded sequence batches from H5AD files.
+    Args:
+        batch_files: H5AD paths containing encoded sequences in `.uns`.
+    Returns:
+        tuple[dict[str, np.ndarray], set[str], set[str]]:
+            Read-to-sequence mapping and sets of forward/reverse mapped reads.
+    Processing Steps:
+        1. Read each H5AD file.
+        2. Merge `.uns` dictionaries into a single mapping.
+        3. Track forward/reverse read IDs based on filename markers.
+    """
+    sequences: dict[str, np.ndarray] = {}
+    fwd_reads: set[str] = set()
+    rev_reads: set[str] = set()
+    for batch_file in batch_files:
+        batch_path = Path(batch_file)
+        batch_sequences = ad.read_h5ad(batch_path).uns
+        sequences.update(batch_sequences)
+        if "_fwd_" in batch_path.name:
+            fwd_reads.update(batch_sequences.keys())
+        elif "_rev_" in batch_path.name:
+            rev_reads.update(batch_sequences.keys())
+    return sequences, fwd_reads, rev_reads
 def process_single_bam(
-    bam_index,
-    bam,
-    records_to_analyze,
-    record_FASTA_dict,
-    chromosome_FASTA_dict,
-    tmp_dir,
-    max_reference_length,
-    device,
-    deaminase_footprinting,
-    samtools_backend,
-):
-    """Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
-    adata_list = []
+    bam_index: int,
+    bam: Path,
+    records_to_analyze: set[str],
+    record_FASTA_dict: dict[str, RecordFastaInfo],
+    chromosome_FASTA_dict: dict[str, tuple[str, str]],
+    tmp_dir: Path,
+    max_reference_length: int,
+    device: torch.device,
+    deaminase_footprinting: bool,
+    samtools_backend: str | None,
+    converted_FASTA_record_seq_map: dict[str, tuple[int, str]],
+) -> ad.AnnData | None:
+    """Process a single BAM file into per-record AnnData objects.
+    Args:
+        bam_index: Index of the BAM within the processing batch.
+        bam: Path to the BAM file.
+        records_to_analyze: FASTA record IDs that passed the mapping threshold.
+        record_FASTA_dict: FASTA metadata keyed by record ID.
+        chromosome_FASTA_dict: Chromosome sequences for annotations.
+        tmp_dir: Directory for temporary batch files.
+        max_reference_length: Maximum reference length for padding.
+        device: Torch device used for binarization.
+        deaminase_footprinting: Whether direct deamination chemistry was used.
+        samtools_backend: Samtools backend choice for alignment parsing.
+        converted_FASTA_record_seq_map: record to seq map
+    Returns:
+        anndata.AnnData | None: Concatenated AnnData object or None if no data.
+    Processing Steps:
+        1. Extract base identities and mismatch profiles for each record.
+        2. Binarize modified base identities into feature matrices.
+        3. Encode read sequences into integer arrays and cache batches.
+        4. Build AnnData layers/obs metadata for each record and concatenate.
+    """
+    adata_list: list[ad.AnnData] = []
+    # Iterate over BAM records that passed filtering.
     for record in records_to_analyze:
         sample = bam.stem
-        chromosome = record_FASTA_dict[record][2]
-        current_length = record_FASTA_dict[record][4]
-        mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
-        sequence = chromosome_FASTA_dict[chromosome][0]
-        # Extract Base Identities
-        fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = (
-            extract_base_identities(
-                bam, record, range(current_length), max_reference_length, sequence, samtools_backend
-            )
+        record_info = record_FASTA_dict[record]
+        chromosome = record_info.chromosome
+        current_length = record_info.sequence_length
+        # Note, mod_type and strand are only correctly load for conversion smf and not deaminase
+        # However, these variables are only used for conversion smf and not deaminase, so works.
+        mod_type, strand = record_info.conversion, record_info.strand
+        non_converted_sequence = chromosome_FASTA_dict[chromosome][0]
+        record_sequence = converted_FASTA_record_seq_map[record][1]
+        # Extract Base Identities for forward and reverse mapped reads.
+        (
+            fwd_bases,
+            rev_bases,
+            mismatch_counts_per_read,
+            mismatch_trend_per_read,
+            mismatch_base_identities,
+            base_quality_scores,
+            read_span_masks,
+        ) = extract_base_identities(
+            bam,
+            record,
+            range(current_length),
+            max_reference_length,
+            record_sequence,
+            samtools_backend,
         )
         mismatch_trend_series = pd.Series(mismatch_trend_per_read)
@@ -305,13 +633,12 @@ def process_single_bam(
         merged_bin = {}
         # Binarize the Base Identities if they exist
+        # Note, mod_type is always unconverted and strand is always top currently for deaminase smf. this works for now.
         if fwd_bases:
             fwd_bin = binarize_converted_base_identities(
                 fwd_bases,
                 strand,
                 mod_type,
-                bam,
-                device,
                 deaminase_footprinting,
                 mismatch_trend_per_read,
             )
@@ -322,8 +649,6 @@ def process_single_bam(
                 rev_bases,
                 strand,
                 mod_type,
-                bam,
-                device,
                 deaminase_footprinting,
                 mismatch_trend_per_read,
             )
@@ -343,83 +668,140 @@ def process_single_bam(
         sorted_index = sorted(bin_df.index)
         bin_df = bin_df.reindex(sorted_index)
-        # One-Hot Encode Reads if there is valid data
-        one_hot_reads = {}
+        # Integer-encode reads if there is valid data
+        batch_files: list[str] = []
         if fwd_bases:
-            fwd_ohe_files = ohe_batching(
-                fwd_bases, tmp_dir, record, f"{bam_index}_fwd", batch_size=100000
+            batch_files.extend(
+                _write_sequence_batches(
+                    fwd_bases,
+                    tmp_dir,
+                    record,
+                    f"{bam_index}_fwd",
+                    current_length,
+                    SEQUENCE_ENCODING_CONFIG,
+                )
             )
-            for ohe_file in fwd_ohe_files:
-                tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
-                one_hot_reads.update(tmp_ohe_dict)
-                del tmp_ohe_dict
         if rev_bases:
-            rev_ohe_files = ohe_batching(
-                rev_bases, tmp_dir, record, f"{bam_index}_rev", batch_size=100000
+            batch_files.extend(
+                _write_sequence_batches(
+                    rev_bases,
+                    tmp_dir,
+                    record,
+                    f"{bam_index}_rev",
+                    current_length,
+                    SEQUENCE_ENCODING_CONFIG,
+                )
             )
-            for ohe_file in rev_ohe_files:
-                tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
-                one_hot_reads.update(tmp_ohe_dict)
-                del tmp_ohe_dict
-        # Skip if one_hot_reads is empty
-        if not one_hot_reads:
+        if not batch_files:
             logger.debug(
-                f"[Worker {current_process().pid}] Skipping {sample} - No valid one-hot encoded data for {record}."
+                f"[Worker {current_process().pid}] Skipping {sample} - No valid encoded data for {record}."
             )
             continue
         gc.collect()
-        # Convert One-Hot Encodings to Numpy Arrays
-        n_rows_OHE = 5
-        read_names = list(one_hot_reads.keys())
-        # Skip if no read names exist
-        if not read_names:
+        encoded_reads, fwd_reads, rev_reads = _load_sequence_batches(batch_files)
+        if not encoded_reads:
             logger.debug(
-                f"[Worker {current_process().pid}] Skipping {sample} - No reads found in one-hot encoded data for {record}."
+                f"[Worker {current_process().pid}] Skipping {sample} - No reads found in encoded data for {record}."
             )
             continue
-        sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
-        df_A, df_C, df_G, df_T, df_N = [
-            np.zeros((len(sorted_index), sequence_length), dtype=int) for _ in range(5)
-        ]
+        sequence_length = max_reference_length
+        default_sequence = np.full(
+            sequence_length, SEQUENCE_ENCODING_CONFIG.unknown_value, dtype=np.int16
+        )
+        if current_length < sequence_length:
+            default_sequence[current_length:] = SEQUENCE_ENCODING_CONFIG.padding_value
-        # Populate One-Hot Arrays
-        for j, read_name in enumerate(sorted_index):
-            if read_name in one_hot_reads:
-                one_hot_array = one_hot_reads[read_name].reshape(n_rows_OHE, -1)
-                df_A[j], df_C[j], df_G[j], df_T[j], df_N[j] = one_hot_array
+        encoded_matrix = np.vstack(
+            [encoded_reads.get(read_name, default_sequence) for read_name in sorted_index]
+        )
+        default_mismatch_sequence = np.full(
+            sequence_length, SEQUENCE_ENCODING_CONFIG.unknown_value, dtype=np.int16
+        )
+        if current_length < sequence_length:
+            default_mismatch_sequence[current_length:] = SEQUENCE_ENCODING_CONFIG.padding_value
+        mismatch_encoded_matrix = np.vstack(
+            [
+                mismatch_base_identities.get(read_name, default_mismatch_sequence)
+                for read_name in sorted_index
+            ]
+        )
+        default_quality_sequence = np.full(sequence_length, -1, dtype=np.int16)
+        quality_matrix = np.vstack(
+            [
+                base_quality_scores.get(read_name, default_quality_sequence)
+                for read_name in sorted_index
+            ]
+        )
+        default_read_span = np.zeros(sequence_length, dtype=np.int16)
+        read_span_matrix = np.vstack(
+            [read_span_masks.get(read_name, default_read_span) for read_name in sorted_index]
+        )
         # Convert to AnnData
         X = bin_df.values.astype(np.float32)
         adata = ad.AnnData(X)
         adata.obs_names = bin_df.index.astype(str)
         adata.var_names = bin_df.columns.astype(str)
-        adata.obs["Sample"] = [sample] * len(adata)
+        adata.obs[SAMPLE] = [sample] * len(adata)
         try:
             barcode = sample.split("barcode")[1]
         except Exception:
             barcode = np.nan
-        adata.obs["Barcode"] = [int(barcode)] * len(adata)
-        adata.obs["Barcode"] = adata.obs["Barcode"].astype(str)
-        adata.obs["Reference"] = [chromosome] * len(adata)
-        adata.obs["Strand"] = [strand] * len(adata)
-        adata.obs["Dataset"] = [mod_type] * len(adata)
-        adata.obs["Reference_dataset_strand"] = [f"{chromosome}_{mod_type}_{strand}"] * len(adata)
-        adata.obs["Reference_strand"] = [f"{chromosome}_{strand}"] * len(adata)
-        adata.obs["Read_mismatch_trend"] = adata.obs_names.map(mismatch_trend_series)
-        # Attach One-Hot Encodings to Layers
-        adata.layers["A_binary_sequence_encoding"] = df_A
-        adata.layers["C_binary_sequence_encoding"] = df_C
-        adata.layers["G_binary_sequence_encoding"] = df_G
-        adata.layers["T_binary_sequence_encoding"] = df_T
-        adata.layers["N_binary_sequence_encoding"] = df_N
+        adata.obs[BARCODE] = [int(barcode)] * len(adata)
+        adata.obs[BARCODE] = adata.obs[BARCODE].astype(str)
+        adata.obs[REFERENCE] = [chromosome] * len(adata)
+        adata.obs[STRAND] = [strand] * len(adata)
+        adata.obs[DATASET] = [mod_type] * len(adata)
+        adata.obs[READ_MISMATCH_TREND] = adata.obs_names.map(mismatch_trend_series)
+        # Currently, deaminase footprinting uses mismatch trend to define the strand.
+        if deaminase_footprinting:
+            is_ct = adata.obs[READ_MISMATCH_TREND] == "C->T"
+            is_ga = adata.obs[READ_MISMATCH_TREND] == "G->A"
+            adata.obs.loc[is_ct, STRAND] = "top"
+            adata.obs.loc[is_ga, STRAND] = "bottom"
+        # Currently, conversion footprinting uses strand to define the mismatch trend.
+        else:
+            is_top = adata.obs[STRAND] == "top"
+            is_bottom = adata.obs[STRAND] == "bottom"
+            adata.obs.loc[is_top, READ_MISMATCH_TREND] = "C->T"
+            adata.obs.loc[is_bottom, READ_MISMATCH_TREND] = "G->A"
+        adata.obs[REFERENCE_DATASET_STRAND] = (
+            adata.obs[REFERENCE].astype(str)
+            + "_"
+            + adata.obs[DATASET].astype(str)
+            + "_"
+            + adata.obs[STRAND].astype(str)
+        )
+        adata.obs[REFERENCE_STRAND] = (
+            adata.obs[REFERENCE].astype(str) + "_" + adata.obs[STRAND].astype(str)
+        )
+        read_mapping_direction = []
+        for read_id in adata.obs_names:
+            if read_id in fwd_reads:
+                read_mapping_direction.append("fwd")
+            elif read_id in rev_reads:
+                read_mapping_direction.append("rev")
+            else:
+                read_mapping_direction.append("unk")
+        adata.obs[READ_MAPPING_DIRECTION] = read_mapping_direction
+        # Attach integer sequence encoding to layers
+        adata.layers[SEQUENCE_INTEGER_ENCODING] = encoded_matrix
+        adata.layers[MISMATCH_INTEGER_ENCODING] = mismatch_encoded_matrix
+        adata.layers[BASE_QUALITY_SCORES] = quality_matrix
+        adata.layers[READ_SPAN_MASK] = read_span_matrix
         adata_list.append(adata)
@@ -427,27 +809,56 @@ def process_single_bam(
 def timestamp():
-    """Returns a formatted timestamp for logging."""
+    """Return a formatted timestamp for logging.
+    Returns:
+        str: Timestamp string in the format ``[YYYY-MM-DD HH:MM:SS]``.
+    """
     return time.strftime("[%Y-%m-%d %H:%M:%S]")
 def worker_function(
-    bam_index,
-    bam,
-    records_to_analyze,
-    shared_record_FASTA_dict,
-    chromosome_FASTA_dict,
-    tmp_dir,
-    h5_dir,
-    max_reference_length,
-    device,
-    deaminase_footprinting,
-    samtools_backend,
+    bam_index: int,
+    bam: Path,
+    records_to_analyze: set[str],
+    shared_record_FASTA_dict: dict[str, RecordFastaInfo],
+    chromosome_FASTA_dict: dict[str, tuple[str, str]],
+    tmp_dir: Path,
+    h5_dir: Path,
+    max_reference_length: int,
+    device: torch.device,
+    deaminase_footprinting: bool,
+    samtools_backend: str | None,
+    converted_FASTA_record_seq_map: dict[str, tuple[int, str]],
     progress_queue,
-    log_level,
-    log_file,
+    log_level: int,
+    log_file: Path | None,
 ):
-    """Worker function that processes a single BAM and writes the output to an H5AD file."""
+    """Process a single BAM and write the output to an H5AD file.
+    Args:
+        bam_index: Index of the BAM within the processing batch.
+        bam: Path to the BAM file.
+        records_to_analyze: FASTA record IDs that passed the mapping threshold.
+        shared_record_FASTA_dict: Shared FASTA metadata keyed by record ID.
+        chromosome_FASTA_dict: Chromosome sequences for annotations.
+        tmp_dir: Directory for temporary batch files.
+        h5_dir: Directory for per-BAM H5AD outputs.
+        max_reference_length: Maximum reference length for padding.
+        device: Torch device used for binarization.
+        deaminase_footprinting: Whether direct deamination chemistry was used.
+        samtools_backend: Samtools backend choice for alignment parsing.
+        converted_FASTA_record_seq_map: record to sequence map
+        progress_queue: Queue used to signal completion.
+        log_level: Logging level to configure in workers.
+        log_file: Optional log file path.
+    Processing Steps:
+        1. Skip processing if an output H5AD already exists.
+        2. Filter records to those present in the FASTA metadata.
+        3. Run per-record processing and write AnnData output.
+        4. Signal completion via the progress queue.
+    """
     _ensure_worker_logging(log_level, log_file)
     worker_id = current_process().pid  # Get worker process ID
     sample = bam.stem
@@ -485,6 +896,7 @@ def worker_function(
             device,
             deaminase_footprinting,
             samtools_backend,
+            converted_FASTA_record_seq_map,
         )
         if adata is not None:
@@ -505,19 +917,43 @@ def worker_function(
 def process_bams_parallel(
-    bam_path_list,
-    records_to_analyze,
-    record_FASTA_dict,
-    chromosome_FASTA_dict,
-    tmp_dir,
-    h5_dir,
-    num_threads,
-    max_reference_length,
-    device,
-    deaminase_footprinting,
-    samtools_backend,
-):
-    """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
+    bam_path_list: list[Path],
+    records_to_analyze: set[str],
+    record_FASTA_dict: dict[str, RecordFastaInfo],
+    chromosome_FASTA_dict: dict[str, tuple[str, str]],
+    tmp_dir: Path,
+    h5_dir: Path,
+    num_threads: int,
+    max_reference_length: int,
+    device: torch.device,
+    deaminase_footprinting: bool,
+    samtools_backend: str | None,
+    converted_FASTA_record_seq_map: dict[str, tuple[int, str]],
+) -> ad.AnnData | None:
+    """Process BAM files in parallel and concatenate the resulting AnnData.
+    Args:
+        bam_path_list: List of BAM files to process.
+        records_to_analyze: FASTA record IDs that passed the mapping threshold.
+        record_FASTA_dict: FASTA metadata keyed by record ID.
+        chromosome_FASTA_dict: Chromosome sequences for annotations.
+        tmp_dir: Directory for temporary batch files.
+        h5_dir: Directory for per-BAM H5AD outputs.
+        num_threads: Number of worker processes.
+        max_reference_length: Maximum reference length for padding.
+        device: Torch device used for binarization.
+        deaminase_footprinting: Whether direct deamination chemistry was used.
+        samtools_backend: Samtools backend choice for alignment parsing.
+        converted_FASTA_record_seq_map: map from converted record name to the converted reference length and sequence.
+    Returns:
+        anndata.AnnData | None: Concatenated AnnData or None if no H5ADs produced.
+    Processing Steps:
+        1. Spawn worker processes to handle each BAM.
+        2. Track completion via a multiprocessing queue.
+        3. Concatenate per-BAM H5AD files into a final AnnData.
+    """
     make_dirs(h5_dir)  # Ensure h5_dir exists
     logger.info(f"Starting parallel BAM processing with {num_threads} threads...")
@@ -543,6 +979,7 @@ def process_bams_parallel(
                         device,
                         deaminase_footprinting,
                         samtools_backend,
+                        converted_FASTA_record_seq_map,
                         progress_queue,
                         log_level,
                         log_file,
@@ -583,7 +1020,16 @@ def process_bams_parallel(
 def _log_async_result_errors(results, bam_path_list):
-    """Log worker failures captured by multiprocessing AsyncResult objects."""
+    """Log worker failures captured by multiprocessing AsyncResult objects.
+    Args:
+        results: Iterable of AsyncResult objects from multiprocessing.
+        bam_path_list: List of BAM paths matching the async results.
+    Processing Steps:
+        1. Iterate over async results.
+        2. Retrieve results to surface worker exceptions.
+    """
     for bam, result in zip(bam_path_list, results):
         if not result.ready():
             continue
@@ -594,6 +1040,15 @@ def _log_async_result_errors(results, bam_path_list):
 def _get_logger_config() -> tuple[int, Path | None]:
+    """Return the active smftools logger level and optional file path.
+    Returns:
+        tuple[int, Path | None]: Log level and log file path (if configured).
+    Processing Steps:
+        1. Inspect the smftools logger for configured handlers.
+        2. Extract log level and file handler path.
+    """
     smftools_logger = logging.getLogger("smftools")
     level = smftools_logger.level
     if level == logging.NOTSET:
@@ -607,6 +1062,16 @@ def _get_logger_config() -> tuple[int, Path | None]:
 def _ensure_worker_logging(log_level: int, log_file: Path | None) -> None:
+    """Ensure worker processes have logging configured.
+    Args:
+        log_level: Logging level to configure.
+        log_file: Optional log file path.
+    Processing Steps:
+        1. Check if handlers are already configured.
+        2. Initialize logging with the provided level and file path.
+    """
     smftools_logger = logging.getLogger("smftools")
     if not smftools_logger.handlers:
         setup_logging(level=log_level, log_file=log_file)
@@ -619,21 +1084,17 @@ def delete_intermediate_h5ads_and_tmpdir(
     dry_run: bool = False,
     verbose: bool = True,
 ):
-    """
-    Delete intermediate .h5ad files and a temporary directory.
-    Parameters
-    ----------
-    h5_dir : str | Path | iterable[str] | None
-        If a directory path is given, all files directly inside it will be considered.
-        If an iterable of file paths is given, those files will be considered.
-        Only files ending with '.h5ad' (and not ending with '.gz') are removed.
-    tmp_dir : str | Path | None
-        Path to a directory to remove recursively (e.g. a temp dir created earlier).
-    dry_run : bool
-        If True, print what *would* be removed but do not actually delete.
-    verbose : bool
-        Print progress / warnings.
+    """Delete intermediate .h5ad files and a temporary directory.
+    Args:
+        h5_dir: Directory path or iterable of file paths to inspect for `.h5ad` files.
+        tmp_dir: Optional directory to remove recursively.
+        dry_run: If True, log what would be removed without deleting.
+        verbose: If True, log progress and warnings.
+    Processing Steps:
+        1. Remove `.h5ad` files (excluding `.gz`) from the provided directory or list.
+        2. Optionally remove the temporary directory tree.
     """
     # Helper: remove a single file path (Path-like or string)

smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

smftools 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl