PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

smftools/__init__.py +43 -13
smftools/_settings.py +6 -6
smftools/_version.py +3 -1
smftools/cli/__init__.py +1 -0
smftools/cli/archived/cli_flows.py +2 -0
smftools/cli/helpers.py +9 -1
smftools/cli/hmm_adata.py +905 -242
smftools/cli/load_adata.py +432 -280
smftools/cli/preprocess_adata.py +287 -171
smftools/cli/spatial_adata.py +141 -53
smftools/cli_entry.py +119 -178
smftools/config/__init__.py +3 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +26 -18
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +511 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +4 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2133 -1428
smftools/hmm/__init__.py +24 -14
smftools/hmm/archived/apply_hmm_batched.py +2 -0
smftools/hmm/archived/calculate_distances.py +2 -0
smftools/hmm/archived/call_hmm_peaks.py +18 -1
smftools/hmm/archived/train_hmm.py +2 -0
smftools/hmm/call_hmm_peaks.py +176 -193
smftools/hmm/display_hmm.py +23 -7
smftools/hmm/hmm_readwrite.py +20 -6
smftools/hmm/nucleosome_hmm_refinement.py +104 -14
smftools/informatics/__init__.py +55 -13
smftools/informatics/archived/bam_conversion.py +2 -0
smftools/informatics/archived/bam_direct.py +2 -0
smftools/informatics/archived/basecall_pod5s.py +2 -0
smftools/informatics/archived/basecalls_to_adata.py +2 -0
smftools/informatics/archived/conversion_smf.py +2 -0
smftools/informatics/archived/deaminase_smf.py +1 -0
smftools/informatics/archived/direct_smf.py +2 -0
smftools/informatics/archived/fast5_to_pod5.py +2 -0
smftools/informatics/archived/helpers/archived/__init__.py +2 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
smftools/informatics/archived/helpers/archived/informatics.py +2 -0
smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
smftools/informatics/archived/helpers/archived/modQC.py +2 -0
smftools/informatics/archived/helpers/archived/modcall.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +9 -1
smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
smftools/informatics/archived/subsample_pod5.py +2 -0
smftools/informatics/bam_functions.py +1059 -269
smftools/informatics/basecalling.py +53 -9
smftools/informatics/bed_functions.py +357 -114
smftools/informatics/binarize_converted_base_identities.py +21 -7
smftools/informatics/complement_base_list.py +9 -6
smftools/informatics/converted_BAM_to_adata.py +324 -137
smftools/informatics/fasta_functions.py +251 -89
smftools/informatics/h5ad_functions.py +202 -30
smftools/informatics/modkit_extract_to_adata.py +623 -274
smftools/informatics/modkit_functions.py +87 -44
smftools/informatics/ohe.py +46 -21
smftools/informatics/pod5_functions.py +114 -74
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +23 -12
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +157 -50
smftools/machine_learning/data/preprocessing.py +4 -1
smftools/machine_learning/evaluation/__init__.py +3 -1
smftools/machine_learning/evaluation/eval_utils.py +13 -14
smftools/machine_learning/evaluation/evaluators.py +52 -34
smftools/machine_learning/inference/__init__.py +3 -1
smftools/machine_learning/inference/inference_utils.py +9 -4
smftools/machine_learning/inference/lightning_inference.py +14 -13
smftools/machine_learning/inference/sklearn_inference.py +8 -8
smftools/machine_learning/inference/sliding_window_inference.py +37 -25
smftools/machine_learning/models/__init__.py +12 -5
smftools/machine_learning/models/base.py +34 -43
smftools/machine_learning/models/cnn.py +22 -13
smftools/machine_learning/models/lightning_base.py +78 -42
smftools/machine_learning/models/mlp.py +18 -5
smftools/machine_learning/models/positional.py +10 -4
smftools/machine_learning/models/rnn.py +8 -3
smftools/machine_learning/models/sklearn_models.py +46 -24
smftools/machine_learning/models/transformer.py +75 -55
smftools/machine_learning/models/wrappers.py +8 -3
smftools/machine_learning/training/__init__.py +4 -2
smftools/machine_learning/training/train_lightning_model.py +42 -23
smftools/machine_learning/training/train_sklearn_model.py +11 -15
smftools/machine_learning/utils/__init__.py +3 -1
smftools/machine_learning/utils/device.py +12 -5
smftools/machine_learning/utils/grl.py +8 -2
smftools/metadata.py +443 -0
smftools/optional_imports.py +31 -0
smftools/plotting/__init__.py +32 -17
smftools/plotting/autocorrelation_plotting.py +153 -48
smftools/plotting/classifiers.py +175 -73
smftools/plotting/general_plotting.py +350 -168
smftools/plotting/hmm_plotting.py +53 -14
smftools/plotting/position_stats.py +155 -87
smftools/plotting/qc_plotting.py +25 -12
smftools/preprocessing/__init__.py +35 -37
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +18 -11
smftools/preprocessing/calculate_complexity_II.py +89 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +4 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
smftools/preprocessing/calculate_position_Youden.py +110 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
smftools/preprocessing/flag_duplicate_reads.py +708 -303
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +9 -3
smftools/preprocessing/min_non_diagonal.py +4 -1
smftools/preprocessing/recipes.py +58 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +25 -18
smftools/tools/archived/apply_hmm.py +2 -0
smftools/tools/archived/classifiers.py +165 -0
smftools/tools/archived/classify_methylated_features.py +2 -0
smftools/tools/archived/classify_non_methylated_features.py +2 -0
smftools/tools/archived/subset_adata_v1.py +12 -1
smftools/tools/archived/subset_adata_v2.py +14 -1
smftools/tools/calculate_umap.py +56 -15
smftools/tools/cluster_adata_on_methylation.py +122 -47
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +220 -99
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
smftools-0.3.0.dist-info/METADATA +147 -0
smftools-0.3.0.dist-info/RECORD +182 -0
smftools-0.2.4.dist-info/METADATA +0 -141
smftools-0.2.4.dist-info/RECORD +0 -176
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/fasta_functions.py CHANGED Viewed

@@ -1,36 +1,127 @@
-from ..readwrite import make_dirs, time_string
+from __future__ import annotations
-import os
+import gzip
+import shutil
 import subprocess
+from concurrent.futures import ProcessPoolExecutor
+from importlib.util import find_spec
 from pathlib import Path
-from typing import Union, List, Dict, Tuple
+from typing import TYPE_CHECKING, Dict, Iterable, Tuple
 import numpy as np
-import gzip
 from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq
-from pyfaidx import Fasta
-import pysam
+from Bio.SeqRecord import SeqRecord
-from concurrent.futures import ProcessPoolExecutor
-from itertools import chain
+from smftools.logging_utils import get_logger
+from smftools.optional_imports import require
+from ..readwrite import time_string
+logger = get_logger(__name__)
+if TYPE_CHECKING:
+    import pysam as pysam_module
+def _require_pysam() -> "pysam_module":
+    if pysam_types is not None:
+        return pysam_types
+    return require("pysam", extra="pysam", purpose="FASTA access")
+pysam_types = None
+if find_spec("pysam") is not None:
+    pysam_types = require("pysam", extra="pysam", purpose="FASTA access")
+def _resolve_fasta_backend() -> str:
+    """Resolve the backend to use for FASTA access."""
+    if pysam_types is not None:
+        return "python"
+    if shutil is not None and shutil.which("samtools"):
+        return "cli"
+    raise RuntimeError("FASTA access requires pysam or samtools in PATH.")
-def _convert_FASTA_record(record, modification_type, strand, unconverted):
-    """ Converts a FASTA record based on modification type and strand. """
+def _ensure_fasta_index(fasta: Path) -> None:
+    fai = fasta.with_suffix(fasta.suffix + ".fai")
+    if fai.exists():
+        return
+    if subprocess is None or shutil is None or not shutil.which("samtools"):
+        pysam_mod = _require_pysam()
+        pysam_mod.faidx(str(fasta))
+        return
+    cp = subprocess.run(
+        ["samtools", "faidx", str(fasta)],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    if cp.returncode != 0:
+        raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
+def _bed_to_faidx_region(chrom: str, start: int, end: int) -> str:
+    """Convert 0-based half-open BED coords to samtools faidx region."""
+    start1 = start + 1
+    end1 = end
+    if start1 > end1:
+        start1, end1 = end1, start1
+    return f"{chrom}:{start1}-{end1}"
+def _fetch_sequence_with_samtools(fasta: Path, chrom: str, start: int, end: int) -> str:
+    if subprocess is None or shutil is None:
+        raise RuntimeError("samtools backend is unavailable.")
+    if not shutil.which("samtools"):
+        raise RuntimeError("samtools is required but not available in PATH.")
+    region = _bed_to_faidx_region(chrom, start, end)
+    cp = subprocess.run(
+        ["samtools", "faidx", str(fasta), region],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    if cp.returncode != 0:
+        raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
+    lines = [line.strip() for line in cp.stdout.splitlines() if line and not line.startswith(">")]
+    return "".join(lines)
+def _convert_FASTA_record(
+    record: SeqRecord,
+    modification_type: str,
+    strand: str,
+    unconverted: str,
+) -> SeqRecord:
+    """Convert a FASTA record based on modification type and strand.
+    Args:
+        record: Input FASTA record.
+        modification_type: Modification type (e.g., ``5mC`` or ``6mA``).
+        strand: Strand label (``top`` or ``bottom``).
+        unconverted: Label for the unconverted record type.
+    Returns:
+        Bio.SeqRecord.SeqRecord: Converted FASTA record.
+    Raises:
+        ValueError: If the modification type/strand combination is invalid.
+    """
     conversion_maps = {
-        ('5mC', 'top'): ('C', 'T'),
-        ('5mC', 'bottom'): ('G', 'A'),
-        ('6mA', 'top'): ('A', 'G'),
-        ('6mA', 'bottom'): ('T', 'C')
+        ("5mC", "top"): ("C", "T"),
+        ("5mC", "bottom"): ("G", "A"),
+        ("6mA", "top"): ("A", "G"),
+        ("6mA", "bottom"): ("T", "C"),
     }
     sequence = str(record.seq).upper()
     if modification_type == unconverted:
-        return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
+        return SeqRecord(
+            Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description
+        )
     if (modification_type, strand) not in conversion_maps:
         raise ValueError(f"Invalid combination: {modification_type}, {strand}")
@@ -38,62 +129,80 @@ def _convert_FASTA_record(record, modification_type, strand, unconverted):
     original_base, converted_base = conversion_maps[(modification_type, strand)]
     new_seq = sequence.replace(original_base, converted_base)
-    return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
+    return SeqRecord(
+        Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description
+    )
+def _process_fasta_record(
+    args: tuple[SeqRecord, Iterable[str], Iterable[str], str],
+) -> list[SeqRecord]:
+    """Process a single FASTA record for parallel conversion.
-def _process_fasta_record(args):
-    """
-    Processes a single FASTA record for parallel execution.
     Args:
-        args (tuple): (record, modification_types, strands, unconverted)
+        args: Tuple containing ``(record, modification_types, strands, unconverted)``.
     Returns:
-        list of modified SeqRecord objects.
+        list[Bio.SeqRecord.SeqRecord]: Converted FASTA records.
     """
     record, modification_types, strands, unconverted = args
     modified_records = []
     for modification_type in modification_types:
         for i, strand in enumerate(strands):
             if i > 0 and modification_type == unconverted:
                 continue  # Ensure unconverted is added only once
-            modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
+            modified_records.append(
+                _convert_FASTA_record(record, modification_type, strand, unconverted)
+            )
     return modified_records
-def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
-    """
-    Converts an input FASTA file and writes a new converted FASTA file efficiently.
-    Parameters:
-        input_fasta (str): Path to the unconverted FASTA file.
-        modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
-        strands (list): List of strands ('top', 'bottom').
-        output_fasta (str): Path to the converted FASTA output file.
-        num_threads (int): Number of parallel threads to use.
-        chunk_size (int): Number of records to process per write batch.
+def generate_converted_FASTA(
+    input_fasta: str | Path,
+    modification_types: list[str],
+    strands: list[str],
+    output_fasta: str | Path,
+    num_threads: int = 4,
+    chunk_size: int = 500,
+) -> None:
+    """Convert a FASTA file and write converted records to disk.
-    Returns:
-        None (Writes the converted FASTA file).
+    Args:
+        input_fasta: Path to the unconverted FASTA file.
+        modification_types: List of modification types (``5mC``, ``6mA``, or unconverted).
+        strands: List of strands (``top``, ``bottom``).
+        output_fasta: Path to the converted FASTA output file.
+        num_threads: Number of parallel workers to use.
+        chunk_size: Number of records to process per write batch.
     """
     unconverted = modification_types[0]
     input_fasta = str(input_fasta)
     output_fasta = str(output_fasta)
     # Detect if input is gzipped
-    open_func = gzip.open if input_fasta.endswith('.gz') else open
-    file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
+    open_func = gzip.open if input_fasta.endswith(".gz") else open
+    file_mode = "rt" if input_fasta.endswith(".gz") else "r"
     def _fasta_record_generator():
-        """ Lazily yields FASTA records from file. """
+        """Lazily yields FASTA records from file."""
         with open_func(input_fasta, file_mode) as handle:
-            for record in SeqIO.parse(handle, 'fasta'):
+            for record in SeqIO.parse(handle, "fasta"):
                 yield record
-    with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
+    with (
+        open(output_fasta, "w") as output_handle,
+        ProcessPoolExecutor(max_workers=num_threads) as executor,
+    ):
         # Process records in parallel using a named function (avoiding lambda)
         results = executor.map(
             _process_fasta_record,
-            ((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
+            (
+                (record, modification_types, strands, unconverted)
+                for record in _fasta_record_generator()
+            ),
         )
         buffer = []
@@ -102,16 +211,26 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
             # Write out in chunks to save memory
             if len(buffer) >= chunk_size:
-                SeqIO.write(buffer, output_handle, 'fasta')
+                SeqIO.write(buffer, output_handle, "fasta")
                 buffer.clear()
         # Write any remaining records
         if buffer:
-            SeqIO.write(buffer, output_handle, 'fasta')
+            SeqIO.write(buffer, output_handle, "fasta")
 def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
+    """Index a FASTA file and optionally write chromosome sizes.
+    Args:
+        fasta: Path to the FASTA file.
+        write_chrom_sizes: Whether to write a ``.chrom.sizes`` file.
+    Returns:
+        Path: Path to the index file or chromosome sizes file.
+    """
     fasta = Path(fasta)
-    pysam.faidx(str(fasta))  # creates <fasta>.fai
+    _require_pysam().faidx(str(fasta))  # creates <fasta>.fai
     fai = fasta.with_suffix(fasta.suffix + ".fai")
     if write_chrom_sizes:
@@ -123,9 +242,15 @@ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
         return chrom_sizes
     return fai
 def get_chromosome_lengths(fasta: str | Path) -> Path:
-    """
-    Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
+    """Create or reuse ``<fasta>.chrom.sizes`` derived from the FASTA index.
+    Args:
+        fasta: Path to the FASTA file.
+    Returns:
+        Path: Path to the chromosome sizes file.
     """
     fasta = Path(fasta)
     fai = fasta.with_suffix(fasta.suffix + ".fai")
@@ -133,7 +258,7 @@ def get_chromosome_lengths(fasta: str | Path) -> Path:
         index_fasta(fasta, write_chrom_sizes=True)  # will also create .chrom.sizes
     chrom_sizes = fasta.with_suffix(".chrom.sizes")
     if chrom_sizes.exists():
-        print(f"Using existing chrom length file: {chrom_sizes}")
+        logger.debug(f"Using existing chrom length file: {chrom_sizes}")
         return chrom_sizes
     # Build chrom.sizes from .fai
@@ -143,10 +268,15 @@ def get_chromosome_lengths(fasta: str | Path) -> Path:
             out.write(f"{chrom}\t{size}\n")
     return chrom_sizes
 def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
-    """
-    Return {record_id: (length, sequence)} from a FASTA.
-    Direct methylation specific
+    """Return record lengths and sequences from a FASTA file.
+    Args:
+        fasta_file: Path to the FASTA file.
+    Returns:
+        dict[str, tuple[int, str]]: Mapping of record ID to ``(length, sequence)``.
     """
     fasta_file = Path(fasta_file)
     print(f"{time_string()}: Opening FASTA file {fasta_file}")
@@ -157,28 +287,35 @@ def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
             record_dict[rec.id] = (len(seq), seq)
     return record_dict
-def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
-    """
-    Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
-    Parameters:
-        fasta_file (str): Path to the converted reference FASTA.
-        modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
-        conversions (list): List of conversion types. The first element is the unconverted record type.
-        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
-    Returns:
-        dict: Dictionary where keys are **both unconverted & converted record names**.
-              Values contain:
-              [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
+def find_conversion_sites(
+    fasta_file: str | Path,
+    modification_type: str,
+    conversions: list[str],
+    deaminase_footprinting: bool = False,
+) -> dict[str, list]:
+    """Find genomic coordinates of modified bases in a reference FASTA.
+    Args:
+        fasta_file: Path to the converted reference FASTA.
+        modification_type: Modification type (``5mC``, ``6mA``, or ``unconverted``).
+        conversions: List of conversion types (first entry is the unconverted record type).
+        deaminase_footprinting: Whether the footprinting used direct deamination chemistry.
+    Returns:
+        dict[str, list]: Mapping of record name to
+        ``[sequence length, top strand coordinates, bottom strand coordinates, sequence, complement]``.
+    Raises:
+        ValueError: If the modification type is invalid.
     """
     unconverted = conversions[0]
     record_dict = {}
     # Define base mapping based on modification type
     base_mappings = {
-        '5mC': ('C', 'G'),  # Cytosine and Guanine
-        '6mA': ('A', 'T')   # Adenine and Thymine
+        "5mC": ("C", "G"),  # Cytosine and Guanine
+        "6mA": ("A", "T"),  # Adenine and Thymine
     }
     # Read FASTA file and process records
@@ -200,22 +337,35 @@ def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_
                     top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
                     bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
-                    record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
+                    record_dict[record.id] = [
+                        sequence_length,
+                        top_strand_coordinates,
+                        bottom_strand_coordinates,
+                        sequence,
+                        complement,
+                    ]
                 else:
-                    raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
+                    raise ValueError(
+                        f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'."
+                    )
     return record_dict
 def subsample_fasta_from_bed(
     input_FASTA: str | Path,
     input_bed: str | Path,
     output_directory: str | Path,
-    output_FASTA: str | Path
+    output_FASTA: str | Path,
 ) -> None:
-    """
-    Take a genome-wide FASTA file and a BED file containing
-    coordinate windows of interest. Outputs a subsampled FASTA.
+    """Subsample a FASTA using BED coordinates.
+    Args:
+        input_FASTA: Genome-wide FASTA path.
+        input_bed: BED file path containing coordinate windows of interest.
+        output_directory: Directory to write the subsampled FASTA.
+        output_FASTA: Output FASTA path.
     """
     # Normalize everything to Path
@@ -227,29 +377,41 @@ def subsample_fasta_from_bed(
     # Ensure output directory exists
     output_directory.mkdir(parents=True, exist_ok=True)
-    output_FASTA_path = output_directory / output_FASTA
+    backend = _resolve_fasta_backend()
+    _ensure_fasta_index(input_FASTA)
-    # Load the FASTA file using pyfaidx
-    fasta = Fasta(str(input_FASTA))   # pyfaidx requires string paths
+    fasta_handle = None
+    if backend == "python":
+        pysam_mod = _require_pysam()
+        fasta_handle = pysam_mod.FastaFile(str(input_FASTA))
     # Open BED + output FASTA
-    with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
+    with input_bed.open("r") as bed, output_FASTA.open("w") as out_fasta:
         for line in bed:
             fields = line.strip().split()
             chrom = fields[0]
-            start = int(fields[1]) # BED is 0-based
-            end   = int(fields[2]) # BED is 0-based and end is exclusive
-            desc  = " ".join(fields[3:]) if len(fields) > 3 else ""
-            if chrom not in fasta:
-                print(f"Warning: {chrom} not found in FASTA")
+            start = int(fields[1])  # BED is 0-based
+            end = int(fields[2])  # BED is 0-based and end is exclusive
+            desc = " ".join(fields[3:]) if len(fields) > 3 else ""
+            if backend == "python":
+                assert fasta_handle is not None
+                if chrom not in fasta_handle.references:
+                    logger.warning(f"{chrom} not found in FASTA")
+                    continue
+                sequence = fasta_handle.fetch(chrom, start, end)
+            else:
+                sequence = _fetch_sequence_with_samtools(input_FASTA, chrom, start, end)
+            if not sequence:
+                logger.warning(f"{chrom} not found in FASTA")
                 continue
-            # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
-            sequence = fasta[chrom][start:end].seq
             header = f">{chrom}:{start}-{end}"
             if desc:
                 header += f"    {desc}"
-            out_fasta.write(f"{header}\n{sequence}\n")
+            out_fasta.write(f"{header}\n{sequence}\n")
+    if fasta_handle is not None:
+        fasta_handle.close()

smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl