PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/informatics/fasta_functions.py ADDED Viewed

@@ -0,0 +1,255 @@
+from ..readwrite import make_dirs, time_string
+import os
+import subprocess
+from pathlib import Path
+from typing import Union, List, Dict, Tuple
+import numpy as np
+import gzip
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from pyfaidx import Fasta
+import pysam
+from concurrent.futures import ProcessPoolExecutor
+from itertools import chain
+def _convert_FASTA_record(record, modification_type, strand, unconverted):
+    """ Converts a FASTA record based on modification type and strand. """
+    conversion_maps = {
+        ('5mC', 'top'): ('C', 'T'),
+        ('5mC', 'bottom'): ('G', 'A'),
+        ('6mA', 'top'): ('A', 'G'),
+        ('6mA', 'bottom'): ('T', 'C')
+    }
+    sequence = str(record.seq).upper()
+    if modification_type == unconverted:
+        return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
+    if (modification_type, strand) not in conversion_maps:
+        raise ValueError(f"Invalid combination: {modification_type}, {strand}")
+    original_base, converted_base = conversion_maps[(modification_type, strand)]
+    new_seq = sequence.replace(original_base, converted_base)
+    return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
+def _process_fasta_record(args):
+    """
+    Processes a single FASTA record for parallel execution.
+    Args:
+        args (tuple): (record, modification_types, strands, unconverted)
+    Returns:
+        list of modified SeqRecord objects.
+    """
+    record, modification_types, strands, unconverted = args
+    modified_records = []
+    for modification_type in modification_types:
+        for i, strand in enumerate(strands):
+            if i > 0 and modification_type == unconverted:
+                continue  # Ensure unconverted is added only once
+            modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
+    return modified_records
+def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
+    """
+    Converts an input FASTA file and writes a new converted FASTA file efficiently.
+    Parameters:
+        input_fasta (str): Path to the unconverted FASTA file.
+        modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
+        strands (list): List of strands ('top', 'bottom').
+        output_fasta (str): Path to the converted FASTA output file.
+        num_threads (int): Number of parallel threads to use.
+        chunk_size (int): Number of records to process per write batch.
+    Returns:
+        None (Writes the converted FASTA file).
+    """
+    unconverted = modification_types[0]
+    input_fasta = str(input_fasta)
+    output_fasta = str(output_fasta)
+    # Detect if input is gzipped
+    open_func = gzip.open if input_fasta.endswith('.gz') else open
+    file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
+    def _fasta_record_generator():
+        """ Lazily yields FASTA records from file. """
+        with open_func(input_fasta, file_mode) as handle:
+            for record in SeqIO.parse(handle, 'fasta'):
+                yield record
+    with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
+        # Process records in parallel using a named function (avoiding lambda)
+        results = executor.map(
+            _process_fasta_record,
+            ((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
+        )
+        buffer = []
+        for modified_records in results:
+            buffer.extend(modified_records)
+            # Write out in chunks to save memory
+            if len(buffer) >= chunk_size:
+                SeqIO.write(buffer, output_handle, 'fasta')
+                buffer.clear()
+        # Write any remaining records
+        if buffer:
+            SeqIO.write(buffer, output_handle, 'fasta')
+def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
+    fasta = Path(fasta)
+    pysam.faidx(str(fasta))  # creates <fasta>.fai
+    fai = fasta.with_suffix(fasta.suffix + ".fai")
+    if write_chrom_sizes:
+        chrom_sizes = fasta.with_suffix(".chrom.sizes")
+        with fai.open() as f_in, chrom_sizes.open("w") as out:
+            for line in f_in:
+                chrom, size = line.split()[:2]
+                out.write(f"{chrom}\t{size}\n")
+        return chrom_sizes
+    return fai
+def get_chromosome_lengths(fasta: str | Path) -> Path:
+    """
+    Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
+    """
+    fasta = Path(fasta)
+    fai = fasta.with_suffix(fasta.suffix + ".fai")
+    if not fai.exists():
+        index_fasta(fasta, write_chrom_sizes=True)  # will also create .chrom.sizes
+    chrom_sizes = fasta.with_suffix(".chrom.sizes")
+    if chrom_sizes.exists():
+        print(f"Using existing chrom length file: {chrom_sizes}")
+        return chrom_sizes
+    # Build chrom.sizes from .fai
+    with fai.open() as f_in, chrom_sizes.open("w") as out:
+        for line in f_in:
+            chrom, size = line.split()[:2]
+            out.write(f"{chrom}\t{size}\n")
+    return chrom_sizes
+def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
+    """
+    Return {record_id: (length, sequence)} from a FASTA.
+    Direct methylation specific
+    """
+    fasta_file = Path(fasta_file)
+    print(f"{time_string()}: Opening FASTA file {fasta_file}")
+    record_dict: Dict[str, Tuple[int, str]] = {}
+    with fasta_file.open("r") as f:
+        for rec in SeqIO.parse(f, "fasta"):
+            seq = str(rec.seq).upper()
+            record_dict[rec.id] = (len(seq), seq)
+    return record_dict
+def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
+    """
+    Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
+    Parameters:
+        fasta_file (str): Path to the converted reference FASTA.
+        modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
+        conversions (list): List of conversion types. The first element is the unconverted record type.
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
+    Returns:
+        dict: Dictionary where keys are **both unconverted & converted record names**.
+              Values contain:
+              [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
+    """
+    unconverted = conversions[0]
+    record_dict = {}
+    # Define base mapping based on modification type
+    base_mappings = {
+        '5mC': ('C', 'G'),  # Cytosine and Guanine
+        '6mA': ('A', 'T')   # Adenine and Thymine
+    }
+    # Read FASTA file and process records
+    with open(fasta_file, "r") as f:
+        for record in SeqIO.parse(f, "fasta"):
+            if unconverted in record.id or deaminase_footprinting:
+                sequence = str(record.seq).upper()
+                complement = str(record.seq.complement()).upper()
+                sequence_length = len(sequence)
+                # Unconverted case: store the full sequence without coordinate filtering
+                if modification_type == unconverted:
+                    record_dict[record.id] = [sequence_length, [], [], sequence, complement]
+                # Process converted records: extract modified base positions
+                elif modification_type in base_mappings:
+                    top_base, bottom_base = base_mappings[modification_type]
+                    seq_array = np.array(list(sequence))
+                    top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
+                    bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
+                    record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
+                else:
+                    raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
+    return record_dict
+def subsample_fasta_from_bed(
+    input_FASTA: str | Path,
+    input_bed: str | Path,
+    output_directory: str | Path,
+    output_FASTA: str | Path
+) -> None:
+    """
+    Take a genome-wide FASTA file and a BED file containing
+    coordinate windows of interest. Outputs a subsampled FASTA.
+    """
+    # Normalize everything to Path
+    input_FASTA = Path(input_FASTA)
+    input_bed = Path(input_bed)
+    output_directory = Path(output_directory)
+    output_FASTA = Path(output_FASTA)
+    # Ensure output directory exists
+    output_directory.mkdir(parents=True, exist_ok=True)
+    output_FASTA_path = output_directory / output_FASTA
+    # Load the FASTA file using pyfaidx
+    fasta = Fasta(str(input_FASTA))   # pyfaidx requires string paths
+    # Open BED + output FASTA
+    with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
+        for line in bed:
+            fields = line.strip().split()
+            chrom = fields[0]
+            start = int(fields[1]) # BED is 0-based
+            end   = int(fields[2]) # BED is 0-based and end is exclusive
+            desc  = " ".join(fields[3:]) if len(fields) > 3 else ""
+            if chrom not in fasta:
+                print(f"Warning: {chrom} not found in FASTA")
+                continue
+            # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
+            sequence = fasta[chrom][start:end].seq
+            header = f">{chrom}:{start}-{end}"
+            if desc:
+                header += f"    {desc}"
+            out_fasta.write(f"{header}\n{sequence}\n")

smftools/informatics/h5ad_functions.py ADDED Viewed

@@ -0,0 +1,197 @@
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import scipy.sparse as sp
+from typing import Optional, List, Dict, Union
+def add_demux_type_annotation(
+    adata,
+    double_demux_source,
+    sep: str = "\t",
+    read_id_col: str = "read_id",
+    barcode_col: str = "barcode",
+):
+    """
+    Add adata.obs["demux_type"]:
+        - "double" if read_id appears in the *double demux* TSV
+        - "single" otherwise
+    Rows where barcode == "unclassified" in the demux TSV are ignored.
+    Parameters
+    ----------
+    adata : AnnData
+        AnnData object whose obs_names are read_ids.
+    double_demux_source : str | Path | list[str]
+        Either:
+          - path to a TSV/TXT of dorado demux results
+          - a list of read_ids
+    """
+    # -----------------------------
+    # If it's a file → load TSV
+    # -----------------------------
+    if isinstance(double_demux_source, (str, Path)):
+        file_path = Path(double_demux_source)
+        if not file_path.exists():
+            raise FileNotFoundError(f"File does not exist: {file_path}")
+        df = pd.read_csv(file_path, sep=sep, dtype=str)
+        # If the file has only one column → treat as a simple read list
+        if df.shape[1] == 1:
+            read_ids = df.iloc[:, 0].tolist()
+        else:
+            # Validate columns
+            if read_id_col not in df.columns:
+                raise ValueError(f"TSV must contain a '{read_id_col}' column.")
+            if barcode_col not in df.columns:
+                raise ValueError(f"TSV must contain a '{barcode_col}' column.")
+            # Drop unclassified reads
+            df = df[df[barcode_col].str.lower() != "unclassified"]
+            # Extract read_ids
+            read_ids = df[read_id_col].tolist()
+    # -----------------------------
+    # If user supplied list-of-ids
+    # -----------------------------
+    else:
+        read_ids = list(double_demux_source)
+    # Deduplicate for speed
+    double_set = set(read_ids)
+    # Boolean lookup in AnnData
+    is_double = adata.obs_names.isin(double_set)
+    adata.obs["demux_type"] = np.where(is_double, "double", "single")
+    adata.obs["demux_type"] = adata.obs["demux_type"].astype("category")
+    return adata
+def add_read_length_and_mapping_qc(
+    adata,
+    bam_files: Optional[List[str]] = None,
+    read_metrics: Optional[Dict[str, Union[list, tuple]]] = None,
+    uns_flag: str = "add_read_length_and_mapping_qc_performed",
+    extract_read_features_from_bam_callable = None,
+    bypass: bool = False,
+    force_redo: bool = True
+):
+    """
+    Populate adata.obs with read/mapping QC columns.
+    Parameters
+    ----------
+    adata
+        AnnData to annotate (modified in-place).
+    bam_files
+        Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
+    read_metrics
+        Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length, mapping_quality]
+        If provided, this will be used directly and bam_files will be ignored.
+    uns_flag
+        key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
+    extract_read_features_from_bam_callable
+        Optional callable(bam_path) -> dict mapping read_name -> list/tuple of metrics.
+        If not provided and bam_files is given, function will attempt to call `extract_read_features_from_bam`
+        from the global namespace (your existing helper).
+    Returns
+    -------
+    None (mutates final_adata in-place)
+    """
+    # Only run if not already performed
+    already = bool(adata.uns.get(uns_flag, False))
+    if (already and not force_redo) or bypass:
+        # QC already performed; nothing to do
+        return
+    # Build read_metrics dict either from provided arg or by extracting from bam files
+    if read_metrics is None:
+        read_metrics = {}
+        if bam_files:
+            extractor = extract_read_features_from_bam_callable or globals().get("extract_read_features_from_bam")
+            if extractor is None:
+                raise ValueError("No `read_metrics` provided and `extract_read_features_from_bam` not found.")
+            for bam in bam_files:
+                bam_read_metrics = extractor(bam)
+                if not isinstance(bam_read_metrics, dict):
+                    raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
+                read_metrics.update(bam_read_metrics)
+        else:
+            # nothing to do
+            read_metrics = {}
+    # Convert read_metrics dict -> DataFrame (rows = read id)
+    # Values may be lists/tuples or scalars; prefer lists/tuples with 5 entries.
+    if len(read_metrics) == 0:
+        # fill with NaNs
+        n = adata.n_obs
+        adata.obs['read_length'] = np.full(n, np.nan)
+        adata.obs['mapped_length'] = np.full(n, np.nan)
+        adata.obs['reference_length'] = np.full(n, np.nan)
+        adata.obs['read_quality'] = np.full(n, np.nan)
+        adata.obs['mapping_quality'] = np.full(n, np.nan)
+    else:
+        # Build DF robustly
+        # Convert values to lists where possible, else to [val, val, val...]
+        max_cols = 5
+        rows = {}
+        for k, v in read_metrics.items():
+            if isinstance(v, (list, tuple, np.ndarray)):
+                vals = list(v)
+            else:
+                # scalar -> replicate into 5 columns to preserve original behavior
+                vals = [v] * max_cols
+            # Ensure length >= 5
+            if len(vals) < max_cols:
+                vals = vals + [np.nan] * (max_cols - len(vals))
+            rows[k] = vals[:max_cols]
+        df = pd.DataFrame.from_dict(rows, orient='index', columns=[
+            'read_length', 'read_quality', 'reference_length', 'mapped_length', 'mapping_quality'
+        ])
+        # Reindex to final_adata.obs_names so order matches adata
+        # If obs_names are not present as keys in df, the results will be NaN
+        df_reindexed = df.reindex(adata.obs_names).astype(float)
+        adata.obs['read_length'] = df_reindexed['read_length'].values
+        adata.obs['mapped_length'] = df_reindexed['mapped_length'].values
+        adata.obs['reference_length'] = df_reindexed['reference_length'].values
+        adata.obs['read_quality'] = df_reindexed['read_quality'].values
+        adata.obs['mapping_quality'] = df_reindexed['mapping_quality'].values
+    # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
+    # read_length_to_reference_length_ratio
+    rl = pd.to_numeric(adata.obs['read_length'], errors='coerce').to_numpy(dtype=float)
+    ref_len = pd.to_numeric(adata.obs['reference_length'], errors='coerce').to_numpy(dtype=float)
+    mapped_len = pd.to_numeric(adata.obs['mapped_length'], errors='coerce').to_numpy(dtype=float)
+    # safe divisions: use np.where to avoid warnings and replace inf with nan
+    with np.errstate(divide='ignore', invalid='ignore'):
+        rl_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), rl / ref_len, np.nan)
+        mapped_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), mapped_len / ref_len, np.nan)
+        mapped_to_read = np.where((rl != 0) & np.isfinite(rl), mapped_len / rl, np.nan)
+    adata.obs['read_length_to_reference_length_ratio'] = rl_to_ref
+    adata.obs['mapped_length_to_reference_length_ratio'] = mapped_to_ref
+    adata.obs['mapped_length_to_read_length_ratio'] = mapped_to_read
+    # Add read level raw modification signal: sum over X rows
+    X = adata.X
+    if sp.issparse(X):
+        # sum returns (n_obs, 1) sparse matrix; convert to 1d array
+        raw_sig = np.asarray(X.sum(axis=1)).ravel()
+    else:
+        raw_sig = np.asarray(X.sum(axis=1)).ravel()
+    adata.obs['Raw_modification_signal'] = raw_sig
+    # mark as done
+    adata.uns[uns_flag] = True
+    return None

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl