PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/basecalling.py ADDED Viewed

@@ -0,0 +1,67 @@
+import subprocess
+from pathlib import Path
+def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
+    """
+    Wrapper function for dorado canonical base calling.
+    Parameters:
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
+        bam (str): File path to the BAM file to output.
+        bam_suffix (str): The suffix to use for the BAM file.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
+        device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
+    Returns:
+        None
+            Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
+    """
+    output = bam + bam_suffix
+    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
+    if barcode_both_ends:
+        command.append("--barcode-both-ends")
+    if not trim:
+        command.append("--no-trim")
+    command += [model, pod5_dir]
+    command_string = " ".join(command)
+    print(f"Running {command_string}\n to generate {output}")
+    with open(output, "w") as outfile:
+        subprocess.run(command, stdout=outfile)
+def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
+    """
+    Wrapper function for dorado modified base calling.
+    Parameters:
+        model_dir (str): a string representing the file path to the dorado basecalling model directory.
+        model (str): a string representing the the dorado basecalling model.
+        pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
+        barcode_kit (str): A string representing the barcoding kit used in the experiment.
+        mod_list (list): A list of modification types to use in the analysis.
+        bam (str): File path to the BAM file to output.
+        bam_suffix (str): The suffix to use for the BAM file.
+        barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
+        trim (bool): Whether to trim barcodes, adapters, and primers from read ends
+        device (str): Device to use for basecalling. auto, metal, cpu, cuda.
+    Returns:
+        None
+            Outputs a BAM file holding the modified base calls output by the dorado basecaller.
+    """
+    import subprocess
+    output = bam + bam_suffix
+    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
+    command += mod_list
+    command += ["--device", device, "--batchsize", "0"]
+    if barcode_both_ends:
+        command.append("--barcode-both-ends")
+    if not trim:
+        command.append("--no-trim")
+    command += [model, pod5_dir]
+    print(f'Running: {" ".join(command)}')
+    with open(output, "w") as outfile:
+        subprocess.run(command, stdout=outfile)

smftools/informatics/bed_functions.py ADDED Viewed

@@ -0,0 +1,366 @@
+from pathlib import Path
+import os
+import subprocess
+from typing import List, Optional, Union
+import pysam
+import pybedtools
+import pyBigWig
+import numpy as np
+import pandas as pd
+import concurrent.futures
+from concurrent.futures import ProcessPoolExecutor
+import matplotlib.pyplot as plt
+from ..readwrite import make_dirs
+def _bed_to_bigwig(fasta: str, bed: str) -> str:
+    """
+    BED → bedGraph → bigWig
+    Requires:
+      - FASTA must have .fai index present
+    """
+    bed = Path(bed)
+    fa = Path(fasta)  # path to .fa
+    parent = bed.parent
+    stem = bed.stem
+    fa_stem = fa.stem
+    fai = parent / f"{fa_stem}.fai"
+    bedgraph = parent / f"{stem}.bedgraph"
+    bigwig = parent / f"{stem}.bw"
+    # 1) Compute coverage → bedGraph
+    print(f"[pybedtools] generating coverage bedgraph from {bed}")
+    bt = pybedtools.BedTool(str(bed))
+    # bedtools genomecov -bg
+    coverage = bt.genome_coverage(bg=True, genome=str(fai))
+    coverage.saveas(str(bedgraph))
+    # 2) Convert bedGraph → BigWig via pyBigWig
+    print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
+    # read chrom sizes from the FASTA .fai index
+    chrom_sizes = {}
+    with open(fai) as f:
+        for line in f:
+            fields = line.strip().split("\t")
+            chrom = fields[0]
+            size = int(fields[1])
+            chrom_sizes[chrom] = size
+    bw = pyBigWig.open(str(bigwig), "w")
+    bw.addHeader(list(chrom_sizes.items()))
+    with open(bedgraph) as f:
+        for line in f:
+            chrom, start, end, coverage = line.strip().split()
+            bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
+    bw.close()
+    print(f"BigWig written: {bigwig}")
+    return str(bigwig)
+def _plot_bed_histograms(
+    bed_file,
+    plotting_directory,
+    fasta,
+    *,
+    bins=60,
+    clip_quantiles=(0.0, 0.995),
+    cov_bin_size=1000,       # coverage bin size in bp
+    rows_per_fig=6,          # paginate if many chromosomes
+    include_mapq_quality=True,   # add MAPQ + avg read quality columns to grid
+    coordinate_mode="one_based",  # "one_based" (your BED-like) or "zero_based"
+):
+    """
+    Plot per-chromosome QC grids from a BED-like file.
+    Expects columns:
+      chrom, start, end, read_len, qname, mapq, avg_base_qual
+    For each chromosome:
+      - Column 1: Read length histogram
+      - Column 2: Coverage across the chromosome (binned)
+      - (optional) Column 3: MAPQ histogram
+      - (optional) Column 4: Avg base quality histogram
+    The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
+    Saves one PNG per page under `plotting_directory`.
+    Parameters
+    ----------
+    bed_file : str
+    plotting_directory : str
+    fasta : str
+        Reference FASTA (used to get chromosome lengths).
+    bins : int
+        Histogram bins for read length / MAPQ / quality.
+    clip_quantiles : (float, float)
+        Clip hist tails for readability (e.g., (0, 0.995)).
+    cov_bin_size : int
+        Bin size (bp) for coverage plot; bigger = faster/coarser.
+    rows_per_fig : int
+        Number of chromosomes per page.
+    include_mapq_quality : bool
+        If True, add MAPQ and avg base quality histograms as extra columns.
+    coordinate_mode : {"one_based","zero_based"}
+        One-based, inclusive (your file) vs BED-standard zero-based, half-open.
+    """
+    os.makedirs(plotting_directory, exist_ok=True)
+    bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
+    print(f"[plot_bed_histograms] Loading: {bed_file}")
+    # Load BED-like table
+    cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
+    df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
+        'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
+        'mapq': float, 'avg_q': float
+    })
+    # Drop unaligned records (chrom == '*') if present
+    df = df[df['chrom'] != '*'].copy()
+    if df.empty:
+        print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
+        return
+    # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
+    # Input is typically one_based inclusive (from your writer).
+    if coordinate_mode not in {"one_based", "zero_based"}:
+        raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
+    if coordinate_mode == "one_based":
+        # convert to 0-based half-open [start0, end0)
+        start0 = df['start'].to_numpy() - 1
+        end0   = df['end'].to_numpy()   # inclusive in input -> +1 already handled by not subtracting
+    else:
+        # already 0-based half-open (assumption)
+        start0 = df['start'].to_numpy()
+        end0   = df['end'].to_numpy()
+    # Clip helper for hist tails
+    def _clip_series(s, q=(0.0, 0.995)):
+        if q is None:
+            return s.to_numpy()
+        lo = s.quantile(q[0]) if q[0] is not None else s.min()
+        hi = s.quantile(q[1]) if q[1] is not None else s.max()
+        x = s.to_numpy(dtype=float)
+        return np.clip(x, lo, hi)
+    # Load chromosome order/lengths from FASTA
+    with pysam.FastaFile(fasta) as fa:
+        ref_names = list(fa.references)
+        ref_lengths = dict(zip(ref_names, fa.lengths))
+    # Keep only chroms present in FASTA and with at least one read
+    chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
+    # Order chromosomes by FASTA order
+    chrom_order = [c for c in ref_names if c in chroms]
+    if not chrom_order:
+        print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
+        return
+    # Pagination
+    def _sanitize(name: str) -> str:
+        return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
+    cols_per_fig = 4 if include_mapq_quality else 2
+    for start_idx in range(0, len(chrom_order), rows_per_fig):
+        chunk = chrom_order[start_idx:start_idx + rows_per_fig]
+        nrows = len(chunk)
+        ncols = cols_per_fig
+        fig, axes = plt.subplots(
+            nrows=nrows, ncols=ncols,
+            figsize=(4.0 * ncols, 2.6 * nrows),
+            dpi=160,
+            squeeze=False
+        )
+        for r, chrom in enumerate(chunk):
+            chrom_len = ref_lengths[chrom]
+            mask = (df['chrom'].to_numpy() == chrom)
+            # Slice per-chrom arrays for speed
+            s0 = start0[mask]
+            e0 = end0[mask]
+            len_arr = df.loc[mask, 'read_len']
+            mapq_arr = df.loc[mask, 'mapq']
+            q_arr = df.loc[mask, 'avg_q']
+            # --- Col 1: Read length histogram (clipped) ---
+            ax = axes[r, 0]
+            ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+            if r == 0:
+                ax.set_title("Read length")
+            ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
+            ax.set_xlabel("bp")
+            ax.grid(alpha=0.25)
+            # --- Col 2: Coverage (binned over genome) ---
+            ax = axes[r, 1]
+            nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
+            # Bin edges in 0-based coords
+            edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
+            # Compute per-bin "read count coverage": number of reads overlapping each bin.
+            # Approximate by incrementing all bins touched by the interval.
+            # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
+            cov = np.zeros(nb, dtype=np.int32)
+            # bin indices overlapped by each read (0-based half-open)
+            b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
+            b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
+            # ensure valid ordering
+            b_lo = np.minimum(b0, b1)
+            b_hi = np.maximum(b0, b1)
+            # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
+            for lo, hi in zip(b_lo, b_hi):
+                cov[lo:hi + 1] += 1
+            x_mid = (edges[:-1] + edges[1:]) / 2.0
+            ax.plot(x_mid, cov)
+            if r == 0:
+                ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
+            ax.set_xlim(0, chrom_len)
+            ax.set_xlabel("Position (bp)")
+            ax.set_ylabel("")  # already show chrom on col 1
+            ax.grid(alpha=0.25)
+            if include_mapq_quality:
+                # --- Col 3: MAPQ ---
+                ax = axes[r, 2]
+                # Clip MAPQ upper tail if needed (usually 60)
+                ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                if r == 0:
+                    ax.set_title("MAPQ")
+                ax.set_xlabel("MAPQ")
+                ax.grid(alpha=0.25)
+                # --- Col 4: Avg base quality ---
+                ax = axes[r, 3]
+                ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                if r == 0:
+                    ax.set_title("Avg base qual")
+                ax.set_xlabel("Phred")
+                ax.grid(alpha=0.25)
+        fig.suptitle(
+            f"{bed_basename} — per-chromosome QC "
+            f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
+            y=0.995, fontsize=11
+        )
+        fig.tight_layout(rect=[0, 0, 1, 0.98])
+        page = start_idx // rows_per_fig + 1
+        out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
+        plt.savefig(out_png, bbox_inches="tight")
+        plt.close(fig)
+    print("[plot_bed_histograms] Done.")
+def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
+    """
+    Takes an aligned BAM as input and writes a BED file of reads as output.
+    Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
+    Parameters:
+        aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
+        out_dir (str): Directory to output files.
+        fasta (str): File path to the reference genome.
+        make_bigwigs (bool): Whether to generate bigwig files.
+        threads (int): Number of threads to use.
+    Returns:
+        None
+    """
+    threads = threads or os.cpu_count()  # Use max available cores if not specified
+    # Create necessary directories
+    plotting_dir = out_dir / "bed_cov_histograms"
+    bed_dir = out_dir / "beds"
+    make_dirs([plotting_dir, bed_dir])
+    bed_output = bed_dir /  str(aligned_BAM.name).replace(".bam", "_bed.bed")
+    print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
+    with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
+        for read in bam.fetch(until_eof=True):
+            if read.is_unmapped:
+                chrom = "*"
+                start1 = 1
+                rl = read.query_length or 0
+                mapq = 0
+            else:
+                chrom = bam.get_reference_name(read.reference_id)
+                # pysam reference_start is 0-based → +1 for 1-based SAM-like start
+                start1 = int(read.reference_start) + 1
+                rl = read.query_length or 0
+                mapq = int(read.mapping_quality)
+            # End position in 1-based inclusive coords
+            end1 = start1 + (rl or 0) - 1
+            qname = read.query_name
+            quals = read.query_qualities
+            if quals is None or rl == 0:
+                avg_q = float("nan")
+            else:
+                avg_q = float(np.mean(quals))
+            out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
+    print(f"BED-like file created: {bed_output}")
+    def split_bed(bed):
+        """Splits into aligned and unaligned reads (chrom == '*')."""
+        bed = str(bed)
+        aligned = bed.replace(".bed", "_aligned.bed")
+        unaligned = bed.replace(".bed", "_unaligned.bed")
+        with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
+            for line in infile:
+                (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
+        os.remove(bed)
+        return aligned
+    print(f"Splitting: {bed_output}")
+    aligned_bed = split_bed(bed_output)
+    with ProcessPoolExecutor() as executor:
+        futures = []
+        futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
+        if make_bigwigs:
+            futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
+        concurrent.futures.wait(futures)
+    print("Processing completed successfully.")
+def extract_read_lengths_from_bed(file_path):
+    """
+    Load a dict of read names that points to the read length
+    Params:
+        file_path (str): file path to a bed file
+    Returns:
+        read_dict (dict)
+    """
+    import pandas as pd
+    columns = ['chrom', 'start', 'end', 'length', 'name']
+    df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
+    read_dict = {}
+    for _, row in df.iterrows():
+        chrom = row['chrom']
+        start = row['start']
+        end = row['end']
+        name = row['name']
+        length = row['length']
+        read_dict[name] = length
+    return read_dict

smftools/informatics/binarize_converted_base_identities.py ADDED Viewed

@@ -0,0 +1,172 @@
+def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu', deaminase_footprinting=False, mismatch_trend_per_read={}, on_missing="nan"):
+    """
+    Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
+    Parameters:
+        base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
+        strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
+        modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
+        bam (str): The bam file path
+        deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
+        mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
+        on_missing (str): Error handling if a read is missing
+    Returns:
+        dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
+        If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
+    """
+    import numpy as np
+    if mismatch_trend_per_read is None:
+        mismatch_trend_per_read = {}
+    # Fast path
+    if modification_type == "unconverted" and not deaminase_footprinting:
+        return {k: np.full(len(v), np.nan, dtype=np.float32) for k, v in base_identities.items()}
+    out = {}
+    if deaminase_footprinting:
+        valid_trends = {"C->T", "G->A"}
+        for read_id, bases in base_identities.items():
+            trend_raw = mismatch_trend_per_read.get(read_id, None)
+            if trend_raw is None:
+                if on_missing == "error":
+                    raise KeyError(f"Missing mismatch trend for read '{read_id}'")
+                out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
+                continue
+            trend = trend_raw.replace(" ", "").upper()
+            if trend not in valid_trends:
+                if on_missing == "error":
+                    raise KeyError(
+                        f"Invalid mismatch trend '{trend_raw}' for read '{read_id}'. "
+                        f"Expected one of {sorted(valid_trends)}"
+                    )
+                out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
+                continue
+            arr = np.asarray(bases, dtype="<U1")
+            res = np.full(arr.shape, np.nan, dtype=np.float32)
+            if trend == "C->T":
+                # C (unconverted) -> 0, T (converted) -> 1
+                res[arr == "C"] = 0.0
+                res[arr == "T"] = 1.0
+            else:  # "G->A"
+                res[arr == "G"] = 0.0
+                res[arr == "A"] = 1.0
+            out[read_id] = res
+        return out
+    # Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
+    bin_maps = {
+        ("top", "5mC"):    {"C": 1.0, "T": 0.0},
+        ("bottom", "5mC"): {"G": 1.0, "A": 0.0},
+        ("top", "6mA"):    {"A": 1.0, "G": 0.0},
+        ("bottom", "6mA"): {"T": 1.0, "C": 0.0},
+    }
+    key = (strand, modification_type)
+    if key not in bin_maps:
+        raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
+    base_map = bin_maps[key]
+    for read_id, bases in base_identities.items():
+        arr = np.asarray(bases, dtype="<U1")
+        res = np.full(arr.shape, np.nan, dtype=np.float32)
+        # mask-assign; unknown characters (N, -, etc.) remain NaN
+        for b, v in base_map.items():
+            res[arr == b] = v
+        out[read_id] = res
+    return out
+    # if mismatch_trend_per_read is None:
+    #     mismatch_trend_per_read = {}
+    # # If the modification type is 'unconverted', return NaN for all positions if the deaminase_footprinting strategy is not being used.
+    # if modification_type == "unconverted" and not deaminase_footprinting:
+    #     #print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
+    #     return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
+    # # Define mappings for binarization based on strand and modification type
+    # if deaminase_footprinting:
+    #     binarization_maps = {
+    #         ('C->T'): {'C': 0, 'T': 1},
+    #         ('G->A'): {'G': 0, 'A': 1},
+    #     }
+    #     binarized_base_identities = {}
+    #     for key, bases in base_identities.items():
+    #         arr = np.array(bases, dtype='<U1')
+    #         # Fetch the appropriate mapping
+    #         conversion_type = mismatch_trend_per_read[key]
+    #         base_map = binarization_maps.get(conversion_type, None)
+    #         binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr)  # Apply mapping with fallback to NaN
+    #         binarized_base_identities[key] = binarized
+    #     return binarized_base_identities
+    # else:
+    #     binarization_maps = {
+    #         ('top', '5mC'): {'C': 1, 'T': 0},
+    #         ('top', '6mA'): {'A': 1, 'G': 0},
+    #         ('bottom', '5mC'): {'G': 1, 'A': 0},
+    #         ('bottom', '6mA'): {'T': 1, 'C': 0}
+    #     }
+    #     if (strand, modification_type) not in binarization_maps:
+    #         raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
+    #     # Fetch the appropriate mapping
+    #     base_map = binarization_maps[(strand, modification_type)]
+    #     binarized_base_identities = {}
+    #     for key, bases in base_identities.items():
+    #         arr = np.array(bases, dtype='<U1')
+    #         binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr)  # Apply mapping with fallback to NaN
+    #         binarized_base_identities[key] = binarized
+    #     return binarized_base_identities
+    # import torch
+    # # If the modification type is 'unconverted', return NaN for all positions
+    # if modification_type == "unconverted":
+    #     print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
+    #     return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
+    # # Define mappings for binarization based on strand and modification type
+    # binarization_maps = {
+    #     ('top', '5mC'): {'C': 1, 'T': 0},
+    #     ('top', '6mA'): {'A': 1, 'G': 0},
+    #     ('bottom', '5mC'): {'G': 1, 'A': 0},
+    #     ('bottom', '6mA'): {'T': 1, 'C': 0}
+    # }
+    # if (strand, modification_type) not in binarization_maps:
+    #     raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
+    # # Fetch the appropriate mapping
+    # base_map = binarization_maps[(strand, modification_type)]
+    # # Convert mapping to tensor
+    # base_keys = list(base_map.keys())
+    # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
+    # # Create a lookup dictionary (ASCII-based for fast mapping)
+    # lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
+    # for k, v in zip(base_keys, base_values):
+    #     lookup_table[ord(k)] = v
+    # # Process reads
+    # binarized_base_identities = {}
+    # for key, bases in base_identities.items():
+    #     bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device)  # Convert chars to ASCII
+    #     binarized = lookup_table[bases_tensor]  # Efficient lookup
+    #     binarized_base_identities[key] = binarized
+    # return binarized_base_identities

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl