PyPI - krewlyzer - Versions diffs - 0.1.4__py3-none-any.whl - Mend

krewlyzer 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

krewlyzer/__init__.py +3 -0
krewlyzer/cli.py +53 -0
krewlyzer/fsc.py +330 -0
krewlyzer/fsd.py +170 -0
krewlyzer/fsr.py +225 -0
krewlyzer/helpers.py +237 -0
krewlyzer/mfsd.py +236 -0
krewlyzer/motif.py +430 -0
krewlyzer/ocf.py +133 -0
krewlyzer/uxm.py +188 -0
krewlyzer/wps.py +264 -0
krewlyzer/wrapper.py +147 -0
krewlyzer-0.1.4.dist-info/METADATA +22 -0
krewlyzer-0.1.4.dist-info/RECORD +18 -0
krewlyzer-0.1.4.dist-info/WHEEL +5 -0
krewlyzer-0.1.4.dist-info/entry_points.txt +2 -0
krewlyzer-0.1.4.dist-info/licenses/LICENSE +619 -0
krewlyzer-0.1.4.dist-info/top_level.txt +1 -0

krewlyzer/fsr.py ADDED Viewed

@@ -0,0 +1,225 @@
+import typer
+from pathlib import Path
+from typing import Optional
+import logging
+import sys
+import pysam
+import numpy as np
+from rich.console import Console
+from rich.logging import RichHandler
+console = Console()
+logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
+logger = logging.getLogger("fsr")
+from .helpers import gc_correct
+import pandas as pd
+def _calc_fsr(
+    bedgz_input: str | Path,
+    bin_input: str | Path,
+    windows: int,
+    continue_n: int,
+    output_file: str | Path
+):
+    """
+    Internal: Calculate fragment size ratio (FSR) for a single .bed.gz file.
+    Optimized with vectorized operations.
+    """
+    try:
+        logger.info(f"Processing {bedgz_input} with bins from {bin_input}")
+        # Load bins
+        try:
+            bins_df = pd.read_csv(bin_input, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
+        except Exception as e:
+            logger.error(f"Could not load bins from {bin_input}: {e}")
+            raise typer.Exit(1)
+        try:
+            tbx = pysam.TabixFile(filename=bedgz_input, mode="r")
+        except Exception as e:
+            logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
+            raise typer.Exit(1)
+        shorts_ratios = []
+        ultra_shorts_ratios = []
+        inter_ratios = []
+        longs_ratios = []
+        # Iterate over bins
+        for _, bin_row in bins_df.iterrows():
+            chrom = bin_row['chrom']
+            start = bin_row['start']
+            end = bin_row['end']
+            try:
+                rows = list(tbx.fetch(chrom, start, end, parser=pysam.asTuple()))
+            except ValueError:
+                rows = []
+            except Exception as e:
+                logger.error(f"Error fetching {chrom}:{start}-{end}: {e}")
+                raise typer.Exit(1)
+            if not rows:
+                shorts_ratios.append(0)
+                ultra_shorts_ratios.append(0)
+                inter_ratios.append(0)
+                longs_ratios.append(0)
+                continue
+            try:
+                # Vectorized parsing
+                _, starts, ends, _ = zip(*rows)
+                starts = np.array(starts, dtype=int)
+                ends = np.array(ends, dtype=int)
+                lengths = ends - starts
+                # Filter 65-400
+                mask = (lengths >= 65) & (lengths <= 400)
+                valid_lengths = lengths[mask]
+                total = len(valid_lengths)
+                if total == 0:
+                    shorts_ratios.append(0)
+                    ultra_shorts_ratios.append(0)
+                    inter_ratios.append(0)
+                    longs_ratios.append(0)
+                else:
+                    shorts = np.sum((valid_lengths >= 65) & (valid_lengths <= 150))
+                    ultra_shorts = np.sum((valid_lengths >= 65) & (valid_lengths <= 100))
+                    intermediates = np.sum((valid_lengths >= 151) & (valid_lengths <= 260))
+                    longs = np.sum((valid_lengths >= 261) & (valid_lengths <= 400))
+                    shorts_ratios.append(shorts / total)
+                    ultra_shorts_ratios.append(ultra_shorts / total)
+                    inter_ratios.append(intermediates / total)
+                    longs_ratios.append(longs / total)
+            except Exception as e:
+                logger.error(f"Error processing data in bin {chrom}:{start}-{end}: {e}")
+                raise typer.Exit(1)
+        # Aggregation into windows
+        df = pd.DataFrame({
+            'chrom': bins_df['chrom'],
+            'start': bins_df['start'],
+            'end': bins_df['end'],
+            'short_r': shorts_ratios,
+            'ultra_short_r': ultra_shorts_ratios,
+            'inter_r': inter_ratios,
+            'long_r': longs_ratios
+        })
+        results = []
+        for chrom, group in df.groupby('chrom', sort=False):
+            n_bins = len(group)
+            n_windows = n_bins // continue_n
+            if n_windows == 0:
+                continue
+            trunc_len = n_windows * continue_n
+            short_mat = group['short_r'].values[:trunc_len].reshape(n_windows, continue_n)
+            ultra_short_mat = group['ultra_short_r'].values[:trunc_len].reshape(n_windows, continue_n)
+            inter_mat = group['inter_r'].values[:trunc_len].reshape(n_windows, continue_n)
+            long_mat = group['long_r'].values[:trunc_len].reshape(n_windows, continue_n)
+            # Mean of ratios
+            mean_short = short_mat.mean(axis=1)
+            mean_ultra_short = ultra_short_mat.mean(axis=1)
+            mean_inter = inter_mat.mean(axis=1)
+            mean_long = long_mat.mean(axis=1)
+            window_starts = np.arange(n_windows) * continue_n * windows
+            window_ends = (np.arange(n_windows) + 1) * continue_n * windows - 1
+            results.append(pd.DataFrame({
+                'chrom': chrom,
+                'start': window_starts,
+                'end': window_ends,
+                'short_mean': mean_short,
+                'ultra_short_mean': mean_ultra_short,
+                'inter_mean': mean_inter,
+                'long_mean': mean_long
+            }))
+        if not results:
+            logger.warning("No valid windows found.")
+            return
+        final_df = pd.concat(results, ignore_index=True)
+        # Write output
+        with open(output_file, 'w') as f:
+            f.write("region\tshort-ratio\tultra-short-ratio\titermediate-ratio\tlong-ratio\n")
+            for _, row in final_df.iterrows():
+                region = f"{row['chrom']}:{int(row['start'])}-{int(row['end'])}"
+                f.write(f"{region}\t{row['short_mean']:.4f}\t{row['ultra_short_mean']:.4f}\t{row['inter_mean']:.4f}\t{row['long_mean']:.4f}\n")
+        logger.info(f"FSR calculation complete. Results written to {output_file}")
+    except Exception as e:
+        logger.error(f"Fatal error in _calc_fsr: {e}")
+        raise typer.Exit(1)
+def fsr(
+    bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
+    bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
+    windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
+    continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
+    output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
+    threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
+):
+    """
+    Calculate fragment size ratio (FSR) features for all .bed.gz files in a folder.
+    The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
+    Output files are written to the output directory, one per .bed.gz file.
+    """
+    # Input checks
+    if not bedgz_path.exists():
+        logger.error(f"Input directory not found: {bedgz_path}")
+        raise typer.Exit(1)
+    if bin_input and not bin_input.exists():
+        logger.error(f"Bin input file not found: {bin_input}")
+        raise typer.Exit(1)
+    try:
+        output.mkdir(parents=True, exist_ok=True)
+    except Exception as e:
+        logger.error(f"Could not create output directory {output}: {e}")
+        raise typer.Exit(1)
+    if not output.exists():
+        output.mkdir(parents=True, exist_ok=True)
+    bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
+    if not bedgz_files:
+        logger.error("No .bed.gz files found in the specified folder.")
+        raise typer.Exit(1)
+    if bin_input is None:
+        bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
+        logger.info(f"No bin_input specified. Using default: {bin_input}")
+    if not bin_input.exists():
+        logger.error(f"Bin input file does not exist: {bin_input}")
+        raise typer.Exit(1)
+    logger.info(f"Calculating FSR for {len(bedgz_files)} files...")
+    from concurrent.futures import ProcessPoolExecutor, as_completed
+    logger.info(f"Starting parallel FSR calculation using {threads} processes...")
+    def run_fsr_file(bedgz_file):
+        output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSR.txt')
+        _calc_fsr(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
+        return str(output_file)
+    with ProcessPoolExecutor(max_workers=threads) as executor:
+        futures = {executor.submit(run_fsr_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
+        for future in as_completed(futures):
+            bedgz_file = futures[future]
+            try:
+                result = future.result()
+                logger.info(f"FSR calculated: {result}")
+            except Exception as exc:
+                logger.error(f"FSR calculation failed for {bedgz_file}: {exc}")
+    logger.info(f"FSR features calculated for {len(bedgz_files)} files.")

krewlyzer/helpers.py ADDED Viewed

@@ -0,0 +1,237 @@
+import pysam
+import itertools
+import os
+import numpy as np
+import pandas as pd
+import math
+from collections import defaultdict
+from rich.logging import RichHandler
+import logging
+from skmisc.loess import loess
+from pathlib import Path
+logging.basicConfig(level="INFO", handlers=[RichHandler()], format="%(message)s")
+logger = logging.getLogger("krewlyzer-helpers")
+def gc_correct(coverage: list[int | float], bias: list[float]) -> list[float]:
+    """
+    Perform GC bias correction on coverage values using LOESS regression.
+    Logs errors and raises commonError if fitting fails.
+    """
+    covl = len(coverage)
+    valid = [True for _ in range(covl)]
+    temp_cov = []
+    temp_bias = []
+    for i in range(covl):
+        if np.isnan(bias[i]):
+            valid[i] = False
+        else:
+            temp_cov.append(coverage[i])
+            temp_bias.append(bias[i])
+    if not temp_cov or not temp_bias:
+        logger.warning("No valid coverage/bias values for GC correction. Returning original values.")
+        return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
+    # Check for sufficient data points and variance for LOESS
+    if len(temp_cov) < 20:
+        logger.warning(f"Too few data points ({len(temp_cov)}) for LOESS GC correction. Returning original values.")
+        return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
+    if np.std(temp_bias) == 0:
+        logger.warning("No variance in GC bias values. Skipping LOESS correction.")
+        return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
+    med = np.median(temp_cov)
+    correct_cov = []
+    try:
+        i = np.arange(np.min(temp_bias), np.max(temp_bias), 0.001)
+        coverage_trend = loess(temp_bias, temp_cov, span=0.75)
+        coverage_trend.fit()
+        coverage_model = loess(i, coverage_trend.predict(i, stderror=True).values)
+        coverage_model.fit()
+        coverage_pred = coverage_model.predict(temp_bias, stderror=True)
+        pred = np.array(coverage_pred.values)
+        coverage_corrected = temp_cov - pred + med
+    except Exception as e:
+        logger.error(f"GC correction failed: {e}")
+        # Return original values on failure
+        return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
+    i, j = 0, 0
+    while i < covl:
+        if valid[i]:
+            if coverage_corrected[j] < 0:
+                correct_cov.append(0)
+            else:
+                correct_cov.append(coverage_corrected[j])
+            j += 1
+        else:
+            correct_cov.append(0)
+        i += 1
+    return correct_cov
+class commonError(Exception):
+    def __init__(self, message):
+        logger.error(f"commonError: {message}")
+        self.message = message
+def maxCore(nCore: int | None = None) -> int | None:
+    if nCore and nCore > 16:
+        logger.warning("Requested nCore > 16; capping to 16.")
+        return 16
+    else:
+        return nCore
+# Alias for CLI import consistency
+max_core = maxCore
+def rmEndString(x: str, y: list[str]) -> str:
+    for item in y:
+        if x.endswith(item):
+            x = x.replace(item, "")
+    return x
+def isSoftClipped(cigar: list[tuple[int, int]]) -> bool:
+    """
+    cigar information:
+    S	BAM_CSOFT_CLIP	4
+    H	BAM_CHARD_CLIP	5
+    P	BAM_CPAD	6
+    """
+    for (op, count) in cigar:
+        if op in [4, 5, 6]:
+            return True
+    return False
+def GCcontent(seq: str) -> float:
+    try:
+        nA = seq.count("a") + seq.count("A")
+        nT = seq.count("t") + seq.count("T")
+        nG = seq.count("g") + seq.count("G")
+        nC = seq.count("c") + seq.count("C")
+        percent_GC = (nG + nC) / (nA + nT + nG + nC) if (nA + nT + nG + nC) > 0 else 0
+        return percent_GC
+    except Exception as e:
+        logger.error(f"GCcontent calculation failed: {e}")
+        return 0
+def read_pair_generator(bam: pysam.AlignmentFile, region_string: str | None = None):
+    """
+    Generate read pairs in a BAM file or within a region string.
+    Reads are added to read_dict until a pair is found.
+    Reference: https://www.biostars.org/p/306041/
+    """
+    read_dict = defaultdict(lambda: [None, None])
+    try:
+        for read in bam.fetch(region=region_string):
+            if read.is_unmapped or read.is_qcfail or read.is_duplicate:
+                continue
+            if not read.is_paired or not read.is_proper_pair:
+                continue
+            if read.is_secondary or read.is_supplementary:
+                continue
+            if read.mate_is_unmapped:
+                continue
+            if read.rnext != read.tid:
+                continue
+            if read.template_length == 0:
+                continue
+            if isSoftClipped(read.cigar):
+                continue
+            qname = read.query_name
+            if qname not in read_dict:
+                if read.is_read1:
+                    read_dict[qname][0] = read
+                else:
+                    read_dict[qname][1] = read
+            else:
+                if read.is_read1:
+                    yield read, read_dict[qname][1]
+                else:
+                    yield read_dict[qname][0], read
+                del read_dict[qname]
+    except Exception as e:
+        logger.error(f"Error during BAM read pair generation: {e}")
+        return
+def reverse_complement(seq: str) -> str:
+    """
+    Return the reverse complement of a DNA sequence.
+    """
+    trans_table = str.maketrans("ATCGatcgNn", "TAGCtagcNn")
+    return seq.translate(trans_table)[::-1]
+def get_End_motif(Emotif: dict[str, int], end5: str, end3: str) -> dict[str, int]:
+    """
+    Update End Motif frequency dictionary.
+    end5: 5' end of the fragment (from Read 1)
+    end3: 3' end of the fragment (from Read 2, already reverse complemented to be on forward strand relative to fragment)
+    """
+    if "N" in end5 or "n" in end5 or "N" in end3 or "n" in end3:
+        return Emotif
+    # In cfDNAFE, they used:
+    # seq2 = reverse_seq(seq2) -> which was just complement, not reverse.
+    # And they passed forward_end3 twice.
+    #
+    # Correct logic:
+    # We want the 4-mer at the 5' end and the 4-mer at the 3' end.
+    # The 5' end sequence is just the sequence.
+    # The 3' end sequence (on the + strand) is what we want.
+    #
+    # However, standard motif analysis often looks at the 5' end of the fragment on both strands.
+    # If we treat the fragment as double stranded:
+    # Strand 1: 5' [Seq] 3'
+    # Strand 2: 3' [Seq_RC] 5'
+    #
+    # The 5' end of Strand 1 is `end5`.
+    # The 5' end of Strand 2 corresponds to the 3' end of Strand 1, reverse complemented.
+    #
+    # If `end3` passed here is the sequence of the 3' end of the fragment on the forward strand:
+    # Then the 5' end of the reverse strand is `reverse_complement(end3)`.
+    #
+    # Let's assume the caller passes the raw sequences from the reads.
+    # Read 1 (Forward): 5' -> 3'. 5' end is the start of fragment.
+    # Read 2 (Reverse): 5' -> 3'. 5' end is the other start of fragment (on reverse strand).
+    #
+    # So we should just take the 5' end of Read 1 and the 5' end of Read 2.
+    #
+    # But let's look at how `motif.py` calls this.
+    # It will be updated to pass:
+    # forward_end5 (Read 1 5' end)
+    # reverse_end5 (Read 2 5' end)
+    #
+    # So we just count both of them.
+    if end5 in Emotif:
+        Emotif[end5] += 1
+    if end3 in Emotif:
+        Emotif[end3] += 1
+    return Emotif
+def calc_MDS(inputEndMotifFile: str | Path, outputfile: str | Path) -> None:
+    inputfile = pd.read_table(inputEndMotifFile, header=None, names=['bases', 'frequency'])
+    k_mer = math.log(len(inputfile), 4)
+    frequency = inputfile['frequency'].to_numpy()
+    MDS = np.sum(-frequency * np.log2(frequency) / np.log2(4 ** k_mer))
+    with open(outputfile, 'a') as f:
+        f.write(str(inputEndMotifFile) + '\t' + str(MDS) + '\n')
+def get_Breakpoint_motif(Bpmotif: dict[str, int], seq1: str, seq2: str) -> dict[str, int]:
+    """
+    Update Breakpoint Motif frequency dictionary.
+    seq1: Sequence around the 5' end of the fragment.
+    seq2: Sequence around the 3' end of the fragment.
+    """
+    if "N" in seq1 or "n" in seq1 or "N" in seq2 or "n" in seq2:
+        return Bpmotif
+    # Similar to End Motif, we just count the motifs at both breakpoints.
+    # The caller should ensure seq1 and seq2 are the correct sequences surrounding the breakpoints.
+    if seq1 in Bpmotif:
+        Bpmotif[seq1] += 1
+    if seq2 in Bpmotif:
+        Bpmotif[seq2] += 1
+    return Bpmotif

krewlyzer/mfsd.py ADDED Viewed

@@ -0,0 +1,236 @@
+import typer
+from pathlib import Path
+from typing import Optional
+import logging
+import pysam
+import pandas as pd
+import numpy as np
+from scipy.stats import ks_2samp
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.progress import track
+import os
+console = Console()
+logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
+logger = logging.getLogger("mfsd")
+def classify_read(read: pysam.AlignedSegment, pos: int, ref: str, alt: str) -> str:
+    """
+    Classify a read as Mutant or Wild-Type at a specific genomic position.
+    Currently supports SNVs.
+    pos: 0-based genomic position.
+    """
+    try:
+        # Check if read covers the position
+        if read.reference_start > pos or read.reference_end <= pos:
+            return "Unknown"
+        # Get aligned pairs to map reference position to query position
+        # matches_only=True ensures we only get aligned bases (no deletions/insertions in cigar at this pos)
+        # But for SNV, we want to see the base.
+        # aligned_pairs returns (query_pos, ref_pos).
+        # Optimization: use get_aligned_pairs(matches_only=True) might skip if it's a mismatch?
+        # No, matches_only=True means "not None", i.e., aligned columns. It includes mismatches.
+        pairs = read.get_aligned_pairs(matches_only=True)
+        for q_pos, r_pos in pairs:
+            if r_pos == pos:
+                base = read.query_sequence[q_pos].upper()
+                if base == alt:
+                    return "Mutant"
+                if base == ref:
+                    return "WildType"
+                return "Other" # Different base
+    except Exception:
+        return "Unknown"
+    return "Unknown"
+def parse_input_file(input_file: Path, input_format: str) -> pd.DataFrame:
+    """
+    Parse VCF or MAF file into a standardized DataFrame.
+    Returns DataFrame with columns: [chrom, pos, ref, alt] (pos is 0-based)
+    """
+    if input_format == "auto":
+        if input_file.suffix.lower() in ['.vcf', '.gz']: # .vcf.gz
+            input_format = "vcf"
+        elif input_file.suffix.lower() in ['.maf', '.txt', '.tsv']:
+            input_format = "maf"
+        else:
+            raise ValueError(f"Could not determine format for {input_file}. Please specify --format.")
+    variants = []
+    if input_format == "vcf":
+        try:
+            vcf = pysam.VariantFile(str(input_file))
+            for record in vcf:
+                # VCF is 1-based, pysam.VariantFile.pos is 1-based.
+                # record.start is 0-based.
+                # We handle only the first ALT allele for now if multiple exist.
+                if len(record.alts) > 0:
+                    variants.append({
+                        'chrom': record.chrom,
+                        'pos': record.start, # 0-based
+                        'ref': record.ref,
+                        'alt': record.alts[0]
+                    })
+        except Exception as e:
+            logger.error(f"Error parsing VCF: {e}")
+            raise typer.Exit(1)
+    elif input_format == "maf":
+        try:
+            # MAF is tab-delimited.
+            # Columns: Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2
+            df = pd.read_csv(input_file, sep='\t', comment='#')
+            # Check required columns
+            required = ['Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2']
+            if not all(col in df.columns for col in required):
+                 # Try alternative column names if standard ones fail?
+                 # For now assume standard MAF.
+                 raise ValueError(f"MAF file missing required columns: {required}")
+            for _, row in df.iterrows():
+                variants.append({
+                    'chrom': str(row['Chromosome']),
+                    'pos': int(row['Start_Position']) - 1, # MAF is 1-based
+                    'ref': row['Reference_Allele'],
+                    'alt': row['Tumor_Seq_Allele2']
+                })
+        except Exception as e:
+            logger.error(f"Error parsing MAF: {e}")
+            raise typer.Exit(1)
+    return pd.DataFrame(variants)
+def calc_mfsd(
+    bam_file: Path,
+    input_file: Path,
+    output_file: Path,
+    input_format: str = "auto",
+    map_quality: int = 20
+) -> None:
+    """
+    Calculate Mutant Fragment Size Distribution metrics.
+    """
+    try:
+        logger.info(f"Parsing variants from {input_file}...")
+        variants_df = parse_input_file(input_file, input_format)
+        logger.info(f"Found {len(variants_df)} variants.")
+        bam = pysam.AlignmentFile(str(bam_file), "rb")
+        results = []
+        for _, var in track(variants_df.iterrows(), total=len(variants_df), description="Processing variants..."):
+            chrom = var['chrom']
+            pos = var['pos']
+            ref = var['ref']
+            alt = var['alt']
+            # Skip if ref/alt are not single bases (Indels) for now?
+            # Let's try to handle SNVs primarily.
+            if len(ref) > 1 or len(alt) > 1:
+                # Simple Indel logic or skip?
+                # classify_read logic above assumes SNV (base comparison).
+                # Let's skip Indels for this version to ensure correctness of SNV first.
+                # Or we can try.
+                # For now, let's log warning and skip complex indels to avoid noise.
+                # logger.warning(f"Skipping Indel at {chrom}:{pos} ({ref}->{alt}) - only SNVs supported currently.")
+                continue
+            mutant_lengths = []
+            wt_lengths = []
+            try:
+                # Fetch reads
+                # pos is 0-based.
+                for read in bam.fetch(chrom, pos, pos + 1):
+                    if read.mapping_quality < map_quality:
+                        continue
+                    if read.is_duplicate or read.is_unmapped or read.is_secondary:
+                        continue
+                    cls = classify_read(read, pos, ref, alt)
+                    length = abs(read.template_length)
+                    # template_length (TLEN) is the insert size.
+                    # 0 means single ended or info not available.
+                    if length == 0:
+                        length = read.query_length # Fallback to read length?
+                    if cls == "Mutant":
+                        mutant_lengths.append(length)
+                    elif cls == "WildType":
+                        wt_lengths.append(length)
+            except Exception as e:
+                logger.warning(f"Error fetching reads at {chrom}:{pos}: {e}")
+                continue
+            # Calculate metrics
+            n_mut = len(mutant_lengths)
+            n_wt = len(wt_lengths)
+            if n_mut > 0 and n_wt > 0:
+                mut_mean = np.mean(mutant_lengths)
+                wt_mean = np.mean(wt_lengths)
+                delta_size = wt_mean - mut_mean
+                # KS Test
+                ks_stat, ks_pval = ks_2samp(mutant_lengths, wt_lengths)
+            else:
+                mut_mean = np.nan
+                wt_mean = np.nan
+                delta_size = np.nan
+                ks_stat = np.nan
+                ks_pval = np.nan
+            results.append({
+                'Chrom': chrom,
+                'Pos': pos + 1, # Output 1-based for user convenience? Or 0-based?
+                                # VCF/MAF users expect 1-based usually. Let's stick to 1-based for output.
+                'Ref': ref,
+                'Alt': alt,
+                'Mut_Count': n_mut,
+                'WT_Count': n_wt,
+                'Mut_MeanSize': mut_mean,
+                'WT_MeanSize': wt_mean,
+                'Delta_Size': delta_size,
+                'KS_Stat': ks_stat,
+                'KS_Pval': ks_pval
+            })
+        # Write output
+        out_df = pd.DataFrame(results)
+        out_df.to_csv(output_file, sep='\t', index=False)
+        logger.info(f"mFSD analysis complete. Results written to {output_file}")
+    except Exception as e:
+        logger.error(f"Fatal error in calc_mfsd: {e}")
+        raise typer.Exit(1)
+def mfsd(
+    bam_path: Path = typer.Argument(..., help="Input BAM file"),
+    input_file: Path = typer.Option(..., "--input", "-i", help="Input VCF or MAF file containing variants"),
+    output: Path = typer.Option(..., "--output", "-o", help="Output file path (TSV)"),
+    format: str = typer.Option("auto", "--format", "-f", help="Input format: 'auto', 'vcf', or 'maf'"),
+    map_quality: int = typer.Option(20, "--map-quality", "-q", help="Minimum mapping quality")
+) -> None:
+    """
+    Calculate Mutant Fragment Size Distribution (mFSD) features.
+    Compares fragment sizes of mutant vs. wild-type reads at variant sites.
+    """
+    if not bam_path.exists():
+        logger.error(f"BAM file not found: {bam_path}")
+        raise typer.Exit(1)
+    if not input_file.exists():
+        logger.error(f"Input variant file not found: {input_file}")
+        raise typer.Exit(1)
+    # Create parent dir for output if needed
+    output.parent.mkdir(parents=True, exist_ok=True)
+    calc_mfsd(bam_path, input_file, output, format, map_quality)