krewlyzer 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
krewlyzer/fsr.py ADDED
@@ -0,0 +1,225 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import sys
6
+
7
+ import pysam
8
+
9
+ import numpy as np
10
+ from rich.console import Console
11
+ from rich.logging import RichHandler
12
+
13
+ console = Console()
14
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
15
+ logger = logging.getLogger("fsr")
16
+
17
+ from .helpers import gc_correct
18
+
19
+ import pandas as pd
20
+
21
+ def _calc_fsr(
22
+ bedgz_input: str | Path,
23
+ bin_input: str | Path,
24
+ windows: int,
25
+ continue_n: int,
26
+ output_file: str | Path
27
+ ):
28
+ """
29
+ Internal: Calculate fragment size ratio (FSR) for a single .bed.gz file.
30
+ Optimized with vectorized operations.
31
+ """
32
+ try:
33
+ logger.info(f"Processing {bedgz_input} with bins from {bin_input}")
34
+
35
+ # Load bins
36
+ try:
37
+ bins_df = pd.read_csv(bin_input, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
38
+ except Exception as e:
39
+ logger.error(f"Could not load bins from {bin_input}: {e}")
40
+ raise typer.Exit(1)
41
+
42
+ try:
43
+ tbx = pysam.TabixFile(filename=bedgz_input, mode="r")
44
+ except Exception as e:
45
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
46
+ raise typer.Exit(1)
47
+
48
+ shorts_ratios = []
49
+ ultra_shorts_ratios = []
50
+ inter_ratios = []
51
+ longs_ratios = []
52
+
53
+ # Iterate over bins
54
+ for _, bin_row in bins_df.iterrows():
55
+ chrom = bin_row['chrom']
56
+ start = bin_row['start']
57
+ end = bin_row['end']
58
+
59
+ try:
60
+ rows = list(tbx.fetch(chrom, start, end, parser=pysam.asTuple()))
61
+ except ValueError:
62
+ rows = []
63
+ except Exception as e:
64
+ logger.error(f"Error fetching {chrom}:{start}-{end}: {e}")
65
+ raise typer.Exit(1)
66
+
67
+ if not rows:
68
+ shorts_ratios.append(0)
69
+ ultra_shorts_ratios.append(0)
70
+ inter_ratios.append(0)
71
+ longs_ratios.append(0)
72
+ continue
73
+
74
+ try:
75
+ # Vectorized parsing
76
+ _, starts, ends, _ = zip(*rows)
77
+ starts = np.array(starts, dtype=int)
78
+ ends = np.array(ends, dtype=int)
79
+ lengths = ends - starts
80
+
81
+ # Filter 65-400
82
+ mask = (lengths >= 65) & (lengths <= 400)
83
+ valid_lengths = lengths[mask]
84
+
85
+ total = len(valid_lengths)
86
+
87
+ if total == 0:
88
+ shorts_ratios.append(0)
89
+ ultra_shorts_ratios.append(0)
90
+ inter_ratios.append(0)
91
+ longs_ratios.append(0)
92
+ else:
93
+ shorts = np.sum((valid_lengths >= 65) & (valid_lengths <= 150))
94
+ ultra_shorts = np.sum((valid_lengths >= 65) & (valid_lengths <= 100))
95
+ intermediates = np.sum((valid_lengths >= 151) & (valid_lengths <= 260))
96
+ longs = np.sum((valid_lengths >= 261) & (valid_lengths <= 400))
97
+
98
+ shorts_ratios.append(shorts / total)
99
+ ultra_shorts_ratios.append(ultra_shorts / total)
100
+ inter_ratios.append(intermediates / total)
101
+ longs_ratios.append(longs / total)
102
+
103
+ except Exception as e:
104
+ logger.error(f"Error processing data in bin {chrom}:{start}-{end}: {e}")
105
+ raise typer.Exit(1)
106
+
107
+ # Aggregation into windows
108
+ df = pd.DataFrame({
109
+ 'chrom': bins_df['chrom'],
110
+ 'start': bins_df['start'],
111
+ 'end': bins_df['end'],
112
+ 'short_r': shorts_ratios,
113
+ 'ultra_short_r': ultra_shorts_ratios,
114
+ 'inter_r': inter_ratios,
115
+ 'long_r': longs_ratios
116
+ })
117
+
118
+ results = []
119
+
120
+ for chrom, group in df.groupby('chrom', sort=False):
121
+ n_bins = len(group)
122
+ n_windows = n_bins // continue_n
123
+
124
+ if n_windows == 0:
125
+ continue
126
+
127
+ trunc_len = n_windows * continue_n
128
+
129
+ short_mat = group['short_r'].values[:trunc_len].reshape(n_windows, continue_n)
130
+ ultra_short_mat = group['ultra_short_r'].values[:trunc_len].reshape(n_windows, continue_n)
131
+ inter_mat = group['inter_r'].values[:trunc_len].reshape(n_windows, continue_n)
132
+ long_mat = group['long_r'].values[:trunc_len].reshape(n_windows, continue_n)
133
+
134
+ # Mean of ratios
135
+ mean_short = short_mat.mean(axis=1)
136
+ mean_ultra_short = ultra_short_mat.mean(axis=1)
137
+ mean_inter = inter_mat.mean(axis=1)
138
+ mean_long = long_mat.mean(axis=1)
139
+
140
+ window_starts = np.arange(n_windows) * continue_n * windows
141
+ window_ends = (np.arange(n_windows) + 1) * continue_n * windows - 1
142
+
143
+ results.append(pd.DataFrame({
144
+ 'chrom': chrom,
145
+ 'start': window_starts,
146
+ 'end': window_ends,
147
+ 'short_mean': mean_short,
148
+ 'ultra_short_mean': mean_ultra_short,
149
+ 'inter_mean': mean_inter,
150
+ 'long_mean': mean_long
151
+ }))
152
+
153
+ if not results:
154
+ logger.warning("No valid windows found.")
155
+ return
156
+
157
+ final_df = pd.concat(results, ignore_index=True)
158
+
159
+ # Write output
160
+ with open(output_file, 'w') as f:
161
+ f.write("region\tshort-ratio\tultra-short-ratio\titermediate-ratio\tlong-ratio\n")
162
+ for _, row in final_df.iterrows():
163
+ region = f"{row['chrom']}:{int(row['start'])}-{int(row['end'])}"
164
+ f.write(f"{region}\t{row['short_mean']:.4f}\t{row['ultra_short_mean']:.4f}\t{row['inter_mean']:.4f}\t{row['long_mean']:.4f}\n")
165
+
166
+ logger.info(f"FSR calculation complete. Results written to {output_file}")
167
+
168
+ except Exception as e:
169
+ logger.error(f"Fatal error in _calc_fsr: {e}")
170
+ raise typer.Exit(1)
171
+
172
+ def fsr(
173
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
174
+ bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
175
+ windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
176
+ continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
177
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
178
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
179
+ ):
180
+ """
181
+ Calculate fragment size ratio (FSR) features for all .bed.gz files in a folder.
182
+ The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
183
+ Output files are written to the output directory, one per .bed.gz file.
184
+ """
185
+ # Input checks
186
+ if not bedgz_path.exists():
187
+ logger.error(f"Input directory not found: {bedgz_path}")
188
+ raise typer.Exit(1)
189
+ if bin_input and not bin_input.exists():
190
+ logger.error(f"Bin input file not found: {bin_input}")
191
+ raise typer.Exit(1)
192
+ try:
193
+ output.mkdir(parents=True, exist_ok=True)
194
+ except Exception as e:
195
+ logger.error(f"Could not create output directory {output}: {e}")
196
+ raise typer.Exit(1)
197
+ if not output.exists():
198
+ output.mkdir(parents=True, exist_ok=True)
199
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
200
+ if not bedgz_files:
201
+ logger.error("No .bed.gz files found in the specified folder.")
202
+ raise typer.Exit(1)
203
+ if bin_input is None:
204
+ bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
205
+ logger.info(f"No bin_input specified. Using default: {bin_input}")
206
+ if not bin_input.exists():
207
+ logger.error(f"Bin input file does not exist: {bin_input}")
208
+ raise typer.Exit(1)
209
+ logger.info(f"Calculating FSR for {len(bedgz_files)} files...")
210
+ from concurrent.futures import ProcessPoolExecutor, as_completed
211
+ logger.info(f"Starting parallel FSR calculation using {threads} processes...")
212
+ def run_fsr_file(bedgz_file):
213
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSR.txt')
214
+ _calc_fsr(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
215
+ return str(output_file)
216
+ with ProcessPoolExecutor(max_workers=threads) as executor:
217
+ futures = {executor.submit(run_fsr_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
218
+ for future in as_completed(futures):
219
+ bedgz_file = futures[future]
220
+ try:
221
+ result = future.result()
222
+ logger.info(f"FSR calculated: {result}")
223
+ except Exception as exc:
224
+ logger.error(f"FSR calculation failed for {bedgz_file}: {exc}")
225
+ logger.info(f"FSR features calculated for {len(bedgz_files)} files.")
krewlyzer/helpers.py ADDED
@@ -0,0 +1,237 @@
1
+ import pysam
2
+ import itertools
3
+ import os
4
+ import numpy as np
5
+ import pandas as pd
6
+ import math
7
+ from collections import defaultdict
8
+ from rich.logging import RichHandler
9
+ import logging
10
+ from skmisc.loess import loess
11
+ from pathlib import Path
12
+
13
+ logging.basicConfig(level="INFO", handlers=[RichHandler()], format="%(message)s")
14
+ logger = logging.getLogger("krewlyzer-helpers")
15
+
16
+ def gc_correct(coverage: list[int | float], bias: list[float]) -> list[float]:
17
+ """
18
+ Perform GC bias correction on coverage values using LOESS regression.
19
+ Logs errors and raises commonError if fitting fails.
20
+ """
21
+ covl = len(coverage)
22
+ valid = [True for _ in range(covl)]
23
+ temp_cov = []
24
+ temp_bias = []
25
+ for i in range(covl):
26
+ if np.isnan(bias[i]):
27
+ valid[i] = False
28
+ else:
29
+ temp_cov.append(coverage[i])
30
+ temp_bias.append(bias[i])
31
+
32
+ if not temp_cov or not temp_bias:
33
+ logger.warning("No valid coverage/bias values for GC correction. Returning original values.")
34
+ return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
35
+
36
+ # Check for sufficient data points and variance for LOESS
37
+ if len(temp_cov) < 20:
38
+ logger.warning(f"Too few data points ({len(temp_cov)}) for LOESS GC correction. Returning original values.")
39
+ return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
40
+
41
+ if np.std(temp_bias) == 0:
42
+ logger.warning("No variance in GC bias values. Skipping LOESS correction.")
43
+ return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
44
+
45
+ med = np.median(temp_cov)
46
+ correct_cov = []
47
+ try:
48
+ i = np.arange(np.min(temp_bias), np.max(temp_bias), 0.001)
49
+ coverage_trend = loess(temp_bias, temp_cov, span=0.75)
50
+ coverage_trend.fit()
51
+ coverage_model = loess(i, coverage_trend.predict(i, stderror=True).values)
52
+ coverage_model.fit()
53
+ coverage_pred = coverage_model.predict(temp_bias, stderror=True)
54
+ pred = np.array(coverage_pred.values)
55
+ coverage_corrected = temp_cov - pred + med
56
+ except Exception as e:
57
+ logger.error(f"GC correction failed: {e}")
58
+ # Return original values on failure
59
+ return [0 if np.isnan(b) else c for c, b in zip(coverage, bias)]
60
+
61
+ i, j = 0, 0
62
+ while i < covl:
63
+ if valid[i]:
64
+ if coverage_corrected[j] < 0:
65
+ correct_cov.append(0)
66
+ else:
67
+ correct_cov.append(coverage_corrected[j])
68
+ j += 1
69
+ else:
70
+ correct_cov.append(0)
71
+ i += 1
72
+ return correct_cov
73
+
74
+ class commonError(Exception):
75
+ def __init__(self, message):
76
+ logger.error(f"commonError: {message}")
77
+ self.message = message
78
+
79
+ def maxCore(nCore: int | None = None) -> int | None:
80
+ if nCore and nCore > 16:
81
+ logger.warning("Requested nCore > 16; capping to 16.")
82
+ return 16
83
+ else:
84
+ return nCore
85
+
86
+ # Alias for CLI import consistency
87
+ max_core = maxCore
88
+
89
+ def rmEndString(x: str, y: list[str]) -> str:
90
+ for item in y:
91
+ if x.endswith(item):
92
+ x = x.replace(item, "")
93
+ return x
94
+
95
+ def isSoftClipped(cigar: list[tuple[int, int]]) -> bool:
96
+ """
97
+ cigar information:
98
+ S BAM_CSOFT_CLIP 4
99
+ H BAM_CHARD_CLIP 5
100
+ P BAM_CPAD 6
101
+ """
102
+ for (op, count) in cigar:
103
+ if op in [4, 5, 6]:
104
+ return True
105
+ return False
106
+
107
+ def GCcontent(seq: str) -> float:
108
+ try:
109
+ nA = seq.count("a") + seq.count("A")
110
+ nT = seq.count("t") + seq.count("T")
111
+ nG = seq.count("g") + seq.count("G")
112
+ nC = seq.count("c") + seq.count("C")
113
+ percent_GC = (nG + nC) / (nA + nT + nG + nC) if (nA + nT + nG + nC) > 0 else 0
114
+ return percent_GC
115
+ except Exception as e:
116
+ logger.error(f"GCcontent calculation failed: {e}")
117
+ return 0
118
+
119
+ def read_pair_generator(bam: pysam.AlignmentFile, region_string: str | None = None):
120
+ """
121
+ Generate read pairs in a BAM file or within a region string.
122
+ Reads are added to read_dict until a pair is found.
123
+ Reference: https://www.biostars.org/p/306041/
124
+ """
125
+ read_dict = defaultdict(lambda: [None, None])
126
+ try:
127
+ for read in bam.fetch(region=region_string):
128
+ if read.is_unmapped or read.is_qcfail or read.is_duplicate:
129
+ continue
130
+ if not read.is_paired or not read.is_proper_pair:
131
+ continue
132
+ if read.is_secondary or read.is_supplementary:
133
+ continue
134
+ if read.mate_is_unmapped:
135
+ continue
136
+ if read.rnext != read.tid:
137
+ continue
138
+ if read.template_length == 0:
139
+ continue
140
+ if isSoftClipped(read.cigar):
141
+ continue
142
+ qname = read.query_name
143
+ if qname not in read_dict:
144
+ if read.is_read1:
145
+ read_dict[qname][0] = read
146
+ else:
147
+ read_dict[qname][1] = read
148
+ else:
149
+ if read.is_read1:
150
+ yield read, read_dict[qname][1]
151
+ else:
152
+ yield read_dict[qname][0], read
153
+ del read_dict[qname]
154
+ except Exception as e:
155
+ logger.error(f"Error during BAM read pair generation: {e}")
156
+ return
157
+
158
+ def reverse_complement(seq: str) -> str:
159
+ """
160
+ Return the reverse complement of a DNA sequence.
161
+ """
162
+ trans_table = str.maketrans("ATCGatcgNn", "TAGCtagcNn")
163
+ return seq.translate(trans_table)[::-1]
164
+
165
+ def get_End_motif(Emotif: dict[str, int], end5: str, end3: str) -> dict[str, int]:
166
+ """
167
+ Update End Motif frequency dictionary.
168
+ end5: 5' end of the fragment (from Read 1)
169
+ end3: 3' end of the fragment (from Read 2, already reverse complemented to be on forward strand relative to fragment)
170
+ """
171
+ if "N" in end5 or "n" in end5 or "N" in end3 or "n" in end3:
172
+ return Emotif
173
+
174
+ # In cfDNAFE, they used:
175
+ # seq2 = reverse_seq(seq2) -> which was just complement, not reverse.
176
+ # And they passed forward_end3 twice.
177
+ #
178
+ # Correct logic:
179
+ # We want the 4-mer at the 5' end and the 4-mer at the 3' end.
180
+ # The 5' end sequence is just the sequence.
181
+ # The 3' end sequence (on the + strand) is what we want.
182
+ #
183
+ # However, standard motif analysis often looks at the 5' end of the fragment on both strands.
184
+ # If we treat the fragment as double stranded:
185
+ # Strand 1: 5' [Seq] 3'
186
+ # Strand 2: 3' [Seq_RC] 5'
187
+ #
188
+ # The 5' end of Strand 1 is `end5`.
189
+ # The 5' end of Strand 2 corresponds to the 3' end of Strand 1, reverse complemented.
190
+ #
191
+ # If `end3` passed here is the sequence of the 3' end of the fragment on the forward strand:
192
+ # Then the 5' end of the reverse strand is `reverse_complement(end3)`.
193
+ #
194
+ # Let's assume the caller passes the raw sequences from the reads.
195
+ # Read 1 (Forward): 5' -> 3'. 5' end is the start of fragment.
196
+ # Read 2 (Reverse): 5' -> 3'. 5' end is the other start of fragment (on reverse strand).
197
+ #
198
+ # So we should just take the 5' end of Read 1 and the 5' end of Read 2.
199
+ #
200
+ # But let's look at how `motif.py` calls this.
201
+ # It will be updated to pass:
202
+ # forward_end5 (Read 1 5' end)
203
+ # reverse_end5 (Read 2 5' end)
204
+ #
205
+ # So we just count both of them.
206
+
207
+ if end5 in Emotif:
208
+ Emotif[end5] += 1
209
+ if end3 in Emotif:
210
+ Emotif[end3] += 1
211
+ return Emotif
212
+
213
+ def calc_MDS(inputEndMotifFile: str | Path, outputfile: str | Path) -> None:
214
+ inputfile = pd.read_table(inputEndMotifFile, header=None, names=['bases', 'frequency'])
215
+ k_mer = math.log(len(inputfile), 4)
216
+ frequency = inputfile['frequency'].to_numpy()
217
+ MDS = np.sum(-frequency * np.log2(frequency) / np.log2(4 ** k_mer))
218
+ with open(outputfile, 'a') as f:
219
+ f.write(str(inputEndMotifFile) + '\t' + str(MDS) + '\n')
220
+
221
+ def get_Breakpoint_motif(Bpmotif: dict[str, int], seq1: str, seq2: str) -> dict[str, int]:
222
+ """
223
+ Update Breakpoint Motif frequency dictionary.
224
+ seq1: Sequence around the 5' end of the fragment.
225
+ seq2: Sequence around the 3' end of the fragment.
226
+ """
227
+ if "N" in seq1 or "n" in seq1 or "N" in seq2 or "n" in seq2:
228
+ return Bpmotif
229
+
230
+ # Similar to End Motif, we just count the motifs at both breakpoints.
231
+ # The caller should ensure seq1 and seq2 are the correct sequences surrounding the breakpoints.
232
+
233
+ if seq1 in Bpmotif:
234
+ Bpmotif[seq1] += 1
235
+ if seq2 in Bpmotif:
236
+ Bpmotif[seq2] += 1
237
+ return Bpmotif
krewlyzer/mfsd.py ADDED
@@ -0,0 +1,236 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import pysam
6
+ import pandas as pd
7
+ import numpy as np
8
+ from scipy.stats import ks_2samp
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+ from rich.progress import track
12
+ import os
13
+
14
+ console = Console()
15
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
16
+ logger = logging.getLogger("mfsd")
17
+
18
+ def classify_read(read: pysam.AlignedSegment, pos: int, ref: str, alt: str) -> str:
19
+ """
20
+ Classify a read as Mutant or Wild-Type at a specific genomic position.
21
+ Currently supports SNVs.
22
+ pos: 0-based genomic position.
23
+ """
24
+ try:
25
+ # Check if read covers the position
26
+ if read.reference_start > pos or read.reference_end <= pos:
27
+ return "Unknown"
28
+
29
+ # Get aligned pairs to map reference position to query position
30
+ # matches_only=True ensures we only get aligned bases (no deletions/insertions in cigar at this pos)
31
+ # But for SNV, we want to see the base.
32
+ # aligned_pairs returns (query_pos, ref_pos).
33
+
34
+ # Optimization: use get_aligned_pairs(matches_only=True) might skip if it's a mismatch?
35
+ # No, matches_only=True means "not None", i.e., aligned columns. It includes mismatches.
36
+
37
+ pairs = read.get_aligned_pairs(matches_only=True)
38
+ for q_pos, r_pos in pairs:
39
+ if r_pos == pos:
40
+ base = read.query_sequence[q_pos].upper()
41
+ if base == alt:
42
+ return "Mutant"
43
+ if base == ref:
44
+ return "WildType"
45
+ return "Other" # Different base
46
+ except Exception:
47
+ return "Unknown"
48
+ return "Unknown"
49
+
50
+ def parse_input_file(input_file: Path, input_format: str) -> pd.DataFrame:
51
+ """
52
+ Parse VCF or MAF file into a standardized DataFrame.
53
+ Returns DataFrame with columns: [chrom, pos, ref, alt] (pos is 0-based)
54
+ """
55
+ if input_format == "auto":
56
+ if input_file.suffix.lower() in ['.vcf', '.gz']: # .vcf.gz
57
+ input_format = "vcf"
58
+ elif input_file.suffix.lower() in ['.maf', '.txt', '.tsv']:
59
+ input_format = "maf"
60
+ else:
61
+ raise ValueError(f"Could not determine format for {input_file}. Please specify --format.")
62
+
63
+ variants = []
64
+
65
+ if input_format == "vcf":
66
+ try:
67
+ vcf = pysam.VariantFile(str(input_file))
68
+ for record in vcf:
69
+ # VCF is 1-based, pysam.VariantFile.pos is 1-based.
70
+ # record.start is 0-based.
71
+ # We handle only the first ALT allele for now if multiple exist.
72
+ if len(record.alts) > 0:
73
+ variants.append({
74
+ 'chrom': record.chrom,
75
+ 'pos': record.start, # 0-based
76
+ 'ref': record.ref,
77
+ 'alt': record.alts[0]
78
+ })
79
+ except Exception as e:
80
+ logger.error(f"Error parsing VCF: {e}")
81
+ raise typer.Exit(1)
82
+
83
+ elif input_format == "maf":
84
+ try:
85
+ # MAF is tab-delimited.
86
+ # Columns: Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2
87
+ df = pd.read_csv(input_file, sep='\t', comment='#')
88
+ # Check required columns
89
+ required = ['Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2']
90
+ if not all(col in df.columns for col in required):
91
+ # Try alternative column names if standard ones fail?
92
+ # For now assume standard MAF.
93
+ raise ValueError(f"MAF file missing required columns: {required}")
94
+
95
+ for _, row in df.iterrows():
96
+ variants.append({
97
+ 'chrom': str(row['Chromosome']),
98
+ 'pos': int(row['Start_Position']) - 1, # MAF is 1-based
99
+ 'ref': row['Reference_Allele'],
100
+ 'alt': row['Tumor_Seq_Allele2']
101
+ })
102
+ except Exception as e:
103
+ logger.error(f"Error parsing MAF: {e}")
104
+ raise typer.Exit(1)
105
+
106
+ return pd.DataFrame(variants)
107
+
108
+ def calc_mfsd(
109
+ bam_file: Path,
110
+ input_file: Path,
111
+ output_file: Path,
112
+ input_format: str = "auto",
113
+ map_quality: int = 20
114
+ ) -> None:
115
+ """
116
+ Calculate Mutant Fragment Size Distribution metrics.
117
+ """
118
+ try:
119
+ logger.info(f"Parsing variants from {input_file}...")
120
+ variants_df = parse_input_file(input_file, input_format)
121
+ logger.info(f"Found {len(variants_df)} variants.")
122
+
123
+ bam = pysam.AlignmentFile(str(bam_file), "rb")
124
+
125
+ results = []
126
+
127
+ for _, var in track(variants_df.iterrows(), total=len(variants_df), description="Processing variants..."):
128
+ chrom = var['chrom']
129
+ pos = var['pos']
130
+ ref = var['ref']
131
+ alt = var['alt']
132
+
133
+ # Skip if ref/alt are not single bases (Indels) for now?
134
+ # Let's try to handle SNVs primarily.
135
+ if len(ref) > 1 or len(alt) > 1:
136
+ # Simple Indel logic or skip?
137
+ # classify_read logic above assumes SNV (base comparison).
138
+ # Let's skip Indels for this version to ensure correctness of SNV first.
139
+ # Or we can try.
140
+ # For now, let's log warning and skip complex indels to avoid noise.
141
+ # logger.warning(f"Skipping Indel at {chrom}:{pos} ({ref}->{alt}) - only SNVs supported currently.")
142
+ continue
143
+
144
+ mutant_lengths = []
145
+ wt_lengths = []
146
+
147
+ try:
148
+ # Fetch reads
149
+ # pos is 0-based.
150
+ for read in bam.fetch(chrom, pos, pos + 1):
151
+ if read.mapping_quality < map_quality:
152
+ continue
153
+ if read.is_duplicate or read.is_unmapped or read.is_secondary:
154
+ continue
155
+
156
+ cls = classify_read(read, pos, ref, alt)
157
+
158
+ length = abs(read.template_length)
159
+ # template_length (TLEN) is the insert size.
160
+ # 0 means single ended or info not available.
161
+ if length == 0:
162
+ length = read.query_length # Fallback to read length?
163
+
164
+ if cls == "Mutant":
165
+ mutant_lengths.append(length)
166
+ elif cls == "WildType":
167
+ wt_lengths.append(length)
168
+
169
+ except Exception as e:
170
+ logger.warning(f"Error fetching reads at {chrom}:{pos}: {e}")
171
+ continue
172
+
173
+ # Calculate metrics
174
+ n_mut = len(mutant_lengths)
175
+ n_wt = len(wt_lengths)
176
+
177
+ if n_mut > 0 and n_wt > 0:
178
+ mut_mean = np.mean(mutant_lengths)
179
+ wt_mean = np.mean(wt_lengths)
180
+ delta_size = wt_mean - mut_mean
181
+
182
+ # KS Test
183
+ ks_stat, ks_pval = ks_2samp(mutant_lengths, wt_lengths)
184
+ else:
185
+ mut_mean = np.nan
186
+ wt_mean = np.nan
187
+ delta_size = np.nan
188
+ ks_stat = np.nan
189
+ ks_pval = np.nan
190
+
191
+ results.append({
192
+ 'Chrom': chrom,
193
+ 'Pos': pos + 1, # Output 1-based for user convenience? Or 0-based?
194
+ # VCF/MAF users expect 1-based usually. Let's stick to 1-based for output.
195
+ 'Ref': ref,
196
+ 'Alt': alt,
197
+ 'Mut_Count': n_mut,
198
+ 'WT_Count': n_wt,
199
+ 'Mut_MeanSize': mut_mean,
200
+ 'WT_MeanSize': wt_mean,
201
+ 'Delta_Size': delta_size,
202
+ 'KS_Stat': ks_stat,
203
+ 'KS_Pval': ks_pval
204
+ })
205
+
206
+ # Write output
207
+ out_df = pd.DataFrame(results)
208
+ out_df.to_csv(output_file, sep='\t', index=False)
209
+ logger.info(f"mFSD analysis complete. Results written to {output_file}")
210
+
211
+ except Exception as e:
212
+ logger.error(f"Fatal error in calc_mfsd: {e}")
213
+ raise typer.Exit(1)
214
+
215
+ def mfsd(
216
+ bam_path: Path = typer.Argument(..., help="Input BAM file"),
217
+ input_file: Path = typer.Option(..., "--input", "-i", help="Input VCF or MAF file containing variants"),
218
+ output: Path = typer.Option(..., "--output", "-o", help="Output file path (TSV)"),
219
+ format: str = typer.Option("auto", "--format", "-f", help="Input format: 'auto', 'vcf', or 'maf'"),
220
+ map_quality: int = typer.Option(20, "--map-quality", "-q", help="Minimum mapping quality")
221
+ ) -> None:
222
+ """
223
+ Calculate Mutant Fragment Size Distribution (mFSD) features.
224
+ Compares fragment sizes of mutant vs. wild-type reads at variant sites.
225
+ """
226
+ if not bam_path.exists():
227
+ logger.error(f"BAM file not found: {bam_path}")
228
+ raise typer.Exit(1)
229
+ if not input_file.exists():
230
+ logger.error(f"Input variant file not found: {input_file}")
231
+ raise typer.Exit(1)
232
+
233
+ # Create parent dir for output if needed
234
+ output.parent.mkdir(parents=True, exist_ok=True)
235
+
236
+ calc_mfsd(bam_path, input_file, output, format, map_quality)