uht-tooling 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2480 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import subprocess
4
+ import sys
5
+ import glob
6
+ import os
7
+ import shutil
8
+ import logging
9
+ import pysam
10
+ import random
11
+ import numpy as np
12
+ from Bio import SeqIO
13
+ from Bio.Seq import Seq
14
+ import matplotlib.pyplot as plt
15
+ import math
16
+ import tempfile
17
+ from pathlib import Path
18
+ from typing import Dict, Iterable, List, Optional, Sequence
19
+
20
+ # Use a built-in Matplotlib style ("ggplot") for consistency
21
+ plt.style.use("ggplot")
22
+
23
+ # Try to import scipy for Z‐test p‐value and KDE
24
+ try:
25
+ from scipy.stats import norm, gaussian_kde, beta, binom
26
+ HAVE_SCIPY = True
27
+ except ImportError:
28
+ HAVE_SCIPY = False
29
+ print("Warning: scipy is not installed. Z-test p-values and KDE may be limited.")
30
+
31
+ def setup_logging(log_dir):
32
+ """
33
+ Configure logging so that INFO (and above) go into run.log inside log_dir.
34
+ Before configuring, remove any existing handlers to avoid carryover.
35
+ """
36
+ # Remove any existing handlers
37
+ for handler in logging.root.handlers[:]:
38
+ logging.root.removeHandler(handler)
39
+
40
+ os.makedirs(log_dir, exist_ok=True)
41
+ log_file = os.path.join(log_dir, "run.log")
42
+ logging.basicConfig(
43
+ level=logging.INFO,
44
+ format="%(asctime)s %(levelname)s: %(message)s",
45
+ handlers=[
46
+ logging.FileHandler(log_file),
47
+ # INFO logs go into file only
48
+ ]
49
+ )
50
+
51
+ def run_minimap2(reads_input, ref_fasta, out_prefix, work_dir):
52
+ """
53
+ Runs minimap2 on reads_input (either one FASTQ or a glob pattern).
54
+ Saves SAM to work_dir/out_prefix.sam.
55
+ Returns the full path to the SAM file.
56
+ """
57
+ # Expand glob if pattern contains wildcard; otherwise assume it's a single path
58
+ fastq_files = glob.glob(reads_input) if ("*" in reads_input or "?" in reads_input) else [reads_input]
59
+ if not fastq_files:
60
+ logging.error(f"No FASTQ files found matching {reads_input}")
61
+ raise FileNotFoundError(f"No FASTQ files found matching {reads_input}")
62
+
63
+ sam_output = os.path.join(work_dir, f"{out_prefix}.sam")
64
+ cmd = ["minimap2", "-ax", "map-ont", ref_fasta, *fastq_files]
65
+ logging.info(f"Running minimap2: {' '.join(cmd)} → {sam_output}")
66
+ with open(sam_output, "w") as out_sam:
67
+ subprocess.run(cmd, stdout=out_sam, check=True)
68
+ return sam_output
69
+
70
+ def load_single_sequence(fasta_path):
71
+ """
72
+ Loads exactly one sequence from a FASTA. Raises if not exactly one record.
73
+ Returns (sequence_string, sequence_id).
74
+ """
75
+ records = list(SeqIO.parse(fasta_path, "fasta"))
76
+ if len(records) != 1:
77
+ logging.error(f"Expected exactly 1 record in {fasta_path}, found {len(records)}.")
78
+ raise ValueError(f"Expected exactly 1 record in {fasta_path}, found {len(records)}.")
79
+ seq_id = records[0].id
80
+ seq_str = str(records[0].seq)
81
+ logging.info(f"Loaded sequence {seq_id} from {fasta_path}")
82
+ return seq_str, seq_id
83
+
84
+ def create_multi_fasta(chunks, work_dir, out_fasta_prefix="plasmid_chunks"):
85
+ """
86
+ Writes each chunk as a separate FASTA entry into work_dir/out_fasta_prefix.fasta:
87
+ >chunk_1
88
+ ACTG...
89
+ >chunk_2
90
+ TTAG...
91
+ Returns the path to the created FASTA.
92
+ """
93
+ out_fasta = os.path.join(work_dir, f"{out_fasta_prefix}.fasta")
94
+ logging.info(f"Writing {len(chunks)} chunks to {out_fasta}")
95
+ with open(out_fasta, "w") as f:
96
+ for i, seq in enumerate(chunks):
97
+ f.write(f">chunk_{i+1}\n")
98
+ f.write(str(seq) + "\n")
99
+ return out_fasta
100
+
101
+ def calculate_background_from_plasmid(sam_plasmid, plasmid_seq, target_start, target_length):
102
+ """
103
+ Calculate background mismatch statistics from full plasmid alignment, excluding target region.
104
+
105
+ Args:
106
+ sam_plasmid: Path to SAM file with full plasmid alignment
107
+ plasmid_seq: Full plasmid sequence
108
+ target_start: Start position of target region (0-based)
109
+ target_length: Length of target region
110
+
111
+ Returns:
112
+ tuple: (total_mismatches, total_covered_bases, mapped_reads)
113
+ """
114
+ target_end = target_start + target_length
115
+
116
+ # Initialize counters
117
+ total_mismatches = 0
118
+ total_covered_bases = 0
119
+ mapped_reads = 0
120
+
121
+ samfile = pysam.AlignmentFile(sam_plasmid, "r")
122
+ for read in samfile.fetch():
123
+ if read.is_unmapped or read.query_sequence is None:
124
+ continue
125
+
126
+ mapped_reads += 1
127
+
128
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
129
+ if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
130
+ # Skip positions within the target region
131
+ if target_start <= ref_pos < target_end:
132
+ continue
133
+
134
+ total_covered_bases += 1
135
+ if read.query_sequence[read_pos].upper() != plasmid_seq[ref_pos].upper():
136
+ total_mismatches += 1
137
+
138
+ samfile.close()
139
+
140
+ return total_mismatches, total_covered_bases, mapped_reads
141
+
142
+ def calculate_rolling_mutation_rate_plasmid(sam_plasmid, plasmid_seq, window_size=20):
143
+ """
144
+ Calculate rolling mutation rate across the entire plasmid with a specified window size.
145
+
146
+ Args:
147
+ sam_plasmid: Path to SAM file with full plasmid alignment
148
+ plasmid_seq: Full plasmid sequence
149
+ window_size: Size of the rolling window (default: 20 bp)
150
+
151
+ Returns:
152
+ tuple: (positions, rolling_rates) for the entire plasmid
153
+ """
154
+ # Calculate per-position mutation rates for the entire plasmid
155
+ plasmid_mismatches = [0] * len(plasmid_seq)
156
+ plasmid_coverage = [0] * len(plasmid_seq)
157
+
158
+ samfile = pysam.AlignmentFile(sam_plasmid, "r")
159
+ for read in samfile.fetch():
160
+ if read.is_unmapped or read.query_sequence is None:
161
+ continue
162
+
163
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
164
+ if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
165
+ plasmid_coverage[ref_pos] += 1
166
+ if read.query_sequence[read_pos].upper() != plasmid_seq[ref_pos].upper():
167
+ plasmid_mismatches[ref_pos] += 1
168
+
169
+ samfile.close()
170
+
171
+ # Calculate per-position mutation rates
172
+ plasmid_rates = []
173
+ for cov, mis in zip(plasmid_coverage, plasmid_mismatches):
174
+ if cov > 0:
175
+ plasmid_rates.append(mis / cov)
176
+ else:
177
+ plasmid_rates.append(0.0)
178
+
179
+ # Calculate rolling average
180
+ rolling_rates = []
181
+ positions = []
182
+
183
+ for i in range(len(plasmid_rates) - window_size + 1):
184
+ window_rates = plasmid_rates[i:i + window_size]
185
+ rolling_rates.append(np.mean(window_rates))
186
+ positions.append(i + window_size // 2 + 1) # Center position of window
187
+
188
+ return positions, rolling_rates
189
+
190
+ def compute_mismatch_stats_sam(sam_file, refs_dict):
191
+ """
192
+ For each reference in refs_dict:
193
+ - Count how many reads mapped to it
194
+ - Count per‐position coverage
195
+ - Count per‐position mismatches
196
+ - Compute total mismatches, total covered bases, average mismatch rate
197
+
198
+ Returns a dict keyed by reference name, each containing:
199
+ pos_rates -> list of mismatch rates per position
200
+ cov -> list of coverage per position
201
+ mismatch -> list of mismatch counts per position
202
+ avg_mismatch_rate -> float (raw per‐base fraction)
203
+ total_mismatches -> int
204
+ total_covered_bases -> int
205
+ mapped_reads -> int
206
+ """
207
+ mismatch_data = {
208
+ name: {
209
+ "pos_rates": [0.0] * len(seq),
210
+ "cov": [0] * len(seq),
211
+ "mismatch": [0] * len(seq),
212
+ "avg_mismatch_rate": 0.0,
213
+ "total_mismatches": 0,
214
+ "total_covered_bases": 0,
215
+ "mapped_reads": 0,
216
+ }
217
+ for name, seq in refs_dict.items()
218
+ }
219
+
220
+ logging.info(f"Computing mismatch stats for {sam_file}")
221
+ samfile = pysam.AlignmentFile(sam_file, "r")
222
+ for read in samfile.fetch():
223
+ if read.is_unmapped or read.query_sequence is None:
224
+ continue
225
+ ref_name = samfile.get_reference_name(read.reference_id)
226
+ if ref_name not in mismatch_data:
227
+ continue
228
+
229
+ mismatch_data[ref_name]["mapped_reads"] += 1
230
+ info = mismatch_data[ref_name]
231
+ ref_seq = refs_dict[ref_name]
232
+
233
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
234
+ if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(ref_seq):
235
+ info["cov"][ref_pos] += 1
236
+ if read.query_sequence[read_pos].upper() != ref_seq[ref_pos].upper():
237
+ info["mismatch"][ref_pos] += 1
238
+
239
+ samfile.close()
240
+
241
+ for name, info in mismatch_data.items():
242
+ total_mis = 0
243
+ total_cov = 0
244
+ pos_rates = []
245
+ for cov, mis in zip(info["cov"], info["mismatch"]):
246
+ if cov > 0:
247
+ rate = mis / cov
248
+ pos_rates.append(rate)
249
+ total_mis += mis
250
+ total_cov += cov
251
+ else:
252
+ pos_rates.append(0.0)
253
+
254
+ info["pos_rates"] = pos_rates
255
+ info["total_mismatches"] = total_mis
256
+ info["total_covered_bases"] = total_cov
257
+ info["avg_mismatch_rate"] = (total_mis / total_cov) if total_cov > 0 else 0.0
258
+
259
+ logging.info(
260
+ f"{name}: mapped_reads={info['mapped_reads']}, "
261
+ f"mismatches={total_mis}, covered_bases={total_cov}, "
262
+ f"avg_rate={info['avg_mismatch_rate']:.6f}"
263
+ )
264
+
265
+ return mismatch_data
266
+
267
+ def z_test_two_proportions(mis1, cov1, mis2, cov2):
268
+ """
269
+ Performs a two‐proportion Z‐test:
270
+ H0: p1 == p2
271
+ H1: p1 != p2
272
+
273
+ Returns (z_statistic, p_value). If scipy is unavailable, p_value=None.
274
+ """
275
+ if cov1 == 0 or cov2 == 0:
276
+ return 0.0, 1.0
277
+
278
+ p1 = mis1 / cov1
279
+ p2 = mis2 / cov2
280
+ p = (mis1 + mis2) / (cov1 + cov2)
281
+ denom = math.sqrt(p * (1 - p) * (1 / cov1 + 1 / cov2))
282
+ if denom == 0:
283
+ return 0.0, 1.0
284
+
285
+ z_stat = (p1 - p2) / denom
286
+ if HAVE_SCIPY:
287
+ p_val = 2 * (1 - norm.cdf(abs(z_stat)))
288
+ logging.info(f"Z‐test: z={z_stat:.4f}, p‐value={p_val:.4e}")
289
+ else:
290
+ p_val = None
291
+ logging.info(f"Z‐test: z={z_stat:.4f}, p‐value=(scipy unavailable)")
292
+
293
+ return z_stat, p_val
294
+
295
+ def run_nanofilt_filtering(input_fastq, quality_threshold, output_fastq):
296
+ """
297
+ Run NanoFilt to filter FASTQ file by quality score threshold and minimum length.
298
+
299
+ Args:
300
+ input_fastq: Path to input FASTQ.gz file
301
+ quality_threshold: Quality score threshold (integer)
302
+ output_fastq: Path to output filtered FASTQ.gz file
303
+
304
+ Returns:
305
+ bool: True if successful, False otherwise
306
+ """
307
+ try:
308
+ # Use gunzip to decompress, pipe to NanoFilt with length filter, then compress output
309
+ cmd = f"gunzip -c {input_fastq} | NanoFilt -q {quality_threshold} -l 30 | gzip > {output_fastq}"
310
+ logging.info(f"Running NanoFilt with quality threshold {quality_threshold} and min length 30bp: {cmd}")
311
+
312
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
313
+ if result.returncode != 0:
314
+ logging.error(f"NanoFilt failed with return code {result.returncode}: {result.stderr}")
315
+ return False
316
+
317
+ # Check if output file was created and has content
318
+ if os.path.exists(output_fastq) and os.path.getsize(output_fastq) > 0:
319
+ logging.info(f"Successfully created filtered FASTQ: {output_fastq}")
320
+ return True
321
+ else:
322
+ logging.error(f"Output file {output_fastq} was not created or is empty")
323
+ return False
324
+
325
+ except Exception as e:
326
+ logging.error(f"Error running NanoFilt: {e}")
327
+ return False
328
+
329
+ def calculate_mutation_rate_for_quality(fastq_path, quality_threshold, work_dir, ref_hit_fasta, plasmid_fasta):
330
+ """
331
+ Calculate comprehensive AA mutation analysis for a specific quality threshold.
332
+
333
+ Args:
334
+ fastq_path: Path to input FASTQ.gz file
335
+ quality_threshold: Quality score threshold
336
+ work_dir: Working directory for temporary files
337
+ ref_hit_fasta: Path to reference hit FASTA
338
+ plasmid_fasta: Path to plasmid FASTA
339
+
340
+ Returns:
341
+ dict: Comprehensive results with error estimates or None if failed
342
+ """
343
+ try:
344
+ # Create filtered FASTQ file
345
+ filtered_fastq = os.path.join(work_dir, f"filtered_q{quality_threshold}.fastq.gz")
346
+
347
+ if not run_nanofilt_filtering(fastq_path, quality_threshold, filtered_fastq):
348
+ return None
349
+
350
+ # Load sequences
351
+ hit_seq, hit_id = load_single_sequence(ref_hit_fasta)
352
+ plasmid_seq, plasmid_id = load_single_sequence(plasmid_fasta)
353
+
354
+ # Find hit region in plasmid
355
+ idx = plasmid_seq.upper().find(hit_seq.upper())
356
+ if idx == -1:
357
+ logging.error("Gene region not found in plasmid")
358
+ return None
359
+
360
+ # Align filtered reads to hit region
361
+ sam_hit = run_minimap2(filtered_fastq, ref_hit_fasta, f"hit_q{quality_threshold}", work_dir)
362
+
363
+ # Align filtered reads to full plasmid for background calculation
364
+ sam_plasmid = run_minimap2(filtered_fastq, plasmid_fasta, f"plasmid_q{quality_threshold}", work_dir)
365
+
366
+ # Calculate background rate from full plasmid alignment, excluding target region
367
+ bg_mis, bg_cov, bg_reads = calculate_background_from_plasmid(sam_plasmid, plasmid_seq, idx, len(hit_seq))
368
+
369
+ # Calculate hit region mutation rate
370
+ mismatch_hit = compute_mismatch_stats_sam(sam_hit, {hit_id: hit_seq})
371
+ hit_info = mismatch_hit[hit_id]
372
+ hit_mis = hit_info["total_mismatches"]
373
+ hit_cov = hit_info["total_covered_bases"]
374
+
375
+ # Extract Q-score statistics for both hit and background regions
376
+ hit_qscore_stats = extract_qscores_from_sam(sam_hit)
377
+ bg_qscore_stats = extract_qscores_from_sam(sam_plasmid)
378
+
379
+ # Check if it's a protein-coding sequence
380
+ is_protein = True
381
+ seq_upper = hit_seq.upper()
382
+ if len(seq_upper) % 3 != 0:
383
+ is_protein = False
384
+ elif "*" in str(Seq(seq_upper).translate(to_stop=False))[:-1]:
385
+ is_protein = False
386
+
387
+ # Run comprehensive analysis if protein-coding
388
+ if is_protein:
389
+ results = comprehensive_aa_mutation_analysis(
390
+ hit_mis, hit_cov, bg_mis, bg_cov, hit_seq,
391
+ quality_threshold=quality_threshold, n_trials=10000,
392
+ hit_qscore_stats=hit_qscore_stats, bg_qscore_stats=bg_qscore_stats,
393
+ sam_hit=sam_hit, sam_plasmid=sam_plasmid, hit_seq=hit_seq, plasmid_seq=plasmid_seq
394
+ )
395
+ return results
396
+ else:
397
+ # For non-protein sequences, return basic info
398
+ return {
399
+ 'mean_aa_mutations': 0.0,
400
+ 'std_aa_mutations': 0.0,
401
+ 'ci_lower': 0.0,
402
+ 'ci_upper': 0.0,
403
+ 'hit_rate': hit_mis / hit_cov if hit_cov > 0 else 0,
404
+ 'bg_rate': bg_mis / bg_cov if bg_cov > 0 else 0,
405
+ 'net_rate': max((hit_mis / hit_cov) - (bg_mis / bg_cov), 0) if hit_cov > 0 and bg_cov > 0 else 0,
406
+ 'mappable_bases': hit_cov,
407
+ 'quality_threshold': quality_threshold,
408
+ 'is_protein': False
409
+ }
410
+
411
+ except Exception as e:
412
+ logging.error(f"Error calculating mutation rate for quality {quality_threshold}: {e}")
413
+ return None
414
+
415
+ def find_optimal_qscore(qc_results):
416
+ """
417
+ Find the Q-score threshold with the lowest net mutation rate error.
418
+
419
+ Args:
420
+ qc_results: List of comprehensive analysis results
421
+
422
+ Returns:
423
+ tuple: (optimal_qscore, optimal_result, error_comparison)
424
+ """
425
+ logging.info("=== FINDING OPTIMAL Q-SCORE THRESHOLD ===")
426
+
427
+ if not qc_results:
428
+ return None, None, None
429
+
430
+ # Find Q-score with minimum net mutation rate error
431
+ min_error = float('inf')
432
+ optimal_result = None
433
+ optimal_qscore = None
434
+
435
+ error_comparison = []
436
+
437
+ for result in qc_results:
438
+ qscore = result['quality_threshold']
439
+ # Use weighted error for optimal Q-score selection
440
+ net_rate_error = result.get('net_weighted_error', result['net_rate_error'])
441
+ mappable_bases = result['mappable_bases']
442
+
443
+ error_comparison.append({
444
+ 'qscore': qscore,
445
+ 'net_rate_error': net_rate_error,
446
+ 'net_weighted_error': result.get('net_weighted_error', 0.0),
447
+ 'mappable_bases': mappable_bases,
448
+ 'aa_mutations': result['mean_aa_mutations'],
449
+ 'aa_error': result['std_aa_mutations']
450
+ })
451
+
452
+ logging.info(f"Q{qscore}: net_weighted_error={net_rate_error:.6f}, mappable_bases={mappable_bases}")
453
+
454
+ if net_rate_error < min_error:
455
+ min_error = net_rate_error
456
+ optimal_result = result
457
+ optimal_qscore = qscore
458
+
459
+ logging.info(f"OPTIMAL Q-SCORE: Q{optimal_qscore} (lowest net mutation rate error: {min_error:.6f})")
460
+ logging.info(f"Optimal result: AA mutations = {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}")
461
+
462
+ return optimal_qscore, optimal_result, error_comparison
463
+
464
+ def run_qc_analysis(fastq_path, results_dir, ref_hit_fasta, plasmid_fasta):
465
+ """
466
+ Run simple QC analysis using segmentation-based error estimation.
467
+
468
+ Args:
469
+ fastq_path: Path to input FASTQ.gz file
470
+ results_dir: Directory to save QC results
471
+ """
472
+ logging.info("Starting simple QC analysis with segmentation-based error estimation")
473
+
474
+ # Define quality thresholds to test
475
+ quality_thresholds = [10, 12, 14, 16, 18, 20, 22, 24, 26]
476
+
477
+
478
+ # Segment the input FASTQ file
479
+ logging.info("Segmenting FASTQ file into 10 parts for error estimation...")
480
+ segment_files = segment_fastq_file(fastq_path, n_segments=10)
481
+
482
+ if not segment_files:
483
+ logging.error("Failed to segment FASTQ file")
484
+ return
485
+
486
+ # Create temporary work directory for QC analysis
487
+ with tempfile.TemporaryDirectory() as qc_work_dir:
488
+ logging.info(f"Using temporary work directory: {qc_work_dir}")
489
+
490
+ # Calculate results for each quality threshold
491
+ qc_results = []
492
+ successful_thresholds = []
493
+
494
+ for q_threshold in quality_thresholds:
495
+ logging.info(f"Processing quality threshold: {q_threshold}")
496
+ result = run_segmented_analysis(
497
+ segment_files, q_threshold, qc_work_dir, ref_hit_fasta, plasmid_fasta
498
+ )
499
+
500
+ if result is not None:
501
+ qc_results.append(result)
502
+ successful_thresholds.append(q_threshold)
503
+ logging.info(f"Quality {q_threshold}: AA mutations = {result['mean_aa_mutations']:.4f} ± {result['std_aa_mutations']:.4f}, "
504
+ f"mappable bases = {result['total_mappable_bases']}")
505
+ else:
506
+ logging.warning(f"Failed to calculate mutation rate for quality threshold {q_threshold}")
507
+
508
+ # Find optimal Q-score threshold (lowest empirical error)
509
+ optimal_qscore, optimal_result = find_optimal_qscore_simple(qc_results)
510
+
511
+ # Create QC plots
512
+ if len(qc_results) >= 2:
513
+ create_simple_qc_plots(successful_thresholds, qc_results, results_dir, optimal_qscore, optimal_result)
514
+ else:
515
+ logging.warning("Insufficient data points for QC plots (need at least 2)")
516
+
517
+ # Save optimal Q-score information
518
+ if optimal_qscore is not None:
519
+ optimal_qscore_path = os.path.join(results_dir, "optimal_qscore_analysis.txt")
520
+ with open(optimal_qscore_path, 'w') as f:
521
+ f.write("=== OPTIMAL Q-SCORE ANALYSIS (PRECISION-WEIGHTED) ===\n")
522
+ f.write(f"Optimal Q-score threshold: {optimal_qscore}\n")
523
+ f.write(f"Precision-weighted score: {(1.0 / optimal_result['std_aa_mutations']) * optimal_qscore:.6f}\n" if optimal_result['std_aa_mutations'] > 0 else "Precision-weighted score: inf (perfect precision)\n")
524
+ f.write(f"Empirical error (std): {optimal_result['std_aa_mutations']:.6f}\n")
525
+ f.write(f"AA mutations per gene: {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}\n")
526
+ f.write(f"95% Confidence Interval: [{optimal_result['ci_lower']:.4f}, {optimal_result['ci_upper']:.4f}]\n")
527
+ f.write(f"Total mappable bases: {optimal_result['total_mappable_bases']}\n")
528
+ f.write(f"Number of segments: {optimal_result['n_segments']}\n")
529
+ f.write("\n=== ALL Q-SCORE COMPARISON ===\n")
530
+ f.write("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases\tAA_Mutations\tCI_Lower\tCI_Upper\n")
531
+ for result in qc_results:
532
+ precision_score = (1.0 / result['std_aa_mutations']) * result['quality_threshold'] if result['std_aa_mutations'] > 0 else float('inf')
533
+ f.write(f"{result['quality_threshold']}\t{result['std_aa_mutations']:.6f}\t{precision_score:.6f}\t{result['total_mappable_bases']}\t{result['mean_aa_mutations']:.4f}\t{result['ci_lower']:.4f}\t{result['ci_upper']:.4f}\n")
534
+
535
+ logging.info(f"Optimal Q-score analysis saved to: {optimal_qscore_path}")
536
+
537
+ # Clean up segment files
538
+ import shutil
539
+ segment_dir = os.path.dirname(segment_files[0])
540
+ if os.path.exists(segment_dir):
541
+ shutil.rmtree(segment_dir)
542
+ logging.info(f"Cleaned up segment directory: {segment_dir}")
543
+
544
+ # Return both QC results and optimal Q-score for use in main analysis
545
+ return qc_results, optimal_qscore
546
+
547
+ def find_optimal_qscore_simple(qc_results):
548
+ """
549
+ Find the Q-score threshold with the highest precision-weighted score.
550
+ Precision-weighted score = (1 / standard_deviation) * q_score
551
+
552
+ Args:
553
+ qc_results: List of segmentation analysis results
554
+
555
+ Returns:
556
+ tuple: (optimal_qscore, optimal_result)
557
+ """
558
+ logging.info("=== FINDING OPTIMAL Q-SCORE THRESHOLD (PRECISION-WEIGHTED) ===")
559
+
560
+ if not qc_results:
561
+ return None, None
562
+
563
+ # Find Q-score with highest precision-weighted score
564
+ max_score = -1
565
+ optimal_result = None
566
+ optimal_qscore = None
567
+
568
+ logging.info("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases")
569
+ logging.info("-" * 60)
570
+
571
+ for result in qc_results:
572
+ qscore = result['quality_threshold']
573
+ empirical_error = result['std_aa_mutations']
574
+ mappable_bases = result['total_mappable_bases']
575
+
576
+ # Calculate precision-weighted score: (1/sd) * q_score
577
+ if empirical_error > 0:
578
+ precision_score = (1.0 / empirical_error) * qscore
579
+ else:
580
+ precision_score = float('inf') # Perfect precision
581
+
582
+ logging.info(f"Q{qscore}\t{empirical_error:.6f}\t{precision_score:.6f}\t{mappable_bases}")
583
+
584
+ if precision_score > max_score:
585
+ max_score = precision_score
586
+ optimal_result = result
587
+ optimal_qscore = qscore
588
+
589
+ logging.info("-" * 60)
590
+ logging.info(f"OPTIMAL Q-SCORE: Q{optimal_qscore} (highest precision-weighted score: {max_score:.6f})")
591
+ logging.info(f"Optimal result: AA mutations = {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}")
592
+
593
+ return optimal_qscore, optimal_result
594
+
595
+ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_qscore=None, optimal_result=None):
596
+ """
597
+ Create simple QC plots with empirical error bars.
598
+
599
+ Args:
600
+ quality_thresholds: List of quality score thresholds
601
+ qc_results: List of segmentation analysis results
602
+ results_dir: Directory to save the plots
603
+ optimal_qscore: Optimal Q-score threshold (optional)
604
+ optimal_result: Optimal result data (optional)
605
+ """
606
+ try:
607
+ # Extract data for plotting
608
+ aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
609
+ aa_errors = [r['std_aa_mutations'] for r in qc_results]
610
+ aa_ci_lower = [r['ci_lower'] for r in qc_results]
611
+ aa_ci_upper = [r['ci_upper'] for r in qc_results]
612
+ mappable_bases = [r['total_mappable_bases'] for r in qc_results]
613
+
614
+ # Create main QC plot
615
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
616
+
617
+ # Top plot: AA mutations per gene with empirical error bars
618
+ color1 = '#2E8B57'
619
+ ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
620
+ fmt='o', capsize=5, capthick=2, markersize=8,
621
+ color=color1, ecolor=color1, alpha=0.8, label='Mean ± Empirical Std')
622
+
623
+ # Add confidence intervals as shaded area
624
+ ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
625
+ alpha=0.3, color=color1, label='95% Confidence Interval')
626
+
627
+ # Highlight optimal Q-score
628
+ if optimal_qscore is not None:
629
+ ax1.axvline(x=optimal_qscore, color='red', linestyle='--', alpha=0.7,
630
+ label=f'Optimal Q{optimal_qscore}')
631
+
632
+ ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
633
+ ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
634
+ ax1.tick_params(axis='y', labelcolor=color1)
635
+ ax1.set_title('AA Mutations per Gene vs Quality Score Filter (Segmentation-Based Error)',
636
+ fontsize=14, fontweight='bold')
637
+ ax1.grid(True, alpha=0.3)
638
+ ax1.legend(frameon=False, fontsize=10)
639
+
640
+ # Add data point labels
641
+ for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
642
+ ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
643
+ (q, aa_mut), xytext=(5, 5),
644
+ textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
645
+
646
+ # Bottom plot: Mappable bases and AA mutations per gene
647
+ color2 = '#FF6B6B'
648
+ color3 = '#4169E1'
649
+
650
+ # Mappable bases (left y-axis)
651
+ ax2_twin = ax2.twinx()
652
+ ax2_twin.scatter(quality_thresholds, mappable_bases,
653
+ s=100, alpha=0.7, color=color2, edgecolors='black',
654
+ linewidth=1, marker='s', label='Mappable Bases')
655
+ ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
656
+ ax2_twin.tick_params(axis='y', labelcolor=color2)
657
+
658
+ # AA mutations per gene with error bars (right y-axis)
659
+ ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
660
+ fmt='^', capsize=5, capthick=2, markersize=8,
661
+ color=color3, ecolor=color3, alpha=0.8, label='AA Mutations ± Empirical Error')
662
+ ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
663
+ ax2.tick_params(axis='y', labelcolor=color3)
664
+ ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
665
+ ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
666
+ fontsize=14, fontweight='bold')
667
+ ax2.grid(True, alpha=0.3)
668
+
669
+ # Add legends
670
+ lines1, labels1 = ax2.get_legend_handles_labels()
671
+ lines2, labels2 = ax2_twin.get_legend_handles_labels()
672
+ ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
673
+
674
+ # Add data point labels for mappable bases
675
+ for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
676
+ ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
677
+ textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
678
+
679
+ plt.tight_layout()
680
+
681
+ # Save the plot
682
+ project_name = os.path.basename(results_dir)
683
+ qc_plot_path = os.path.join(results_dir, f"qc_plot_{project_name}.png")
684
+ fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
685
+ plt.close(fig)
686
+
687
+ logging.info(f"QC plot saved to: {qc_plot_path}")
688
+
689
+ # Save data as CSV
690
+ qc_data_path = os.path.join(results_dir, "simple_qc_data.csv")
691
+ with open(qc_data_path, 'w') as f:
692
+ f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
693
+ f.write("total_mappable_bases,n_segments\n")
694
+
695
+ for q, r in zip(quality_thresholds, qc_results):
696
+ f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
697
+ f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
698
+ f.write(f"{r['total_mappable_bases']},{r['n_segments']}\n")
699
+
700
+ logging.info(f"Simple QC data saved to: {qc_data_path}")
701
+
702
+ except Exception as e:
703
+ logging.error(f"Error creating simple QC plots: {e}")
704
+ """
705
+ Create comprehensive QC plots with error bars and uncertainty quantification.
706
+
707
+ Args:
708
+ quality_thresholds: List of quality score thresholds
709
+ qc_results: List of comprehensive analysis results
710
+ results_dir: Directory to save the plots
711
+ optimal_qscore: Optimal Q-score threshold (optional)
712
+ optimal_result: Optimal result data (optional)
713
+ """
714
+ try:
715
+ # Extract data for plotting
716
+ aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
717
+ aa_errors = [r['std_aa_mutations'] for r in qc_results]
718
+ aa_ci_lower = [r['ci_lower'] for r in qc_results]
719
+ aa_ci_upper = [r['ci_upper'] for r in qc_results]
720
+ mappable_bases = [r['mappable_bases'] for r in qc_results]
721
+ net_rates = [r['net_rate'] for r in qc_results]
722
+ net_rate_errors = [r['net_rate_error'] for r in qc_results]
723
+
724
+ # Create main QC plot with error bars
725
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))
726
+
727
+ # Top plot: AA mutations per gene with error bars
728
+ color1 = '#2E8B57'
729
+ ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
730
+ fmt='o', capsize=5, capthick=2, markersize=8,
731
+ color=color1, ecolor=color1, alpha=0.8, label='Mean ± Std')
732
+
733
+ # Add confidence intervals as shaded area
734
+ ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
735
+ alpha=0.3, color=color1, label='95% Confidence Interval')
736
+
737
+ ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
738
+ ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
739
+ ax1.tick_params(axis='y', labelcolor=color1)
740
+ ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
741
+ fontsize=14, fontweight='bold')
742
+ ax1.grid(True, alpha=0.3)
743
+ ax1.legend(frameon=False, fontsize=10)
744
+
745
+ # Add data point labels
746
+ for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
747
+ ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
748
+ (q, aa_mut), xytext=(5, 5),
749
+ textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
750
+
751
+ # Bottom plot: Mappable bases and AA mutations per gene
752
+ color2 = '#FF6B6B'
753
+ color3 = '#4169E1'
754
+
755
+ # Mappable bases (left y-axis)
756
+ ax2_twin = ax2.twinx()
757
+ ax2_twin.scatter(quality_thresholds, mappable_bases,
758
+ s=100, alpha=0.7, color=color2, edgecolors='black',
759
+ linewidth=1, marker='s', label='Mappable Bases')
760
+ ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
761
+ ax2_twin.tick_params(axis='y', labelcolor=color2)
762
+
763
+ # AA mutations per gene with error bars (right y-axis)
764
+ ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
765
+ fmt='^', capsize=5, capthick=2, markersize=8,
766
+ color=color3, ecolor=color3, alpha=0.8, label='AA Mutations ± Error')
767
+ ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
768
+ ax2.tick_params(axis='y', labelcolor=color3)
769
+ ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
770
+ ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
771
+ fontsize=14, fontweight='bold')
772
+ ax2.grid(True, alpha=0.3)
773
+
774
+ # Add legends
775
+ lines1, labels1 = ax2.get_legend_handles_labels()
776
+ lines2, labels2 = ax2_twin.get_legend_handles_labels()
777
+ ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
778
+
779
+ # Add data point labels for mappable bases
780
+ for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
781
+ ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
782
+ textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
783
+
784
+ plt.tight_layout()
785
+
786
+ # Save the comprehensive plot
787
+ qc_plot_path = os.path.join(results_dir, "comprehensive_qc_analysis.png")
788
+ fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
789
+ plt.close(fig)
790
+
791
+ logging.info(f"Comprehensive QC plot saved to: {qc_plot_path}")
792
+
793
+ # Create error analysis plot
794
+ create_error_analysis_plot(quality_thresholds, qc_results, results_dir)
795
+
796
+ # Save comprehensive data as CSV
797
+ qc_data_path = os.path.join(results_dir, "comprehensive_qc_data.csv")
798
+ with open(qc_data_path, 'w') as f:
799
+ f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
800
+ f.write("mappable_bases,hit_rate,hit_rate_ci_lower,hit_rate_ci_upper,")
801
+ f.write("bg_rate,bg_rate_ci_lower,bg_rate_ci_upper,net_rate,net_rate_error,")
802
+ f.write("lambda_bp,lambda_error,alignment_error,")
803
+ f.write("hit_qscore_mean,hit_qscore_std,hit_qscore_uncertainty,")
804
+ f.write("bg_qscore_mean,bg_qscore_std,bg_qscore_uncertainty,")
805
+ f.write("hit_weighted_rate,hit_weighted_error,bg_weighted_rate,bg_weighted_error,")
806
+ f.write("net_weighted_rate,net_weighted_error,lambda_bp_weighted,lambda_error_weighted\n")
807
+
808
+ for q, r in zip(quality_thresholds, qc_results):
809
+ f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
810
+ f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
811
+ f.write(f"{r['mappable_bases']},{r['hit_rate']:.6f},")
812
+ f.write(f"{r['hit_rate_ci'][0]:.6f},{r['hit_rate_ci'][1]:.6f},")
813
+ f.write(f"{r['bg_rate']:.6f},{r['bg_rate_ci'][0]:.6f},{r['bg_rate_ci'][1]:.6f},")
814
+ f.write(f"{r['net_rate']:.6f},{r['net_rate_error']:.6f},")
815
+ f.write(f"{r['lambda_bp']:.6f},{r['lambda_error']:.6f},{r['alignment_error']:.6f},")
816
+
817
+ # Q-score information
818
+ hit_qscore_mean = r['hit_qscore_stats']['mean_qscore'] if r['hit_qscore_stats'] else 0.0
819
+ hit_qscore_std = r['hit_qscore_stats']['std_qscore'] if r['hit_qscore_stats'] else 0.0
820
+ bg_qscore_mean = r['bg_qscore_stats']['mean_qscore'] if r['bg_qscore_stats'] else 0.0
821
+ bg_qscore_std = r['bg_qscore_stats']['std_qscore'] if r['bg_qscore_stats'] else 0.0
822
+
823
+ f.write(f"{hit_qscore_mean:.2f},{hit_qscore_std:.2f},{r['hit_qscore_uncertainty']:.6f},")
824
+ f.write(f"{bg_qscore_mean:.2f},{bg_qscore_std:.2f},{r['bg_qscore_uncertainty']:.6f},")
825
+ f.write(f"{r.get('hit_weighted_rate', 0.0):.6f},{r.get('hit_weighted_error', 0.0):.6f},")
826
+ f.write(f"{r.get('bg_weighted_rate', 0.0):.6f},{r.get('bg_weighted_error', 0.0):.6f},")
827
+ f.write(f"{r.get('net_weighted_rate', 0.0):.6f},{r.get('net_weighted_error', 0.0):.6f},")
828
+ f.write(f"{r.get('lambda_bp_weighted', 0.0):.6f},{r.get('lambda_error_weighted', 0.0):.6f}\n")
829
+
830
+ logging.info(f"Comprehensive QC data saved to: {qc_data_path}")
831
+
832
+ except Exception as e:
833
+ logging.error(f"Error creating comprehensive QC plots: {e}")
834
+
835
+ def create_error_analysis_plot(quality_thresholds, qc_results, results_dir):
836
+ """
837
+ Create a detailed error analysis plot showing different sources of uncertainty.
838
+
839
+ Args:
840
+ quality_thresholds: List of quality score thresholds
841
+ qc_results: List of comprehensive analysis results
842
+ results_dir: Directory to save the plot
843
+ """
844
+ try:
845
+ fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
846
+
847
+ # Extract error components
848
+ aa_std = [r['std_aa_mutations'] for r in qc_results]
849
+ net_rate_errors = [r['net_rate_error'] for r in qc_results]
850
+ lambda_errors = [r['lambda_error'] for r in qc_results]
851
+ alignment_errors = [r['alignment_error'] for r in qc_results]
852
+ mappable_bases = [r['mappable_bases'] for r in qc_results]
853
+
854
+ # Plot 1: AA mutation uncertainty vs quality threshold
855
+ ax1.plot(quality_thresholds, aa_std, 'o-', color='#2E8B57', linewidth=2, markersize=6)
856
+ ax1.set_xlabel('Quality Score Threshold')
857
+ ax1.set_ylabel('AA Mutation Standard Deviation')
858
+ ax1.set_title('AA Mutation Uncertainty vs Quality Filter')
859
+ ax1.grid(True, alpha=0.3)
860
+
861
+ # Plot 2: Net rate error vs quality threshold
862
+ ax2.plot(quality_thresholds, net_rate_errors, 's-', color='#FF6B6B', linewidth=2, markersize=6)
863
+ ax2.set_xlabel('Quality Score Threshold')
864
+ ax2.set_ylabel('Net Mutation Rate Error')
865
+ ax2.set_title('Net Rate Error vs Quality Filter')
866
+ ax2.grid(True, alpha=0.3)
867
+
868
+ # Plot 3: Lambda error vs quality threshold
869
+ ax3.plot(quality_thresholds, lambda_errors, '^-', color='#4169E1', linewidth=2, markersize=6)
870
+ ax3.set_xlabel('Quality Score Threshold')
871
+ ax3.set_ylabel('Lambda Error (mutations per copy)')
872
+ ax3.set_title('Lambda Error vs Quality Filter')
873
+ ax3.grid(True, alpha=0.3)
874
+
875
+ # Plot 4: Alignment error vs mappable bases
876
+ ax4.scatter(mappable_bases, alignment_errors, s=100, alpha=0.7, color='#FF8C00')
877
+ ax4.set_xlabel('Mappable Bases')
878
+ ax4.set_ylabel('Alignment Error (1/√reads)')
879
+ ax4.set_title('Alignment Error vs Read Count')
880
+ ax4.grid(True, alpha=0.3)
881
+
882
+ # Add quality threshold labels to scatter plot
883
+ for i, q in enumerate(quality_thresholds):
884
+ ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
885
+ xytext=(5, 5), textcoords='offset points', fontsize=8)
886
+
887
+ plt.tight_layout()
888
+
889
+ # Save error analysis plot
890
+ error_plot_path = os.path.join(results_dir, "error_analysis.png")
891
+ fig.savefig(error_plot_path, dpi=300, bbox_inches='tight')
892
+ plt.close(fig)
893
+
894
+ logging.info(f"Error analysis plot saved to: {error_plot_path}")
895
+
896
+ except Exception as e:
897
+ logging.error(f"Error creating error analysis plot: {e}")
898
+
899
+ def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir):
900
+ """
901
+ Create a dual-axis plot showing quality score threshold vs AA mutations per gene and mappable bases.
902
+
903
+ Args:
904
+ quality_thresholds: List of quality score thresholds
905
+ aa_mutations: List of corresponding AA mutations per gene
906
+ mappable_bases: List of corresponding mappable bases
907
+ results_dir: Directory to save the plot
908
+ """
909
+ try:
910
+ # Create the plot with dual y-axes
911
+ fig, ax1 = plt.subplots(figsize=(12, 8))
912
+
913
+ # Left y-axis: AA mutations per gene
914
+ color1 = '#2E8B57'
915
+ ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
916
+ ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
917
+ ax1.scatter(quality_thresholds, aa_mutations,
918
+ s=100, alpha=0.7, color=color1, edgecolors='black', linewidth=1, label='AA Mutations per Gene')
919
+ ax1.tick_params(axis='y', labelcolor=color1)
920
+
921
+ # Right y-axis: Mappable bases
922
+ ax2 = ax1.twinx()
923
+ color2 = '#FF6B6B'
924
+ ax2.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
925
+ ax2.scatter(quality_thresholds, mappable_bases,
926
+ s=100, alpha=0.7, color=color2, edgecolors='black', linewidth=1, marker='s', label='Mappable Bases')
927
+ ax2.tick_params(axis='y', labelcolor=color2)
928
+
929
+ # Customize the plot
930
+ ax1.set_title('AA Mutations per Gene and Mappable Bases vs Quality Score Filter', fontsize=14, fontweight='bold')
931
+
932
+ # Add grid for better readability
933
+ ax1.grid(True, alpha=0.3)
934
+
935
+ # Customize ticks and spines
936
+ ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
937
+ ax1.tick_params(axis='both', which='minor', direction='in', length=3)
938
+ ax1.spines['top'].set_visible(False)
939
+ ax1.spines['right'].set_visible(False)
940
+
941
+ # Add data point labels for AA mutations
942
+ for i, (q, aa_mut) in enumerate(zip(quality_thresholds, aa_mutations)):
943
+ ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
944
+ textcoords='offset points', fontsize=9, alpha=0.8, color=color1)
945
+
946
+ # Add data point labels for mappable bases
947
+ for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
948
+ ax2.annotate(f'{reads}', (q, reads), xytext=(5, -15),
949
+ textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
950
+
951
+ # Add legend
952
+ lines1, labels1 = ax1.get_legend_handles_labels()
953
+ lines2, labels2 = ax2.get_legend_handles_labels()
954
+ ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
955
+
956
+ # Save the plot
957
+ qc_plot_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.png")
958
+ fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
959
+ plt.close(fig)
960
+
961
+ logging.info(f"QC plot saved to: {qc_plot_path}")
962
+
963
+ # Also save data as CSV for reference
964
+ qc_data_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.csv")
965
+ with open(qc_data_path, 'w') as f:
966
+ f.write("quality_threshold,aa_mutations_per_gene,mappable_bases\n")
967
+ for q, aa_mut, bases in zip(quality_thresholds, aa_mutations, mappable_bases):
968
+ f.write(f"{q},{aa_mut:.6f},{bases}\n")
969
+
970
+ logging.info(f"QC data saved to: {qc_data_path}")
971
+
972
+ except Exception as e:
973
+ logging.error(f"Error creating QC plot: {e}")
974
+
975
+ def extract_qscores_from_sam(sam_file):
976
+ """
977
+ Extract Q-scores from SAM file and calculate statistics.
978
+
979
+ Args:
980
+ sam_file: Path to SAM file
981
+
982
+ Returns:
983
+ dict: Q-score statistics including mean, std, and per-position averages
984
+ """
985
+ try:
986
+ import pysam
987
+
988
+ qscores = []
989
+ position_qscores = {} # position -> list of qscores
990
+
991
+ with pysam.AlignmentFile(sam_file, "r") as samfile:
992
+ for read in samfile:
993
+ if read.is_unmapped:
994
+ continue
995
+
996
+ # Get Q-scores for this read
997
+ read_qscores = read.query_qualities
998
+ if read_qscores is None:
999
+ continue
1000
+
1001
+ # Convert to Q-score values (Phred+33 encoding)
1002
+ q_values = [q + 33 for q in read_qscores]
1003
+ qscores.extend(q_values)
1004
+
1005
+ # Store per-position Q-scores
1006
+ for i, q_val in enumerate(q_values):
1007
+ pos = read.reference_start + i
1008
+ if pos not in position_qscores:
1009
+ position_qscores[pos] = []
1010
+ position_qscores[pos].append(q_val)
1011
+
1012
+ if not qscores:
1013
+ return {
1014
+ 'mean_qscore': 0.0,
1015
+ 'std_qscore': 0.0,
1016
+ 'min_qscore': 0.0,
1017
+ 'max_qscore': 0.0,
1018
+ 'position_avg_qscores': {},
1019
+ 'total_bases': 0
1020
+ }
1021
+
1022
+ # Calculate statistics
1023
+ mean_qscore = np.mean(qscores)
1024
+ std_qscore = np.std(qscores)
1025
+ min_qscore = np.min(qscores)
1026
+ max_qscore = np.max(qscores)
1027
+
1028
+ # Calculate per-position average Q-scores
1029
+ position_avg_qscores = {}
1030
+ for pos, pos_qscores in position_qscores.items():
1031
+ position_avg_qscores[pos] = np.mean(pos_qscores)
1032
+
1033
+ return {
1034
+ 'mean_qscore': mean_qscore,
1035
+ 'std_qscore': std_qscore,
1036
+ 'min_qscore': min_qscore,
1037
+ 'max_qscore': max_qscore,
1038
+ 'position_avg_qscores': position_avg_qscores,
1039
+ 'total_bases': len(qscores)
1040
+ }
1041
+
1042
+ except Exception as e:
1043
+ logging.error(f"Error extracting Q-scores from {sam_file}: {e}")
1044
+ return {
1045
+ 'mean_qscore': 0.0,
1046
+ 'std_qscore': 0.0,
1047
+ 'min_qscore': 0.0,
1048
+ 'max_qscore': 0.0,
1049
+ 'position_avg_qscores': {},
1050
+ 'total_bases': 0
1051
+ }
1052
+
1053
+ def qscore_uncertainty_factor(qscore):
1054
+ """
1055
+ Convert Q-score to uncertainty factor.
1056
+
1057
+ Args:
1058
+ qscore: Q-score value (typically 0-40)
1059
+
1060
+ Returns:
1061
+ float: Uncertainty factor (0-1, where 1 = maximum uncertainty)
1062
+ """
1063
+ if qscore <= 0:
1064
+ return 1.0 # Maximum uncertainty
1065
+
1066
+ # Q-score = -10 * log10(P_error)
1067
+ # P_error = 10^(-Q/10)
1068
+ # Uncertainty factor = sqrt(P_error) for error propagation
1069
+ error_probability = 10**(-qscore/10)
1070
+ uncertainty_factor = np.sqrt(error_probability)
1071
+
1072
+ return uncertainty_factor
1073
+
1074
+ def segment_fastq_file(input_fastq, n_segments=10):
1075
+ """
1076
+ Segment a FASTQ file into N parts for error estimation.
1077
+
1078
+ Args:
1079
+ input_fastq: Path to input FASTQ.gz file
1080
+ n_segments: Number of segments to create
1081
+
1082
+ Returns:
1083
+ list: Paths to segmented FASTQ files
1084
+ """
1085
+ try:
1086
+ import gzip
1087
+ from itertools import cycle
1088
+
1089
+ # Create output directory
1090
+ base_name = os.path.splitext(os.path.basename(input_fastq))[0].replace('.fastq', '')
1091
+ segment_dir = os.path.join(os.path.dirname(input_fastq), f"{base_name}_segments")
1092
+ os.makedirs(segment_dir, exist_ok=True)
1093
+
1094
+ # Open output files
1095
+ segment_files = []
1096
+ file_handles = []
1097
+
1098
+ for i in range(n_segments):
1099
+ segment_path = os.path.join(segment_dir, f"{base_name}_segment_{i+1}.fastq.gz")
1100
+ segment_files.append(segment_path)
1101
+ file_handles.append(gzip.open(segment_path, 'wt'))
1102
+
1103
+ # Read and distribute reads
1104
+ read_count = 0
1105
+ with gzip.open(input_fastq, 'rt') as infile:
1106
+ current_read = []
1107
+
1108
+ for line in infile:
1109
+ current_read.append(line)
1110
+
1111
+ # Complete read (4 lines)
1112
+ if len(current_read) == 4:
1113
+ # Write to current segment
1114
+ segment_idx = read_count % n_segments
1115
+ for line in current_read:
1116
+ file_handles[segment_idx].write(line)
1117
+
1118
+ read_count += 1
1119
+ current_read = []
1120
+
1121
+ # Close all files
1122
+ for fh in file_handles:
1123
+ fh.close()
1124
+
1125
+ logging.info(f"Segmented {read_count} reads into {n_segments} files")
1126
+ return segment_files
1127
+
1128
+ except Exception as e:
1129
+ logging.error(f"Error segmenting FASTQ file: {e}")
1130
+ return []
1131
+
1132
+ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_fasta, plasmid_fasta):
1133
+ """
1134
+ Run mutation rate analysis on each segment and calculate empirical error.
1135
+
1136
+ Args:
1137
+ segment_files: List of segmented FASTQ files
1138
+ quality_threshold: Quality score threshold
1139
+ work_dir: Working directory
1140
+ ref_hit_fasta: Path to reference hit FASTA
1141
+ plasmid_fasta: Path to plasmid FASTA
1142
+
1143
+ Returns:
1144
+ dict: Results with empirical error estimates
1145
+ """
1146
+ try:
1147
+ segment_results = []
1148
+
1149
+ for i, segment_file in enumerate(segment_files):
1150
+ logging.info(f"Processing segment {i+1}/{len(segment_files)}")
1151
+
1152
+ # Filter segment with NanoFilt
1153
+ filtered_segment = os.path.join(work_dir, f"segment_{i+1}_q{quality_threshold}.fastq.gz")
1154
+ if not run_nanofilt_filtering(segment_file, quality_threshold, filtered_segment):
1155
+ logging.warning(f"Failed to filter segment {i+1}")
1156
+ continue
1157
+
1158
+ # Load sequences
1159
+ hit_seq, hit_id = load_single_sequence(ref_hit_fasta)
1160
+ plasmid_seq, plasmid_id = load_single_sequence(plasmid_fasta)
1161
+
1162
+ # Find hit region in plasmid
1163
+ idx = plasmid_seq.upper().find(hit_seq.upper())
1164
+ if idx == -1:
1165
+ logging.error(f"Gene region not found in plasmid for segment {i+1}")
1166
+ continue
1167
+
1168
+ # Align filtered reads to hit region
1169
+ sam_hit = run_minimap2(filtered_segment, ref_hit_fasta, f"hit_segment_{i+1}_q{quality_threshold}", work_dir)
1170
+
1171
+ # Align filtered reads to full plasmid for background calculation
1172
+ sam_plasmid = run_minimap2(filtered_segment, plasmid_fasta, f"plasmid_segment_{i+1}_q{quality_threshold}", work_dir)
1173
+
1174
+ # Calculate background rate from full plasmid alignment, excluding target region
1175
+ bg_mis, bg_cov, bg_reads = calculate_background_from_plasmid(sam_plasmid, plasmid_seq, idx, len(hit_seq))
1176
+
1177
+ # Calculate hit region mutation rate
1178
+ mismatch_hit = compute_mismatch_stats_sam(sam_hit, {hit_id: hit_seq})
1179
+ hit_info = mismatch_hit[hit_id]
1180
+ hit_mis = hit_info["total_mismatches"]
1181
+ hit_cov = hit_info["total_covered_bases"]
1182
+
1183
+ # Calculate rates
1184
+ hit_rate = hit_mis / hit_cov if hit_cov > 0 else 0
1185
+ bg_rate = bg_mis / bg_cov if bg_cov > 0 else 0
1186
+ net_rate = max(hit_rate - bg_rate, 0.0)
1187
+
1188
+ # Calculate AA mutations per gene (simplified)
1189
+ lambda_bp = net_rate * len(hit_seq)
1190
+ aa_mutations = lambda_bp / 3.0 # Approximate: 3 bp per AA
1191
+
1192
+ segment_results.append({
1193
+ 'segment': i+1,
1194
+ 'hit_rate': hit_rate,
1195
+ 'bg_rate': bg_rate,
1196
+ 'net_rate': net_rate,
1197
+ 'aa_mutations': aa_mutations,
1198
+ 'mappable_bases': hit_cov,
1199
+ 'hit_mismatches': hit_mis,
1200
+ 'hit_coverage': hit_cov
1201
+ })
1202
+
1203
+ if not segment_results:
1204
+ return None
1205
+
1206
+ # Calculate empirical statistics
1207
+ aa_mutations_list = [r['aa_mutations'] for r in segment_results]
1208
+ net_rates_list = [r['net_rate'] for r in segment_results]
1209
+ mappable_bases_list = [r['mappable_bases'] for r in segment_results]
1210
+
1211
+ mean_aa = np.mean(aa_mutations_list)
1212
+ std_aa = np.std(aa_mutations_list, ddof=1) # Sample standard deviation
1213
+ mean_net_rate = np.mean(net_rates_list)
1214
+ std_net_rate = np.std(net_rates_list, ddof=1)
1215
+ total_mappable_bases = sum(mappable_bases_list)
1216
+
1217
+ # Calculate confidence interval using t-distribution
1218
+ n_segments = len(segment_results)
1219
+ if n_segments > 1:
1220
+ # 95% confidence interval
1221
+ from scipy.stats import t
1222
+ t_val = t.ppf(0.975, n_segments - 1)
1223
+ se_aa = std_aa / np.sqrt(n_segments)
1224
+ ci_lower = mean_aa - t_val * se_aa
1225
+ ci_upper = mean_aa + t_val * se_aa
1226
+ else:
1227
+ ci_lower = mean_aa
1228
+ ci_upper = mean_aa
1229
+
1230
+ return {
1231
+ 'mean_aa_mutations': mean_aa,
1232
+ 'std_aa_mutations': std_aa,
1233
+ 'ci_lower': ci_lower,
1234
+ 'ci_upper': ci_upper,
1235
+ 'mean_net_rate': mean_net_rate,
1236
+ 'std_net_rate': std_net_rate,
1237
+ 'total_mappable_bases': total_mappable_bases,
1238
+ 'n_segments': n_segments,
1239
+ 'segment_results': segment_results,
1240
+ 'quality_threshold': quality_threshold
1241
+ }
1242
+
1243
+ except Exception as e:
1244
+ logging.error(f"Error in segmented analysis: {e}")
1245
+ return None
1246
+ """
1247
+ Calculate mismatches weighted by Q-score uncertainty with proper sampling error.
1248
+
1249
+ Args:
1250
+ sam_file: Path to SAM file
1251
+ ref_seq: Reference sequence
1252
+ qscore_stats: Q-score statistics from extract_qscores_from_sam
1253
+
1254
+ Returns:
1255
+ tuple: (weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes)
1256
+ """
1257
+ try:
1258
+ import pysam
1259
+
1260
+ weighted_mismatches = 0.0
1261
+ total_weighted_coverage = 0.0
1262
+ raw_mismatches = 0
1263
+ raw_coverage = 0
1264
+
1265
+ # Store position-level data for proper sampling error calculation
1266
+ position_weights = []
1267
+ position_outcomes = []
1268
+
1269
+ position_qscores = qscore_stats['position_avg_qscores']
1270
+
1271
+ with pysam.AlignmentFile(sam_file, "r") as samfile:
1272
+ for read in samfile:
1273
+ if read.is_unmapped:
1274
+ continue
1275
+
1276
+ # Get aligned pairs (read_pos, ref_pos)
1277
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
1278
+ if ref_pos is None or read_pos is None:
1279
+ continue
1280
+
1281
+ if ref_pos >= len(ref_seq):
1282
+ continue
1283
+
1284
+ # Get base calls
1285
+ read_base = read.query_sequence[read_pos].upper()
1286
+ ref_base = ref_seq[ref_pos].upper()
1287
+
1288
+ # Skip if either base is N
1289
+ if read_base == 'N' or ref_base == 'N':
1290
+ continue
1291
+
1292
+ # Get Q-score for this position
1293
+ qscore = position_qscores.get(ref_pos, qscore_stats['mean_qscore'])
1294
+ uncertainty_factor = qscore_uncertainty_factor(qscore)
1295
+
1296
+ # Weight by uncertainty (lower Q-score = higher uncertainty = lower weight)
1297
+ weight = 1.0 - uncertainty_factor
1298
+
1299
+ # Store position-level data
1300
+ position_weights.append(weight)
1301
+ position_outcomes.append(1 if read_base != ref_base else 0)
1302
+
1303
+ # Count coverage
1304
+ total_weighted_coverage += weight
1305
+ raw_coverage += 1
1306
+
1307
+ # Count mismatches
1308
+ if read_base != ref_base:
1309
+ weighted_mismatches += weight
1310
+ raw_mismatches += 1
1311
+
1312
+ return weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes
1313
+
1314
+ except Exception as e:
1315
+ logging.error(f"Error calculating Q-score weighted mismatches: {e}")
1316
+ return 0.0, 0.0, 0, 0, [], []
1317
+
1318
+ def calculate_weighted_sampling_error(position_weights, position_outcomes):
1319
+ """
1320
+ Calculate proper weighted sampling error from position-level data.
1321
+
1322
+ Args:
1323
+ position_weights: List of weights for each position
1324
+ position_outcomes: List of outcomes (0=match, 1=mismatch) for each position
1325
+
1326
+ Returns:
1327
+ tuple: (weighted_rate, weighted_error)
1328
+ """
1329
+ if not position_weights or len(position_weights) == 0:
1330
+ return 0.0, 0.0
1331
+
1332
+ position_weights = np.array(position_weights)
1333
+ position_outcomes = np.array(position_outcomes)
1334
+
1335
+ # Calculate weighted rate
1336
+ weighted_mismatches = np.sum(position_weights * position_outcomes)
1337
+ weighted_coverage = np.sum(position_weights)
1338
+
1339
+ if weighted_coverage == 0:
1340
+ return 0.0, 0.0
1341
+
1342
+ weighted_rate = weighted_mismatches / weighted_coverage
1343
+
1344
+ # Proper weighted sampling error calculation
1345
+ # Var(p̂) = (1/W²) * Σ[w_i² * (y_i - p̂)²]
1346
+ # where W = Σw_i, y_i = outcome, p̂ = weighted rate
1347
+
1348
+ residuals = position_outcomes - weighted_rate
1349
+ weighted_residuals_squared = position_weights**2 * residuals**2
1350
+ weighted_error = np.sqrt(np.sum(weighted_residuals_squared) / (weighted_coverage**2))
1351
+
1352
+ return weighted_rate, weighted_error
1353
+
1354
+ def calculate_qscore_weighted_error_propagation(weighted_mismatches, weighted_coverage, qscore_stats):
1355
+ """
1356
+ Calculate error propagation for Q-score weighted mutation rates using proper weighted sampling theory.
1357
+
1358
+ Args:
1359
+ weighted_mismatches: Q-score weighted mismatch count
1360
+ weighted_coverage: Q-score weighted coverage
1361
+ qscore_stats: Q-score statistics
1362
+
1363
+ Returns:
1364
+ tuple: (weighted_rate, weighted_error)
1365
+ """
1366
+ if weighted_coverage == 0:
1367
+ return 0.0, 0.0
1368
+
1369
+ weighted_rate = weighted_mismatches / weighted_coverage
1370
+
1371
+ # Proper weighted sampling error calculation
1372
+ # For weighted binomial: Var(p̂) ≈ (1/n²) * Σ[w_i² * p * (1-p)]
1373
+ # where n = Σw_i (weighted_coverage)
1374
+
1375
+ # Estimate weight variance from Q-score statistics
1376
+ mean_qscore = qscore_stats['mean_qscore']
1377
+ std_qscore = qscore_stats['std_qscore']
1378
+
1379
+ # Convert Q-score statistics to weight statistics
1380
+ mean_weight = 1.0 - qscore_uncertainty_factor(mean_qscore)
1381
+
1382
+ # Approximate weight variance using delta method
1383
+ # If w = 1 - sqrt(10^(-Q/10)), then Var(w) ≈ Var(Q) * (dw/dQ)²
1384
+ # dw/dQ = (ln(10)/20) * 10^(-Q/10) * (1/sqrt(10^(-Q/10)))
1385
+ # = (ln(10)/20) * sqrt(10^(-Q/10))
1386
+
1387
+ if mean_qscore > 0:
1388
+ # Delta method approximation for weight variance
1389
+ error_prob = 10**(-mean_qscore/10)
1390
+ weight_derivative = (np.log(10)/20) * np.sqrt(error_prob)
1391
+ weight_variance = (std_qscore**2) * (weight_derivative**2)
1392
+ else:
1393
+ weight_variance = 0.0
1394
+
1395
+ # Effective sample size for weighted sampling
1396
+ # n_eff = (Σw_i)² / Σ(w_i²) ≈ (Σw_i)² / [n * (E[w]² + Var[w])]
1397
+ n_positions = qscore_stats.get('total_bases', weighted_coverage)
1398
+ if n_positions > 0:
1399
+ expected_w_squared = mean_weight**2 + weight_variance
1400
+ effective_n = (weighted_coverage**2) / (n_positions * expected_w_squared)
1401
+ else:
1402
+ effective_n = weighted_coverage / mean_weight
1403
+
1404
+ # Weighted sampling error
1405
+ # Var(p̂) = p(1-p) / n_eff * [1 + (Var[w]/E[w]²)]
1406
+ weight_cv_squared = weight_variance / (mean_weight**2) if mean_weight > 0 else 0
1407
+ weighted_error = np.sqrt(weighted_rate * (1 - weighted_rate) / effective_n * (1 + weight_cv_squared))
1408
+
1409
+ return weighted_rate, weighted_error
1410
+
1411
+ def binomial_confidence_interval(successes, trials, confidence=0.95):
1412
+ """
1413
+ Calculate confidence interval for binomial proportion using beta distribution.
1414
+
1415
+ Args:
1416
+ successes: Number of successes
1417
+ trials: Number of trials
1418
+ confidence: Confidence level (default 0.95 for 95% CI)
1419
+
1420
+ Returns:
1421
+ tuple: (lower_bound, upper_bound)
1422
+ """
1423
+ if not HAVE_SCIPY:
1424
+ # Simple normal approximation if scipy not available
1425
+ p = successes / trials if trials > 0 else 0
1426
+ se = np.sqrt(p * (1 - p) / trials) if trials > 0 else 0
1427
+ z = norm.ppf(1 - (1 - confidence) / 2)
1428
+ return max(0, p - z * se), min(1, p + z * se)
1429
+
1430
+ alpha = 1 - confidence
1431
+ lower = beta.ppf(alpha/2, successes, trials - successes + 1) if trials > successes else 0
1432
+ upper = beta.ppf(1 - alpha/2, successes + 1, trials - successes) if successes > 0 else 0
1433
+ return lower, upper
1434
+
1435
+ def propagate_mutation_rate_error(hit_mis, hit_cov, bg_mis, bg_cov, hit_qscore_stats=None, bg_qscore_stats=None):
1436
+ """
1437
+ Calculate error propagation for net mutation rate = hit_rate - bg_rate, including Q-score uncertainty.
1438
+
1439
+ Args:
1440
+ hit_mis, hit_cov: Hit region mismatches and coverage
1441
+ bg_mis, bg_cov: Background mismatches and coverage
1442
+ hit_qscore_stats: Q-score statistics for hit region (optional)
1443
+ bg_qscore_stats: Q-score statistics for background region (optional)
1444
+
1445
+ Returns:
1446
+ tuple: (net_rate, net_rate_error)
1447
+ """
1448
+ if hit_cov == 0 or bg_cov == 0:
1449
+ return 0.0, 0.0
1450
+
1451
+ hit_rate = hit_mis / hit_cov
1452
+ bg_rate = bg_mis / bg_cov
1453
+
1454
+ # Binomial standard errors
1455
+ hit_se = np.sqrt(hit_rate * (1 - hit_rate) / hit_cov)
1456
+ bg_se = np.sqrt(bg_rate * (1 - bg_rate) / bg_cov)
1457
+
1458
+ # Add Q-score uncertainty if available
1459
+ if hit_qscore_stats:
1460
+ hit_qscore_uncertainty = qscore_uncertainty_factor(hit_qscore_stats['mean_qscore'])
1461
+ hit_se = np.sqrt(hit_se**2 + hit_qscore_uncertainty**2)
1462
+
1463
+ if bg_qscore_stats:
1464
+ bg_qscore_uncertainty = qscore_uncertainty_factor(bg_qscore_stats['mean_qscore'])
1465
+ bg_se = np.sqrt(bg_se**2 + bg_qscore_uncertainty**2)
1466
+
1467
+ # Net rate and error propagation
1468
+ net_rate = max(hit_rate - bg_rate, 0.0)
1469
+ net_se = np.sqrt(hit_se**2 + bg_se**2)
1470
+
1471
+ return net_rate, net_se
1472
+
1473
+ def simulate_aa_distribution_with_error(lambda_bp, lambda_error, cds_seq, n_trials=10000):
1474
+ """
1475
+ Enhanced Monte Carlo simulation that includes uncertainty in lambda_bp.
1476
+
1477
+ Args:
1478
+ lambda_bp: Mean mutations per copy (basepairs)
1479
+ lambda_error: Standard error of lambda_bp
1480
+ cds_seq: Coding sequence
1481
+ n_trials: Number of Monte Carlo trials
1482
+
1483
+ Returns:
1484
+ tuple: (mean_aa_mutations, std_aa_mutations, aa_distribution)
1485
+ """
1486
+ prot_orig = str(Seq(cds_seq).translate(to_stop=False))
1487
+ aa_diffs = []
1488
+
1489
+ for _ in range(n_trials):
1490
+ # Sample lambda from normal distribution with error
1491
+ lambda_sample = np.random.normal(lambda_bp, lambda_error)
1492
+ lambda_sample = max(lambda_sample, 0) # Ensure non-negative
1493
+
1494
+ # Number of base changes in this trial ~ Poisson(lambda_sample)
1495
+ n_bp_mut = np.random.poisson(lambda_sample)
1496
+
1497
+ # Make a mutable copy of the CDS
1498
+ seq_list = list(cds_seq.upper())
1499
+
1500
+ # Introduce exactly n_bp_mut random single‐base substitutions
1501
+ for _ in range(n_bp_mut):
1502
+ pos = random.randrange(len(seq_list))
1503
+ orig_base = seq_list[pos]
1504
+ bases = ["A", "T", "C", "G"]
1505
+ bases.remove(orig_base)
1506
+ seq_list[pos] = random.choice(bases)
1507
+
1508
+ # Translate mutated sequence (no early stop)
1509
+ mutated_prot = str(Seq("".join(seq_list)).translate(to_stop=False))
1510
+
1511
+ # Count how many amino acids differ
1512
+ aa_diff = sum(1 for a, b in zip(prot_orig, mutated_prot) if a != b)
1513
+ aa_diffs.append(aa_diff)
1514
+
1515
+ mean_aa = np.mean(aa_diffs)
1516
+ std_aa = np.std(aa_diffs)
1517
+
1518
+ return mean_aa, std_aa, aa_diffs
1519
+
1520
+ def bootstrap_aa_mutations(hit_mis, hit_cov, bg_mis, bg_cov, cds_seq, n_bootstrap=1000):
1521
+ """
1522
+ Bootstrap resampling to estimate confidence intervals for AA mutations.
1523
+
1524
+ Args:
1525
+ hit_mis, hit_cov: Hit region mismatches and coverage
1526
+ bg_mis, bg_cov: Background mismatches and coverage
1527
+ cds_seq: Coding sequence
1528
+ n_bootstrap: Number of bootstrap samples
1529
+
1530
+ Returns:
1531
+ tuple: (mean_aa_mutations, ci_lower, ci_upper, bootstrap_distribution)
1532
+ """
1533
+ bootstrap_results = []
1534
+
1535
+ for _ in range(n_bootstrap):
1536
+ # Resample reads with replacement (binomial resampling)
1537
+ hit_mis_boot = np.random.binomial(hit_cov, hit_mis/hit_cov) if hit_cov > 0 else 0
1538
+ bg_mis_boot = np.random.binomial(bg_cov, bg_mis/bg_cov) if bg_cov > 0 else 0
1539
+
1540
+ # Calculate net rate
1541
+ hit_rate_boot = hit_mis_boot / hit_cov if hit_cov > 0 else 0
1542
+ bg_rate_boot = bg_mis_boot / bg_cov if bg_cov > 0 else 0
1543
+ net_rate_boot = max(hit_rate_boot - bg_rate_boot, 0)
1544
+
1545
+ # Calculate AA mutations
1546
+ lambda_bp_boot = net_rate_boot * len(cds_seq)
1547
+
1548
+ # Quick simulation for bootstrap (fewer trials for speed)
1549
+ aa_mut_boot = simulate_aa_distribution(lambda_bp_boot, cds_seq, n_trials=1000)
1550
+ bootstrap_results.append(np.mean(aa_mut_boot))
1551
+
1552
+ mean_aa = np.mean(bootstrap_results)
1553
+ # Use proper percentile calculation for 95% CI
1554
+ ci_lower = np.percentile(bootstrap_results, 2.5)
1555
+ ci_upper = np.percentile(bootstrap_results, 97.5)
1556
+
1557
+ # Additional validation: ensure CI makes sense
1558
+ if ci_lower > mean_aa or ci_upper < mean_aa:
1559
+ logging.warning(f"Bootstrap CI validation failed: mean={mean_aa:.4f}, CI=[{ci_lower:.4f}, {ci_upper:.4f}]")
1560
+ # Use empirical CI if percentile method fails
1561
+ sorted_results = np.sort(bootstrap_results)
1562
+ n = len(sorted_results)
1563
+ ci_lower = sorted_results[int(0.025 * n)]
1564
+ ci_upper = sorted_results[int(0.975 * n)]
1565
+
1566
+ return mean_aa, ci_lower, ci_upper, bootstrap_results
1567
+
1568
+ def comprehensive_aa_mutation_analysis(hit_mis, hit_cov, bg_mis, bg_cov, cds_seq,
1569
+ quality_threshold=None, n_trials=10000,
1570
+ hit_qscore_stats=None, bg_qscore_stats=None,
1571
+ sam_hit=None, sam_plasmid=None, hit_seq=None, plasmid_seq=None):
1572
+ """
1573
+ Comprehensive AA mutation analysis with full error propagation including Q-score uncertainty.
1574
+
1575
+ Args:
1576
+ hit_mis, hit_cov: Hit region mismatches and coverage
1577
+ bg_mis, bg_cov: Background mismatches and coverage
1578
+ cds_seq: Coding sequence
1579
+ quality_threshold: Quality threshold for logging
1580
+ n_trials: Number of Monte Carlo trials
1581
+ hit_qscore_stats: Q-score statistics for hit region (optional)
1582
+ bg_qscore_stats: Q-score statistics for background region (optional)
1583
+
1584
+ Returns:
1585
+ dict: Comprehensive results with all error estimates including Q-score effects
1586
+ """
1587
+ logging.info(f"=== COMPREHENSIVE ERROR MODEL ANALYSIS (Q{quality_threshold}) ===")
1588
+
1589
+ # 1. Binomial confidence intervals for mutation rates
1590
+ logging.info("1. Calculating binomial confidence intervals for mutation rates...")
1591
+ hit_rate_ci = binomial_confidence_interval(hit_mis, hit_cov)
1592
+ bg_rate_ci = binomial_confidence_interval(bg_mis, bg_cov)
1593
+ logging.info(f" Hit rate CI: [{hit_rate_ci[0]:.6f}, {hit_rate_ci[1]:.6f}]")
1594
+ logging.info(f" Background rate CI: [{bg_rate_ci[0]:.6f}, {bg_rate_ci[1]:.6f}]")
1595
+
1596
+ # 2. Error propagation for net mutation rate (including Q-score uncertainty)
1597
+ logging.info("2. Propagating errors for net mutation rate (including Q-score uncertainty)...")
1598
+ net_rate, net_rate_error = propagate_mutation_rate_error(
1599
+ hit_mis, hit_cov, bg_mis, bg_cov, hit_qscore_stats, bg_qscore_stats
1600
+ )
1601
+ logging.info(f" Net mutation rate: {net_rate:.6f} ± {net_rate_error:.6f}")
1602
+
1603
+ # 3. Calculate lambda_bp with error
1604
+ logging.info("3. Calculating lambda_bp (mutations per copy) with error propagation...")
1605
+ lambda_bp = net_rate * len(cds_seq)
1606
+ lambda_error = net_rate_error * len(cds_seq)
1607
+ logging.info(f" Lambda_bp: {lambda_bp:.6f} ± {lambda_error:.6f} mutations per copy")
1608
+
1609
+ # 4. Q-score weighted analysis
1610
+ logging.info("4. Calculating Q-score weighted mutation rates...")
1611
+ hit_weighted_mis, hit_weighted_cov, hit_raw_mis, hit_raw_cov, hit_weights, hit_outcomes = calculate_qscore_weighted_mismatches(
1612
+ sam_hit, hit_seq, hit_qscore_stats
1613
+ )
1614
+ bg_weighted_mis, bg_weighted_cov, bg_raw_mis, bg_raw_cov, bg_weights, bg_outcomes = calculate_qscore_weighted_mismatches(
1615
+ sam_plasmid, plasmid_seq, bg_qscore_stats
1616
+ )
1617
+
1618
+ # Calculate proper weighted sampling errors
1619
+ hit_weighted_rate, hit_weighted_error = calculate_weighted_sampling_error(hit_weights, hit_outcomes)
1620
+ bg_weighted_rate, bg_weighted_error = calculate_weighted_sampling_error(bg_weights, bg_outcomes)
1621
+
1622
+ # Net weighted rate
1623
+ net_weighted_rate = max(hit_weighted_rate - bg_weighted_rate, 0.0)
1624
+ net_weighted_error = np.sqrt(hit_weighted_error**2 + bg_weighted_error**2)
1625
+
1626
+ logging.info(f" Hit weighted rate: {hit_weighted_rate:.6f} ± {hit_weighted_error:.6f}")
1627
+ logging.info(f" Background weighted rate: {bg_weighted_rate:.6f} ± {bg_weighted_error:.6f}")
1628
+ logging.info(f" Net weighted rate: {net_weighted_rate:.6f} ± {net_weighted_error:.6f}")
1629
+
1630
+ # 5. Calculate AA mutations per gene (simplified - no Monte Carlo)
1631
+ logging.info("5. Calculating AA mutations per gene from weighted rates...")
1632
+ lambda_bp_weighted = net_weighted_rate * len(cds_seq)
1633
+ lambda_error_weighted = net_weighted_error * len(cds_seq)
1634
+
1635
+ # Simple AA mutation estimate (mean of Poisson distribution)
1636
+ mean_aa = lambda_bp_weighted / 3.0 # Approximate: 3 bp per AA
1637
+ std_aa = np.sqrt(lambda_bp_weighted) / 3.0 # Standard deviation of Poisson
1638
+
1639
+ logging.info(f" Lambda_bp (weighted): {lambda_bp_weighted:.6f} ± {lambda_error_weighted:.6f}")
1640
+ logging.info(f" AA mutations per gene: {mean_aa:.4f} ± {std_aa:.4f}")
1641
+
1642
+ # 6. Bootstrap confidence intervals
1643
+ logging.info("6. Calculating bootstrap confidence intervals (1,000 resamples)...")
1644
+ bootstrap_mean, ci_lower, ci_upper, bootstrap_dist = bootstrap_aa_mutations(
1645
+ hit_mis, hit_cov, bg_mis, bg_cov, cds_seq
1646
+ )
1647
+ logging.info(f" Bootstrap 95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
1648
+
1649
+ # 7. Alignment error estimation
1650
+ logging.info("7. Calculating alignment error estimation...")
1651
+ alignment_error = 1.0 / np.sqrt(hit_cov) if hit_cov > 0 else 1.0
1652
+ logging.info(f" Alignment error: {alignment_error:.6f}")
1653
+
1654
+ # 8. Q-score uncertainty factors
1655
+ logging.info("8. Calculating Q-score uncertainty factors...")
1656
+ hit_qscore_uncertainty = qscore_uncertainty_factor(hit_qscore_stats['mean_qscore']) if hit_qscore_stats else 0.0
1657
+ bg_qscore_uncertainty = qscore_uncertainty_factor(bg_qscore_stats['mean_qscore']) if bg_qscore_stats else 0.0
1658
+ logging.info(f" Hit Q-score uncertainty: {hit_qscore_uncertainty:.6f}")
1659
+ logging.info(f" Background Q-score uncertainty: {bg_qscore_uncertainty:.6f}")
1660
+
1661
+ results = {
1662
+ 'mean_aa_mutations': mean_aa,
1663
+ 'std_aa_mutations': std_aa,
1664
+ 'ci_lower': ci_lower,
1665
+ 'ci_upper': ci_upper,
1666
+ 'hit_rate': hit_mis / hit_cov if hit_cov > 0 else 0,
1667
+ 'hit_rate_ci': hit_rate_ci,
1668
+ 'bg_rate': bg_mis / bg_cov if bg_cov > 0 else 0,
1669
+ 'bg_rate_ci': bg_rate_ci,
1670
+ 'net_rate': net_rate,
1671
+ 'net_rate_error': net_rate_error,
1672
+ 'lambda_bp': lambda_bp,
1673
+ 'lambda_error': lambda_error,
1674
+ 'alignment_error': alignment_error,
1675
+ 'hit_qscore_uncertainty': hit_qscore_uncertainty,
1676
+ 'bg_qscore_uncertainty': bg_qscore_uncertainty,
1677
+ 'hit_qscore_stats': hit_qscore_stats,
1678
+ 'bg_qscore_stats': bg_qscore_stats,
1679
+ 'bootstrap_distribution': bootstrap_dist,
1680
+ 'quality_threshold': quality_threshold,
1681
+ 'mappable_bases': hit_cov,
1682
+ # Q-score weighted results
1683
+ 'hit_weighted_rate': hit_weighted_rate,
1684
+ 'hit_weighted_error': hit_weighted_error,
1685
+ 'bg_weighted_rate': bg_weighted_rate,
1686
+ 'bg_weighted_error': bg_weighted_error,
1687
+ 'net_weighted_rate': net_weighted_rate,
1688
+ 'net_weighted_error': net_weighted_error,
1689
+ 'lambda_bp_weighted': lambda_bp_weighted,
1690
+ 'lambda_error_weighted': lambda_error_weighted,
1691
+ 'hit_weighted_mismatches': hit_weighted_mis,
1692
+ 'hit_weighted_coverage': hit_weighted_cov,
1693
+ 'bg_weighted_mismatches': bg_weighted_mis,
1694
+ 'bg_weighted_coverage': bg_weighted_cov
1695
+ }
1696
+
1697
+ if quality_threshold is not None:
1698
+ qscore_info = ""
1699
+ if hit_qscore_stats:
1700
+ qscore_info = f", hit_qscore={hit_qscore_stats['mean_qscore']:.1f}±{hit_qscore_stats['std_qscore']:.1f}"
1701
+ if bg_qscore_stats:
1702
+ qscore_info += f", bg_qscore={bg_qscore_stats['mean_qscore']:.1f}±{bg_qscore_stats['std_qscore']:.1f}"
1703
+
1704
+ logging.info(f"Quality {quality_threshold}: AA mutations = {mean_aa:.4f} ± {std_aa:.4f} "
1705
+ f"(95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]), "
1706
+ f"mappable_bases={hit_cov}, net_rate={net_rate:.6f}±{net_rate_error:.6f}{qscore_info}")
1707
+
1708
+ return results
1709
+
1710
+ def simulate_aa_distribution(lambda_bp, cds_seq, n_trials=1000):
1711
+ """
1712
+ Monte Carlo: each trial draws n_bp_mut ~ Poisson(lambda_bp),
1713
+ introduces those random single‐base substitutions, translates,
1714
+ and returns a list of amino acid differences per trial.
1715
+ """
1716
+ prot_orig = str(Seq(cds_seq).translate(to_stop=False))
1717
+ aa_diffs = []
1718
+
1719
+ for _ in range(n_trials):
1720
+ # Number of base changes in this trial ~ Poisson(lambda_bp)
1721
+ n_bp_mut = np.random.poisson(lambda_bp)
1722
+
1723
+ # Make a mutable copy of the CDS
1724
+ seq_list = list(cds_seq.upper())
1725
+
1726
+ # Introduce exactly n_bp_mut random single‐base substitutions
1727
+ for _ in range(n_bp_mut):
1728
+ pos = random.randrange(len(seq_list))
1729
+ orig_base = seq_list[pos]
1730
+ bases = ["A", "T", "C", "G"]
1731
+ bases.remove(orig_base)
1732
+ seq_list[pos] = random.choice(bases)
1733
+
1734
+ # Translate mutated sequence (no early stop)
1735
+ mutated_prot = str(Seq("".join(seq_list)).translate(to_stop=False))
1736
+
1737
+ # Count how many amino acids differ
1738
+ aa_diff = sum(1 for a, b in zip(prot_orig, mutated_prot) if a != b)
1739
+ aa_diffs.append(aa_diff)
1740
+
1741
+ return aa_diffs
1742
+
1743
+ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, work_dir, results_dir,
1744
+ chunks, ref_hit_fasta, plasmid_fasta, hit_seq, hit_id, plasmid_seq, idx):
1745
+ """
1746
+ Run the main mutation rate analysis for a specific Q-score.
1747
+
1748
+ Args:
1749
+ fastq_path: Path to the FASTQ file to analyze
1750
+ qscore: Q-score threshold (None for unfiltered)
1751
+ qscore_desc: Description of the Q-score (e.g., "Q18", "unfiltered")
1752
+ sample_name: Name of the sample
1753
+ work_dir: Working directory for temporary files
1754
+ results_dir: Results directory for output files
1755
+ chunks: List of plasmid chunks
1756
+ ref_hit_fasta: Path to reference hit FASTA
1757
+ plasmid_fasta: Path to plasmid FASTA
1758
+ hit_seq: Hit sequence
1759
+ hit_id: Hit ID
1760
+ plasmid_seq: Plasmid sequence
1761
+ idx: Index of hit in plasmid
1762
+
1763
+ Returns:
1764
+ dict: Analysis results
1765
+ """
1766
+ logging.info(f"Running main analysis for {qscore_desc}...")
1767
+
1768
+ # Ensure work directory exists
1769
+ os.makedirs(work_dir, exist_ok=True)
1770
+
1771
+ # Create subdirectory for this Q-score analysis
1772
+ qscore_results_dir = results_dir
1773
+ if qscore is not None:
1774
+ qscore_results_dir = os.path.join(results_dir, f"q{qscore}_analysis")
1775
+ os.makedirs(qscore_results_dir, exist_ok=True)
1776
+
1777
+ # Write chunks FASTA & align to background‐chunks
1778
+ chunks_fasta = create_multi_fasta(chunks, work_dir)
1779
+ sam_chunks = run_minimap2(fastq_path, chunks_fasta, "plasmid_chunks_alignment", work_dir)
1780
+
1781
+ # Align to hit (target) alone
1782
+ sam_hit = run_minimap2(fastq_path, ref_hit_fasta, "hit_alignment", work_dir)
1783
+
1784
+ # Compute mismatch stats for background chunks (for reference, but not used for background rate)
1785
+ chunk_refs = { f"chunk_{i+1}": seq for i, seq in enumerate(chunks) }
1786
+ mismatch_chunks = compute_mismatch_stats_sam(sam_chunks, chunk_refs)
1787
+
1788
+ # ----------------------------
1789
+ # COMPUTE BASE DISTRIBUTION AT EACH POSITION OF HIT
1790
+ # ----------------------------
1791
+ base_counts = [
1792
+ {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0}
1793
+ for _ in range(len(hit_seq))
1794
+ ]
1795
+ samfile_hit = pysam.AlignmentFile(sam_hit, "r")
1796
+ for read in samfile_hit.fetch():
1797
+ if read.is_unmapped or read.query_sequence is None:
1798
+ continue
1799
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
1800
+ if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(hit_seq):
1801
+ base = read.query_sequence[read_pos].upper()
1802
+ if base not in {"A", "C", "G", "T"}:
1803
+ base = "N"
1804
+ base_counts[ref_pos][base] += 1
1805
+ samfile_hit.close()
1806
+
1807
+ # ----------------------------
1808
+ # ALIGN TO FULL PLASMID TO GET COVERAGE
1809
+ # ----------------------------
1810
+ sam_plasmid = run_minimap2(fastq_path, plasmid_fasta, "plasmid_full_alignment", work_dir)
1811
+
1812
+ # Calculate plasmid coverage
1813
+ plasmid_cov = [0] * len(plasmid_seq)
1814
+ samfile_full = pysam.AlignmentFile(sam_plasmid, "r")
1815
+ for read in samfile_full.fetch():
1816
+ if read.is_unmapped or read.query_sequence is None:
1817
+ continue
1818
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
1819
+ if ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
1820
+ plasmid_cov[ref_pos] += 1
1821
+ samfile_full.close()
1822
+
1823
+ # Calculate background rate from full plasmid alignment, excluding target region
1824
+ # This avoids artificial junction mismatches from concatenated chunks
1825
+ bg_mis, bg_cov, bg_reads = calculate_background_from_plasmid(sam_plasmid, plasmid_seq, idx, len(hit_seq))
1826
+ bg_rate = (bg_mis / bg_cov) if bg_cov else 0.0 # raw per‐base
1827
+ bg_rate_per_kb = bg_rate * 1e3
1828
+
1829
+ logging.info(
1830
+ f"Background (plasmid excluding target): total_mismatches={bg_mis}, "
1831
+ f"covered_bases={bg_cov}, mapped_reads={bg_reads}, "
1832
+ f"rate_per_kb={bg_rate_per_kb:.4f}"
1833
+ )
1834
+
1835
+ # Compute mismatch stats for hit (target)
1836
+ mismatch_hit = compute_mismatch_stats_sam(sam_hit, {hit_id: hit_seq})
1837
+ hit_info = mismatch_hit[hit_id]
1838
+ hit_mis = hit_info["total_mismatches"]
1839
+ hit_cov = hit_info["total_covered_bases"]
1840
+ hit_reads = hit_info["mapped_reads"]
1841
+ hit_rate = hit_info["avg_mismatch_rate"] # raw per‐base
1842
+ hit_rate_per_kb = hit_rate * 1e3
1843
+
1844
+ logging.info(
1845
+ f"Target ({hit_id}): total_mismatches={hit_mis}, "
1846
+ f"covered_bases={hit_cov}, mapped_reads={hit_reads}, "
1847
+ f"rate_per_kb={hit_rate_per_kb:.4f}"
1848
+ )
1849
+
1850
+ # Two‐proportion Z‐test: is target rate > background rate?
1851
+ z_stat, p_val = z_test_two_proportions(hit_mis, hit_cov, bg_mis, bg_cov)
1852
+
1853
+ # Compute "Estimated mutations per target copy (basepairs)" (float)
1854
+ length_of_target = len(hit_seq)
1855
+ true_diff_rate = hit_rate - bg_rate
1856
+ est_mut_per_copy = max(true_diff_rate * length_of_target, 0.0)
1857
+
1858
+ # Determine if ROI is a valid protein‐coding sequence (updated definition)
1859
+ is_protein = True
1860
+ reasons = []
1861
+ seq_upper = hit_seq.upper()
1862
+ # Must be multiple of 3
1863
+ if len(seq_upper) % 3 != 0:
1864
+ is_protein = False
1865
+ reasons.append(f"length {len(seq_upper)} is not a multiple of 3")
1866
+
1867
+ if is_protein:
1868
+ prot_full = str(Seq(seq_upper).translate(to_stop=False))
1869
+ # Check for premature stop codons (anything except possibly at the end)
1870
+ if "*" in prot_full[:-1]:
1871
+ is_protein = False
1872
+ reasons.append("premature stop codon detected before the last codon")
1873
+ # No requirement to start with ATG or end with stop beyond the last codon
1874
+
1875
+ # If protein, simulate AA distribution per copy using Poisson sampling
1876
+ if is_protein:
1877
+ logging.info(f"Simulating amino acid distribution with λ_bp={est_mut_per_copy:.2f}")
1878
+ aa_diffs = simulate_aa_distribution(est_mut_per_copy, hit_seq, n_trials=1000)
1879
+ avg_aa_mutations = sum(aa_diffs) / len(aa_diffs)
1880
+
1881
+ # Log simulation results for debugging
1882
+ logging.info(f"AA simulation results: min={min(aa_diffs)}, max={max(aa_diffs)}, mean={avg_aa_mutations:.3f}")
1883
+ logging.info(f"AA simulation distribution: {len([x for x in aa_diffs if x == 0])} zeros, {len([x for x in aa_diffs if x > 0])} non-zeros")
1884
+ else:
1885
+ aa_diffs = []
1886
+ avg_aa_mutations = None
1887
+
1888
+ # Update Q-score info for titles
1889
+ qscore_info = f" ({qscore_desc})" if qscore_desc != "unfiltered" else ""
1890
+
1891
+ # ----------------------------
1892
+ # SAVE CSV FOR MUTATION RATES (PANEL 1)
1893
+ # ----------------------------
1894
+ gene_mismatch_csv = os.path.join(qscore_results_dir, "gene_mismatch_rates.csv")
1895
+ with open(gene_mismatch_csv, "w", newline="") as csvfile:
1896
+ csvfile.write(f"# gene_id: {hit_id}\n")
1897
+ csvfile.write(f"# background_rate_per_kb: {bg_rate_per_kb:.6f}\n")
1898
+ csvfile.write("position_1based,mismatch_rate_per_base\n")
1899
+ for pos0, rate in enumerate(hit_info["pos_rates"]):
1900
+ csvfile.write(f"{pos0 + 1},{rate:.6e}\n")
1901
+ logging.info(f"Saved CSV for gene mismatch rates: {gene_mismatch_csv}")
1902
+
1903
+ # ----------------------------
1904
+ # SAVE CSV FOR BASE DISTRIBUTION (PANEL 2)
1905
+ # ----------------------------
1906
+ base_dist_csv = os.path.join(qscore_results_dir, "base_distribution.csv")
1907
+ with open(base_dist_csv, "w", newline="") as csvfile:
1908
+ csvfile.write(f"# gene_id: {hit_id}\n")
1909
+ csvfile.write("position_1based,ref_base,A_count,C_count,G_count,T_count,N_count\n")
1910
+ for pos0, counts in enumerate(base_counts):
1911
+ ref_base = seq_upper[pos0]
1912
+ csvfile.write(f"{pos0 + 1},{ref_base},{counts['A']},{counts['C']},{counts['G']},{counts['T']},{counts['N']}\n")
1913
+ logging.info(f"Saved CSV for base distribution: {base_dist_csv}")
1914
+
1915
+ # ----------------------------
1916
+ # SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - only if protein
1917
+ # ----------------------------
1918
+ if is_protein:
1919
+ aa_subst_csv = os.path.join(qscore_results_dir, "aa_substitutions.csv")
1920
+ with open(aa_subst_csv, "w", newline="") as csvfile:
1921
+ csvfile.write(f"# gene_id: {hit_id}\n")
1922
+ csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
1923
+ csvfile.write("position_1based,ref_aa,alt_aa,count\n")
1924
+ # This would need to be implemented based on the specific requirements
1925
+ # For now, just write a header
1926
+ logging.info(f"Saved CSV for AA substitutions: {aa_subst_csv}")
1927
+
1928
+ # ----------------------------
1929
+ # SAVE CSV FOR PLASMID COVERAGE (PANEL 4)
1930
+ # ----------------------------
1931
+ plasmid_cov_csv = os.path.join(qscore_results_dir, "plasmid_coverage.csv")
1932
+ with open(plasmid_cov_csv, "w", newline="") as csvfile:
1933
+ csvfile.write("position_1based,coverage\n")
1934
+ for pos0, cov in enumerate(plasmid_cov):
1935
+ csvfile.write(f"{pos0 + 1},{cov}\n")
1936
+ logging.info(f"Saved CSV for plasmid coverage: {plasmid_cov_csv}")
1937
+
1938
+ # ----------------------------
1939
+ # SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3)
1940
+ # ----------------------------
1941
+ aa_dist_csv = os.path.join(qscore_results_dir, "aa_mutation_distribution.csv")
1942
+ with open(aa_dist_csv, "w", newline="") as csvfile:
1943
+ csvfile.write(f"# gene_id: {hit_id}\n")
1944
+ csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
1945
+ csvfile.write(f"# n_trials: 1000\n")
1946
+ if is_protein:
1947
+ csvfile.write("trial_index,aa_mutations\n")
1948
+ for idx_trial, aa_count in enumerate(aa_diffs, start=1):
1949
+ csvfile.write(f"{idx_trial},{aa_count}\n")
1950
+ else:
1951
+ csvfile.write("# No AA distribution because region is not protein-coding\n")
1952
+ logging.info(f"Saved CSV for AA mutation distribution: {aa_dist_csv}")
1953
+
1954
+ # ----------------------------
1955
+ # PREPARE PANEL FIGURE WITH 4 SUBPLOTS
1956
+ # ----------------------------
1957
+ fig, axes = plt.subplots(2, 2, figsize=(18, 12), constrained_layout=True)
1958
+ # axes[0,0]: Mutation rate over gene of interest
1959
+ # axes[0,1]: Rolling mutation rate across plasmid (20 bp window)
1960
+ # axes[1,0]: Coverage of plasmid with ROI shaded
1961
+ # axes[1,1]: KDE of AA mutations per copy
1962
+
1963
+ # --- Panel 1: Mutation rate over gene of interest ---
1964
+ ax0 = axes[0, 0]
1965
+ positions_gene = np.arange(1, len(hit_info["pos_rates"]) + 1)
1966
+ ax0.axhspan(0, bg_rate, color='gray', alpha=0.3, label="Background rate")
1967
+ ax0.plot(positions_gene, hit_info["pos_rates"],
1968
+ color="#2E86AB", linestyle='-', linewidth=1.5, alpha=0.8,
1969
+ label="Mutation rate")
1970
+ ax0.set_title(f"Mismatch Rate per Position: Gene of Interest{qscore_info}", fontsize=14, fontweight='bold')
1971
+ ax0.set_xlabel("Position in Gene (bp)", fontsize=12)
1972
+ ax0.set_ylabel("Mismatch Rate", fontsize=12)
1973
+ ax0.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
1974
+ ax0.tick_params(axis='both', which='minor', direction='in', length=3)
1975
+ ax0.spines['top'].set_visible(False)
1976
+ ax0.spines['right'].set_visible(False)
1977
+ ax0.legend(loc="upper right", frameon=False, fontsize=10)
1978
+
1979
+ # --- Panel 2: Rolling mutation rate across plasmid ---
1980
+ ax1 = axes[0, 1]
1981
+ # Calculate rolling mutation rate across plasmid
1982
+ window_size = 20
1983
+ rolling_positions = []
1984
+ rolling_rates = []
1985
+
1986
+ # Calculate mismatches per position across the plasmid
1987
+ plasmid_mismatches = [0] * len(plasmid_seq)
1988
+ samfile_rolling = pysam.AlignmentFile(sam_plasmid, "r")
1989
+ for read in samfile_rolling.fetch():
1990
+ if read.is_unmapped or read.query_sequence is None:
1991
+ continue
1992
+ for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
1993
+ if ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
1994
+ if read_pos is not None:
1995
+ read_base = read.query_sequence[read_pos].upper()
1996
+ ref_base = plasmid_seq[ref_pos].upper()
1997
+ if read_base != ref_base and read_base in "ACGT" and ref_base in "ACGT":
1998
+ plasmid_mismatches[ref_pos] += 1
1999
+ samfile_rolling.close()
2000
+
2001
+ # Calculate rolling mutation rate
2002
+ for i in range(len(plasmid_cov) - window_size + 1):
2003
+ window_cov = plasmid_cov[i:i + window_size]
2004
+ window_mismatches = plasmid_mismatches[i:i + window_size]
2005
+
2006
+ total_coverage = sum(window_cov)
2007
+ total_mismatches = sum(window_mismatches)
2008
+
2009
+ if total_coverage > 0: # Only include windows with coverage
2010
+ rolling_positions.append(i + window_size // 2)
2011
+ mutation_rate = total_mismatches / total_coverage
2012
+ rolling_rates.append(mutation_rate)
2013
+
2014
+ ax1.plot(rolling_positions, rolling_rates,
2015
+ color="#FF6B6B", linestyle='-', linewidth=2, alpha=0.8,
2016
+ label="Rolling average (20 bp)")
2017
+ ax1.set_title(f"Rolling Mutation Rate Across Plasmid (20 bp Window){qscore_info}", fontsize=14, fontweight='bold')
2018
+ ax1.set_xlabel("Position on Plasmid (bp)", fontsize=12)
2019
+ ax1.set_ylabel("Mismatch Rate", fontsize=12)
2020
+ ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
2021
+ ax1.tick_params(axis='both', which='minor', direction='in', length=3)
2022
+ ax1.spines['top'].set_visible(False)
2023
+ ax1.spines['right'].set_visible(False)
2024
+ ax1.legend(loc="upper right", frameon=False, fontsize=10)
2025
+ # Shade the ROI region
2026
+ start_roi = idx + 1
2027
+ end_roi = idx + len(hit_seq)
2028
+ ax1.axvspan(start_roi, end_roi, color='gray', alpha=0.3, label=f"ROI: {start_roi}–{end_roi}")
2029
+
2030
+ # --- Panel 3: Coverage of plasmid with ROI shaded ---
2031
+ ax2 = axes[1, 0]
2032
+ plasmid_positions = np.arange(1, len(plasmid_cov) + 1)
2033
+ ax2.plot(plasmid_positions, plasmid_cov,
2034
+ linestyle='-', color='black', linewidth=1.0, alpha=0.8, label="Coverage")
2035
+ ax2.set_title(f"Full Plasmid Coverage with ROI Shaded{qscore_info}", fontsize=14, fontweight='bold')
2036
+ ax2.set_xlabel("Position on Plasmid", fontsize=12)
2037
+ ax2.set_ylabel("Coverage (# reads)", fontsize=12)
2038
+ ax2.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
2039
+ ax2.tick_params(axis='both', which='minor', direction='in', length=3)
2040
+ ax2.spines['top'].set_visible(False)
2041
+ ax2.spines['right'].set_visible(False)
2042
+ start_roi = idx + 1
2043
+ end_roi = idx + len(hit_seq)
2044
+ ax2.axvspan(start_roi, end_roi, color='gray', alpha=0.3, label=f"ROI: {start_roi}–{end_roi}")
2045
+
2046
+ # --- Panel 4: KDE of AA mutations per copy ---
2047
+ ax3 = axes[1, 1]
2048
+ if is_protein and aa_diffs and len(aa_diffs) > 0:
2049
+ x_vals = np.array(aa_diffs)
2050
+ unique_vals = np.unique(x_vals)
2051
+
2052
+ if len(unique_vals) > 1:
2053
+ # Multiple unique values - use KDE or histogram
2054
+ if HAVE_SCIPY:
2055
+ try:
2056
+ kde = gaussian_kde(x_vals)
2057
+ x_grid = np.linspace(0, max(x_vals), 200)
2058
+ kde_values = kde(x_grid)
2059
+ ax3.plot(x_grid, kde_values,
2060
+ color="#C44E52", linewidth=2.0, alpha=0.8, label="KDE")
2061
+ ax3.fill_between(x_grid, kde_values, color="#C44E52", alpha=0.3)
2062
+ ax3.set_ylim(bottom=0)
2063
+ except Exception as e:
2064
+ logging.warning(f"KDE failed: {e}, falling back to histogram")
2065
+ ax3.hist(x_vals, bins=min(20, len(unique_vals)),
2066
+ color="#C44E52", alpha=0.7, density=True, edgecolor='black')
2067
+ else:
2068
+ ax3.hist(x_vals, bins=min(20, len(unique_vals)),
2069
+ color="#C44E52", alpha=0.7, density=True, edgecolor='black')
2070
+ else:
2071
+ # Single unique value - just show a bar
2072
+ ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
2073
+ ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
2074
+ else:
2075
+ # Not protein or no AA differences
2076
+ ax3.text(0.5, 0.5, "Not a protein‐coding region",
2077
+ horizontalalignment='center', verticalalignment='center',
2078
+ fontsize=12, color='gray', transform=ax3.transAxes)
2079
+
2080
+ ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
2081
+ ax3.set_xlabel("Number of AA Mutations", fontsize=12)
2082
+ ax3.set_ylabel("Density", fontsize=12)
2083
+ ax3.spines['top'].set_visible(False)
2084
+ ax3.spines['right'].set_visible(False)
2085
+ ax3.set_xticks([])
2086
+ ax3.set_yticks([])
2087
+
2088
+ # Save the combined figure as both PNG and PDF
2089
+ panel_path_png = os.path.join(qscore_results_dir, "summary_panels.png")
2090
+ panel_path_pdf = os.path.join(qscore_results_dir, "summary_panels.pdf")
2091
+ fig.savefig(panel_path_png, dpi=150, transparent=False)
2092
+ fig.savefig(panel_path_pdf) # vector format
2093
+ plt.close(fig)
2094
+ logging.info(f"Saved combined panel figure as PNG: {panel_path_png}")
2095
+ logging.info(f"Saved combined panel figure as PDF: {panel_path_pdf}")
2096
+
2097
+ # ----------------------------
2098
+ # COMPUTE MUTATION SPECTRUM FOR ABOVE-BACKGROUND POSITIONS
2099
+ # ----------------------------
2100
+ # Define categories and reference percentages
2101
+ categories = {
2102
+ "A→G, T→C": {"pairs": [("A", "G"), ("T", "C")], "ref": 17.5},
2103
+ "G→A, C→T": {"pairs": [("G", "A"), ("C", "T")], "ref": 25.5},
2104
+ "A→T, T→A": {"pairs": [("A", "T"), ("T", "A")], "ref": 28.5},
2105
+ "A→C, T→G": {"pairs": [("A", "C"), ("T", "G")], "ref": 4.7},
2106
+ "G→C, C→G": {"pairs": [("G", "C"), ("C", "G")], "ref": 4.1},
2107
+ "G→T, C→A": {"pairs": [("G", "T"), ("C", "A")], "ref": 14.1},
2108
+ }
2109
+
2110
+ # Tally observed counts at above-background positions
2111
+ category_counts = {cat: 0 for cat in categories}
2112
+ total_alt_counts = 0
2113
+
2114
+ for pos0, rate in enumerate(hit_info["pos_rates"]):
2115
+ if rate <= bg_rate:
2116
+ continue
2117
+ ref_base = seq_upper[pos0]
2118
+ counts = base_counts[pos0]
2119
+ for alt_base in ("A", "C", "G", "T"):
2120
+ if alt_base == ref_base:
2121
+ continue
2122
+ cnt = counts.get(alt_base, 0)
2123
+ if cnt == 0:
2124
+ continue
2125
+ # Determine which category this (ref→alt) belongs to
2126
+ for cat, info in categories.items():
2127
+ if (ref_base, alt_base) in info["pairs"]:
2128
+ category_counts[cat] += cnt
2129
+ total_alt_counts += cnt
2130
+ break
2131
+
2132
+ # Compute sample percentages
2133
+ sample_percent = {}
2134
+ if total_alt_counts > 0:
2135
+ for cat, cnt in category_counts.items():
2136
+ sample_percent[cat] = 100.0 * cnt / total_alt_counts
2137
+ else:
2138
+ for cat in categories:
2139
+ sample_percent[cat] = 0.0
2140
+
2141
+ # ----------------------------
2142
+ # GENERATE PDF TABLE (MUTATION SPECTRUM)
2143
+ # ----------------------------
2144
+ pdf_path = os.path.join(qscore_results_dir, f"{sample_name}_mutation_spectrum.pdf")
2145
+ # Prepare table data
2146
+ table_rows = []
2147
+ for cat in categories:
2148
+ ref_pct = categories[cat]["ref"]
2149
+ samp_pct = sample_percent[cat]
2150
+ table_rows.append([cat, f"{ref_pct:.1f}%", f"{samp_pct:.1f}%"])
2151
+
2152
+ # Create a matplotlib figure for the table
2153
+ fig, ax = plt.subplots(figsize=(6, 3)) # adjust size as needed
2154
+ ax.axis("off")
2155
+
2156
+ col_labels = ["Mutation Type", "Mutazyme II reference", sample_name]
2157
+ tbl = ax.table(
2158
+ cellText=table_rows,
2159
+ colLabels=col_labels,
2160
+ cellLoc="center",
2161
+ colLoc="center",
2162
+ loc="center"
2163
+ )
2164
+ tbl.auto_set_font_size(False)
2165
+ tbl.set_fontsize(10)
2166
+ tbl.scale(1, 1.5) # stretch rows
2167
+
2168
+ # Add a title
2169
+ ax.set_title("Mutation Spectrum (Above-Background Sites)", fontsize=12, fontweight="bold", pad=20)
2170
+
2171
+ # Save as PDF
2172
+ fig.savefig(pdf_path, format="pdf", bbox_inches="tight")
2173
+ plt.close(fig)
2174
+ logging.info(f"Saved mutation spectrum table as PDF: {pdf_path}")
2175
+
2176
+ # ----------------------------
2177
+ # WRITE PER-SAMPLE SUMMARY TXT
2178
+ # ----------------------------
2179
+ sample_summary_path = os.path.join(qscore_results_dir, "summary.txt")
2180
+ with open(sample_summary_path, "w") as txtf:
2181
+ txtf.write(f"Sample: {sample_name}\n")
2182
+ txtf.write(f"{'=' * (8 + len(sample_name))}\n\n")
2183
+ txtf.write("1) Background (plasmid excluding target):\n")
2184
+ txtf.write(f" • Total mismatches: {bg_mis}\n")
2185
+ txtf.write(f" • Total covered bases: {bg_cov}\n")
2186
+ txtf.write(f" • Mapped reads: {bg_reads}\n")
2187
+ txtf.write(f" • Rate (per base): {bg_rate:.6e}\n")
2188
+ txtf.write(f" • Rate (per kb): {bg_rate_per_kb:.6e}\n\n")
2189
+
2190
+ txtf.write("2) Target (ROI) stats:\n")
2191
+ txtf.write(f" • Gene ID: {hit_id}\n")
2192
+ txtf.write(f" • Total mismatches: {hit_mis}\n")
2193
+ txtf.write(f" • Total covered bases:{hit_cov}\n")
2194
+ txtf.write(f" • Mapped reads: {hit_reads}\n")
2195
+ txtf.write(f" • Rate (per base): {hit_rate:.6e}\n")
2196
+ txtf.write(f" • Rate (per kb): {hit_rate_per_kb:.6e}\n")
2197
+ txtf.write(f" • Z‐statistic: {z_stat:.4f}\n")
2198
+ txtf.write(f" • p‐value: {p_val if p_val is not None else 'N/A'}\n")
2199
+ txtf.write(f" • Estimated mutations per copy: {est_mut_per_copy:.6e}\n\n")
2200
+
2201
+ txtf.write("3) Protein‐coding evaluation:\n")
2202
+ txtf.write(f" • Is protein: {is_protein}\n")
2203
+ if is_protein:
2204
+ txtf.write(f" • Average AA mutations per copy (simulated): {avg_aa_mutations:.3f}\n")
2205
+ else:
2206
+ txtf.write(f" • Reason(s): {('; '.join(reasons) if reasons else 'N/A')}\n")
2207
+ txtf.write("\n4) Mutation spectrum (above-background sites):\n")
2208
+ for cat in categories:
2209
+ txtf.write(f" • {cat}: {sample_percent[cat]:.1f}% (Ref: {categories[cat]['ref']:.1f}%)\n")
2210
+ txtf.write("\n5) Output files written to:\n")
2211
+ txtf.write(f" • {gene_mismatch_csv}\n")
2212
+ txtf.write(f" • {base_dist_csv}\n")
2213
+ if is_protein:
2214
+ txtf.write(f" • {aa_subst_csv}\n")
2215
+ txtf.write(f" • {plasmid_cov_csv}\n")
2216
+ txtf.write(f" • {aa_dist_csv}\n")
2217
+ txtf.write(f" • {panel_path_png} (figure)\n")
2218
+ txtf.write(f" • {panel_path_pdf} (figure)\n")
2219
+ txtf.write(f" • {pdf_path} (mutation spectrum table)\n")
2220
+
2221
+ logging.info(f"Wrote per-sample summary to: {sample_summary_path}")
2222
+
2223
+ return {
2224
+ 'qscore': qscore,
2225
+ 'qscore_desc': qscore_desc,
2226
+ 'summary_path': sample_summary_path,
2227
+ 'qscore_results_dir': qscore_results_dir,
2228
+ 'bg_mis': bg_mis,
2229
+ 'bg_cov': bg_cov,
2230
+ 'bg_reads': bg_reads,
2231
+ 'bg_rate': bg_rate,
2232
+ 'bg_rate_per_kb': bg_rate_per_kb,
2233
+ 'hit_mis': hit_mis,
2234
+ 'hit_cov': hit_cov,
2235
+ 'hit_reads': hit_reads,
2236
+ 'hit_rate': hit_rate,
2237
+ 'hit_rate_per_kb': hit_rate_per_kb,
2238
+ 'hit_info': hit_info,
2239
+ 'z_stat': z_stat,
2240
+ 'p_val': p_val,
2241
+ 'est_mut_per_copy': est_mut_per_copy,
2242
+ 'is_protein': is_protein,
2243
+ 'reasons': reasons,
2244
+ 'aa_diffs': aa_diffs,
2245
+ 'avg_aa_mutations': avg_aa_mutations,
2246
+ 'base_counts': base_counts,
2247
+ 'qscore_info': qscore_info,
2248
+ 'sam_plasmid': sam_plasmid
2249
+ }
2250
+
2251
+
2252
+
2253
+ main()
2254
+
2255
+ def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
2256
+ paths: List[Path] = []
2257
+ for item in inputs:
2258
+ if any(ch in item for ch in "*?[]"):
2259
+ paths.extend(Path().glob(item))
2260
+ else:
2261
+ paths.append(Path(item))
2262
+ unique_paths: List[Path] = []
2263
+ seen = set()
2264
+ for path in paths:
2265
+ resolved = path.resolve()
2266
+ if resolved not in seen:
2267
+ seen.add(resolved)
2268
+ unique_paths.append(path)
2269
+ return unique_paths
2270
+
2271
+
2272
+ def run_ep_library_profile(
2273
+ fastq_paths: Sequence[Path],
2274
+ region_fasta: Path,
2275
+ plasmid_fasta: Path,
2276
+ output_dir: Path,
2277
+ work_dir: Optional[Path] = None,
2278
+ ) -> Dict[str, object]:
2279
+ fastq_paths = [Path(p) for p in fastq_paths]
2280
+ if not fastq_paths:
2281
+ raise ValueError("No FASTQ files provided for analysis.")
2282
+
2283
+ region_fasta = Path(region_fasta)
2284
+ plasmid_fasta = Path(plasmid_fasta)
2285
+ output_dir = Path(output_dir)
2286
+ work_dir = Path(work_dir) if work_dir is not None else output_dir / "tmp"
2287
+
2288
+ output_dir.mkdir(parents=True, exist_ok=True)
2289
+ work_dir.mkdir(parents=True, exist_ok=True)
2290
+
2291
+ master_summary_path = output_dir / "master_summary.txt"
2292
+ header = "\t".join(
2293
+ [
2294
+ "Sample",
2295
+ "Condition",
2296
+ "Background_Rate",
2297
+ "Target_Rate",
2298
+ "Z_stat",
2299
+ "P_value",
2300
+ "Est_Mut_per_Copy",
2301
+ "Is_Protein",
2302
+ ]
2303
+ )
2304
+ master_summary_path.write_text(header + "\n", encoding="utf-8")
2305
+
2306
+ sample_results: List[Dict[str, object]] = []
2307
+ for fastq in fastq_paths:
2308
+ result = process_single_fastq(
2309
+ fastq,
2310
+ region_fasta,
2311
+ plasmid_fasta,
2312
+ work_dir,
2313
+ output_dir,
2314
+ )
2315
+ sample_results.append(result)
2316
+
2317
+ with master_summary_path.open("a", encoding="utf-8") as masterf:
2318
+ for analysis in result.get("analysis_results", []):
2319
+ if not analysis:
2320
+ continue
2321
+ row = [
2322
+ result["sample"],
2323
+ analysis.get("qscore_desc", ""),
2324
+ f"{analysis.get('bg_rate', 0.0):.6e}",
2325
+ f"{analysis.get('hit_rate', 0.0):.6e}",
2326
+ f"{analysis.get('z_stat', 0.0):.4f}",
2327
+ str(analysis.get("p_val", "N/A")),
2328
+ f"{analysis.get('est_mut_per_copy', 0.0):.6e}",
2329
+ "yes" if analysis.get("is_protein") else "no",
2330
+ ]
2331
+ masterf.write("\t".join(row) + "\n")
2332
+
2333
+ return {
2334
+ "master_summary": master_summary_path,
2335
+ "samples": sample_results,
2336
+ }
2337
+
2338
+
2339
+ def process_single_fastq(
2340
+ fastq_path,
2341
+ region_fasta,
2342
+ plasmid_fasta,
2343
+ base_work_dir,
2344
+ base_results_dir,
2345
+ ):
2346
+ """Run the mutation-rate analysis pipeline for a single FASTQ file."""
2347
+ fastq_path = Path(fastq_path)
2348
+ region_fasta = Path(region_fasta)
2349
+ plasmid_fasta = Path(plasmid_fasta)
2350
+ base_work_dir = Path(base_work_dir)
2351
+ base_results_dir = Path(base_results_dir)
2352
+
2353
+ sample_name = fastq_path.name
2354
+ if sample_name.endswith('.fastq.gz'):
2355
+ sample_name = sample_name[:-9]
2356
+ elif sample_name.endswith('.fastq'):
2357
+ sample_name = sample_name[:-6]
2358
+
2359
+ work_dir = base_work_dir / sample_name
2360
+ results_dir = base_results_dir / sample_name
2361
+
2362
+ if work_dir.exists():
2363
+ shutil.rmtree(work_dir)
2364
+ if results_dir.exists():
2365
+ shutil.rmtree(results_dir)
2366
+
2367
+ work_dir.mkdir(parents=True, exist_ok=True)
2368
+ results_dir.mkdir(parents=True, exist_ok=True)
2369
+
2370
+ setup_logging(str(results_dir))
2371
+ logging.info("--- Starting analysis for sample: %s ---", sample_name)
2372
+
2373
+ hit_seq, hit_id = load_single_sequence(str(region_fasta))
2374
+ plasmid_seq, plasmid_id = load_single_sequence(str(plasmid_fasta))
2375
+
2376
+ logging.info("Plasmid length: %s bp", len(plasmid_seq))
2377
+ logging.info("Gene of interest length: %s bp", len(hit_seq))
2378
+
2379
+ idx = plasmid_seq.upper().find(hit_seq.upper())
2380
+ if idx == -1:
2381
+ logging.error("Gene region not found in plasmid")
2382
+ return {
2383
+ "sample": sample_name,
2384
+ "results_dir": results_dir,
2385
+ "analysis_results": [],
2386
+ }
2387
+ plasmid_no_gene = plasmid_seq[:idx] + plasmid_seq[idx + len(hit_seq):]
2388
+
2389
+ logging.info("Gene found at position %s-%s (1-based)", idx + 1, idx + len(hit_seq))
2390
+ logging.info("Background region length: %s bp", len(plasmid_no_gene))
2391
+
2392
+ n_chunks = 10
2393
+ length = len(plasmid_no_gene)
2394
+ size = length // n_chunks
2395
+ min_chunk_size = 50
2396
+ if size < min_chunk_size:
2397
+ logging.warning(
2398
+ "Background region (%s bp) would create chunks smaller than %s bp. Adjusting chunk count.",
2399
+ length,
2400
+ min_chunk_size,
2401
+ )
2402
+ n_chunks = max(1, length // min_chunk_size)
2403
+ size = length // n_chunks
2404
+ logging.info("Adjusted to %s chunks of approximately %s bp each", n_chunks, size)
2405
+
2406
+ chunks = [
2407
+ plasmid_no_gene[i * size : (length if i == n_chunks - 1 else (i + 1) * size)]
2408
+ for i in range(n_chunks)
2409
+ ]
2410
+ logging.info("Chunk sizes: %s bp", [len(chunk) for chunk in chunks])
2411
+
2412
+ logging.info("Running QC analysis to get Q-score results...")
2413
+ qc_results = None
2414
+ try:
2415
+ qc_results, optimal_qscore = run_qc_analysis(
2416
+ str(fastq_path),
2417
+ str(results_dir),
2418
+ str(region_fasta),
2419
+ str(plasmid_fasta),
2420
+ )
2421
+ if qc_results is not None:
2422
+ logging.info("QC analysis completed successfully. Found %s Q-score results.", len(qc_results))
2423
+ if optimal_qscore is not None:
2424
+ logging.info("Optimal Q-score determined: %s", optimal_qscore)
2425
+ else:
2426
+ logging.warning("QC analysis completed but no Q-score results found.")
2427
+ except Exception as exc:
2428
+ logging.error("QC analysis failed: %s", exc)
2429
+ logging.warning("Proceeding with unfiltered data only.")
2430
+
2431
+ qscores_to_analyze: List[tuple[Optional[int], str, str]] = []
2432
+ qscores_to_analyze.append((None, str(fastq_path), 'unfiltered'))
2433
+
2434
+ if qc_results is not None:
2435
+ for result in qc_results:
2436
+ qscore = result['quality_threshold']
2437
+ filtered_fastq_path = work_dir / f"{sample_name}_q{qscore}.fastq.gz"
2438
+ if run_nanofilt_filtering(str(fastq_path), qscore, str(filtered_fastq_path)):
2439
+ qscores_to_analyze.append((qscore, str(filtered_fastq_path), f"Q{qscore}"))
2440
+ logging.info("Successfully created Q%s filtered data for analysis.", qscore)
2441
+ else:
2442
+ logging.warning("Failed to create Q%s filtered data.", qscore)
2443
+
2444
+ logging.info(
2445
+ "Will run main analysis for %s conditions: %s",
2446
+ len(qscores_to_analyze),
2447
+ [desc for _, _, desc in qscores_to_analyze],
2448
+ )
2449
+
2450
+ analysis_results = []
2451
+ for qscore, analysis_fastq_path, qscore_desc in qscores_to_analyze:
2452
+ result = run_main_analysis_for_qscore(
2453
+ analysis_fastq_path,
2454
+ qscore,
2455
+ qscore_desc,
2456
+ sample_name,
2457
+ str(work_dir),
2458
+ str(results_dir),
2459
+ chunks,
2460
+ str(region_fasta),
2461
+ str(plasmid_fasta),
2462
+ hit_seq,
2463
+ hit_id,
2464
+ plasmid_seq,
2465
+ idx,
2466
+ )
2467
+ analysis_results.append(result)
2468
+
2469
+ if work_dir.exists():
2470
+ shutil.rmtree(work_dir)
2471
+ logging.info("Removed temporary work directory: %s", work_dir)
2472
+
2473
+ logging.info("--- Finished analysis for sample: %s ---", sample_name)
2474
+
2475
+ return {
2476
+ "sample": sample_name,
2477
+ "results_dir": results_dir,
2478
+ "analysis_results": analysis_results,
2479
+ }
2480
+