uht-tooling 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uht_tooling/__init__.py +10 -0
- uht_tooling/cli.py +368 -0
- uht_tooling/models/__init__.py +0 -0
- uht_tooling/workflows/__init__.py +0 -0
- uht_tooling/workflows/design_gibson.py +368 -0
- uht_tooling/workflows/design_slim.py +402 -0
- uht_tooling/workflows/gui.py +595 -0
- uht_tooling/workflows/mut_rate.py +2480 -0
- uht_tooling/workflows/mutation_caller.py +432 -0
- uht_tooling/workflows/nextera_designer.py +199 -0
- uht_tooling/workflows/profile_inserts.py +441 -0
- uht_tooling/workflows/umi_hunter.py +412 -0
- uht_tooling-0.1.2.dist-info/METADATA +271 -0
- uht_tooling-0.1.2.dist-info/RECORD +17 -0
- uht_tooling-0.1.2.dist-info/WHEEL +5 -0
- uht_tooling-0.1.2.dist-info/entry_points.txt +2 -0
- uht_tooling-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2480 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
5
|
+
import glob
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import logging
|
|
9
|
+
import pysam
|
|
10
|
+
import random
|
|
11
|
+
import numpy as np
|
|
12
|
+
from Bio import SeqIO
|
|
13
|
+
from Bio.Seq import Seq
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
import math
|
|
16
|
+
import tempfile
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, Iterable, List, Optional, Sequence
|
|
19
|
+
|
|
20
|
+
# Use a built-in Matplotlib style ("ggplot") for consistency
|
|
21
|
+
plt.style.use("ggplot")
|
|
22
|
+
|
|
23
|
+
# Try to import scipy for Z‐test p‐value and KDE
|
|
24
|
+
try:
|
|
25
|
+
from scipy.stats import norm, gaussian_kde, beta, binom
|
|
26
|
+
HAVE_SCIPY = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
HAVE_SCIPY = False
|
|
29
|
+
print("Warning: scipy is not installed. Z-test p-values and KDE may be limited.")
|
|
30
|
+
|
|
31
|
+
def setup_logging(log_dir):
|
|
32
|
+
"""
|
|
33
|
+
Configure logging so that INFO (and above) go into run.log inside log_dir.
|
|
34
|
+
Before configuring, remove any existing handlers to avoid carryover.
|
|
35
|
+
"""
|
|
36
|
+
# Remove any existing handlers
|
|
37
|
+
for handler in logging.root.handlers[:]:
|
|
38
|
+
logging.root.removeHandler(handler)
|
|
39
|
+
|
|
40
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
41
|
+
log_file = os.path.join(log_dir, "run.log")
|
|
42
|
+
logging.basicConfig(
|
|
43
|
+
level=logging.INFO,
|
|
44
|
+
format="%(asctime)s %(levelname)s: %(message)s",
|
|
45
|
+
handlers=[
|
|
46
|
+
logging.FileHandler(log_file),
|
|
47
|
+
# INFO logs go into file only
|
|
48
|
+
]
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def run_minimap2(reads_input, ref_fasta, out_prefix, work_dir):
|
|
52
|
+
"""
|
|
53
|
+
Runs minimap2 on reads_input (either one FASTQ or a glob pattern).
|
|
54
|
+
Saves SAM to work_dir/out_prefix.sam.
|
|
55
|
+
Returns the full path to the SAM file.
|
|
56
|
+
"""
|
|
57
|
+
# Expand glob if pattern contains wildcard; otherwise assume it's a single path
|
|
58
|
+
fastq_files = glob.glob(reads_input) if ("*" in reads_input or "?" in reads_input) else [reads_input]
|
|
59
|
+
if not fastq_files:
|
|
60
|
+
logging.error(f"No FASTQ files found matching {reads_input}")
|
|
61
|
+
raise FileNotFoundError(f"No FASTQ files found matching {reads_input}")
|
|
62
|
+
|
|
63
|
+
sam_output = os.path.join(work_dir, f"{out_prefix}.sam")
|
|
64
|
+
cmd = ["minimap2", "-ax", "map-ont", ref_fasta, *fastq_files]
|
|
65
|
+
logging.info(f"Running minimap2: {' '.join(cmd)} → {sam_output}")
|
|
66
|
+
with open(sam_output, "w") as out_sam:
|
|
67
|
+
subprocess.run(cmd, stdout=out_sam, check=True)
|
|
68
|
+
return sam_output
|
|
69
|
+
|
|
70
|
+
def load_single_sequence(fasta_path):
|
|
71
|
+
"""
|
|
72
|
+
Loads exactly one sequence from a FASTA. Raises if not exactly one record.
|
|
73
|
+
Returns (sequence_string, sequence_id).
|
|
74
|
+
"""
|
|
75
|
+
records = list(SeqIO.parse(fasta_path, "fasta"))
|
|
76
|
+
if len(records) != 1:
|
|
77
|
+
logging.error(f"Expected exactly 1 record in {fasta_path}, found {len(records)}.")
|
|
78
|
+
raise ValueError(f"Expected exactly 1 record in {fasta_path}, found {len(records)}.")
|
|
79
|
+
seq_id = records[0].id
|
|
80
|
+
seq_str = str(records[0].seq)
|
|
81
|
+
logging.info(f"Loaded sequence {seq_id} from {fasta_path}")
|
|
82
|
+
return seq_str, seq_id
|
|
83
|
+
|
|
84
|
+
def create_multi_fasta(chunks, work_dir, out_fasta_prefix="plasmid_chunks"):
|
|
85
|
+
"""
|
|
86
|
+
Writes each chunk as a separate FASTA entry into work_dir/out_fasta_prefix.fasta:
|
|
87
|
+
>chunk_1
|
|
88
|
+
ACTG...
|
|
89
|
+
>chunk_2
|
|
90
|
+
TTAG...
|
|
91
|
+
Returns the path to the created FASTA.
|
|
92
|
+
"""
|
|
93
|
+
out_fasta = os.path.join(work_dir, f"{out_fasta_prefix}.fasta")
|
|
94
|
+
logging.info(f"Writing {len(chunks)} chunks to {out_fasta}")
|
|
95
|
+
with open(out_fasta, "w") as f:
|
|
96
|
+
for i, seq in enumerate(chunks):
|
|
97
|
+
f.write(f">chunk_{i+1}\n")
|
|
98
|
+
f.write(str(seq) + "\n")
|
|
99
|
+
return out_fasta
|
|
100
|
+
|
|
101
|
+
def calculate_background_from_plasmid(sam_plasmid, plasmid_seq, target_start, target_length):
|
|
102
|
+
"""
|
|
103
|
+
Calculate background mismatch statistics from full plasmid alignment, excluding target region.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
sam_plasmid: Path to SAM file with full plasmid alignment
|
|
107
|
+
plasmid_seq: Full plasmid sequence
|
|
108
|
+
target_start: Start position of target region (0-based)
|
|
109
|
+
target_length: Length of target region
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
tuple: (total_mismatches, total_covered_bases, mapped_reads)
|
|
113
|
+
"""
|
|
114
|
+
target_end = target_start + target_length
|
|
115
|
+
|
|
116
|
+
# Initialize counters
|
|
117
|
+
total_mismatches = 0
|
|
118
|
+
total_covered_bases = 0
|
|
119
|
+
mapped_reads = 0
|
|
120
|
+
|
|
121
|
+
samfile = pysam.AlignmentFile(sam_plasmid, "r")
|
|
122
|
+
for read in samfile.fetch():
|
|
123
|
+
if read.is_unmapped or read.query_sequence is None:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
mapped_reads += 1
|
|
127
|
+
|
|
128
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
129
|
+
if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
|
|
130
|
+
# Skip positions within the target region
|
|
131
|
+
if target_start <= ref_pos < target_end:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
total_covered_bases += 1
|
|
135
|
+
if read.query_sequence[read_pos].upper() != plasmid_seq[ref_pos].upper():
|
|
136
|
+
total_mismatches += 1
|
|
137
|
+
|
|
138
|
+
samfile.close()
|
|
139
|
+
|
|
140
|
+
return total_mismatches, total_covered_bases, mapped_reads
|
|
141
|
+
|
|
142
|
+
def calculate_rolling_mutation_rate_plasmid(sam_plasmid, plasmid_seq, window_size=20):
|
|
143
|
+
"""
|
|
144
|
+
Calculate rolling mutation rate across the entire plasmid with a specified window size.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
sam_plasmid: Path to SAM file with full plasmid alignment
|
|
148
|
+
plasmid_seq: Full plasmid sequence
|
|
149
|
+
window_size: Size of the rolling window (default: 20 bp)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
tuple: (positions, rolling_rates) for the entire plasmid
|
|
153
|
+
"""
|
|
154
|
+
# Calculate per-position mutation rates for the entire plasmid
|
|
155
|
+
plasmid_mismatches = [0] * len(plasmid_seq)
|
|
156
|
+
plasmid_coverage = [0] * len(plasmid_seq)
|
|
157
|
+
|
|
158
|
+
samfile = pysam.AlignmentFile(sam_plasmid, "r")
|
|
159
|
+
for read in samfile.fetch():
|
|
160
|
+
if read.is_unmapped or read.query_sequence is None:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
164
|
+
if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
|
|
165
|
+
plasmid_coverage[ref_pos] += 1
|
|
166
|
+
if read.query_sequence[read_pos].upper() != plasmid_seq[ref_pos].upper():
|
|
167
|
+
plasmid_mismatches[ref_pos] += 1
|
|
168
|
+
|
|
169
|
+
samfile.close()
|
|
170
|
+
|
|
171
|
+
# Calculate per-position mutation rates
|
|
172
|
+
plasmid_rates = []
|
|
173
|
+
for cov, mis in zip(plasmid_coverage, plasmid_mismatches):
|
|
174
|
+
if cov > 0:
|
|
175
|
+
plasmid_rates.append(mis / cov)
|
|
176
|
+
else:
|
|
177
|
+
plasmid_rates.append(0.0)
|
|
178
|
+
|
|
179
|
+
# Calculate rolling average
|
|
180
|
+
rolling_rates = []
|
|
181
|
+
positions = []
|
|
182
|
+
|
|
183
|
+
for i in range(len(plasmid_rates) - window_size + 1):
|
|
184
|
+
window_rates = plasmid_rates[i:i + window_size]
|
|
185
|
+
rolling_rates.append(np.mean(window_rates))
|
|
186
|
+
positions.append(i + window_size // 2 + 1) # Center position of window
|
|
187
|
+
|
|
188
|
+
return positions, rolling_rates
|
|
189
|
+
|
|
190
|
+
def compute_mismatch_stats_sam(sam_file, refs_dict):
|
|
191
|
+
"""
|
|
192
|
+
For each reference in refs_dict:
|
|
193
|
+
- Count how many reads mapped to it
|
|
194
|
+
- Count per‐position coverage
|
|
195
|
+
- Count per‐position mismatches
|
|
196
|
+
- Compute total mismatches, total covered bases, average mismatch rate
|
|
197
|
+
|
|
198
|
+
Returns a dict keyed by reference name, each containing:
|
|
199
|
+
pos_rates -> list of mismatch rates per position
|
|
200
|
+
cov -> list of coverage per position
|
|
201
|
+
mismatch -> list of mismatch counts per position
|
|
202
|
+
avg_mismatch_rate -> float (raw per‐base fraction)
|
|
203
|
+
total_mismatches -> int
|
|
204
|
+
total_covered_bases -> int
|
|
205
|
+
mapped_reads -> int
|
|
206
|
+
"""
|
|
207
|
+
mismatch_data = {
|
|
208
|
+
name: {
|
|
209
|
+
"pos_rates": [0.0] * len(seq),
|
|
210
|
+
"cov": [0] * len(seq),
|
|
211
|
+
"mismatch": [0] * len(seq),
|
|
212
|
+
"avg_mismatch_rate": 0.0,
|
|
213
|
+
"total_mismatches": 0,
|
|
214
|
+
"total_covered_bases": 0,
|
|
215
|
+
"mapped_reads": 0,
|
|
216
|
+
}
|
|
217
|
+
for name, seq in refs_dict.items()
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
logging.info(f"Computing mismatch stats for {sam_file}")
|
|
221
|
+
samfile = pysam.AlignmentFile(sam_file, "r")
|
|
222
|
+
for read in samfile.fetch():
|
|
223
|
+
if read.is_unmapped or read.query_sequence is None:
|
|
224
|
+
continue
|
|
225
|
+
ref_name = samfile.get_reference_name(read.reference_id)
|
|
226
|
+
if ref_name not in mismatch_data:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
mismatch_data[ref_name]["mapped_reads"] += 1
|
|
230
|
+
info = mismatch_data[ref_name]
|
|
231
|
+
ref_seq = refs_dict[ref_name]
|
|
232
|
+
|
|
233
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
234
|
+
if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(ref_seq):
|
|
235
|
+
info["cov"][ref_pos] += 1
|
|
236
|
+
if read.query_sequence[read_pos].upper() != ref_seq[ref_pos].upper():
|
|
237
|
+
info["mismatch"][ref_pos] += 1
|
|
238
|
+
|
|
239
|
+
samfile.close()
|
|
240
|
+
|
|
241
|
+
for name, info in mismatch_data.items():
|
|
242
|
+
total_mis = 0
|
|
243
|
+
total_cov = 0
|
|
244
|
+
pos_rates = []
|
|
245
|
+
for cov, mis in zip(info["cov"], info["mismatch"]):
|
|
246
|
+
if cov > 0:
|
|
247
|
+
rate = mis / cov
|
|
248
|
+
pos_rates.append(rate)
|
|
249
|
+
total_mis += mis
|
|
250
|
+
total_cov += cov
|
|
251
|
+
else:
|
|
252
|
+
pos_rates.append(0.0)
|
|
253
|
+
|
|
254
|
+
info["pos_rates"] = pos_rates
|
|
255
|
+
info["total_mismatches"] = total_mis
|
|
256
|
+
info["total_covered_bases"] = total_cov
|
|
257
|
+
info["avg_mismatch_rate"] = (total_mis / total_cov) if total_cov > 0 else 0.0
|
|
258
|
+
|
|
259
|
+
logging.info(
|
|
260
|
+
f"{name}: mapped_reads={info['mapped_reads']}, "
|
|
261
|
+
f"mismatches={total_mis}, covered_bases={total_cov}, "
|
|
262
|
+
f"avg_rate={info['avg_mismatch_rate']:.6f}"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return mismatch_data
|
|
266
|
+
|
|
267
|
+
def z_test_two_proportions(mis1, cov1, mis2, cov2):
|
|
268
|
+
"""
|
|
269
|
+
Performs a two‐proportion Z‐test:
|
|
270
|
+
H0: p1 == p2
|
|
271
|
+
H1: p1 != p2
|
|
272
|
+
|
|
273
|
+
Returns (z_statistic, p_value). If scipy is unavailable, p_value=None.
|
|
274
|
+
"""
|
|
275
|
+
if cov1 == 0 or cov2 == 0:
|
|
276
|
+
return 0.0, 1.0
|
|
277
|
+
|
|
278
|
+
p1 = mis1 / cov1
|
|
279
|
+
p2 = mis2 / cov2
|
|
280
|
+
p = (mis1 + mis2) / (cov1 + cov2)
|
|
281
|
+
denom = math.sqrt(p * (1 - p) * (1 / cov1 + 1 / cov2))
|
|
282
|
+
if denom == 0:
|
|
283
|
+
return 0.0, 1.0
|
|
284
|
+
|
|
285
|
+
z_stat = (p1 - p2) / denom
|
|
286
|
+
if HAVE_SCIPY:
|
|
287
|
+
p_val = 2 * (1 - norm.cdf(abs(z_stat)))
|
|
288
|
+
logging.info(f"Z‐test: z={z_stat:.4f}, p‐value={p_val:.4e}")
|
|
289
|
+
else:
|
|
290
|
+
p_val = None
|
|
291
|
+
logging.info(f"Z‐test: z={z_stat:.4f}, p‐value=(scipy unavailable)")
|
|
292
|
+
|
|
293
|
+
return z_stat, p_val
|
|
294
|
+
|
|
295
|
+
def run_nanofilt_filtering(input_fastq, quality_threshold, output_fastq):
|
|
296
|
+
"""
|
|
297
|
+
Run NanoFilt to filter FASTQ file by quality score threshold and minimum length.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
input_fastq: Path to input FASTQ.gz file
|
|
301
|
+
quality_threshold: Quality score threshold (integer)
|
|
302
|
+
output_fastq: Path to output filtered FASTQ.gz file
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
bool: True if successful, False otherwise
|
|
306
|
+
"""
|
|
307
|
+
try:
|
|
308
|
+
# Use gunzip to decompress, pipe to NanoFilt with length filter, then compress output
|
|
309
|
+
cmd = f"gunzip -c {input_fastq} | NanoFilt -q {quality_threshold} -l 30 | gzip > {output_fastq}"
|
|
310
|
+
logging.info(f"Running NanoFilt with quality threshold {quality_threshold} and min length 30bp: {cmd}")
|
|
311
|
+
|
|
312
|
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
313
|
+
if result.returncode != 0:
|
|
314
|
+
logging.error(f"NanoFilt failed with return code {result.returncode}: {result.stderr}")
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
# Check if output file was created and has content
|
|
318
|
+
if os.path.exists(output_fastq) and os.path.getsize(output_fastq) > 0:
|
|
319
|
+
logging.info(f"Successfully created filtered FASTQ: {output_fastq}")
|
|
320
|
+
return True
|
|
321
|
+
else:
|
|
322
|
+
logging.error(f"Output file {output_fastq} was not created or is empty")
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
logging.error(f"Error running NanoFilt: {e}")
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
def calculate_mutation_rate_for_quality(fastq_path, quality_threshold, work_dir, ref_hit_fasta, plasmid_fasta):
|
|
330
|
+
"""
|
|
331
|
+
Calculate comprehensive AA mutation analysis for a specific quality threshold.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
fastq_path: Path to input FASTQ.gz file
|
|
335
|
+
quality_threshold: Quality score threshold
|
|
336
|
+
work_dir: Working directory for temporary files
|
|
337
|
+
ref_hit_fasta: Path to reference hit FASTA
|
|
338
|
+
plasmid_fasta: Path to plasmid FASTA
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
dict: Comprehensive results with error estimates or None if failed
|
|
342
|
+
"""
|
|
343
|
+
try:
|
|
344
|
+
# Create filtered FASTQ file
|
|
345
|
+
filtered_fastq = os.path.join(work_dir, f"filtered_q{quality_threshold}.fastq.gz")
|
|
346
|
+
|
|
347
|
+
if not run_nanofilt_filtering(fastq_path, quality_threshold, filtered_fastq):
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
# Load sequences
|
|
351
|
+
hit_seq, hit_id = load_single_sequence(ref_hit_fasta)
|
|
352
|
+
plasmid_seq, plasmid_id = load_single_sequence(plasmid_fasta)
|
|
353
|
+
|
|
354
|
+
# Find hit region in plasmid
|
|
355
|
+
idx = plasmid_seq.upper().find(hit_seq.upper())
|
|
356
|
+
if idx == -1:
|
|
357
|
+
logging.error("Gene region not found in plasmid")
|
|
358
|
+
return None
|
|
359
|
+
|
|
360
|
+
# Align filtered reads to hit region
|
|
361
|
+
sam_hit = run_minimap2(filtered_fastq, ref_hit_fasta, f"hit_q{quality_threshold}", work_dir)
|
|
362
|
+
|
|
363
|
+
# Align filtered reads to full plasmid for background calculation
|
|
364
|
+
sam_plasmid = run_minimap2(filtered_fastq, plasmid_fasta, f"plasmid_q{quality_threshold}", work_dir)
|
|
365
|
+
|
|
366
|
+
# Calculate background rate from full plasmid alignment, excluding target region
|
|
367
|
+
bg_mis, bg_cov, bg_reads = calculate_background_from_plasmid(sam_plasmid, plasmid_seq, idx, len(hit_seq))
|
|
368
|
+
|
|
369
|
+
# Calculate hit region mutation rate
|
|
370
|
+
mismatch_hit = compute_mismatch_stats_sam(sam_hit, {hit_id: hit_seq})
|
|
371
|
+
hit_info = mismatch_hit[hit_id]
|
|
372
|
+
hit_mis = hit_info["total_mismatches"]
|
|
373
|
+
hit_cov = hit_info["total_covered_bases"]
|
|
374
|
+
|
|
375
|
+
# Extract Q-score statistics for both hit and background regions
|
|
376
|
+
hit_qscore_stats = extract_qscores_from_sam(sam_hit)
|
|
377
|
+
bg_qscore_stats = extract_qscores_from_sam(sam_plasmid)
|
|
378
|
+
|
|
379
|
+
# Check if it's a protein-coding sequence
|
|
380
|
+
is_protein = True
|
|
381
|
+
seq_upper = hit_seq.upper()
|
|
382
|
+
if len(seq_upper) % 3 != 0:
|
|
383
|
+
is_protein = False
|
|
384
|
+
elif "*" in str(Seq(seq_upper).translate(to_stop=False))[:-1]:
|
|
385
|
+
is_protein = False
|
|
386
|
+
|
|
387
|
+
# Run comprehensive analysis if protein-coding
|
|
388
|
+
if is_protein:
|
|
389
|
+
results = comprehensive_aa_mutation_analysis(
|
|
390
|
+
hit_mis, hit_cov, bg_mis, bg_cov, hit_seq,
|
|
391
|
+
quality_threshold=quality_threshold, n_trials=10000,
|
|
392
|
+
hit_qscore_stats=hit_qscore_stats, bg_qscore_stats=bg_qscore_stats,
|
|
393
|
+
sam_hit=sam_hit, sam_plasmid=sam_plasmid, hit_seq=hit_seq, plasmid_seq=plasmid_seq
|
|
394
|
+
)
|
|
395
|
+
return results
|
|
396
|
+
else:
|
|
397
|
+
# For non-protein sequences, return basic info
|
|
398
|
+
return {
|
|
399
|
+
'mean_aa_mutations': 0.0,
|
|
400
|
+
'std_aa_mutations': 0.0,
|
|
401
|
+
'ci_lower': 0.0,
|
|
402
|
+
'ci_upper': 0.0,
|
|
403
|
+
'hit_rate': hit_mis / hit_cov if hit_cov > 0 else 0,
|
|
404
|
+
'bg_rate': bg_mis / bg_cov if bg_cov > 0 else 0,
|
|
405
|
+
'net_rate': max((hit_mis / hit_cov) - (bg_mis / bg_cov), 0) if hit_cov > 0 and bg_cov > 0 else 0,
|
|
406
|
+
'mappable_bases': hit_cov,
|
|
407
|
+
'quality_threshold': quality_threshold,
|
|
408
|
+
'is_protein': False
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logging.error(f"Error calculating mutation rate for quality {quality_threshold}: {e}")
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
def find_optimal_qscore(qc_results):
|
|
416
|
+
"""
|
|
417
|
+
Find the Q-score threshold with the lowest net mutation rate error.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
qc_results: List of comprehensive analysis results
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
tuple: (optimal_qscore, optimal_result, error_comparison)
|
|
424
|
+
"""
|
|
425
|
+
logging.info("=== FINDING OPTIMAL Q-SCORE THRESHOLD ===")
|
|
426
|
+
|
|
427
|
+
if not qc_results:
|
|
428
|
+
return None, None, None
|
|
429
|
+
|
|
430
|
+
# Find Q-score with minimum net mutation rate error
|
|
431
|
+
min_error = float('inf')
|
|
432
|
+
optimal_result = None
|
|
433
|
+
optimal_qscore = None
|
|
434
|
+
|
|
435
|
+
error_comparison = []
|
|
436
|
+
|
|
437
|
+
for result in qc_results:
|
|
438
|
+
qscore = result['quality_threshold']
|
|
439
|
+
# Use weighted error for optimal Q-score selection
|
|
440
|
+
net_rate_error = result.get('net_weighted_error', result['net_rate_error'])
|
|
441
|
+
mappable_bases = result['mappable_bases']
|
|
442
|
+
|
|
443
|
+
error_comparison.append({
|
|
444
|
+
'qscore': qscore,
|
|
445
|
+
'net_rate_error': net_rate_error,
|
|
446
|
+
'net_weighted_error': result.get('net_weighted_error', 0.0),
|
|
447
|
+
'mappable_bases': mappable_bases,
|
|
448
|
+
'aa_mutations': result['mean_aa_mutations'],
|
|
449
|
+
'aa_error': result['std_aa_mutations']
|
|
450
|
+
})
|
|
451
|
+
|
|
452
|
+
logging.info(f"Q{qscore}: net_weighted_error={net_rate_error:.6f}, mappable_bases={mappable_bases}")
|
|
453
|
+
|
|
454
|
+
if net_rate_error < min_error:
|
|
455
|
+
min_error = net_rate_error
|
|
456
|
+
optimal_result = result
|
|
457
|
+
optimal_qscore = qscore
|
|
458
|
+
|
|
459
|
+
logging.info(f"OPTIMAL Q-SCORE: Q{optimal_qscore} (lowest net mutation rate error: {min_error:.6f})")
|
|
460
|
+
logging.info(f"Optimal result: AA mutations = {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}")
|
|
461
|
+
|
|
462
|
+
return optimal_qscore, optimal_result, error_comparison
|
|
463
|
+
|
|
464
|
+
def run_qc_analysis(fastq_path, results_dir, ref_hit_fasta, plasmid_fasta):
|
|
465
|
+
"""
|
|
466
|
+
Run simple QC analysis using segmentation-based error estimation.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
fastq_path: Path to input FASTQ.gz file
|
|
470
|
+
results_dir: Directory to save QC results
|
|
471
|
+
"""
|
|
472
|
+
logging.info("Starting simple QC analysis with segmentation-based error estimation")
|
|
473
|
+
|
|
474
|
+
# Define quality thresholds to test
|
|
475
|
+
quality_thresholds = [10, 12, 14, 16, 18, 20, 22, 24, 26]
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
# Segment the input FASTQ file
|
|
479
|
+
logging.info("Segmenting FASTQ file into 10 parts for error estimation...")
|
|
480
|
+
segment_files = segment_fastq_file(fastq_path, n_segments=10)
|
|
481
|
+
|
|
482
|
+
if not segment_files:
|
|
483
|
+
logging.error("Failed to segment FASTQ file")
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
# Create temporary work directory for QC analysis
|
|
487
|
+
with tempfile.TemporaryDirectory() as qc_work_dir:
|
|
488
|
+
logging.info(f"Using temporary work directory: {qc_work_dir}")
|
|
489
|
+
|
|
490
|
+
# Calculate results for each quality threshold
|
|
491
|
+
qc_results = []
|
|
492
|
+
successful_thresholds = []
|
|
493
|
+
|
|
494
|
+
for q_threshold in quality_thresholds:
|
|
495
|
+
logging.info(f"Processing quality threshold: {q_threshold}")
|
|
496
|
+
result = run_segmented_analysis(
|
|
497
|
+
segment_files, q_threshold, qc_work_dir, ref_hit_fasta, plasmid_fasta
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
if result is not None:
|
|
501
|
+
qc_results.append(result)
|
|
502
|
+
successful_thresholds.append(q_threshold)
|
|
503
|
+
logging.info(f"Quality {q_threshold}: AA mutations = {result['mean_aa_mutations']:.4f} ± {result['std_aa_mutations']:.4f}, "
|
|
504
|
+
f"mappable bases = {result['total_mappable_bases']}")
|
|
505
|
+
else:
|
|
506
|
+
logging.warning(f"Failed to calculate mutation rate for quality threshold {q_threshold}")
|
|
507
|
+
|
|
508
|
+
# Find optimal Q-score threshold (lowest empirical error)
|
|
509
|
+
optimal_qscore, optimal_result = find_optimal_qscore_simple(qc_results)
|
|
510
|
+
|
|
511
|
+
# Create QC plots
|
|
512
|
+
if len(qc_results) >= 2:
|
|
513
|
+
create_simple_qc_plots(successful_thresholds, qc_results, results_dir, optimal_qscore, optimal_result)
|
|
514
|
+
else:
|
|
515
|
+
logging.warning("Insufficient data points for QC plots (need at least 2)")
|
|
516
|
+
|
|
517
|
+
# Save optimal Q-score information
|
|
518
|
+
if optimal_qscore is not None:
|
|
519
|
+
optimal_qscore_path = os.path.join(results_dir, "optimal_qscore_analysis.txt")
|
|
520
|
+
with open(optimal_qscore_path, 'w') as f:
|
|
521
|
+
f.write("=== OPTIMAL Q-SCORE ANALYSIS (PRECISION-WEIGHTED) ===\n")
|
|
522
|
+
f.write(f"Optimal Q-score threshold: {optimal_qscore}\n")
|
|
523
|
+
f.write(f"Precision-weighted score: {(1.0 / optimal_result['std_aa_mutations']) * optimal_qscore:.6f}\n" if optimal_result['std_aa_mutations'] > 0 else "Precision-weighted score: inf (perfect precision)\n")
|
|
524
|
+
f.write(f"Empirical error (std): {optimal_result['std_aa_mutations']:.6f}\n")
|
|
525
|
+
f.write(f"AA mutations per gene: {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}\n")
|
|
526
|
+
f.write(f"95% Confidence Interval: [{optimal_result['ci_lower']:.4f}, {optimal_result['ci_upper']:.4f}]\n")
|
|
527
|
+
f.write(f"Total mappable bases: {optimal_result['total_mappable_bases']}\n")
|
|
528
|
+
f.write(f"Number of segments: {optimal_result['n_segments']}\n")
|
|
529
|
+
f.write("\n=== ALL Q-SCORE COMPARISON ===\n")
|
|
530
|
+
f.write("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases\tAA_Mutations\tCI_Lower\tCI_Upper\n")
|
|
531
|
+
for result in qc_results:
|
|
532
|
+
precision_score = (1.0 / result['std_aa_mutations']) * result['quality_threshold'] if result['std_aa_mutations'] > 0 else float('inf')
|
|
533
|
+
f.write(f"{result['quality_threshold']}\t{result['std_aa_mutations']:.6f}\t{precision_score:.6f}\t{result['total_mappable_bases']}\t{result['mean_aa_mutations']:.4f}\t{result['ci_lower']:.4f}\t{result['ci_upper']:.4f}\n")
|
|
534
|
+
|
|
535
|
+
logging.info(f"Optimal Q-score analysis saved to: {optimal_qscore_path}")
|
|
536
|
+
|
|
537
|
+
# Clean up segment files
|
|
538
|
+
import shutil
|
|
539
|
+
segment_dir = os.path.dirname(segment_files[0])
|
|
540
|
+
if os.path.exists(segment_dir):
|
|
541
|
+
shutil.rmtree(segment_dir)
|
|
542
|
+
logging.info(f"Cleaned up segment directory: {segment_dir}")
|
|
543
|
+
|
|
544
|
+
# Return both QC results and optimal Q-score for use in main analysis
|
|
545
|
+
return qc_results, optimal_qscore
|
|
546
|
+
|
|
547
|
+
def find_optimal_qscore_simple(qc_results):
|
|
548
|
+
"""
|
|
549
|
+
Find the Q-score threshold with the highest precision-weighted score.
|
|
550
|
+
Precision-weighted score = (1 / standard_deviation) * q_score
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
qc_results: List of segmentation analysis results
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
tuple: (optimal_qscore, optimal_result)
|
|
557
|
+
"""
|
|
558
|
+
logging.info("=== FINDING OPTIMAL Q-SCORE THRESHOLD (PRECISION-WEIGHTED) ===")
|
|
559
|
+
|
|
560
|
+
if not qc_results:
|
|
561
|
+
return None, None
|
|
562
|
+
|
|
563
|
+
# Find Q-score with highest precision-weighted score
|
|
564
|
+
max_score = -1
|
|
565
|
+
optimal_result = None
|
|
566
|
+
optimal_qscore = None
|
|
567
|
+
|
|
568
|
+
logging.info("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases")
|
|
569
|
+
logging.info("-" * 60)
|
|
570
|
+
|
|
571
|
+
for result in qc_results:
|
|
572
|
+
qscore = result['quality_threshold']
|
|
573
|
+
empirical_error = result['std_aa_mutations']
|
|
574
|
+
mappable_bases = result['total_mappable_bases']
|
|
575
|
+
|
|
576
|
+
# Calculate precision-weighted score: (1/sd) * q_score
|
|
577
|
+
if empirical_error > 0:
|
|
578
|
+
precision_score = (1.0 / empirical_error) * qscore
|
|
579
|
+
else:
|
|
580
|
+
precision_score = float('inf') # Perfect precision
|
|
581
|
+
|
|
582
|
+
logging.info(f"Q{qscore}\t{empirical_error:.6f}\t{precision_score:.6f}\t{mappable_bases}")
|
|
583
|
+
|
|
584
|
+
if precision_score > max_score:
|
|
585
|
+
max_score = precision_score
|
|
586
|
+
optimal_result = result
|
|
587
|
+
optimal_qscore = qscore
|
|
588
|
+
|
|
589
|
+
logging.info("-" * 60)
|
|
590
|
+
logging.info(f"OPTIMAL Q-SCORE: Q{optimal_qscore} (highest precision-weighted score: {max_score:.6f})")
|
|
591
|
+
logging.info(f"Optimal result: AA mutations = {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}")
|
|
592
|
+
|
|
593
|
+
return optimal_qscore, optimal_result
|
|
594
|
+
|
|
595
|
+
def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_qscore=None, optimal_result=None):
|
|
596
|
+
"""
|
|
597
|
+
Create simple QC plots with empirical error bars.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
quality_thresholds: List of quality score thresholds
|
|
601
|
+
qc_results: List of segmentation analysis results
|
|
602
|
+
results_dir: Directory to save the plots
|
|
603
|
+
optimal_qscore: Optimal Q-score threshold (optional)
|
|
604
|
+
optimal_result: Optimal result data (optional)
|
|
605
|
+
"""
|
|
606
|
+
try:
|
|
607
|
+
# Extract data for plotting
|
|
608
|
+
aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
|
|
609
|
+
aa_errors = [r['std_aa_mutations'] for r in qc_results]
|
|
610
|
+
aa_ci_lower = [r['ci_lower'] for r in qc_results]
|
|
611
|
+
aa_ci_upper = [r['ci_upper'] for r in qc_results]
|
|
612
|
+
mappable_bases = [r['total_mappable_bases'] for r in qc_results]
|
|
613
|
+
|
|
614
|
+
# Create main QC plot
|
|
615
|
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
|
|
616
|
+
|
|
617
|
+
# Top plot: AA mutations per gene with empirical error bars
|
|
618
|
+
color1 = '#2E8B57'
|
|
619
|
+
ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
620
|
+
fmt='o', capsize=5, capthick=2, markersize=8,
|
|
621
|
+
color=color1, ecolor=color1, alpha=0.8, label='Mean ± Empirical Std')
|
|
622
|
+
|
|
623
|
+
# Add confidence intervals as shaded area
|
|
624
|
+
ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
|
|
625
|
+
alpha=0.3, color=color1, label='95% Confidence Interval')
|
|
626
|
+
|
|
627
|
+
# Highlight optimal Q-score
|
|
628
|
+
if optimal_qscore is not None:
|
|
629
|
+
ax1.axvline(x=optimal_qscore, color='red', linestyle='--', alpha=0.7,
|
|
630
|
+
label=f'Optimal Q{optimal_qscore}')
|
|
631
|
+
|
|
632
|
+
ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
633
|
+
ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
|
|
634
|
+
ax1.tick_params(axis='y', labelcolor=color1)
|
|
635
|
+
ax1.set_title('AA Mutations per Gene vs Quality Score Filter (Segmentation-Based Error)',
|
|
636
|
+
fontsize=14, fontweight='bold')
|
|
637
|
+
ax1.grid(True, alpha=0.3)
|
|
638
|
+
ax1.legend(frameon=False, fontsize=10)
|
|
639
|
+
|
|
640
|
+
# Add data point labels
|
|
641
|
+
for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
|
|
642
|
+
ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
|
|
643
|
+
(q, aa_mut), xytext=(5, 5),
|
|
644
|
+
textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
|
|
645
|
+
|
|
646
|
+
# Bottom plot: Mappable bases and AA mutations per gene
|
|
647
|
+
color2 = '#FF6B6B'
|
|
648
|
+
color3 = '#4169E1'
|
|
649
|
+
|
|
650
|
+
# Mappable bases (left y-axis)
|
|
651
|
+
ax2_twin = ax2.twinx()
|
|
652
|
+
ax2_twin.scatter(quality_thresholds, mappable_bases,
|
|
653
|
+
s=100, alpha=0.7, color=color2, edgecolors='black',
|
|
654
|
+
linewidth=1, marker='s', label='Mappable Bases')
|
|
655
|
+
ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
|
|
656
|
+
ax2_twin.tick_params(axis='y', labelcolor=color2)
|
|
657
|
+
|
|
658
|
+
# AA mutations per gene with error bars (right y-axis)
|
|
659
|
+
ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
660
|
+
fmt='^', capsize=5, capthick=2, markersize=8,
|
|
661
|
+
color=color3, ecolor=color3, alpha=0.8, label='AA Mutations ± Empirical Error')
|
|
662
|
+
ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
|
|
663
|
+
ax2.tick_params(axis='y', labelcolor=color3)
|
|
664
|
+
ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
665
|
+
ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
|
|
666
|
+
fontsize=14, fontweight='bold')
|
|
667
|
+
ax2.grid(True, alpha=0.3)
|
|
668
|
+
|
|
669
|
+
# Add legends
|
|
670
|
+
lines1, labels1 = ax2.get_legend_handles_labels()
|
|
671
|
+
lines2, labels2 = ax2_twin.get_legend_handles_labels()
|
|
672
|
+
ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
|
|
673
|
+
|
|
674
|
+
# Add data point labels for mappable bases
|
|
675
|
+
for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
|
|
676
|
+
ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
|
|
677
|
+
textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
|
|
678
|
+
|
|
679
|
+
plt.tight_layout()
|
|
680
|
+
|
|
681
|
+
# Save the plot
|
|
682
|
+
project_name = os.path.basename(results_dir)
|
|
683
|
+
qc_plot_path = os.path.join(results_dir, f"qc_plot_{project_name}.png")
|
|
684
|
+
fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
|
|
685
|
+
plt.close(fig)
|
|
686
|
+
|
|
687
|
+
logging.info(f"QC plot saved to: {qc_plot_path}")
|
|
688
|
+
|
|
689
|
+
# Save data as CSV
|
|
690
|
+
qc_data_path = os.path.join(results_dir, "simple_qc_data.csv")
|
|
691
|
+
with open(qc_data_path, 'w') as f:
|
|
692
|
+
f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
|
|
693
|
+
f.write("total_mappable_bases,n_segments\n")
|
|
694
|
+
|
|
695
|
+
for q, r in zip(quality_thresholds, qc_results):
|
|
696
|
+
f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
|
|
697
|
+
f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
|
|
698
|
+
f.write(f"{r['total_mappable_bases']},{r['n_segments']}\n")
|
|
699
|
+
|
|
700
|
+
logging.info(f"Simple QC data saved to: {qc_data_path}")
|
|
701
|
+
|
|
702
|
+
except Exception as e:
|
|
703
|
+
logging.error(f"Error creating simple QC plots: {e}")
|
|
704
|
+
"""
|
|
705
|
+
Create comprehensive QC plots with error bars and uncertainty quantification.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
quality_thresholds: List of quality score thresholds
|
|
709
|
+
qc_results: List of comprehensive analysis results
|
|
710
|
+
results_dir: Directory to save the plots
|
|
711
|
+
optimal_qscore: Optimal Q-score threshold (optional)
|
|
712
|
+
optimal_result: Optimal result data (optional)
|
|
713
|
+
"""
|
|
714
|
+
try:
|
|
715
|
+
# Extract data for plotting
|
|
716
|
+
aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
|
|
717
|
+
aa_errors = [r['std_aa_mutations'] for r in qc_results]
|
|
718
|
+
aa_ci_lower = [r['ci_lower'] for r in qc_results]
|
|
719
|
+
aa_ci_upper = [r['ci_upper'] for r in qc_results]
|
|
720
|
+
mappable_bases = [r['mappable_bases'] for r in qc_results]
|
|
721
|
+
net_rates = [r['net_rate'] for r in qc_results]
|
|
722
|
+
net_rate_errors = [r['net_rate_error'] for r in qc_results]
|
|
723
|
+
|
|
724
|
+
# Create main QC plot with error bars
|
|
725
|
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))
|
|
726
|
+
|
|
727
|
+
# Top plot: AA mutations per gene with error bars
|
|
728
|
+
color1 = '#2E8B57'
|
|
729
|
+
ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
730
|
+
fmt='o', capsize=5, capthick=2, markersize=8,
|
|
731
|
+
color=color1, ecolor=color1, alpha=0.8, label='Mean ± Std')
|
|
732
|
+
|
|
733
|
+
# Add confidence intervals as shaded area
|
|
734
|
+
ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
|
|
735
|
+
alpha=0.3, color=color1, label='95% Confidence Interval')
|
|
736
|
+
|
|
737
|
+
ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
738
|
+
ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
|
|
739
|
+
ax1.tick_params(axis='y', labelcolor=color1)
|
|
740
|
+
ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
|
|
741
|
+
fontsize=14, fontweight='bold')
|
|
742
|
+
ax1.grid(True, alpha=0.3)
|
|
743
|
+
ax1.legend(frameon=False, fontsize=10)
|
|
744
|
+
|
|
745
|
+
# Add data point labels
|
|
746
|
+
for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
|
|
747
|
+
ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
|
|
748
|
+
(q, aa_mut), xytext=(5, 5),
|
|
749
|
+
textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
|
|
750
|
+
|
|
751
|
+
# Bottom plot: Mappable bases and AA mutations per gene
|
|
752
|
+
color2 = '#FF6B6B'
|
|
753
|
+
color3 = '#4169E1'
|
|
754
|
+
|
|
755
|
+
# Mappable bases (left y-axis)
|
|
756
|
+
ax2_twin = ax2.twinx()
|
|
757
|
+
ax2_twin.scatter(quality_thresholds, mappable_bases,
|
|
758
|
+
s=100, alpha=0.7, color=color2, edgecolors='black',
|
|
759
|
+
linewidth=1, marker='s', label='Mappable Bases')
|
|
760
|
+
ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
|
|
761
|
+
ax2_twin.tick_params(axis='y', labelcolor=color2)
|
|
762
|
+
|
|
763
|
+
# AA mutations per gene with error bars (right y-axis)
|
|
764
|
+
ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
765
|
+
fmt='^', capsize=5, capthick=2, markersize=8,
|
|
766
|
+
color=color3, ecolor=color3, alpha=0.8, label='AA Mutations ± Error')
|
|
767
|
+
ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
|
|
768
|
+
ax2.tick_params(axis='y', labelcolor=color3)
|
|
769
|
+
ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
770
|
+
ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
|
|
771
|
+
fontsize=14, fontweight='bold')
|
|
772
|
+
ax2.grid(True, alpha=0.3)
|
|
773
|
+
|
|
774
|
+
# Add legends
|
|
775
|
+
lines1, labels1 = ax2.get_legend_handles_labels()
|
|
776
|
+
lines2, labels2 = ax2_twin.get_legend_handles_labels()
|
|
777
|
+
ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
|
|
778
|
+
|
|
779
|
+
# Add data point labels for mappable bases
|
|
780
|
+
for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
|
|
781
|
+
ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
|
|
782
|
+
textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
|
|
783
|
+
|
|
784
|
+
plt.tight_layout()
|
|
785
|
+
|
|
786
|
+
# Save the comprehensive plot
|
|
787
|
+
qc_plot_path = os.path.join(results_dir, "comprehensive_qc_analysis.png")
|
|
788
|
+
fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
|
|
789
|
+
plt.close(fig)
|
|
790
|
+
|
|
791
|
+
logging.info(f"Comprehensive QC plot saved to: {qc_plot_path}")
|
|
792
|
+
|
|
793
|
+
# Create error analysis plot
|
|
794
|
+
create_error_analysis_plot(quality_thresholds, qc_results, results_dir)
|
|
795
|
+
|
|
796
|
+
# Save comprehensive data as CSV
|
|
797
|
+
qc_data_path = os.path.join(results_dir, "comprehensive_qc_data.csv")
|
|
798
|
+
with open(qc_data_path, 'w') as f:
|
|
799
|
+
f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
|
|
800
|
+
f.write("mappable_bases,hit_rate,hit_rate_ci_lower,hit_rate_ci_upper,")
|
|
801
|
+
f.write("bg_rate,bg_rate_ci_lower,bg_rate_ci_upper,net_rate,net_rate_error,")
|
|
802
|
+
f.write("lambda_bp,lambda_error,alignment_error,")
|
|
803
|
+
f.write("hit_qscore_mean,hit_qscore_std,hit_qscore_uncertainty,")
|
|
804
|
+
f.write("bg_qscore_mean,bg_qscore_std,bg_qscore_uncertainty,")
|
|
805
|
+
f.write("hit_weighted_rate,hit_weighted_error,bg_weighted_rate,bg_weighted_error,")
|
|
806
|
+
f.write("net_weighted_rate,net_weighted_error,lambda_bp_weighted,lambda_error_weighted\n")
|
|
807
|
+
|
|
808
|
+
for q, r in zip(quality_thresholds, qc_results):
|
|
809
|
+
f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
|
|
810
|
+
f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
|
|
811
|
+
f.write(f"{r['mappable_bases']},{r['hit_rate']:.6f},")
|
|
812
|
+
f.write(f"{r['hit_rate_ci'][0]:.6f},{r['hit_rate_ci'][1]:.6f},")
|
|
813
|
+
f.write(f"{r['bg_rate']:.6f},{r['bg_rate_ci'][0]:.6f},{r['bg_rate_ci'][1]:.6f},")
|
|
814
|
+
f.write(f"{r['net_rate']:.6f},{r['net_rate_error']:.6f},")
|
|
815
|
+
f.write(f"{r['lambda_bp']:.6f},{r['lambda_error']:.6f},{r['alignment_error']:.6f},")
|
|
816
|
+
|
|
817
|
+
# Q-score information
|
|
818
|
+
hit_qscore_mean = r['hit_qscore_stats']['mean_qscore'] if r['hit_qscore_stats'] else 0.0
|
|
819
|
+
hit_qscore_std = r['hit_qscore_stats']['std_qscore'] if r['hit_qscore_stats'] else 0.0
|
|
820
|
+
bg_qscore_mean = r['bg_qscore_stats']['mean_qscore'] if r['bg_qscore_stats'] else 0.0
|
|
821
|
+
bg_qscore_std = r['bg_qscore_stats']['std_qscore'] if r['bg_qscore_stats'] else 0.0
|
|
822
|
+
|
|
823
|
+
f.write(f"{hit_qscore_mean:.2f},{hit_qscore_std:.2f},{r['hit_qscore_uncertainty']:.6f},")
|
|
824
|
+
f.write(f"{bg_qscore_mean:.2f},{bg_qscore_std:.2f},{r['bg_qscore_uncertainty']:.6f},")
|
|
825
|
+
f.write(f"{r.get('hit_weighted_rate', 0.0):.6f},{r.get('hit_weighted_error', 0.0):.6f},")
|
|
826
|
+
f.write(f"{r.get('bg_weighted_rate', 0.0):.6f},{r.get('bg_weighted_error', 0.0):.6f},")
|
|
827
|
+
f.write(f"{r.get('net_weighted_rate', 0.0):.6f},{r.get('net_weighted_error', 0.0):.6f},")
|
|
828
|
+
f.write(f"{r.get('lambda_bp_weighted', 0.0):.6f},{r.get('lambda_error_weighted', 0.0):.6f}\n")
|
|
829
|
+
|
|
830
|
+
logging.info(f"Comprehensive QC data saved to: {qc_data_path}")
|
|
831
|
+
|
|
832
|
+
except Exception as e:
|
|
833
|
+
logging.error(f"Error creating comprehensive QC plots: {e}")
|
|
834
|
+
|
|
835
|
+
def create_error_analysis_plot(quality_thresholds, qc_results, results_dir):
|
|
836
|
+
"""
|
|
837
|
+
Create a detailed error analysis plot showing different sources of uncertainty.
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
quality_thresholds: List of quality score thresholds
|
|
841
|
+
qc_results: List of comprehensive analysis results
|
|
842
|
+
results_dir: Directory to save the plot
|
|
843
|
+
"""
|
|
844
|
+
try:
|
|
845
|
+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
|
|
846
|
+
|
|
847
|
+
# Extract error components
|
|
848
|
+
aa_std = [r['std_aa_mutations'] for r in qc_results]
|
|
849
|
+
net_rate_errors = [r['net_rate_error'] for r in qc_results]
|
|
850
|
+
lambda_errors = [r['lambda_error'] for r in qc_results]
|
|
851
|
+
alignment_errors = [r['alignment_error'] for r in qc_results]
|
|
852
|
+
mappable_bases = [r['mappable_bases'] for r in qc_results]
|
|
853
|
+
|
|
854
|
+
# Plot 1: AA mutation uncertainty vs quality threshold
|
|
855
|
+
ax1.plot(quality_thresholds, aa_std, 'o-', color='#2E8B57', linewidth=2, markersize=6)
|
|
856
|
+
ax1.set_xlabel('Quality Score Threshold')
|
|
857
|
+
ax1.set_ylabel('AA Mutation Standard Deviation')
|
|
858
|
+
ax1.set_title('AA Mutation Uncertainty vs Quality Filter')
|
|
859
|
+
ax1.grid(True, alpha=0.3)
|
|
860
|
+
|
|
861
|
+
# Plot 2: Net rate error vs quality threshold
|
|
862
|
+
ax2.plot(quality_thresholds, net_rate_errors, 's-', color='#FF6B6B', linewidth=2, markersize=6)
|
|
863
|
+
ax2.set_xlabel('Quality Score Threshold')
|
|
864
|
+
ax2.set_ylabel('Net Mutation Rate Error')
|
|
865
|
+
ax2.set_title('Net Rate Error vs Quality Filter')
|
|
866
|
+
ax2.grid(True, alpha=0.3)
|
|
867
|
+
|
|
868
|
+
# Plot 3: Lambda error vs quality threshold
|
|
869
|
+
ax3.plot(quality_thresholds, lambda_errors, '^-', color='#4169E1', linewidth=2, markersize=6)
|
|
870
|
+
ax3.set_xlabel('Quality Score Threshold')
|
|
871
|
+
ax3.set_ylabel('Lambda Error (mutations per copy)')
|
|
872
|
+
ax3.set_title('Lambda Error vs Quality Filter')
|
|
873
|
+
ax3.grid(True, alpha=0.3)
|
|
874
|
+
|
|
875
|
+
# Plot 4: Alignment error vs mappable bases
|
|
876
|
+
ax4.scatter(mappable_bases, alignment_errors, s=100, alpha=0.7, color='#FF8C00')
|
|
877
|
+
ax4.set_xlabel('Mappable Bases')
|
|
878
|
+
ax4.set_ylabel('Alignment Error (1/√reads)')
|
|
879
|
+
ax4.set_title('Alignment Error vs Read Count')
|
|
880
|
+
ax4.grid(True, alpha=0.3)
|
|
881
|
+
|
|
882
|
+
# Add quality threshold labels to scatter plot
|
|
883
|
+
for i, q in enumerate(quality_thresholds):
|
|
884
|
+
ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
|
|
885
|
+
xytext=(5, 5), textcoords='offset points', fontsize=8)
|
|
886
|
+
|
|
887
|
+
plt.tight_layout()
|
|
888
|
+
|
|
889
|
+
# Save error analysis plot
|
|
890
|
+
error_plot_path = os.path.join(results_dir, "error_analysis.png")
|
|
891
|
+
fig.savefig(error_plot_path, dpi=300, bbox_inches='tight')
|
|
892
|
+
plt.close(fig)
|
|
893
|
+
|
|
894
|
+
logging.info(f"Error analysis plot saved to: {error_plot_path}")
|
|
895
|
+
|
|
896
|
+
except Exception as e:
|
|
897
|
+
logging.error(f"Error creating error analysis plot: {e}")
|
|
898
|
+
|
|
899
|
+
def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir):
|
|
900
|
+
"""
|
|
901
|
+
Create a dual-axis plot showing quality score threshold vs AA mutations per gene and mappable bases.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
quality_thresholds: List of quality score thresholds
|
|
905
|
+
aa_mutations: List of corresponding AA mutations per gene
|
|
906
|
+
mappable_bases: List of corresponding mappable bases
|
|
907
|
+
results_dir: Directory to save the plot
|
|
908
|
+
"""
|
|
909
|
+
try:
|
|
910
|
+
# Create the plot with dual y-axes
|
|
911
|
+
fig, ax1 = plt.subplots(figsize=(12, 8))
|
|
912
|
+
|
|
913
|
+
# Left y-axis: AA mutations per gene
|
|
914
|
+
color1 = '#2E8B57'
|
|
915
|
+
ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
916
|
+
ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
|
|
917
|
+
ax1.scatter(quality_thresholds, aa_mutations,
|
|
918
|
+
s=100, alpha=0.7, color=color1, edgecolors='black', linewidth=1, label='AA Mutations per Gene')
|
|
919
|
+
ax1.tick_params(axis='y', labelcolor=color1)
|
|
920
|
+
|
|
921
|
+
# Right y-axis: Mappable bases
|
|
922
|
+
ax2 = ax1.twinx()
|
|
923
|
+
color2 = '#FF6B6B'
|
|
924
|
+
ax2.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
|
|
925
|
+
ax2.scatter(quality_thresholds, mappable_bases,
|
|
926
|
+
s=100, alpha=0.7, color=color2, edgecolors='black', linewidth=1, marker='s', label='Mappable Bases')
|
|
927
|
+
ax2.tick_params(axis='y', labelcolor=color2)
|
|
928
|
+
|
|
929
|
+
# Customize the plot
|
|
930
|
+
ax1.set_title('AA Mutations per Gene and Mappable Bases vs Quality Score Filter', fontsize=14, fontweight='bold')
|
|
931
|
+
|
|
932
|
+
# Add grid for better readability
|
|
933
|
+
ax1.grid(True, alpha=0.3)
|
|
934
|
+
|
|
935
|
+
# Customize ticks and spines
|
|
936
|
+
ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
|
|
937
|
+
ax1.tick_params(axis='both', which='minor', direction='in', length=3)
|
|
938
|
+
ax1.spines['top'].set_visible(False)
|
|
939
|
+
ax1.spines['right'].set_visible(False)
|
|
940
|
+
|
|
941
|
+
# Add data point labels for AA mutations
|
|
942
|
+
for i, (q, aa_mut) in enumerate(zip(quality_thresholds, aa_mutations)):
|
|
943
|
+
ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
|
|
944
|
+
textcoords='offset points', fontsize=9, alpha=0.8, color=color1)
|
|
945
|
+
|
|
946
|
+
# Add data point labels for mappable bases
|
|
947
|
+
for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
|
|
948
|
+
ax2.annotate(f'{reads}', (q, reads), xytext=(5, -15),
|
|
949
|
+
textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
|
|
950
|
+
|
|
951
|
+
# Add legend
|
|
952
|
+
lines1, labels1 = ax1.get_legend_handles_labels()
|
|
953
|
+
lines2, labels2 = ax2.get_legend_handles_labels()
|
|
954
|
+
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
|
|
955
|
+
|
|
956
|
+
# Save the plot
|
|
957
|
+
qc_plot_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.png")
|
|
958
|
+
fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
|
|
959
|
+
plt.close(fig)
|
|
960
|
+
|
|
961
|
+
logging.info(f"QC plot saved to: {qc_plot_path}")
|
|
962
|
+
|
|
963
|
+
# Also save data as CSV for reference
|
|
964
|
+
qc_data_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.csv")
|
|
965
|
+
with open(qc_data_path, 'w') as f:
|
|
966
|
+
f.write("quality_threshold,aa_mutations_per_gene,mappable_bases\n")
|
|
967
|
+
for q, aa_mut, bases in zip(quality_thresholds, aa_mutations, mappable_bases):
|
|
968
|
+
f.write(f"{q},{aa_mut:.6f},{bases}\n")
|
|
969
|
+
|
|
970
|
+
logging.info(f"QC data saved to: {qc_data_path}")
|
|
971
|
+
|
|
972
|
+
except Exception as e:
|
|
973
|
+
logging.error(f"Error creating QC plot: {e}")
|
|
974
|
+
|
|
975
|
+
def extract_qscores_from_sam(sam_file):
|
|
976
|
+
"""
|
|
977
|
+
Extract Q-scores from SAM file and calculate statistics.
|
|
978
|
+
|
|
979
|
+
Args:
|
|
980
|
+
sam_file: Path to SAM file
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
dict: Q-score statistics including mean, std, and per-position averages
|
|
984
|
+
"""
|
|
985
|
+
try:
|
|
986
|
+
import pysam
|
|
987
|
+
|
|
988
|
+
qscores = []
|
|
989
|
+
position_qscores = {} # position -> list of qscores
|
|
990
|
+
|
|
991
|
+
with pysam.AlignmentFile(sam_file, "r") as samfile:
|
|
992
|
+
for read in samfile:
|
|
993
|
+
if read.is_unmapped:
|
|
994
|
+
continue
|
|
995
|
+
|
|
996
|
+
# Get Q-scores for this read
|
|
997
|
+
read_qscores = read.query_qualities
|
|
998
|
+
if read_qscores is None:
|
|
999
|
+
continue
|
|
1000
|
+
|
|
1001
|
+
# Convert to Q-score values (Phred+33 encoding)
|
|
1002
|
+
q_values = [q + 33 for q in read_qscores]
|
|
1003
|
+
qscores.extend(q_values)
|
|
1004
|
+
|
|
1005
|
+
# Store per-position Q-scores
|
|
1006
|
+
for i, q_val in enumerate(q_values):
|
|
1007
|
+
pos = read.reference_start + i
|
|
1008
|
+
if pos not in position_qscores:
|
|
1009
|
+
position_qscores[pos] = []
|
|
1010
|
+
position_qscores[pos].append(q_val)
|
|
1011
|
+
|
|
1012
|
+
if not qscores:
|
|
1013
|
+
return {
|
|
1014
|
+
'mean_qscore': 0.0,
|
|
1015
|
+
'std_qscore': 0.0,
|
|
1016
|
+
'min_qscore': 0.0,
|
|
1017
|
+
'max_qscore': 0.0,
|
|
1018
|
+
'position_avg_qscores': {},
|
|
1019
|
+
'total_bases': 0
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
# Calculate statistics
|
|
1023
|
+
mean_qscore = np.mean(qscores)
|
|
1024
|
+
std_qscore = np.std(qscores)
|
|
1025
|
+
min_qscore = np.min(qscores)
|
|
1026
|
+
max_qscore = np.max(qscores)
|
|
1027
|
+
|
|
1028
|
+
# Calculate per-position average Q-scores
|
|
1029
|
+
position_avg_qscores = {}
|
|
1030
|
+
for pos, pos_qscores in position_qscores.items():
|
|
1031
|
+
position_avg_qscores[pos] = np.mean(pos_qscores)
|
|
1032
|
+
|
|
1033
|
+
return {
|
|
1034
|
+
'mean_qscore': mean_qscore,
|
|
1035
|
+
'std_qscore': std_qscore,
|
|
1036
|
+
'min_qscore': min_qscore,
|
|
1037
|
+
'max_qscore': max_qscore,
|
|
1038
|
+
'position_avg_qscores': position_avg_qscores,
|
|
1039
|
+
'total_bases': len(qscores)
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
except Exception as e:
|
|
1043
|
+
logging.error(f"Error extracting Q-scores from {sam_file}: {e}")
|
|
1044
|
+
return {
|
|
1045
|
+
'mean_qscore': 0.0,
|
|
1046
|
+
'std_qscore': 0.0,
|
|
1047
|
+
'min_qscore': 0.0,
|
|
1048
|
+
'max_qscore': 0.0,
|
|
1049
|
+
'position_avg_qscores': {},
|
|
1050
|
+
'total_bases': 0
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
def qscore_uncertainty_factor(qscore):
|
|
1054
|
+
"""
|
|
1055
|
+
Convert Q-score to uncertainty factor.
|
|
1056
|
+
|
|
1057
|
+
Args:
|
|
1058
|
+
qscore: Q-score value (typically 0-40)
|
|
1059
|
+
|
|
1060
|
+
Returns:
|
|
1061
|
+
float: Uncertainty factor (0-1, where 1 = maximum uncertainty)
|
|
1062
|
+
"""
|
|
1063
|
+
if qscore <= 0:
|
|
1064
|
+
return 1.0 # Maximum uncertainty
|
|
1065
|
+
|
|
1066
|
+
# Q-score = -10 * log10(P_error)
|
|
1067
|
+
# P_error = 10^(-Q/10)
|
|
1068
|
+
# Uncertainty factor = sqrt(P_error) for error propagation
|
|
1069
|
+
error_probability = 10**(-qscore/10)
|
|
1070
|
+
uncertainty_factor = np.sqrt(error_probability)
|
|
1071
|
+
|
|
1072
|
+
return uncertainty_factor
|
|
1073
|
+
|
|
1074
|
+
def segment_fastq_file(input_fastq, n_segments=10):
|
|
1075
|
+
"""
|
|
1076
|
+
Segment a FASTQ file into N parts for error estimation.
|
|
1077
|
+
|
|
1078
|
+
Args:
|
|
1079
|
+
input_fastq: Path to input FASTQ.gz file
|
|
1080
|
+
n_segments: Number of segments to create
|
|
1081
|
+
|
|
1082
|
+
Returns:
|
|
1083
|
+
list: Paths to segmented FASTQ files
|
|
1084
|
+
"""
|
|
1085
|
+
try:
|
|
1086
|
+
import gzip
|
|
1087
|
+
from itertools import cycle
|
|
1088
|
+
|
|
1089
|
+
# Create output directory
|
|
1090
|
+
base_name = os.path.splitext(os.path.basename(input_fastq))[0].replace('.fastq', '')
|
|
1091
|
+
segment_dir = os.path.join(os.path.dirname(input_fastq), f"{base_name}_segments")
|
|
1092
|
+
os.makedirs(segment_dir, exist_ok=True)
|
|
1093
|
+
|
|
1094
|
+
# Open output files
|
|
1095
|
+
segment_files = []
|
|
1096
|
+
file_handles = []
|
|
1097
|
+
|
|
1098
|
+
for i in range(n_segments):
|
|
1099
|
+
segment_path = os.path.join(segment_dir, f"{base_name}_segment_{i+1}.fastq.gz")
|
|
1100
|
+
segment_files.append(segment_path)
|
|
1101
|
+
file_handles.append(gzip.open(segment_path, 'wt'))
|
|
1102
|
+
|
|
1103
|
+
# Read and distribute reads
|
|
1104
|
+
read_count = 0
|
|
1105
|
+
with gzip.open(input_fastq, 'rt') as infile:
|
|
1106
|
+
current_read = []
|
|
1107
|
+
|
|
1108
|
+
for line in infile:
|
|
1109
|
+
current_read.append(line)
|
|
1110
|
+
|
|
1111
|
+
# Complete read (4 lines)
|
|
1112
|
+
if len(current_read) == 4:
|
|
1113
|
+
# Write to current segment
|
|
1114
|
+
segment_idx = read_count % n_segments
|
|
1115
|
+
for line in current_read:
|
|
1116
|
+
file_handles[segment_idx].write(line)
|
|
1117
|
+
|
|
1118
|
+
read_count += 1
|
|
1119
|
+
current_read = []
|
|
1120
|
+
|
|
1121
|
+
# Close all files
|
|
1122
|
+
for fh in file_handles:
|
|
1123
|
+
fh.close()
|
|
1124
|
+
|
|
1125
|
+
logging.info(f"Segmented {read_count} reads into {n_segments} files")
|
|
1126
|
+
return segment_files
|
|
1127
|
+
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
logging.error(f"Error segmenting FASTQ file: {e}")
|
|
1130
|
+
return []
|
|
1131
|
+
|
|
1132
|
+
def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_fasta, plasmid_fasta):
|
|
1133
|
+
"""
|
|
1134
|
+
Run mutation rate analysis on each segment and calculate empirical error.
|
|
1135
|
+
|
|
1136
|
+
Args:
|
|
1137
|
+
segment_files: List of segmented FASTQ files
|
|
1138
|
+
quality_threshold: Quality score threshold
|
|
1139
|
+
work_dir: Working directory
|
|
1140
|
+
ref_hit_fasta: Path to reference hit FASTA
|
|
1141
|
+
plasmid_fasta: Path to plasmid FASTA
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
dict: Results with empirical error estimates
|
|
1145
|
+
"""
|
|
1146
|
+
try:
|
|
1147
|
+
segment_results = []
|
|
1148
|
+
|
|
1149
|
+
for i, segment_file in enumerate(segment_files):
|
|
1150
|
+
logging.info(f"Processing segment {i+1}/{len(segment_files)}")
|
|
1151
|
+
|
|
1152
|
+
# Filter segment with NanoFilt
|
|
1153
|
+
filtered_segment = os.path.join(work_dir, f"segment_{i+1}_q{quality_threshold}.fastq.gz")
|
|
1154
|
+
if not run_nanofilt_filtering(segment_file, quality_threshold, filtered_segment):
|
|
1155
|
+
logging.warning(f"Failed to filter segment {i+1}")
|
|
1156
|
+
continue
|
|
1157
|
+
|
|
1158
|
+
# Load sequences
|
|
1159
|
+
hit_seq, hit_id = load_single_sequence(ref_hit_fasta)
|
|
1160
|
+
plasmid_seq, plasmid_id = load_single_sequence(plasmid_fasta)
|
|
1161
|
+
|
|
1162
|
+
# Find hit region in plasmid
|
|
1163
|
+
idx = plasmid_seq.upper().find(hit_seq.upper())
|
|
1164
|
+
if idx == -1:
|
|
1165
|
+
logging.error(f"Gene region not found in plasmid for segment {i+1}")
|
|
1166
|
+
continue
|
|
1167
|
+
|
|
1168
|
+
# Align filtered reads to hit region
|
|
1169
|
+
sam_hit = run_minimap2(filtered_segment, ref_hit_fasta, f"hit_segment_{i+1}_q{quality_threshold}", work_dir)
|
|
1170
|
+
|
|
1171
|
+
# Align filtered reads to full plasmid for background calculation
|
|
1172
|
+
sam_plasmid = run_minimap2(filtered_segment, plasmid_fasta, f"plasmid_segment_{i+1}_q{quality_threshold}", work_dir)
|
|
1173
|
+
|
|
1174
|
+
# Calculate background rate from full plasmid alignment, excluding target region
|
|
1175
|
+
bg_mis, bg_cov, bg_reads = calculate_background_from_plasmid(sam_plasmid, plasmid_seq, idx, len(hit_seq))
|
|
1176
|
+
|
|
1177
|
+
# Calculate hit region mutation rate
|
|
1178
|
+
mismatch_hit = compute_mismatch_stats_sam(sam_hit, {hit_id: hit_seq})
|
|
1179
|
+
hit_info = mismatch_hit[hit_id]
|
|
1180
|
+
hit_mis = hit_info["total_mismatches"]
|
|
1181
|
+
hit_cov = hit_info["total_covered_bases"]
|
|
1182
|
+
|
|
1183
|
+
# Calculate rates
|
|
1184
|
+
hit_rate = hit_mis / hit_cov if hit_cov > 0 else 0
|
|
1185
|
+
bg_rate = bg_mis / bg_cov if bg_cov > 0 else 0
|
|
1186
|
+
net_rate = max(hit_rate - bg_rate, 0.0)
|
|
1187
|
+
|
|
1188
|
+
# Calculate AA mutations per gene (simplified)
|
|
1189
|
+
lambda_bp = net_rate * len(hit_seq)
|
|
1190
|
+
aa_mutations = lambda_bp / 3.0 # Approximate: 3 bp per AA
|
|
1191
|
+
|
|
1192
|
+
segment_results.append({
|
|
1193
|
+
'segment': i+1,
|
|
1194
|
+
'hit_rate': hit_rate,
|
|
1195
|
+
'bg_rate': bg_rate,
|
|
1196
|
+
'net_rate': net_rate,
|
|
1197
|
+
'aa_mutations': aa_mutations,
|
|
1198
|
+
'mappable_bases': hit_cov,
|
|
1199
|
+
'hit_mismatches': hit_mis,
|
|
1200
|
+
'hit_coverage': hit_cov
|
|
1201
|
+
})
|
|
1202
|
+
|
|
1203
|
+
if not segment_results:
|
|
1204
|
+
return None
|
|
1205
|
+
|
|
1206
|
+
# Calculate empirical statistics
|
|
1207
|
+
aa_mutations_list = [r['aa_mutations'] for r in segment_results]
|
|
1208
|
+
net_rates_list = [r['net_rate'] for r in segment_results]
|
|
1209
|
+
mappable_bases_list = [r['mappable_bases'] for r in segment_results]
|
|
1210
|
+
|
|
1211
|
+
mean_aa = np.mean(aa_mutations_list)
|
|
1212
|
+
std_aa = np.std(aa_mutations_list, ddof=1) # Sample standard deviation
|
|
1213
|
+
mean_net_rate = np.mean(net_rates_list)
|
|
1214
|
+
std_net_rate = np.std(net_rates_list, ddof=1)
|
|
1215
|
+
total_mappable_bases = sum(mappable_bases_list)
|
|
1216
|
+
|
|
1217
|
+
# Calculate confidence interval using t-distribution
|
|
1218
|
+
n_segments = len(segment_results)
|
|
1219
|
+
if n_segments > 1:
|
|
1220
|
+
# 95% confidence interval
|
|
1221
|
+
from scipy.stats import t
|
|
1222
|
+
t_val = t.ppf(0.975, n_segments - 1)
|
|
1223
|
+
se_aa = std_aa / np.sqrt(n_segments)
|
|
1224
|
+
ci_lower = mean_aa - t_val * se_aa
|
|
1225
|
+
ci_upper = mean_aa + t_val * se_aa
|
|
1226
|
+
else:
|
|
1227
|
+
ci_lower = mean_aa
|
|
1228
|
+
ci_upper = mean_aa
|
|
1229
|
+
|
|
1230
|
+
return {
|
|
1231
|
+
'mean_aa_mutations': mean_aa,
|
|
1232
|
+
'std_aa_mutations': std_aa,
|
|
1233
|
+
'ci_lower': ci_lower,
|
|
1234
|
+
'ci_upper': ci_upper,
|
|
1235
|
+
'mean_net_rate': mean_net_rate,
|
|
1236
|
+
'std_net_rate': std_net_rate,
|
|
1237
|
+
'total_mappable_bases': total_mappable_bases,
|
|
1238
|
+
'n_segments': n_segments,
|
|
1239
|
+
'segment_results': segment_results,
|
|
1240
|
+
'quality_threshold': quality_threshold
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
except Exception as e:
|
|
1244
|
+
logging.error(f"Error in segmented analysis: {e}")
|
|
1245
|
+
return None
|
|
1246
|
+
"""
|
|
1247
|
+
Calculate mismatches weighted by Q-score uncertainty with proper sampling error.
|
|
1248
|
+
|
|
1249
|
+
Args:
|
|
1250
|
+
sam_file: Path to SAM file
|
|
1251
|
+
ref_seq: Reference sequence
|
|
1252
|
+
qscore_stats: Q-score statistics from extract_qscores_from_sam
|
|
1253
|
+
|
|
1254
|
+
Returns:
|
|
1255
|
+
tuple: (weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes)
|
|
1256
|
+
"""
|
|
1257
|
+
try:
|
|
1258
|
+
import pysam
|
|
1259
|
+
|
|
1260
|
+
weighted_mismatches = 0.0
|
|
1261
|
+
total_weighted_coverage = 0.0
|
|
1262
|
+
raw_mismatches = 0
|
|
1263
|
+
raw_coverage = 0
|
|
1264
|
+
|
|
1265
|
+
# Store position-level data for proper sampling error calculation
|
|
1266
|
+
position_weights = []
|
|
1267
|
+
position_outcomes = []
|
|
1268
|
+
|
|
1269
|
+
position_qscores = qscore_stats['position_avg_qscores']
|
|
1270
|
+
|
|
1271
|
+
with pysam.AlignmentFile(sam_file, "r") as samfile:
|
|
1272
|
+
for read in samfile:
|
|
1273
|
+
if read.is_unmapped:
|
|
1274
|
+
continue
|
|
1275
|
+
|
|
1276
|
+
# Get aligned pairs (read_pos, ref_pos)
|
|
1277
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
1278
|
+
if ref_pos is None or read_pos is None:
|
|
1279
|
+
continue
|
|
1280
|
+
|
|
1281
|
+
if ref_pos >= len(ref_seq):
|
|
1282
|
+
continue
|
|
1283
|
+
|
|
1284
|
+
# Get base calls
|
|
1285
|
+
read_base = read.query_sequence[read_pos].upper()
|
|
1286
|
+
ref_base = ref_seq[ref_pos].upper()
|
|
1287
|
+
|
|
1288
|
+
# Skip if either base is N
|
|
1289
|
+
if read_base == 'N' or ref_base == 'N':
|
|
1290
|
+
continue
|
|
1291
|
+
|
|
1292
|
+
# Get Q-score for this position
|
|
1293
|
+
qscore = position_qscores.get(ref_pos, qscore_stats['mean_qscore'])
|
|
1294
|
+
uncertainty_factor = qscore_uncertainty_factor(qscore)
|
|
1295
|
+
|
|
1296
|
+
# Weight by uncertainty (lower Q-score = higher uncertainty = lower weight)
|
|
1297
|
+
weight = 1.0 - uncertainty_factor
|
|
1298
|
+
|
|
1299
|
+
# Store position-level data
|
|
1300
|
+
position_weights.append(weight)
|
|
1301
|
+
position_outcomes.append(1 if read_base != ref_base else 0)
|
|
1302
|
+
|
|
1303
|
+
# Count coverage
|
|
1304
|
+
total_weighted_coverage += weight
|
|
1305
|
+
raw_coverage += 1
|
|
1306
|
+
|
|
1307
|
+
# Count mismatches
|
|
1308
|
+
if read_base != ref_base:
|
|
1309
|
+
weighted_mismatches += weight
|
|
1310
|
+
raw_mismatches += 1
|
|
1311
|
+
|
|
1312
|
+
return weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes
|
|
1313
|
+
|
|
1314
|
+
except Exception as e:
|
|
1315
|
+
logging.error(f"Error calculating Q-score weighted mismatches: {e}")
|
|
1316
|
+
return 0.0, 0.0, 0, 0, [], []
|
|
1317
|
+
|
|
1318
|
+
def calculate_weighted_sampling_error(position_weights, position_outcomes):
|
|
1319
|
+
"""
|
|
1320
|
+
Calculate proper weighted sampling error from position-level data.
|
|
1321
|
+
|
|
1322
|
+
Args:
|
|
1323
|
+
position_weights: List of weights for each position
|
|
1324
|
+
position_outcomes: List of outcomes (0=match, 1=mismatch) for each position
|
|
1325
|
+
|
|
1326
|
+
Returns:
|
|
1327
|
+
tuple: (weighted_rate, weighted_error)
|
|
1328
|
+
"""
|
|
1329
|
+
if not position_weights or len(position_weights) == 0:
|
|
1330
|
+
return 0.0, 0.0
|
|
1331
|
+
|
|
1332
|
+
position_weights = np.array(position_weights)
|
|
1333
|
+
position_outcomes = np.array(position_outcomes)
|
|
1334
|
+
|
|
1335
|
+
# Calculate weighted rate
|
|
1336
|
+
weighted_mismatches = np.sum(position_weights * position_outcomes)
|
|
1337
|
+
weighted_coverage = np.sum(position_weights)
|
|
1338
|
+
|
|
1339
|
+
if weighted_coverage == 0:
|
|
1340
|
+
return 0.0, 0.0
|
|
1341
|
+
|
|
1342
|
+
weighted_rate = weighted_mismatches / weighted_coverage
|
|
1343
|
+
|
|
1344
|
+
# Proper weighted sampling error calculation
|
|
1345
|
+
# Var(p̂) = (1/W²) * Σ[w_i² * (y_i - p̂)²]
|
|
1346
|
+
# where W = Σw_i, y_i = outcome, p̂ = weighted rate
|
|
1347
|
+
|
|
1348
|
+
residuals = position_outcomes - weighted_rate
|
|
1349
|
+
weighted_residuals_squared = position_weights**2 * residuals**2
|
|
1350
|
+
weighted_error = np.sqrt(np.sum(weighted_residuals_squared) / (weighted_coverage**2))
|
|
1351
|
+
|
|
1352
|
+
return weighted_rate, weighted_error
|
|
1353
|
+
|
|
1354
|
+
def calculate_qscore_weighted_error_propagation(weighted_mismatches, weighted_coverage, qscore_stats):
|
|
1355
|
+
"""
|
|
1356
|
+
Calculate error propagation for Q-score weighted mutation rates using proper weighted sampling theory.
|
|
1357
|
+
|
|
1358
|
+
Args:
|
|
1359
|
+
weighted_mismatches: Q-score weighted mismatch count
|
|
1360
|
+
weighted_coverage: Q-score weighted coverage
|
|
1361
|
+
qscore_stats: Q-score statistics
|
|
1362
|
+
|
|
1363
|
+
Returns:
|
|
1364
|
+
tuple: (weighted_rate, weighted_error)
|
|
1365
|
+
"""
|
|
1366
|
+
if weighted_coverage == 0:
|
|
1367
|
+
return 0.0, 0.0
|
|
1368
|
+
|
|
1369
|
+
weighted_rate = weighted_mismatches / weighted_coverage
|
|
1370
|
+
|
|
1371
|
+
# Proper weighted sampling error calculation
|
|
1372
|
+
# For weighted binomial: Var(p̂) ≈ (1/n²) * Σ[w_i² * p * (1-p)]
|
|
1373
|
+
# where n = Σw_i (weighted_coverage)
|
|
1374
|
+
|
|
1375
|
+
# Estimate weight variance from Q-score statistics
|
|
1376
|
+
mean_qscore = qscore_stats['mean_qscore']
|
|
1377
|
+
std_qscore = qscore_stats['std_qscore']
|
|
1378
|
+
|
|
1379
|
+
# Convert Q-score statistics to weight statistics
|
|
1380
|
+
mean_weight = 1.0 - qscore_uncertainty_factor(mean_qscore)
|
|
1381
|
+
|
|
1382
|
+
# Approximate weight variance using delta method
|
|
1383
|
+
# If w = 1 - sqrt(10^(-Q/10)), then Var(w) ≈ Var(Q) * (dw/dQ)²
|
|
1384
|
+
# dw/dQ = (ln(10)/20) * 10^(-Q/10) * (1/sqrt(10^(-Q/10)))
|
|
1385
|
+
# = (ln(10)/20) * sqrt(10^(-Q/10))
|
|
1386
|
+
|
|
1387
|
+
if mean_qscore > 0:
|
|
1388
|
+
# Delta method approximation for weight variance
|
|
1389
|
+
error_prob = 10**(-mean_qscore/10)
|
|
1390
|
+
weight_derivative = (np.log(10)/20) * np.sqrt(error_prob)
|
|
1391
|
+
weight_variance = (std_qscore**2) * (weight_derivative**2)
|
|
1392
|
+
else:
|
|
1393
|
+
weight_variance = 0.0
|
|
1394
|
+
|
|
1395
|
+
# Effective sample size for weighted sampling
|
|
1396
|
+
# n_eff = (Σw_i)² / Σ(w_i²) ≈ (Σw_i)² / [n * (E[w]² + Var[w])]
|
|
1397
|
+
n_positions = qscore_stats.get('total_bases', weighted_coverage)
|
|
1398
|
+
if n_positions > 0:
|
|
1399
|
+
expected_w_squared = mean_weight**2 + weight_variance
|
|
1400
|
+
effective_n = (weighted_coverage**2) / (n_positions * expected_w_squared)
|
|
1401
|
+
else:
|
|
1402
|
+
effective_n = weighted_coverage / mean_weight
|
|
1403
|
+
|
|
1404
|
+
# Weighted sampling error
|
|
1405
|
+
# Var(p̂) = p(1-p) / n_eff * [1 + (Var[w]/E[w]²)]
|
|
1406
|
+
weight_cv_squared = weight_variance / (mean_weight**2) if mean_weight > 0 else 0
|
|
1407
|
+
weighted_error = np.sqrt(weighted_rate * (1 - weighted_rate) / effective_n * (1 + weight_cv_squared))
|
|
1408
|
+
|
|
1409
|
+
return weighted_rate, weighted_error
|
|
1410
|
+
|
|
1411
|
+
def binomial_confidence_interval(successes, trials, confidence=0.95):
|
|
1412
|
+
"""
|
|
1413
|
+
Calculate confidence interval for binomial proportion using beta distribution.
|
|
1414
|
+
|
|
1415
|
+
Args:
|
|
1416
|
+
successes: Number of successes
|
|
1417
|
+
trials: Number of trials
|
|
1418
|
+
confidence: Confidence level (default 0.95 for 95% CI)
|
|
1419
|
+
|
|
1420
|
+
Returns:
|
|
1421
|
+
tuple: (lower_bound, upper_bound)
|
|
1422
|
+
"""
|
|
1423
|
+
if not HAVE_SCIPY:
|
|
1424
|
+
# Simple normal approximation if scipy not available
|
|
1425
|
+
p = successes / trials if trials > 0 else 0
|
|
1426
|
+
se = np.sqrt(p * (1 - p) / trials) if trials > 0 else 0
|
|
1427
|
+
z = norm.ppf(1 - (1 - confidence) / 2)
|
|
1428
|
+
return max(0, p - z * se), min(1, p + z * se)
|
|
1429
|
+
|
|
1430
|
+
alpha = 1 - confidence
|
|
1431
|
+
lower = beta.ppf(alpha/2, successes, trials - successes + 1) if trials > successes else 0
|
|
1432
|
+
upper = beta.ppf(1 - alpha/2, successes + 1, trials - successes) if successes > 0 else 0
|
|
1433
|
+
return lower, upper
|
|
1434
|
+
|
|
1435
|
+
def propagate_mutation_rate_error(hit_mis, hit_cov, bg_mis, bg_cov, hit_qscore_stats=None, bg_qscore_stats=None):
|
|
1436
|
+
"""
|
|
1437
|
+
Calculate error propagation for net mutation rate = hit_rate - bg_rate, including Q-score uncertainty.
|
|
1438
|
+
|
|
1439
|
+
Args:
|
|
1440
|
+
hit_mis, hit_cov: Hit region mismatches and coverage
|
|
1441
|
+
bg_mis, bg_cov: Background mismatches and coverage
|
|
1442
|
+
hit_qscore_stats: Q-score statistics for hit region (optional)
|
|
1443
|
+
bg_qscore_stats: Q-score statistics for background region (optional)
|
|
1444
|
+
|
|
1445
|
+
Returns:
|
|
1446
|
+
tuple: (net_rate, net_rate_error)
|
|
1447
|
+
"""
|
|
1448
|
+
if hit_cov == 0 or bg_cov == 0:
|
|
1449
|
+
return 0.0, 0.0
|
|
1450
|
+
|
|
1451
|
+
hit_rate = hit_mis / hit_cov
|
|
1452
|
+
bg_rate = bg_mis / bg_cov
|
|
1453
|
+
|
|
1454
|
+
# Binomial standard errors
|
|
1455
|
+
hit_se = np.sqrt(hit_rate * (1 - hit_rate) / hit_cov)
|
|
1456
|
+
bg_se = np.sqrt(bg_rate * (1 - bg_rate) / bg_cov)
|
|
1457
|
+
|
|
1458
|
+
# Add Q-score uncertainty if available
|
|
1459
|
+
if hit_qscore_stats:
|
|
1460
|
+
hit_qscore_uncertainty = qscore_uncertainty_factor(hit_qscore_stats['mean_qscore'])
|
|
1461
|
+
hit_se = np.sqrt(hit_se**2 + hit_qscore_uncertainty**2)
|
|
1462
|
+
|
|
1463
|
+
if bg_qscore_stats:
|
|
1464
|
+
bg_qscore_uncertainty = qscore_uncertainty_factor(bg_qscore_stats['mean_qscore'])
|
|
1465
|
+
bg_se = np.sqrt(bg_se**2 + bg_qscore_uncertainty**2)
|
|
1466
|
+
|
|
1467
|
+
# Net rate and error propagation
|
|
1468
|
+
net_rate = max(hit_rate - bg_rate, 0.0)
|
|
1469
|
+
net_se = np.sqrt(hit_se**2 + bg_se**2)
|
|
1470
|
+
|
|
1471
|
+
return net_rate, net_se
|
|
1472
|
+
|
|
1473
|
+
def simulate_aa_distribution_with_error(lambda_bp, lambda_error, cds_seq, n_trials=10000):
|
|
1474
|
+
"""
|
|
1475
|
+
Enhanced Monte Carlo simulation that includes uncertainty in lambda_bp.
|
|
1476
|
+
|
|
1477
|
+
Args:
|
|
1478
|
+
lambda_bp: Mean mutations per copy (basepairs)
|
|
1479
|
+
lambda_error: Standard error of lambda_bp
|
|
1480
|
+
cds_seq: Coding sequence
|
|
1481
|
+
n_trials: Number of Monte Carlo trials
|
|
1482
|
+
|
|
1483
|
+
Returns:
|
|
1484
|
+
tuple: (mean_aa_mutations, std_aa_mutations, aa_distribution)
|
|
1485
|
+
"""
|
|
1486
|
+
prot_orig = str(Seq(cds_seq).translate(to_stop=False))
|
|
1487
|
+
aa_diffs = []
|
|
1488
|
+
|
|
1489
|
+
for _ in range(n_trials):
|
|
1490
|
+
# Sample lambda from normal distribution with error
|
|
1491
|
+
lambda_sample = np.random.normal(lambda_bp, lambda_error)
|
|
1492
|
+
lambda_sample = max(lambda_sample, 0) # Ensure non-negative
|
|
1493
|
+
|
|
1494
|
+
# Number of base changes in this trial ~ Poisson(lambda_sample)
|
|
1495
|
+
n_bp_mut = np.random.poisson(lambda_sample)
|
|
1496
|
+
|
|
1497
|
+
# Make a mutable copy of the CDS
|
|
1498
|
+
seq_list = list(cds_seq.upper())
|
|
1499
|
+
|
|
1500
|
+
# Introduce exactly n_bp_mut random single‐base substitutions
|
|
1501
|
+
for _ in range(n_bp_mut):
|
|
1502
|
+
pos = random.randrange(len(seq_list))
|
|
1503
|
+
orig_base = seq_list[pos]
|
|
1504
|
+
bases = ["A", "T", "C", "G"]
|
|
1505
|
+
bases.remove(orig_base)
|
|
1506
|
+
seq_list[pos] = random.choice(bases)
|
|
1507
|
+
|
|
1508
|
+
# Translate mutated sequence (no early stop)
|
|
1509
|
+
mutated_prot = str(Seq("".join(seq_list)).translate(to_stop=False))
|
|
1510
|
+
|
|
1511
|
+
# Count how many amino acids differ
|
|
1512
|
+
aa_diff = sum(1 for a, b in zip(prot_orig, mutated_prot) if a != b)
|
|
1513
|
+
aa_diffs.append(aa_diff)
|
|
1514
|
+
|
|
1515
|
+
mean_aa = np.mean(aa_diffs)
|
|
1516
|
+
std_aa = np.std(aa_diffs)
|
|
1517
|
+
|
|
1518
|
+
return mean_aa, std_aa, aa_diffs
|
|
1519
|
+
|
|
1520
|
+
def bootstrap_aa_mutations(hit_mis, hit_cov, bg_mis, bg_cov, cds_seq, n_bootstrap=1000):
|
|
1521
|
+
"""
|
|
1522
|
+
Bootstrap resampling to estimate confidence intervals for AA mutations.
|
|
1523
|
+
|
|
1524
|
+
Args:
|
|
1525
|
+
hit_mis, hit_cov: Hit region mismatches and coverage
|
|
1526
|
+
bg_mis, bg_cov: Background mismatches and coverage
|
|
1527
|
+
cds_seq: Coding sequence
|
|
1528
|
+
n_bootstrap: Number of bootstrap samples
|
|
1529
|
+
|
|
1530
|
+
Returns:
|
|
1531
|
+
tuple: (mean_aa_mutations, ci_lower, ci_upper, bootstrap_distribution)
|
|
1532
|
+
"""
|
|
1533
|
+
bootstrap_results = []
|
|
1534
|
+
|
|
1535
|
+
for _ in range(n_bootstrap):
|
|
1536
|
+
# Resample reads with replacement (binomial resampling)
|
|
1537
|
+
hit_mis_boot = np.random.binomial(hit_cov, hit_mis/hit_cov) if hit_cov > 0 else 0
|
|
1538
|
+
bg_mis_boot = np.random.binomial(bg_cov, bg_mis/bg_cov) if bg_cov > 0 else 0
|
|
1539
|
+
|
|
1540
|
+
# Calculate net rate
|
|
1541
|
+
hit_rate_boot = hit_mis_boot / hit_cov if hit_cov > 0 else 0
|
|
1542
|
+
bg_rate_boot = bg_mis_boot / bg_cov if bg_cov > 0 else 0
|
|
1543
|
+
net_rate_boot = max(hit_rate_boot - bg_rate_boot, 0)
|
|
1544
|
+
|
|
1545
|
+
# Calculate AA mutations
|
|
1546
|
+
lambda_bp_boot = net_rate_boot * len(cds_seq)
|
|
1547
|
+
|
|
1548
|
+
# Quick simulation for bootstrap (fewer trials for speed)
|
|
1549
|
+
aa_mut_boot = simulate_aa_distribution(lambda_bp_boot, cds_seq, n_trials=1000)
|
|
1550
|
+
bootstrap_results.append(np.mean(aa_mut_boot))
|
|
1551
|
+
|
|
1552
|
+
mean_aa = np.mean(bootstrap_results)
|
|
1553
|
+
# Use proper percentile calculation for 95% CI
|
|
1554
|
+
ci_lower = np.percentile(bootstrap_results, 2.5)
|
|
1555
|
+
ci_upper = np.percentile(bootstrap_results, 97.5)
|
|
1556
|
+
|
|
1557
|
+
# Additional validation: ensure CI makes sense
|
|
1558
|
+
if ci_lower > mean_aa or ci_upper < mean_aa:
|
|
1559
|
+
logging.warning(f"Bootstrap CI validation failed: mean={mean_aa:.4f}, CI=[{ci_lower:.4f}, {ci_upper:.4f}]")
|
|
1560
|
+
# Use empirical CI if percentile method fails
|
|
1561
|
+
sorted_results = np.sort(bootstrap_results)
|
|
1562
|
+
n = len(sorted_results)
|
|
1563
|
+
ci_lower = sorted_results[int(0.025 * n)]
|
|
1564
|
+
ci_upper = sorted_results[int(0.975 * n)]
|
|
1565
|
+
|
|
1566
|
+
return mean_aa, ci_lower, ci_upper, bootstrap_results
|
|
1567
|
+
|
|
1568
|
+
def comprehensive_aa_mutation_analysis(hit_mis, hit_cov, bg_mis, bg_cov, cds_seq,
|
|
1569
|
+
quality_threshold=None, n_trials=10000,
|
|
1570
|
+
hit_qscore_stats=None, bg_qscore_stats=None,
|
|
1571
|
+
sam_hit=None, sam_plasmid=None, hit_seq=None, plasmid_seq=None):
|
|
1572
|
+
"""
|
|
1573
|
+
Comprehensive AA mutation analysis with full error propagation including Q-score uncertainty.
|
|
1574
|
+
|
|
1575
|
+
Args:
|
|
1576
|
+
hit_mis, hit_cov: Hit region mismatches and coverage
|
|
1577
|
+
bg_mis, bg_cov: Background mismatches and coverage
|
|
1578
|
+
cds_seq: Coding sequence
|
|
1579
|
+
quality_threshold: Quality threshold for logging
|
|
1580
|
+
n_trials: Number of Monte Carlo trials
|
|
1581
|
+
hit_qscore_stats: Q-score statistics for hit region (optional)
|
|
1582
|
+
bg_qscore_stats: Q-score statistics for background region (optional)
|
|
1583
|
+
|
|
1584
|
+
Returns:
|
|
1585
|
+
dict: Comprehensive results with all error estimates including Q-score effects
|
|
1586
|
+
"""
|
|
1587
|
+
logging.info(f"=== COMPREHENSIVE ERROR MODEL ANALYSIS (Q{quality_threshold}) ===")
|
|
1588
|
+
|
|
1589
|
+
# 1. Binomial confidence intervals for mutation rates
|
|
1590
|
+
logging.info("1. Calculating binomial confidence intervals for mutation rates...")
|
|
1591
|
+
hit_rate_ci = binomial_confidence_interval(hit_mis, hit_cov)
|
|
1592
|
+
bg_rate_ci = binomial_confidence_interval(bg_mis, bg_cov)
|
|
1593
|
+
logging.info(f" Hit rate CI: [{hit_rate_ci[0]:.6f}, {hit_rate_ci[1]:.6f}]")
|
|
1594
|
+
logging.info(f" Background rate CI: [{bg_rate_ci[0]:.6f}, {bg_rate_ci[1]:.6f}]")
|
|
1595
|
+
|
|
1596
|
+
# 2. Error propagation for net mutation rate (including Q-score uncertainty)
|
|
1597
|
+
logging.info("2. Propagating errors for net mutation rate (including Q-score uncertainty)...")
|
|
1598
|
+
net_rate, net_rate_error = propagate_mutation_rate_error(
|
|
1599
|
+
hit_mis, hit_cov, bg_mis, bg_cov, hit_qscore_stats, bg_qscore_stats
|
|
1600
|
+
)
|
|
1601
|
+
logging.info(f" Net mutation rate: {net_rate:.6f} ± {net_rate_error:.6f}")
|
|
1602
|
+
|
|
1603
|
+
# 3. Calculate lambda_bp with error
|
|
1604
|
+
logging.info("3. Calculating lambda_bp (mutations per copy) with error propagation...")
|
|
1605
|
+
lambda_bp = net_rate * len(cds_seq)
|
|
1606
|
+
lambda_error = net_rate_error * len(cds_seq)
|
|
1607
|
+
logging.info(f" Lambda_bp: {lambda_bp:.6f} ± {lambda_error:.6f} mutations per copy")
|
|
1608
|
+
|
|
1609
|
+
# 4. Q-score weighted analysis
|
|
1610
|
+
logging.info("4. Calculating Q-score weighted mutation rates...")
|
|
1611
|
+
hit_weighted_mis, hit_weighted_cov, hit_raw_mis, hit_raw_cov, hit_weights, hit_outcomes = calculate_qscore_weighted_mismatches(
|
|
1612
|
+
sam_hit, hit_seq, hit_qscore_stats
|
|
1613
|
+
)
|
|
1614
|
+
bg_weighted_mis, bg_weighted_cov, bg_raw_mis, bg_raw_cov, bg_weights, bg_outcomes = calculate_qscore_weighted_mismatches(
|
|
1615
|
+
sam_plasmid, plasmid_seq, bg_qscore_stats
|
|
1616
|
+
)
|
|
1617
|
+
|
|
1618
|
+
# Calculate proper weighted sampling errors
|
|
1619
|
+
hit_weighted_rate, hit_weighted_error = calculate_weighted_sampling_error(hit_weights, hit_outcomes)
|
|
1620
|
+
bg_weighted_rate, bg_weighted_error = calculate_weighted_sampling_error(bg_weights, bg_outcomes)
|
|
1621
|
+
|
|
1622
|
+
# Net weighted rate
|
|
1623
|
+
net_weighted_rate = max(hit_weighted_rate - bg_weighted_rate, 0.0)
|
|
1624
|
+
net_weighted_error = np.sqrt(hit_weighted_error**2 + bg_weighted_error**2)
|
|
1625
|
+
|
|
1626
|
+
logging.info(f" Hit weighted rate: {hit_weighted_rate:.6f} ± {hit_weighted_error:.6f}")
|
|
1627
|
+
logging.info(f" Background weighted rate: {bg_weighted_rate:.6f} ± {bg_weighted_error:.6f}")
|
|
1628
|
+
logging.info(f" Net weighted rate: {net_weighted_rate:.6f} ± {net_weighted_error:.6f}")
|
|
1629
|
+
|
|
1630
|
+
# 5. Calculate AA mutations per gene (simplified - no Monte Carlo)
|
|
1631
|
+
logging.info("5. Calculating AA mutations per gene from weighted rates...")
|
|
1632
|
+
lambda_bp_weighted = net_weighted_rate * len(cds_seq)
|
|
1633
|
+
lambda_error_weighted = net_weighted_error * len(cds_seq)
|
|
1634
|
+
|
|
1635
|
+
# Simple AA mutation estimate (mean of Poisson distribution)
|
|
1636
|
+
mean_aa = lambda_bp_weighted / 3.0 # Approximate: 3 bp per AA
|
|
1637
|
+
std_aa = np.sqrt(lambda_bp_weighted) / 3.0 # Standard deviation of Poisson
|
|
1638
|
+
|
|
1639
|
+
logging.info(f" Lambda_bp (weighted): {lambda_bp_weighted:.6f} ± {lambda_error_weighted:.6f}")
|
|
1640
|
+
logging.info(f" AA mutations per gene: {mean_aa:.4f} ± {std_aa:.4f}")
|
|
1641
|
+
|
|
1642
|
+
# 6. Bootstrap confidence intervals
|
|
1643
|
+
logging.info("6. Calculating bootstrap confidence intervals (1,000 resamples)...")
|
|
1644
|
+
bootstrap_mean, ci_lower, ci_upper, bootstrap_dist = bootstrap_aa_mutations(
|
|
1645
|
+
hit_mis, hit_cov, bg_mis, bg_cov, cds_seq
|
|
1646
|
+
)
|
|
1647
|
+
logging.info(f" Bootstrap 95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
|
|
1648
|
+
|
|
1649
|
+
# 7. Alignment error estimation
|
|
1650
|
+
logging.info("7. Calculating alignment error estimation...")
|
|
1651
|
+
alignment_error = 1.0 / np.sqrt(hit_cov) if hit_cov > 0 else 1.0
|
|
1652
|
+
logging.info(f" Alignment error: {alignment_error:.6f}")
|
|
1653
|
+
|
|
1654
|
+
# 8. Q-score uncertainty factors
|
|
1655
|
+
logging.info("8. Calculating Q-score uncertainty factors...")
|
|
1656
|
+
hit_qscore_uncertainty = qscore_uncertainty_factor(hit_qscore_stats['mean_qscore']) if hit_qscore_stats else 0.0
|
|
1657
|
+
bg_qscore_uncertainty = qscore_uncertainty_factor(bg_qscore_stats['mean_qscore']) if bg_qscore_stats else 0.0
|
|
1658
|
+
logging.info(f" Hit Q-score uncertainty: {hit_qscore_uncertainty:.6f}")
|
|
1659
|
+
logging.info(f" Background Q-score uncertainty: {bg_qscore_uncertainty:.6f}")
|
|
1660
|
+
|
|
1661
|
+
results = {
|
|
1662
|
+
'mean_aa_mutations': mean_aa,
|
|
1663
|
+
'std_aa_mutations': std_aa,
|
|
1664
|
+
'ci_lower': ci_lower,
|
|
1665
|
+
'ci_upper': ci_upper,
|
|
1666
|
+
'hit_rate': hit_mis / hit_cov if hit_cov > 0 else 0,
|
|
1667
|
+
'hit_rate_ci': hit_rate_ci,
|
|
1668
|
+
'bg_rate': bg_mis / bg_cov if bg_cov > 0 else 0,
|
|
1669
|
+
'bg_rate_ci': bg_rate_ci,
|
|
1670
|
+
'net_rate': net_rate,
|
|
1671
|
+
'net_rate_error': net_rate_error,
|
|
1672
|
+
'lambda_bp': lambda_bp,
|
|
1673
|
+
'lambda_error': lambda_error,
|
|
1674
|
+
'alignment_error': alignment_error,
|
|
1675
|
+
'hit_qscore_uncertainty': hit_qscore_uncertainty,
|
|
1676
|
+
'bg_qscore_uncertainty': bg_qscore_uncertainty,
|
|
1677
|
+
'hit_qscore_stats': hit_qscore_stats,
|
|
1678
|
+
'bg_qscore_stats': bg_qscore_stats,
|
|
1679
|
+
'bootstrap_distribution': bootstrap_dist,
|
|
1680
|
+
'quality_threshold': quality_threshold,
|
|
1681
|
+
'mappable_bases': hit_cov,
|
|
1682
|
+
# Q-score weighted results
|
|
1683
|
+
'hit_weighted_rate': hit_weighted_rate,
|
|
1684
|
+
'hit_weighted_error': hit_weighted_error,
|
|
1685
|
+
'bg_weighted_rate': bg_weighted_rate,
|
|
1686
|
+
'bg_weighted_error': bg_weighted_error,
|
|
1687
|
+
'net_weighted_rate': net_weighted_rate,
|
|
1688
|
+
'net_weighted_error': net_weighted_error,
|
|
1689
|
+
'lambda_bp_weighted': lambda_bp_weighted,
|
|
1690
|
+
'lambda_error_weighted': lambda_error_weighted,
|
|
1691
|
+
'hit_weighted_mismatches': hit_weighted_mis,
|
|
1692
|
+
'hit_weighted_coverage': hit_weighted_cov,
|
|
1693
|
+
'bg_weighted_mismatches': bg_weighted_mis,
|
|
1694
|
+
'bg_weighted_coverage': bg_weighted_cov
|
|
1695
|
+
}
|
|
1696
|
+
|
|
1697
|
+
if quality_threshold is not None:
|
|
1698
|
+
qscore_info = ""
|
|
1699
|
+
if hit_qscore_stats:
|
|
1700
|
+
qscore_info = f", hit_qscore={hit_qscore_stats['mean_qscore']:.1f}±{hit_qscore_stats['std_qscore']:.1f}"
|
|
1701
|
+
if bg_qscore_stats:
|
|
1702
|
+
qscore_info += f", bg_qscore={bg_qscore_stats['mean_qscore']:.1f}±{bg_qscore_stats['std_qscore']:.1f}"
|
|
1703
|
+
|
|
1704
|
+
logging.info(f"Quality {quality_threshold}: AA mutations = {mean_aa:.4f} ± {std_aa:.4f} "
|
|
1705
|
+
f"(95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]), "
|
|
1706
|
+
f"mappable_bases={hit_cov}, net_rate={net_rate:.6f}±{net_rate_error:.6f}{qscore_info}")
|
|
1707
|
+
|
|
1708
|
+
return results
|
|
1709
|
+
|
|
1710
|
+
def simulate_aa_distribution(lambda_bp, cds_seq, n_trials=1000):
|
|
1711
|
+
"""
|
|
1712
|
+
Monte Carlo: each trial draws n_bp_mut ~ Poisson(lambda_bp),
|
|
1713
|
+
introduces those random single‐base substitutions, translates,
|
|
1714
|
+
and returns a list of amino acid differences per trial.
|
|
1715
|
+
"""
|
|
1716
|
+
prot_orig = str(Seq(cds_seq).translate(to_stop=False))
|
|
1717
|
+
aa_diffs = []
|
|
1718
|
+
|
|
1719
|
+
for _ in range(n_trials):
|
|
1720
|
+
# Number of base changes in this trial ~ Poisson(lambda_bp)
|
|
1721
|
+
n_bp_mut = np.random.poisson(lambda_bp)
|
|
1722
|
+
|
|
1723
|
+
# Make a mutable copy of the CDS
|
|
1724
|
+
seq_list = list(cds_seq.upper())
|
|
1725
|
+
|
|
1726
|
+
# Introduce exactly n_bp_mut random single‐base substitutions
|
|
1727
|
+
for _ in range(n_bp_mut):
|
|
1728
|
+
pos = random.randrange(len(seq_list))
|
|
1729
|
+
orig_base = seq_list[pos]
|
|
1730
|
+
bases = ["A", "T", "C", "G"]
|
|
1731
|
+
bases.remove(orig_base)
|
|
1732
|
+
seq_list[pos] = random.choice(bases)
|
|
1733
|
+
|
|
1734
|
+
# Translate mutated sequence (no early stop)
|
|
1735
|
+
mutated_prot = str(Seq("".join(seq_list)).translate(to_stop=False))
|
|
1736
|
+
|
|
1737
|
+
# Count how many amino acids differ
|
|
1738
|
+
aa_diff = sum(1 for a, b in zip(prot_orig, mutated_prot) if a != b)
|
|
1739
|
+
aa_diffs.append(aa_diff)
|
|
1740
|
+
|
|
1741
|
+
return aa_diffs
|
|
1742
|
+
|
|
1743
|
+
def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, work_dir, results_dir,
|
|
1744
|
+
chunks, ref_hit_fasta, plasmid_fasta, hit_seq, hit_id, plasmid_seq, idx):
|
|
1745
|
+
"""
|
|
1746
|
+
Run the main mutation rate analysis for a specific Q-score.
|
|
1747
|
+
|
|
1748
|
+
Args:
|
|
1749
|
+
fastq_path: Path to the FASTQ file to analyze
|
|
1750
|
+
qscore: Q-score threshold (None for unfiltered)
|
|
1751
|
+
qscore_desc: Description of the Q-score (e.g., "Q18", "unfiltered")
|
|
1752
|
+
sample_name: Name of the sample
|
|
1753
|
+
work_dir: Working directory for temporary files
|
|
1754
|
+
results_dir: Results directory for output files
|
|
1755
|
+
chunks: List of plasmid chunks
|
|
1756
|
+
ref_hit_fasta: Path to reference hit FASTA
|
|
1757
|
+
plasmid_fasta: Path to plasmid FASTA
|
|
1758
|
+
hit_seq: Hit sequence
|
|
1759
|
+
hit_id: Hit ID
|
|
1760
|
+
plasmid_seq: Plasmid sequence
|
|
1761
|
+
idx: Index of hit in plasmid
|
|
1762
|
+
|
|
1763
|
+
Returns:
|
|
1764
|
+
dict: Analysis results
|
|
1765
|
+
"""
|
|
1766
|
+
logging.info(f"Running main analysis for {qscore_desc}...")
|
|
1767
|
+
|
|
1768
|
+
# Ensure work directory exists
|
|
1769
|
+
os.makedirs(work_dir, exist_ok=True)
|
|
1770
|
+
|
|
1771
|
+
# Create subdirectory for this Q-score analysis
|
|
1772
|
+
qscore_results_dir = results_dir
|
|
1773
|
+
if qscore is not None:
|
|
1774
|
+
qscore_results_dir = os.path.join(results_dir, f"q{qscore}_analysis")
|
|
1775
|
+
os.makedirs(qscore_results_dir, exist_ok=True)
|
|
1776
|
+
|
|
1777
|
+
# Write chunks FASTA & align to background‐chunks
|
|
1778
|
+
chunks_fasta = create_multi_fasta(chunks, work_dir)
|
|
1779
|
+
sam_chunks = run_minimap2(fastq_path, chunks_fasta, "plasmid_chunks_alignment", work_dir)
|
|
1780
|
+
|
|
1781
|
+
# Align to hit (target) alone
|
|
1782
|
+
sam_hit = run_minimap2(fastq_path, ref_hit_fasta, "hit_alignment", work_dir)
|
|
1783
|
+
|
|
1784
|
+
# Compute mismatch stats for background chunks (for reference, but not used for background rate)
|
|
1785
|
+
chunk_refs = { f"chunk_{i+1}": seq for i, seq in enumerate(chunks) }
|
|
1786
|
+
mismatch_chunks = compute_mismatch_stats_sam(sam_chunks, chunk_refs)
|
|
1787
|
+
|
|
1788
|
+
# ----------------------------
|
|
1789
|
+
# COMPUTE BASE DISTRIBUTION AT EACH POSITION OF HIT
|
|
1790
|
+
# ----------------------------
|
|
1791
|
+
base_counts = [
|
|
1792
|
+
{"A": 0, "C": 0, "G": 0, "T": 0, "N": 0}
|
|
1793
|
+
for _ in range(len(hit_seq))
|
|
1794
|
+
]
|
|
1795
|
+
samfile_hit = pysam.AlignmentFile(sam_hit, "r")
|
|
1796
|
+
for read in samfile_hit.fetch():
|
|
1797
|
+
if read.is_unmapped or read.query_sequence is None:
|
|
1798
|
+
continue
|
|
1799
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
1800
|
+
if read_pos is not None and ref_pos is not None and 0 <= ref_pos < len(hit_seq):
|
|
1801
|
+
base = read.query_sequence[read_pos].upper()
|
|
1802
|
+
if base not in {"A", "C", "G", "T"}:
|
|
1803
|
+
base = "N"
|
|
1804
|
+
base_counts[ref_pos][base] += 1
|
|
1805
|
+
samfile_hit.close()
|
|
1806
|
+
|
|
1807
|
+
# ----------------------------
|
|
1808
|
+
# ALIGN TO FULL PLASMID TO GET COVERAGE
|
|
1809
|
+
# ----------------------------
|
|
1810
|
+
sam_plasmid = run_minimap2(fastq_path, plasmid_fasta, "plasmid_full_alignment", work_dir)
|
|
1811
|
+
|
|
1812
|
+
# Calculate plasmid coverage
|
|
1813
|
+
plasmid_cov = [0] * len(plasmid_seq)
|
|
1814
|
+
samfile_full = pysam.AlignmentFile(sam_plasmid, "r")
|
|
1815
|
+
for read in samfile_full.fetch():
|
|
1816
|
+
if read.is_unmapped or read.query_sequence is None:
|
|
1817
|
+
continue
|
|
1818
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
1819
|
+
if ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
|
|
1820
|
+
plasmid_cov[ref_pos] += 1
|
|
1821
|
+
samfile_full.close()
|
|
1822
|
+
|
|
1823
|
+
# Calculate background rate from full plasmid alignment, excluding target region
|
|
1824
|
+
# This avoids artificial junction mismatches from concatenated chunks
|
|
1825
|
+
bg_mis, bg_cov, bg_reads = calculate_background_from_plasmid(sam_plasmid, plasmid_seq, idx, len(hit_seq))
|
|
1826
|
+
bg_rate = (bg_mis / bg_cov) if bg_cov else 0.0 # raw per‐base
|
|
1827
|
+
bg_rate_per_kb = bg_rate * 1e3
|
|
1828
|
+
|
|
1829
|
+
logging.info(
|
|
1830
|
+
f"Background (plasmid excluding target): total_mismatches={bg_mis}, "
|
|
1831
|
+
f"covered_bases={bg_cov}, mapped_reads={bg_reads}, "
|
|
1832
|
+
f"rate_per_kb={bg_rate_per_kb:.4f}"
|
|
1833
|
+
)
|
|
1834
|
+
|
|
1835
|
+
# Compute mismatch stats for hit (target)
|
|
1836
|
+
mismatch_hit = compute_mismatch_stats_sam(sam_hit, {hit_id: hit_seq})
|
|
1837
|
+
hit_info = mismatch_hit[hit_id]
|
|
1838
|
+
hit_mis = hit_info["total_mismatches"]
|
|
1839
|
+
hit_cov = hit_info["total_covered_bases"]
|
|
1840
|
+
hit_reads = hit_info["mapped_reads"]
|
|
1841
|
+
hit_rate = hit_info["avg_mismatch_rate"] # raw per‐base
|
|
1842
|
+
hit_rate_per_kb = hit_rate * 1e3
|
|
1843
|
+
|
|
1844
|
+
logging.info(
|
|
1845
|
+
f"Target ({hit_id}): total_mismatches={hit_mis}, "
|
|
1846
|
+
f"covered_bases={hit_cov}, mapped_reads={hit_reads}, "
|
|
1847
|
+
f"rate_per_kb={hit_rate_per_kb:.4f}"
|
|
1848
|
+
)
|
|
1849
|
+
|
|
1850
|
+
# Two‐proportion Z‐test: is target rate > background rate?
|
|
1851
|
+
z_stat, p_val = z_test_two_proportions(hit_mis, hit_cov, bg_mis, bg_cov)
|
|
1852
|
+
|
|
1853
|
+
# Compute "Estimated mutations per target copy (basepairs)" (float)
|
|
1854
|
+
length_of_target = len(hit_seq)
|
|
1855
|
+
true_diff_rate = hit_rate - bg_rate
|
|
1856
|
+
est_mut_per_copy = max(true_diff_rate * length_of_target, 0.0)
|
|
1857
|
+
|
|
1858
|
+
# Determine if ROI is a valid protein‐coding sequence (updated definition)
|
|
1859
|
+
is_protein = True
|
|
1860
|
+
reasons = []
|
|
1861
|
+
seq_upper = hit_seq.upper()
|
|
1862
|
+
# Must be multiple of 3
|
|
1863
|
+
if len(seq_upper) % 3 != 0:
|
|
1864
|
+
is_protein = False
|
|
1865
|
+
reasons.append(f"length {len(seq_upper)} is not a multiple of 3")
|
|
1866
|
+
|
|
1867
|
+
if is_protein:
|
|
1868
|
+
prot_full = str(Seq(seq_upper).translate(to_stop=False))
|
|
1869
|
+
# Check for premature stop codons (anything except possibly at the end)
|
|
1870
|
+
if "*" in prot_full[:-1]:
|
|
1871
|
+
is_protein = False
|
|
1872
|
+
reasons.append("premature stop codon detected before the last codon")
|
|
1873
|
+
# No requirement to start with ATG or end with stop beyond the last codon
|
|
1874
|
+
|
|
1875
|
+
# If protein, simulate AA distribution per copy using Poisson sampling
|
|
1876
|
+
if is_protein:
|
|
1877
|
+
logging.info(f"Simulating amino acid distribution with λ_bp={est_mut_per_copy:.2f}")
|
|
1878
|
+
aa_diffs = simulate_aa_distribution(est_mut_per_copy, hit_seq, n_trials=1000)
|
|
1879
|
+
avg_aa_mutations = sum(aa_diffs) / len(aa_diffs)
|
|
1880
|
+
|
|
1881
|
+
# Log simulation results for debugging
|
|
1882
|
+
logging.info(f"AA simulation results: min={min(aa_diffs)}, max={max(aa_diffs)}, mean={avg_aa_mutations:.3f}")
|
|
1883
|
+
logging.info(f"AA simulation distribution: {len([x for x in aa_diffs if x == 0])} zeros, {len([x for x in aa_diffs if x > 0])} non-zeros")
|
|
1884
|
+
else:
|
|
1885
|
+
aa_diffs = []
|
|
1886
|
+
avg_aa_mutations = None
|
|
1887
|
+
|
|
1888
|
+
# Update Q-score info for titles
|
|
1889
|
+
qscore_info = f" ({qscore_desc})" if qscore_desc != "unfiltered" else ""
|
|
1890
|
+
|
|
1891
|
+
# ----------------------------
|
|
1892
|
+
# SAVE CSV FOR MUTATION RATES (PANEL 1)
|
|
1893
|
+
# ----------------------------
|
|
1894
|
+
gene_mismatch_csv = os.path.join(qscore_results_dir, "gene_mismatch_rates.csv")
|
|
1895
|
+
with open(gene_mismatch_csv, "w", newline="") as csvfile:
|
|
1896
|
+
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
1897
|
+
csvfile.write(f"# background_rate_per_kb: {bg_rate_per_kb:.6f}\n")
|
|
1898
|
+
csvfile.write("position_1based,mismatch_rate_per_base\n")
|
|
1899
|
+
for pos0, rate in enumerate(hit_info["pos_rates"]):
|
|
1900
|
+
csvfile.write(f"{pos0 + 1},{rate:.6e}\n")
|
|
1901
|
+
logging.info(f"Saved CSV for gene mismatch rates: {gene_mismatch_csv}")
|
|
1902
|
+
|
|
1903
|
+
# ----------------------------
|
|
1904
|
+
# SAVE CSV FOR BASE DISTRIBUTION (PANEL 2)
|
|
1905
|
+
# ----------------------------
|
|
1906
|
+
base_dist_csv = os.path.join(qscore_results_dir, "base_distribution.csv")
|
|
1907
|
+
with open(base_dist_csv, "w", newline="") as csvfile:
|
|
1908
|
+
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
1909
|
+
csvfile.write("position_1based,ref_base,A_count,C_count,G_count,T_count,N_count\n")
|
|
1910
|
+
for pos0, counts in enumerate(base_counts):
|
|
1911
|
+
ref_base = seq_upper[pos0]
|
|
1912
|
+
csvfile.write(f"{pos0 + 1},{ref_base},{counts['A']},{counts['C']},{counts['G']},{counts['T']},{counts['N']}\n")
|
|
1913
|
+
logging.info(f"Saved CSV for base distribution: {base_dist_csv}")
|
|
1914
|
+
|
|
1915
|
+
# ----------------------------
|
|
1916
|
+
# SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - only if protein
|
|
1917
|
+
# ----------------------------
|
|
1918
|
+
if is_protein:
|
|
1919
|
+
aa_subst_csv = os.path.join(qscore_results_dir, "aa_substitutions.csv")
|
|
1920
|
+
with open(aa_subst_csv, "w", newline="") as csvfile:
|
|
1921
|
+
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
1922
|
+
csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
|
|
1923
|
+
csvfile.write("position_1based,ref_aa,alt_aa,count\n")
|
|
1924
|
+
# This would need to be implemented based on the specific requirements
|
|
1925
|
+
# For now, just write a header
|
|
1926
|
+
logging.info(f"Saved CSV for AA substitutions: {aa_subst_csv}")
|
|
1927
|
+
|
|
1928
|
+
# ----------------------------
|
|
1929
|
+
# SAVE CSV FOR PLASMID COVERAGE (PANEL 4)
|
|
1930
|
+
# ----------------------------
|
|
1931
|
+
plasmid_cov_csv = os.path.join(qscore_results_dir, "plasmid_coverage.csv")
|
|
1932
|
+
with open(plasmid_cov_csv, "w", newline="") as csvfile:
|
|
1933
|
+
csvfile.write("position_1based,coverage\n")
|
|
1934
|
+
for pos0, cov in enumerate(plasmid_cov):
|
|
1935
|
+
csvfile.write(f"{pos0 + 1},{cov}\n")
|
|
1936
|
+
logging.info(f"Saved CSV for plasmid coverage: {plasmid_cov_csv}")
|
|
1937
|
+
|
|
1938
|
+
# ----------------------------
|
|
1939
|
+
# SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3)
|
|
1940
|
+
# ----------------------------
|
|
1941
|
+
aa_dist_csv = os.path.join(qscore_results_dir, "aa_mutation_distribution.csv")
|
|
1942
|
+
with open(aa_dist_csv, "w", newline="") as csvfile:
|
|
1943
|
+
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
1944
|
+
csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
|
|
1945
|
+
csvfile.write(f"# n_trials: 1000\n")
|
|
1946
|
+
if is_protein:
|
|
1947
|
+
csvfile.write("trial_index,aa_mutations\n")
|
|
1948
|
+
for idx_trial, aa_count in enumerate(aa_diffs, start=1):
|
|
1949
|
+
csvfile.write(f"{idx_trial},{aa_count}\n")
|
|
1950
|
+
else:
|
|
1951
|
+
csvfile.write("# No AA distribution because region is not protein-coding\n")
|
|
1952
|
+
logging.info(f"Saved CSV for AA mutation distribution: {aa_dist_csv}")
|
|
1953
|
+
|
|
1954
|
+
# ----------------------------
|
|
1955
|
+
# PREPARE PANEL FIGURE WITH 4 SUBPLOTS
|
|
1956
|
+
# ----------------------------
|
|
1957
|
+
fig, axes = plt.subplots(2, 2, figsize=(18, 12), constrained_layout=True)
|
|
1958
|
+
# axes[0,0]: Mutation rate over gene of interest
|
|
1959
|
+
# axes[0,1]: Rolling mutation rate across plasmid (20 bp window)
|
|
1960
|
+
# axes[1,0]: Coverage of plasmid with ROI shaded
|
|
1961
|
+
# axes[1,1]: KDE of AA mutations per copy
|
|
1962
|
+
|
|
1963
|
+
# --- Panel 1: Mutation rate over gene of interest ---
|
|
1964
|
+
ax0 = axes[0, 0]
|
|
1965
|
+
positions_gene = np.arange(1, len(hit_info["pos_rates"]) + 1)
|
|
1966
|
+
ax0.axhspan(0, bg_rate, color='gray', alpha=0.3, label="Background rate")
|
|
1967
|
+
ax0.plot(positions_gene, hit_info["pos_rates"],
|
|
1968
|
+
color="#2E86AB", linestyle='-', linewidth=1.5, alpha=0.8,
|
|
1969
|
+
label="Mutation rate")
|
|
1970
|
+
ax0.set_title(f"Mismatch Rate per Position: Gene of Interest{qscore_info}", fontsize=14, fontweight='bold')
|
|
1971
|
+
ax0.set_xlabel("Position in Gene (bp)", fontsize=12)
|
|
1972
|
+
ax0.set_ylabel("Mismatch Rate", fontsize=12)
|
|
1973
|
+
ax0.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
|
|
1974
|
+
ax0.tick_params(axis='both', which='minor', direction='in', length=3)
|
|
1975
|
+
ax0.spines['top'].set_visible(False)
|
|
1976
|
+
ax0.spines['right'].set_visible(False)
|
|
1977
|
+
ax0.legend(loc="upper right", frameon=False, fontsize=10)
|
|
1978
|
+
|
|
1979
|
+
# --- Panel 2: Rolling mutation rate across plasmid ---
|
|
1980
|
+
ax1 = axes[0, 1]
|
|
1981
|
+
# Calculate rolling mutation rate across plasmid
|
|
1982
|
+
window_size = 20
|
|
1983
|
+
rolling_positions = []
|
|
1984
|
+
rolling_rates = []
|
|
1985
|
+
|
|
1986
|
+
# Calculate mismatches per position across the plasmid
|
|
1987
|
+
plasmid_mismatches = [0] * len(plasmid_seq)
|
|
1988
|
+
samfile_rolling = pysam.AlignmentFile(sam_plasmid, "r")
|
|
1989
|
+
for read in samfile_rolling.fetch():
|
|
1990
|
+
if read.is_unmapped or read.query_sequence is None:
|
|
1991
|
+
continue
|
|
1992
|
+
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
1993
|
+
if ref_pos is not None and 0 <= ref_pos < len(plasmid_seq):
|
|
1994
|
+
if read_pos is not None:
|
|
1995
|
+
read_base = read.query_sequence[read_pos].upper()
|
|
1996
|
+
ref_base = plasmid_seq[ref_pos].upper()
|
|
1997
|
+
if read_base != ref_base and read_base in "ACGT" and ref_base in "ACGT":
|
|
1998
|
+
plasmid_mismatches[ref_pos] += 1
|
|
1999
|
+
samfile_rolling.close()
|
|
2000
|
+
|
|
2001
|
+
# Calculate rolling mutation rate
|
|
2002
|
+
for i in range(len(plasmid_cov) - window_size + 1):
|
|
2003
|
+
window_cov = plasmid_cov[i:i + window_size]
|
|
2004
|
+
window_mismatches = plasmid_mismatches[i:i + window_size]
|
|
2005
|
+
|
|
2006
|
+
total_coverage = sum(window_cov)
|
|
2007
|
+
total_mismatches = sum(window_mismatches)
|
|
2008
|
+
|
|
2009
|
+
if total_coverage > 0: # Only include windows with coverage
|
|
2010
|
+
rolling_positions.append(i + window_size // 2)
|
|
2011
|
+
mutation_rate = total_mismatches / total_coverage
|
|
2012
|
+
rolling_rates.append(mutation_rate)
|
|
2013
|
+
|
|
2014
|
+
ax1.plot(rolling_positions, rolling_rates,
|
|
2015
|
+
color="#FF6B6B", linestyle='-', linewidth=2, alpha=0.8,
|
|
2016
|
+
label="Rolling average (20 bp)")
|
|
2017
|
+
ax1.set_title(f"Rolling Mutation Rate Across Plasmid (20 bp Window){qscore_info}", fontsize=14, fontweight='bold')
|
|
2018
|
+
ax1.set_xlabel("Position on Plasmid (bp)", fontsize=12)
|
|
2019
|
+
ax1.set_ylabel("Mismatch Rate", fontsize=12)
|
|
2020
|
+
ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
|
|
2021
|
+
ax1.tick_params(axis='both', which='minor', direction='in', length=3)
|
|
2022
|
+
ax1.spines['top'].set_visible(False)
|
|
2023
|
+
ax1.spines['right'].set_visible(False)
|
|
2024
|
+
ax1.legend(loc="upper right", frameon=False, fontsize=10)
|
|
2025
|
+
# Shade the ROI region
|
|
2026
|
+
start_roi = idx + 1
|
|
2027
|
+
end_roi = idx + len(hit_seq)
|
|
2028
|
+
ax1.axvspan(start_roi, end_roi, color='gray', alpha=0.3, label=f"ROI: {start_roi}–{end_roi}")
|
|
2029
|
+
|
|
2030
|
+
# --- Panel 3: Coverage of plasmid with ROI shaded ---
|
|
2031
|
+
ax2 = axes[1, 0]
|
|
2032
|
+
plasmid_positions = np.arange(1, len(plasmid_cov) + 1)
|
|
2033
|
+
ax2.plot(plasmid_positions, plasmid_cov,
|
|
2034
|
+
linestyle='-', color='black', linewidth=1.0, alpha=0.8, label="Coverage")
|
|
2035
|
+
ax2.set_title(f"Full Plasmid Coverage with ROI Shaded{qscore_info}", fontsize=14, fontweight='bold')
|
|
2036
|
+
ax2.set_xlabel("Position on Plasmid", fontsize=12)
|
|
2037
|
+
ax2.set_ylabel("Coverage (# reads)", fontsize=12)
|
|
2038
|
+
ax2.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
|
|
2039
|
+
ax2.tick_params(axis='both', which='minor', direction='in', length=3)
|
|
2040
|
+
ax2.spines['top'].set_visible(False)
|
|
2041
|
+
ax2.spines['right'].set_visible(False)
|
|
2042
|
+
start_roi = idx + 1
|
|
2043
|
+
end_roi = idx + len(hit_seq)
|
|
2044
|
+
ax2.axvspan(start_roi, end_roi, color='gray', alpha=0.3, label=f"ROI: {start_roi}–{end_roi}")
|
|
2045
|
+
|
|
2046
|
+
# --- Panel 4: KDE of AA mutations per copy ---
|
|
2047
|
+
ax3 = axes[1, 1]
|
|
2048
|
+
if is_protein and aa_diffs and len(aa_diffs) > 0:
|
|
2049
|
+
x_vals = np.array(aa_diffs)
|
|
2050
|
+
unique_vals = np.unique(x_vals)
|
|
2051
|
+
|
|
2052
|
+
if len(unique_vals) > 1:
|
|
2053
|
+
# Multiple unique values - use KDE or histogram
|
|
2054
|
+
if HAVE_SCIPY:
|
|
2055
|
+
try:
|
|
2056
|
+
kde = gaussian_kde(x_vals)
|
|
2057
|
+
x_grid = np.linspace(0, max(x_vals), 200)
|
|
2058
|
+
kde_values = kde(x_grid)
|
|
2059
|
+
ax3.plot(x_grid, kde_values,
|
|
2060
|
+
color="#C44E52", linewidth=2.0, alpha=0.8, label="KDE")
|
|
2061
|
+
ax3.fill_between(x_grid, kde_values, color="#C44E52", alpha=0.3)
|
|
2062
|
+
ax3.set_ylim(bottom=0)
|
|
2063
|
+
except Exception as e:
|
|
2064
|
+
logging.warning(f"KDE failed: {e}, falling back to histogram")
|
|
2065
|
+
ax3.hist(x_vals, bins=min(20, len(unique_vals)),
|
|
2066
|
+
color="#C44E52", alpha=0.7, density=True, edgecolor='black')
|
|
2067
|
+
else:
|
|
2068
|
+
ax3.hist(x_vals, bins=min(20, len(unique_vals)),
|
|
2069
|
+
color="#C44E52", alpha=0.7, density=True, edgecolor='black')
|
|
2070
|
+
else:
|
|
2071
|
+
# Single unique value - just show a bar
|
|
2072
|
+
ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
|
|
2073
|
+
ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
|
|
2074
|
+
else:
|
|
2075
|
+
# Not protein or no AA differences
|
|
2076
|
+
ax3.text(0.5, 0.5, "Not a protein‐coding region",
|
|
2077
|
+
horizontalalignment='center', verticalalignment='center',
|
|
2078
|
+
fontsize=12, color='gray', transform=ax3.transAxes)
|
|
2079
|
+
|
|
2080
|
+
ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
|
|
2081
|
+
ax3.set_xlabel("Number of AA Mutations", fontsize=12)
|
|
2082
|
+
ax3.set_ylabel("Density", fontsize=12)
|
|
2083
|
+
ax3.spines['top'].set_visible(False)
|
|
2084
|
+
ax3.spines['right'].set_visible(False)
|
|
2085
|
+
ax3.set_xticks([])
|
|
2086
|
+
ax3.set_yticks([])
|
|
2087
|
+
|
|
2088
|
+
# Save the combined figure as both PNG and PDF
|
|
2089
|
+
panel_path_png = os.path.join(qscore_results_dir, "summary_panels.png")
|
|
2090
|
+
panel_path_pdf = os.path.join(qscore_results_dir, "summary_panels.pdf")
|
|
2091
|
+
fig.savefig(panel_path_png, dpi=150, transparent=False)
|
|
2092
|
+
fig.savefig(panel_path_pdf) # vector format
|
|
2093
|
+
plt.close(fig)
|
|
2094
|
+
logging.info(f"Saved combined panel figure as PNG: {panel_path_png}")
|
|
2095
|
+
logging.info(f"Saved combined panel figure as PDF: {panel_path_pdf}")
|
|
2096
|
+
|
|
2097
|
+
# ----------------------------
|
|
2098
|
+
# COMPUTE MUTATION SPECTRUM FOR ABOVE-BACKGROUND POSITIONS
|
|
2099
|
+
# ----------------------------
|
|
2100
|
+
# Define categories and reference percentages
|
|
2101
|
+
categories = {
|
|
2102
|
+
"A→G, T→C": {"pairs": [("A", "G"), ("T", "C")], "ref": 17.5},
|
|
2103
|
+
"G→A, C→T": {"pairs": [("G", "A"), ("C", "T")], "ref": 25.5},
|
|
2104
|
+
"A→T, T→A": {"pairs": [("A", "T"), ("T", "A")], "ref": 28.5},
|
|
2105
|
+
"A→C, T→G": {"pairs": [("A", "C"), ("T", "G")], "ref": 4.7},
|
|
2106
|
+
"G→C, C→G": {"pairs": [("G", "C"), ("C", "G")], "ref": 4.1},
|
|
2107
|
+
"G→T, C→A": {"pairs": [("G", "T"), ("C", "A")], "ref": 14.1},
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
# Tally observed counts at above-background positions
|
|
2111
|
+
category_counts = {cat: 0 for cat in categories}
|
|
2112
|
+
total_alt_counts = 0
|
|
2113
|
+
|
|
2114
|
+
for pos0, rate in enumerate(hit_info["pos_rates"]):
|
|
2115
|
+
if rate <= bg_rate:
|
|
2116
|
+
continue
|
|
2117
|
+
ref_base = seq_upper[pos0]
|
|
2118
|
+
counts = base_counts[pos0]
|
|
2119
|
+
for alt_base in ("A", "C", "G", "T"):
|
|
2120
|
+
if alt_base == ref_base:
|
|
2121
|
+
continue
|
|
2122
|
+
cnt = counts.get(alt_base, 0)
|
|
2123
|
+
if cnt == 0:
|
|
2124
|
+
continue
|
|
2125
|
+
# Determine which category this (ref→alt) belongs to
|
|
2126
|
+
for cat, info in categories.items():
|
|
2127
|
+
if (ref_base, alt_base) in info["pairs"]:
|
|
2128
|
+
category_counts[cat] += cnt
|
|
2129
|
+
total_alt_counts += cnt
|
|
2130
|
+
break
|
|
2131
|
+
|
|
2132
|
+
# Compute sample percentages
|
|
2133
|
+
sample_percent = {}
|
|
2134
|
+
if total_alt_counts > 0:
|
|
2135
|
+
for cat, cnt in category_counts.items():
|
|
2136
|
+
sample_percent[cat] = 100.0 * cnt / total_alt_counts
|
|
2137
|
+
else:
|
|
2138
|
+
for cat in categories:
|
|
2139
|
+
sample_percent[cat] = 0.0
|
|
2140
|
+
|
|
2141
|
+
# ----------------------------
|
|
2142
|
+
# GENERATE PDF TABLE (MUTATION SPECTRUM)
|
|
2143
|
+
# ----------------------------
|
|
2144
|
+
pdf_path = os.path.join(qscore_results_dir, f"{sample_name}_mutation_spectrum.pdf")
|
|
2145
|
+
# Prepare table data
|
|
2146
|
+
table_rows = []
|
|
2147
|
+
for cat in categories:
|
|
2148
|
+
ref_pct = categories[cat]["ref"]
|
|
2149
|
+
samp_pct = sample_percent[cat]
|
|
2150
|
+
table_rows.append([cat, f"{ref_pct:.1f}%", f"{samp_pct:.1f}%"])
|
|
2151
|
+
|
|
2152
|
+
# Create a matplotlib figure for the table
|
|
2153
|
+
fig, ax = plt.subplots(figsize=(6, 3)) # adjust size as needed
|
|
2154
|
+
ax.axis("off")
|
|
2155
|
+
|
|
2156
|
+
col_labels = ["Mutation Type", "Mutazyme II reference", sample_name]
|
|
2157
|
+
tbl = ax.table(
|
|
2158
|
+
cellText=table_rows,
|
|
2159
|
+
colLabels=col_labels,
|
|
2160
|
+
cellLoc="center",
|
|
2161
|
+
colLoc="center",
|
|
2162
|
+
loc="center"
|
|
2163
|
+
)
|
|
2164
|
+
tbl.auto_set_font_size(False)
|
|
2165
|
+
tbl.set_fontsize(10)
|
|
2166
|
+
tbl.scale(1, 1.5) # stretch rows
|
|
2167
|
+
|
|
2168
|
+
# Add a title
|
|
2169
|
+
ax.set_title("Mutation Spectrum (Above-Background Sites)", fontsize=12, fontweight="bold", pad=20)
|
|
2170
|
+
|
|
2171
|
+
# Save as PDF
|
|
2172
|
+
fig.savefig(pdf_path, format="pdf", bbox_inches="tight")
|
|
2173
|
+
plt.close(fig)
|
|
2174
|
+
logging.info(f"Saved mutation spectrum table as PDF: {pdf_path}")
|
|
2175
|
+
|
|
2176
|
+
# ----------------------------
|
|
2177
|
+
# WRITE PER-SAMPLE SUMMARY TXT
|
|
2178
|
+
# ----------------------------
|
|
2179
|
+
sample_summary_path = os.path.join(qscore_results_dir, "summary.txt")
|
|
2180
|
+
with open(sample_summary_path, "w") as txtf:
|
|
2181
|
+
txtf.write(f"Sample: {sample_name}\n")
|
|
2182
|
+
txtf.write(f"{'=' * (8 + len(sample_name))}\n\n")
|
|
2183
|
+
txtf.write("1) Background (plasmid excluding target):\n")
|
|
2184
|
+
txtf.write(f" • Total mismatches: {bg_mis}\n")
|
|
2185
|
+
txtf.write(f" • Total covered bases: {bg_cov}\n")
|
|
2186
|
+
txtf.write(f" • Mapped reads: {bg_reads}\n")
|
|
2187
|
+
txtf.write(f" • Rate (per base): {bg_rate:.6e}\n")
|
|
2188
|
+
txtf.write(f" • Rate (per kb): {bg_rate_per_kb:.6e}\n\n")
|
|
2189
|
+
|
|
2190
|
+
txtf.write("2) Target (ROI) stats:\n")
|
|
2191
|
+
txtf.write(f" • Gene ID: {hit_id}\n")
|
|
2192
|
+
txtf.write(f" • Total mismatches: {hit_mis}\n")
|
|
2193
|
+
txtf.write(f" • Total covered bases:{hit_cov}\n")
|
|
2194
|
+
txtf.write(f" • Mapped reads: {hit_reads}\n")
|
|
2195
|
+
txtf.write(f" • Rate (per base): {hit_rate:.6e}\n")
|
|
2196
|
+
txtf.write(f" • Rate (per kb): {hit_rate_per_kb:.6e}\n")
|
|
2197
|
+
txtf.write(f" • Z‐statistic: {z_stat:.4f}\n")
|
|
2198
|
+
txtf.write(f" • p‐value: {p_val if p_val is not None else 'N/A'}\n")
|
|
2199
|
+
txtf.write(f" • Estimated mutations per copy: {est_mut_per_copy:.6e}\n\n")
|
|
2200
|
+
|
|
2201
|
+
txtf.write("3) Protein‐coding evaluation:\n")
|
|
2202
|
+
txtf.write(f" • Is protein: {is_protein}\n")
|
|
2203
|
+
if is_protein:
|
|
2204
|
+
txtf.write(f" • Average AA mutations per copy (simulated): {avg_aa_mutations:.3f}\n")
|
|
2205
|
+
else:
|
|
2206
|
+
txtf.write(f" • Reason(s): {('; '.join(reasons) if reasons else 'N/A')}\n")
|
|
2207
|
+
txtf.write("\n4) Mutation spectrum (above-background sites):\n")
|
|
2208
|
+
for cat in categories:
|
|
2209
|
+
txtf.write(f" • {cat}: {sample_percent[cat]:.1f}% (Ref: {categories[cat]['ref']:.1f}%)\n")
|
|
2210
|
+
txtf.write("\n5) Output files written to:\n")
|
|
2211
|
+
txtf.write(f" • {gene_mismatch_csv}\n")
|
|
2212
|
+
txtf.write(f" • {base_dist_csv}\n")
|
|
2213
|
+
if is_protein:
|
|
2214
|
+
txtf.write(f" • {aa_subst_csv}\n")
|
|
2215
|
+
txtf.write(f" • {plasmid_cov_csv}\n")
|
|
2216
|
+
txtf.write(f" • {aa_dist_csv}\n")
|
|
2217
|
+
txtf.write(f" • {panel_path_png} (figure)\n")
|
|
2218
|
+
txtf.write(f" • {panel_path_pdf} (figure)\n")
|
|
2219
|
+
txtf.write(f" • {pdf_path} (mutation spectrum table)\n")
|
|
2220
|
+
|
|
2221
|
+
logging.info(f"Wrote per-sample summary to: {sample_summary_path}")
|
|
2222
|
+
|
|
2223
|
+
return {
|
|
2224
|
+
'qscore': qscore,
|
|
2225
|
+
'qscore_desc': qscore_desc,
|
|
2226
|
+
'summary_path': sample_summary_path,
|
|
2227
|
+
'qscore_results_dir': qscore_results_dir,
|
|
2228
|
+
'bg_mis': bg_mis,
|
|
2229
|
+
'bg_cov': bg_cov,
|
|
2230
|
+
'bg_reads': bg_reads,
|
|
2231
|
+
'bg_rate': bg_rate,
|
|
2232
|
+
'bg_rate_per_kb': bg_rate_per_kb,
|
|
2233
|
+
'hit_mis': hit_mis,
|
|
2234
|
+
'hit_cov': hit_cov,
|
|
2235
|
+
'hit_reads': hit_reads,
|
|
2236
|
+
'hit_rate': hit_rate,
|
|
2237
|
+
'hit_rate_per_kb': hit_rate_per_kb,
|
|
2238
|
+
'hit_info': hit_info,
|
|
2239
|
+
'z_stat': z_stat,
|
|
2240
|
+
'p_val': p_val,
|
|
2241
|
+
'est_mut_per_copy': est_mut_per_copy,
|
|
2242
|
+
'is_protein': is_protein,
|
|
2243
|
+
'reasons': reasons,
|
|
2244
|
+
'aa_diffs': aa_diffs,
|
|
2245
|
+
'avg_aa_mutations': avg_aa_mutations,
|
|
2246
|
+
'base_counts': base_counts,
|
|
2247
|
+
'qscore_info': qscore_info,
|
|
2248
|
+
'sam_plasmid': sam_plasmid
|
|
2249
|
+
}
|
|
2250
|
+
|
|
2251
|
+
|
|
2252
|
+
|
|
2253
|
+
main()
|
|
2254
|
+
|
|
2255
|
+
def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
|
|
2256
|
+
paths: List[Path] = []
|
|
2257
|
+
for item in inputs:
|
|
2258
|
+
if any(ch in item for ch in "*?[]"):
|
|
2259
|
+
paths.extend(Path().glob(item))
|
|
2260
|
+
else:
|
|
2261
|
+
paths.append(Path(item))
|
|
2262
|
+
unique_paths: List[Path] = []
|
|
2263
|
+
seen = set()
|
|
2264
|
+
for path in paths:
|
|
2265
|
+
resolved = path.resolve()
|
|
2266
|
+
if resolved not in seen:
|
|
2267
|
+
seen.add(resolved)
|
|
2268
|
+
unique_paths.append(path)
|
|
2269
|
+
return unique_paths
|
|
2270
|
+
|
|
2271
|
+
|
|
2272
|
+
def run_ep_library_profile(
|
|
2273
|
+
fastq_paths: Sequence[Path],
|
|
2274
|
+
region_fasta: Path,
|
|
2275
|
+
plasmid_fasta: Path,
|
|
2276
|
+
output_dir: Path,
|
|
2277
|
+
work_dir: Optional[Path] = None,
|
|
2278
|
+
) -> Dict[str, object]:
|
|
2279
|
+
fastq_paths = [Path(p) for p in fastq_paths]
|
|
2280
|
+
if not fastq_paths:
|
|
2281
|
+
raise ValueError("No FASTQ files provided for analysis.")
|
|
2282
|
+
|
|
2283
|
+
region_fasta = Path(region_fasta)
|
|
2284
|
+
plasmid_fasta = Path(plasmid_fasta)
|
|
2285
|
+
output_dir = Path(output_dir)
|
|
2286
|
+
work_dir = Path(work_dir) if work_dir is not None else output_dir / "tmp"
|
|
2287
|
+
|
|
2288
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
2289
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
2290
|
+
|
|
2291
|
+
master_summary_path = output_dir / "master_summary.txt"
|
|
2292
|
+
header = "\t".join(
|
|
2293
|
+
[
|
|
2294
|
+
"Sample",
|
|
2295
|
+
"Condition",
|
|
2296
|
+
"Background_Rate",
|
|
2297
|
+
"Target_Rate",
|
|
2298
|
+
"Z_stat",
|
|
2299
|
+
"P_value",
|
|
2300
|
+
"Est_Mut_per_Copy",
|
|
2301
|
+
"Is_Protein",
|
|
2302
|
+
]
|
|
2303
|
+
)
|
|
2304
|
+
master_summary_path.write_text(header + "\n", encoding="utf-8")
|
|
2305
|
+
|
|
2306
|
+
sample_results: List[Dict[str, object]] = []
|
|
2307
|
+
for fastq in fastq_paths:
|
|
2308
|
+
result = process_single_fastq(
|
|
2309
|
+
fastq,
|
|
2310
|
+
region_fasta,
|
|
2311
|
+
plasmid_fasta,
|
|
2312
|
+
work_dir,
|
|
2313
|
+
output_dir,
|
|
2314
|
+
)
|
|
2315
|
+
sample_results.append(result)
|
|
2316
|
+
|
|
2317
|
+
with master_summary_path.open("a", encoding="utf-8") as masterf:
|
|
2318
|
+
for analysis in result.get("analysis_results", []):
|
|
2319
|
+
if not analysis:
|
|
2320
|
+
continue
|
|
2321
|
+
row = [
|
|
2322
|
+
result["sample"],
|
|
2323
|
+
analysis.get("qscore_desc", ""),
|
|
2324
|
+
f"{analysis.get('bg_rate', 0.0):.6e}",
|
|
2325
|
+
f"{analysis.get('hit_rate', 0.0):.6e}",
|
|
2326
|
+
f"{analysis.get('z_stat', 0.0):.4f}",
|
|
2327
|
+
str(analysis.get("p_val", "N/A")),
|
|
2328
|
+
f"{analysis.get('est_mut_per_copy', 0.0):.6e}",
|
|
2329
|
+
"yes" if analysis.get("is_protein") else "no",
|
|
2330
|
+
]
|
|
2331
|
+
masterf.write("\t".join(row) + "\n")
|
|
2332
|
+
|
|
2333
|
+
return {
|
|
2334
|
+
"master_summary": master_summary_path,
|
|
2335
|
+
"samples": sample_results,
|
|
2336
|
+
}
|
|
2337
|
+
|
|
2338
|
+
|
|
2339
|
+
def process_single_fastq(
|
|
2340
|
+
fastq_path,
|
|
2341
|
+
region_fasta,
|
|
2342
|
+
plasmid_fasta,
|
|
2343
|
+
base_work_dir,
|
|
2344
|
+
base_results_dir,
|
|
2345
|
+
):
|
|
2346
|
+
"""Run the mutation-rate analysis pipeline for a single FASTQ file."""
|
|
2347
|
+
fastq_path = Path(fastq_path)
|
|
2348
|
+
region_fasta = Path(region_fasta)
|
|
2349
|
+
plasmid_fasta = Path(plasmid_fasta)
|
|
2350
|
+
base_work_dir = Path(base_work_dir)
|
|
2351
|
+
base_results_dir = Path(base_results_dir)
|
|
2352
|
+
|
|
2353
|
+
sample_name = fastq_path.name
|
|
2354
|
+
if sample_name.endswith('.fastq.gz'):
|
|
2355
|
+
sample_name = sample_name[:-9]
|
|
2356
|
+
elif sample_name.endswith('.fastq'):
|
|
2357
|
+
sample_name = sample_name[:-6]
|
|
2358
|
+
|
|
2359
|
+
work_dir = base_work_dir / sample_name
|
|
2360
|
+
results_dir = base_results_dir / sample_name
|
|
2361
|
+
|
|
2362
|
+
if work_dir.exists():
|
|
2363
|
+
shutil.rmtree(work_dir)
|
|
2364
|
+
if results_dir.exists():
|
|
2365
|
+
shutil.rmtree(results_dir)
|
|
2366
|
+
|
|
2367
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
2368
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
2369
|
+
|
|
2370
|
+
setup_logging(str(results_dir))
|
|
2371
|
+
logging.info("--- Starting analysis for sample: %s ---", sample_name)
|
|
2372
|
+
|
|
2373
|
+
hit_seq, hit_id = load_single_sequence(str(region_fasta))
|
|
2374
|
+
plasmid_seq, plasmid_id = load_single_sequence(str(plasmid_fasta))
|
|
2375
|
+
|
|
2376
|
+
logging.info("Plasmid length: %s bp", len(plasmid_seq))
|
|
2377
|
+
logging.info("Gene of interest length: %s bp", len(hit_seq))
|
|
2378
|
+
|
|
2379
|
+
idx = plasmid_seq.upper().find(hit_seq.upper())
|
|
2380
|
+
if idx == -1:
|
|
2381
|
+
logging.error("Gene region not found in plasmid")
|
|
2382
|
+
return {
|
|
2383
|
+
"sample": sample_name,
|
|
2384
|
+
"results_dir": results_dir,
|
|
2385
|
+
"analysis_results": [],
|
|
2386
|
+
}
|
|
2387
|
+
plasmid_no_gene = plasmid_seq[:idx] + plasmid_seq[idx + len(hit_seq):]
|
|
2388
|
+
|
|
2389
|
+
logging.info("Gene found at position %s-%s (1-based)", idx + 1, idx + len(hit_seq))
|
|
2390
|
+
logging.info("Background region length: %s bp", len(plasmid_no_gene))
|
|
2391
|
+
|
|
2392
|
+
n_chunks = 10
|
|
2393
|
+
length = len(plasmid_no_gene)
|
|
2394
|
+
size = length // n_chunks
|
|
2395
|
+
min_chunk_size = 50
|
|
2396
|
+
if size < min_chunk_size:
|
|
2397
|
+
logging.warning(
|
|
2398
|
+
"Background region (%s bp) would create chunks smaller than %s bp. Adjusting chunk count.",
|
|
2399
|
+
length,
|
|
2400
|
+
min_chunk_size,
|
|
2401
|
+
)
|
|
2402
|
+
n_chunks = max(1, length // min_chunk_size)
|
|
2403
|
+
size = length // n_chunks
|
|
2404
|
+
logging.info("Adjusted to %s chunks of approximately %s bp each", n_chunks, size)
|
|
2405
|
+
|
|
2406
|
+
chunks = [
|
|
2407
|
+
plasmid_no_gene[i * size : (length if i == n_chunks - 1 else (i + 1) * size)]
|
|
2408
|
+
for i in range(n_chunks)
|
|
2409
|
+
]
|
|
2410
|
+
logging.info("Chunk sizes: %s bp", [len(chunk) for chunk in chunks])
|
|
2411
|
+
|
|
2412
|
+
logging.info("Running QC analysis to get Q-score results...")
|
|
2413
|
+
qc_results = None
|
|
2414
|
+
try:
|
|
2415
|
+
qc_results, optimal_qscore = run_qc_analysis(
|
|
2416
|
+
str(fastq_path),
|
|
2417
|
+
str(results_dir),
|
|
2418
|
+
str(region_fasta),
|
|
2419
|
+
str(plasmid_fasta),
|
|
2420
|
+
)
|
|
2421
|
+
if qc_results is not None:
|
|
2422
|
+
logging.info("QC analysis completed successfully. Found %s Q-score results.", len(qc_results))
|
|
2423
|
+
if optimal_qscore is not None:
|
|
2424
|
+
logging.info("Optimal Q-score determined: %s", optimal_qscore)
|
|
2425
|
+
else:
|
|
2426
|
+
logging.warning("QC analysis completed but no Q-score results found.")
|
|
2427
|
+
except Exception as exc:
|
|
2428
|
+
logging.error("QC analysis failed: %s", exc)
|
|
2429
|
+
logging.warning("Proceeding with unfiltered data only.")
|
|
2430
|
+
|
|
2431
|
+
qscores_to_analyze: List[tuple[Optional[int], str, str]] = []
|
|
2432
|
+
qscores_to_analyze.append((None, str(fastq_path), 'unfiltered'))
|
|
2433
|
+
|
|
2434
|
+
if qc_results is not None:
|
|
2435
|
+
for result in qc_results:
|
|
2436
|
+
qscore = result['quality_threshold']
|
|
2437
|
+
filtered_fastq_path = work_dir / f"{sample_name}_q{qscore}.fastq.gz"
|
|
2438
|
+
if run_nanofilt_filtering(str(fastq_path), qscore, str(filtered_fastq_path)):
|
|
2439
|
+
qscores_to_analyze.append((qscore, str(filtered_fastq_path), f"Q{qscore}"))
|
|
2440
|
+
logging.info("Successfully created Q%s filtered data for analysis.", qscore)
|
|
2441
|
+
else:
|
|
2442
|
+
logging.warning("Failed to create Q%s filtered data.", qscore)
|
|
2443
|
+
|
|
2444
|
+
logging.info(
|
|
2445
|
+
"Will run main analysis for %s conditions: %s",
|
|
2446
|
+
len(qscores_to_analyze),
|
|
2447
|
+
[desc for _, _, desc in qscores_to_analyze],
|
|
2448
|
+
)
|
|
2449
|
+
|
|
2450
|
+
analysis_results = []
|
|
2451
|
+
for qscore, analysis_fastq_path, qscore_desc in qscores_to_analyze:
|
|
2452
|
+
result = run_main_analysis_for_qscore(
|
|
2453
|
+
analysis_fastq_path,
|
|
2454
|
+
qscore,
|
|
2455
|
+
qscore_desc,
|
|
2456
|
+
sample_name,
|
|
2457
|
+
str(work_dir),
|
|
2458
|
+
str(results_dir),
|
|
2459
|
+
chunks,
|
|
2460
|
+
str(region_fasta),
|
|
2461
|
+
str(plasmid_fasta),
|
|
2462
|
+
hit_seq,
|
|
2463
|
+
hit_id,
|
|
2464
|
+
plasmid_seq,
|
|
2465
|
+
idx,
|
|
2466
|
+
)
|
|
2467
|
+
analysis_results.append(result)
|
|
2468
|
+
|
|
2469
|
+
if work_dir.exists():
|
|
2470
|
+
shutil.rmtree(work_dir)
|
|
2471
|
+
logging.info("Removed temporary work directory: %s", work_dir)
|
|
2472
|
+
|
|
2473
|
+
logging.info("--- Finished analysis for sample: %s ---", sample_name)
|
|
2474
|
+
|
|
2475
|
+
return {
|
|
2476
|
+
"sample": sample_name,
|
|
2477
|
+
"results_dir": results_dir,
|
|
2478
|
+
"analysis_results": analysis_results,
|
|
2479
|
+
}
|
|
2480
|
+
|