uht-tooling 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uht_tooling/__init__.py +10 -0
- uht_tooling/cli.py +368 -0
- uht_tooling/models/__init__.py +0 -0
- uht_tooling/workflows/__init__.py +0 -0
- uht_tooling/workflows/design_gibson.py +368 -0
- uht_tooling/workflows/design_slim.py +402 -0
- uht_tooling/workflows/gui.py +595 -0
- uht_tooling/workflows/mut_rate.py +2480 -0
- uht_tooling/workflows/mutation_caller.py +432 -0
- uht_tooling/workflows/nextera_designer.py +199 -0
- uht_tooling/workflows/profile_inserts.py +441 -0
- uht_tooling/workflows/umi_hunter.py +412 -0
- uht_tooling-0.1.2.dist-info/METADATA +271 -0
- uht_tooling-0.1.2.dist-info/RECORD +17 -0
- uht_tooling-0.1.2.dist-info/WHEEL +5 -0
- uht_tooling-0.1.2.dist-info/entry_points.txt +2 -0
- uht_tooling-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import gzip
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
from collections import Counter, defaultdict
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Iterable, List, Optional, Sequence
|
|
11
|
+
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from Bio import AlignIO, SeqIO
|
|
16
|
+
from Bio.Align.Applications import MafftCommandline
|
|
17
|
+
from Bio.Seq import Seq
|
|
18
|
+
from Bio.SeqRecord import SeqRecord
|
|
19
|
+
from scipy.stats import fisher_exact, gaussian_kde
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def reverse_complement(seq: str) -> str:
|
|
23
|
+
return seq.translate(str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_flank_pattern(flanks_csv: Path) -> tuple[re.Pattern, int, int]:
|
|
27
|
+
df = pd.read_csv(flanks_csv)
|
|
28
|
+
gene_start = df.loc[0, "gene_flanks"]
|
|
29
|
+
gene_end = df.loc[1, "gene_flanks"]
|
|
30
|
+
gene_min = int(df.loc[0, "gene_min_max"])
|
|
31
|
+
gene_max = int(df.loc[1, "gene_min_max"])
|
|
32
|
+
pattern = re.compile(
|
|
33
|
+
rf"{gene_start}([ACGTNacgtn]{{{gene_min},{gene_max}}}){gene_end}",
|
|
34
|
+
re.IGNORECASE,
|
|
35
|
+
)
|
|
36
|
+
return pattern, gene_min, gene_max
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def extract_gene(seq: str, pattern: re.Pattern, gene_min: int, gene_max: int) -> Optional[str]:
|
|
40
|
+
match = pattern.search(seq)
|
|
41
|
+
if match:
|
|
42
|
+
gene = match.group(1)
|
|
43
|
+
if gene_min <= len(gene) <= gene_max:
|
|
44
|
+
return gene
|
|
45
|
+
match = pattern.search(reverse_complement(seq))
|
|
46
|
+
if match:
|
|
47
|
+
gene = match.group(1)
|
|
48
|
+
if gene_min <= len(gene) <= gene_max:
|
|
49
|
+
return gene
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def process_fastq(file_path: Path, pattern: re.Pattern, gene_min: int, gene_max: int) -> Dict[str, str]:
|
|
54
|
+
gene_reads: Dict[str, str] = {}
|
|
55
|
+
with gzip.open(file_path, "rt") as handle:
|
|
56
|
+
while True:
|
|
57
|
+
header = handle.readline()
|
|
58
|
+
if not header:
|
|
59
|
+
break
|
|
60
|
+
seq = handle.readline().strip()
|
|
61
|
+
handle.readline() # '+'
|
|
62
|
+
handle.readline() # quality
|
|
63
|
+
gene = extract_gene(seq, pattern, gene_min, gene_max)
|
|
64
|
+
if gene:
|
|
65
|
+
read_id = header.strip()[1:]
|
|
66
|
+
gene_reads[read_id] = gene
|
|
67
|
+
return gene_reads
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def align_to_reference(gene_seqs: Dict[str, str], reference: str) -> tuple[str, Dict[str, str]]:
|
|
71
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".fasta") as tmp_in:
|
|
72
|
+
SeqIO.write(SeqRecord(Seq(reference), id="REF", description=""), tmp_in, "fasta")
|
|
73
|
+
for rid, seq in gene_seqs.items():
|
|
74
|
+
SeqIO.write(SeqRecord(Seq(seq), id=rid, description=""), tmp_in, "fasta")
|
|
75
|
+
tmp_in_path = tmp_in.name
|
|
76
|
+
|
|
77
|
+
mafft = MafftCommandline(input=tmp_in_path)
|
|
78
|
+
proc = subprocess.Popen(
|
|
79
|
+
str(mafft),
|
|
80
|
+
shell=True,
|
|
81
|
+
stdout=subprocess.PIPE,
|
|
82
|
+
stderr=subprocess.PIPE,
|
|
83
|
+
universal_newlines=True,
|
|
84
|
+
)
|
|
85
|
+
stdout, stderr = proc.communicate()
|
|
86
|
+
|
|
87
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".fasta") as tmp_out:
|
|
88
|
+
tmp_out.write(stdout)
|
|
89
|
+
tmp_out_path = tmp_out.name
|
|
90
|
+
|
|
91
|
+
if proc.returncode != 0:
|
|
92
|
+
raise RuntimeError(f"MAFFT failed with exit code {proc.returncode}:\n{stderr}")
|
|
93
|
+
|
|
94
|
+
alignment = AlignIO.read(tmp_out_path, "fasta")
|
|
95
|
+
|
|
96
|
+
aligned_ref = None
|
|
97
|
+
aligned_reads: Dict[str, str] = {}
|
|
98
|
+
for record in alignment:
|
|
99
|
+
if record.id == "REF":
|
|
100
|
+
aligned_ref = str(record.seq)
|
|
101
|
+
else:
|
|
102
|
+
aligned_reads[record.id] = str(record.seq)
|
|
103
|
+
|
|
104
|
+
os.remove(tmp_in_path)
|
|
105
|
+
os.remove(tmp_out_path)
|
|
106
|
+
|
|
107
|
+
if aligned_ref is None:
|
|
108
|
+
raise RuntimeError("Reference sequence missing from alignment output.")
|
|
109
|
+
|
|
110
|
+
return aligned_ref, aligned_reads
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def identify_substitutions(ref: str, aligned_reads: Dict[str, str]) -> Dict[str, List[str]]:
|
|
114
|
+
subs_by_read: Dict[str, List[str]] = defaultdict(list)
|
|
115
|
+
|
|
116
|
+
aln2ref: Dict[int, Optional[int]] = {}
|
|
117
|
+
ref_clean: List[str] = []
|
|
118
|
+
ref_index = 0
|
|
119
|
+
for aln_idx, base in enumerate(ref):
|
|
120
|
+
if base != "-":
|
|
121
|
+
aln2ref[aln_idx] = ref_index
|
|
122
|
+
ref_clean.append(base)
|
|
123
|
+
ref_index += 1
|
|
124
|
+
else:
|
|
125
|
+
aln2ref[aln_idx] = None
|
|
126
|
+
ref_clean_seq = "".join(ref_clean)
|
|
127
|
+
|
|
128
|
+
ref2aln: Dict[int, int] = {}
|
|
129
|
+
for aln_idx, ref_idx in aln2ref.items():
|
|
130
|
+
if ref_idx is not None and ref_idx not in ref2aln:
|
|
131
|
+
ref2aln[ref_idx] = aln_idx
|
|
132
|
+
|
|
133
|
+
codon_count = len(ref_clean_seq) // 3
|
|
134
|
+
for read_id, seq in aligned_reads.items():
|
|
135
|
+
for codon_i in range(codon_count):
|
|
136
|
+
start_r = codon_i * 3
|
|
137
|
+
codon_ref = ref_clean_seq[start_r : start_r + 3]
|
|
138
|
+
codon_read: List[str] = []
|
|
139
|
+
valid = True
|
|
140
|
+
diff = False
|
|
141
|
+
for offset in range(3):
|
|
142
|
+
ref_pos = start_r + offset
|
|
143
|
+
aln_idx = ref2aln.get(ref_pos)
|
|
144
|
+
if aln_idx is None:
|
|
145
|
+
valid = False
|
|
146
|
+
break
|
|
147
|
+
base_q = seq[aln_idx]
|
|
148
|
+
if base_q == "-":
|
|
149
|
+
valid = False
|
|
150
|
+
break
|
|
151
|
+
codon_read.append(base_q)
|
|
152
|
+
if base_q != codon_ref[offset]:
|
|
153
|
+
diff = True
|
|
154
|
+
if not valid or not diff:
|
|
155
|
+
continue
|
|
156
|
+
try:
|
|
157
|
+
aa_from = str(Seq(codon_ref).translate())
|
|
158
|
+
aa_to = str(Seq("".join(codon_read)).translate())
|
|
159
|
+
except Exception:
|
|
160
|
+
aa_from, aa_to = "?", "?"
|
|
161
|
+
aa_mut = f"{aa_from}{codon_i + 1}{aa_to}"
|
|
162
|
+
nt_mut = f"{codon_ref}->{''.join(codon_read)}"
|
|
163
|
+
subs_by_read[read_id].append(f"{nt_mut} ({aa_mut})")
|
|
164
|
+
|
|
165
|
+
return subs_by_read
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def find_cooccurring_aa(
|
|
169
|
+
subs_by_read_aa: Dict[str, List[str]],
|
|
170
|
+
frequent_aa: set[str],
|
|
171
|
+
output_dir: Path,
|
|
172
|
+
sample_name: str,
|
|
173
|
+
) -> tuple[Path, Path]:
|
|
174
|
+
aa_list = sorted(frequent_aa)
|
|
175
|
+
aa_idx = {aa: i for i, aa in enumerate(aa_list)}
|
|
176
|
+
|
|
177
|
+
matrix: List[List[int]] = []
|
|
178
|
+
for calls in subs_by_read_aa.values():
|
|
179
|
+
row = [0] * len(aa_list)
|
|
180
|
+
any_selected = False
|
|
181
|
+
for aa in calls:
|
|
182
|
+
if aa in aa_idx:
|
|
183
|
+
row[aa_idx[aa]] = 1
|
|
184
|
+
any_selected = True
|
|
185
|
+
if any_selected:
|
|
186
|
+
matrix.append(row)
|
|
187
|
+
|
|
188
|
+
baseline_path = output_dir / f"{sample_name}_cooccurring_AA_baseline.csv"
|
|
189
|
+
fisher_path = output_dir / f"{sample_name}_cooccurring_AA_fisher.csv"
|
|
190
|
+
|
|
191
|
+
if not matrix:
|
|
192
|
+
pd.DataFrame(columns=["AA1", "AA2", "Both_Count", "AA1_Count", "AA2_Count"]).to_csv(
|
|
193
|
+
baseline_path, index=False
|
|
194
|
+
)
|
|
195
|
+
pd.DataFrame(columns=["AA1", "AA2", "p-value"]).to_csv(fisher_path, index=False)
|
|
196
|
+
return baseline_path, fisher_path
|
|
197
|
+
|
|
198
|
+
df = pd.DataFrame(matrix, columns=aa_list)
|
|
199
|
+
simple: List[tuple[str, str, int, int, int]] = []
|
|
200
|
+
fisher_rows: List[tuple[str, str, float]] = []
|
|
201
|
+
|
|
202
|
+
for i in range(len(aa_list)):
|
|
203
|
+
for j in range(i + 1, len(aa_list)):
|
|
204
|
+
col_a, col_b = df.iloc[:, i], df.iloc[:, j]
|
|
205
|
+
both = int(((col_a == 1) & (col_b == 1)).sum())
|
|
206
|
+
a_tot = int((col_a == 1).sum())
|
|
207
|
+
b_tot = int((col_b == 1).sum())
|
|
208
|
+
if (both >= 2) or (both > 0 and both == a_tot == b_tot):
|
|
209
|
+
simple.append((aa_list[i], aa_list[j], both, a_tot, b_tot))
|
|
210
|
+
table = [
|
|
211
|
+
[both, int(((col_a == 1) & (col_b == 0)).sum())],
|
|
212
|
+
[int(((col_a == 0) & (col_b == 1)).sum()), int(((col_a == 0) & (col_b == 0)).sum())],
|
|
213
|
+
]
|
|
214
|
+
try:
|
|
215
|
+
_, p_value = fisher_exact(table)
|
|
216
|
+
if p_value < 0.05:
|
|
217
|
+
fisher_rows.append((aa_list[i], aa_list[j], p_value))
|
|
218
|
+
except Exception:
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
pd.DataFrame(simple, columns=["AA1", "AA2", "Both_Count", "AA1_Count", "AA2_Count"]).to_csv(
|
|
222
|
+
baseline_path, index=False
|
|
223
|
+
)
|
|
224
|
+
pd.DataFrame(fisher_rows, columns=["AA1", "AA2", "p-value"]).to_csv(fisher_path, index=False)
|
|
225
|
+
return baseline_path, fisher_path
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def run_mutation_caller(
|
|
229
|
+
template_fasta: Path,
|
|
230
|
+
flanks_csv: Path,
|
|
231
|
+
fastq_files: Sequence[Path],
|
|
232
|
+
output_dir: Path,
|
|
233
|
+
threshold: int,
|
|
234
|
+
log_path: Optional[Path] = None,
|
|
235
|
+
logger: Optional[logging.Logger] = None,
|
|
236
|
+
) -> List[Dict[str, Path]]:
|
|
237
|
+
template_fasta = Path(template_fasta)
|
|
238
|
+
flanks_csv = Path(flanks_csv)
|
|
239
|
+
fastq_files = [Path(fq) for fq in fastq_files]
|
|
240
|
+
output_dir = Path(output_dir)
|
|
241
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
242
|
+
|
|
243
|
+
managed_logger = logger is None
|
|
244
|
+
if logger is None:
|
|
245
|
+
logger = logging.getLogger("uht_tooling.mutation_caller")
|
|
246
|
+
logger.setLevel(logging.INFO)
|
|
247
|
+
handler: logging.Handler
|
|
248
|
+
if log_path:
|
|
249
|
+
handler = logging.FileHandler(log_path, mode="w")
|
|
250
|
+
else:
|
|
251
|
+
handler = logging.StreamHandler()
|
|
252
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
|
|
253
|
+
logger.handlers = []
|
|
254
|
+
logger.addHandler(handler)
|
|
255
|
+
logger.propagate = False
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
if not fastq_files:
|
|
259
|
+
raise ValueError("No FASTQ files provided.")
|
|
260
|
+
|
|
261
|
+
pattern, gene_min, gene_max = build_flank_pattern(flanks_csv)
|
|
262
|
+
template_record = next(SeqIO.parse(str(template_fasta), "fasta"))
|
|
263
|
+
full_ref = str(template_record.seq)
|
|
264
|
+
logger.info("Loaded template sequence of length %s.", len(full_ref))
|
|
265
|
+
|
|
266
|
+
df = pd.read_csv(flanks_csv)
|
|
267
|
+
gene_start = df.loc[0, "gene_flanks"]
|
|
268
|
+
gene_end = df.loc[1, "gene_flanks"]
|
|
269
|
+
if full_ref.startswith(gene_start) and full_ref.endswith(gene_end):
|
|
270
|
+
reference = full_ref[len(gene_start) : len(full_ref) - len(gene_end)]
|
|
271
|
+
logger.info("Trimmed flanking regions from template.")
|
|
272
|
+
else:
|
|
273
|
+
reference = full_ref
|
|
274
|
+
|
|
275
|
+
results: List[Dict[str, Path]] = []
|
|
276
|
+
|
|
277
|
+
for fastq in fastq_files:
|
|
278
|
+
if not fastq.exists():
|
|
279
|
+
logger.warning("FASTQ file %s not found; skipping.", fastq)
|
|
280
|
+
continue
|
|
281
|
+
sample_base = fastq.stem.replace(".fastq", "")
|
|
282
|
+
sample_dir = output_dir / sample_base
|
|
283
|
+
sample_dir.mkdir(parents=True, exist_ok=True)
|
|
284
|
+
|
|
285
|
+
logger.info("Processing sample %s", sample_base)
|
|
286
|
+
gene_reads = process_fastq(fastq, pattern, gene_min, gene_max)
|
|
287
|
+
if not gene_reads:
|
|
288
|
+
logger.warning("No valid gene reads for %s; skipping.", sample_base)
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
aligned_ref, aligned_reads = align_to_reference(gene_reads, reference)
|
|
292
|
+
substitutions = identify_substitutions(aligned_ref, aligned_reads)
|
|
293
|
+
subs_aa = {
|
|
294
|
+
rid: [item.split()[1][1:-1] for item in items if "(" in item and item.endswith(")")]
|
|
295
|
+
for rid, items in substitutions.items()
|
|
296
|
+
}
|
|
297
|
+
counts = Counter(aa for aas in subs_aa.values() for aa in aas)
|
|
298
|
+
if not counts:
|
|
299
|
+
logger.warning("No amino-acid substitutions detected for %s; skipping.", sample_base)
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
keys = list(counts.keys())
|
|
303
|
+
values = np.array([counts[k] for k in keys], dtype=float)
|
|
304
|
+
idx = np.arange(len(keys))
|
|
305
|
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
|
306
|
+
ax1.bar(idx, values)
|
|
307
|
+
ax1.set_xticks(idx)
|
|
308
|
+
ax1.set_xticklabels(keys, rotation=90, fontsize=8)
|
|
309
|
+
ax1.set_ylabel("Count")
|
|
310
|
+
ax1.set_title("Amino-Acid Substitution Frequencies")
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
kde = gaussian_kde(values)
|
|
314
|
+
xmin, xmax = max(1.0, values.min()), values.max()
|
|
315
|
+
xs = np.logspace(np.log10(xmin), np.log10(xmax), 200)
|
|
316
|
+
ax2.plot(xs, kde(xs), linewidth=2)
|
|
317
|
+
except Exception:
|
|
318
|
+
ax2.hist(values, bins="auto", density=True, alpha=0.6)
|
|
319
|
+
ax2.set_xscale("log")
|
|
320
|
+
ax2.set_xlabel("Substitution Count (log scale)")
|
|
321
|
+
ax2.set_ylabel("Density")
|
|
322
|
+
ax2.set_title("KDE of AA Substitution Frequencies")
|
|
323
|
+
|
|
324
|
+
plt.tight_layout()
|
|
325
|
+
plot_path = sample_dir / f"{sample_base}_aa_substitution_frequency.png"
|
|
326
|
+
fig.savefig(plot_path)
|
|
327
|
+
plt.close(fig)
|
|
328
|
+
logger.info("Saved substitution frequency plot to %s", plot_path)
|
|
329
|
+
|
|
330
|
+
frequent = {aa for aa, count in counts.items() if count >= threshold}
|
|
331
|
+
freq_csv = sample_dir / f"{sample_base}_frequent_aa_counts.csv"
|
|
332
|
+
pd.DataFrame(
|
|
333
|
+
sorted(((aa, counts[aa]) for aa in frequent), key=lambda x: x[0]),
|
|
334
|
+
columns=["AA", "Count"],
|
|
335
|
+
).to_csv(freq_csv, index=False)
|
|
336
|
+
|
|
337
|
+
baseline_path, fisher_path = find_cooccurring_aa(subs_aa, frequent, sample_dir, sample_base)
|
|
338
|
+
|
|
339
|
+
report_path = sample_dir / f"{sample_base}_report.txt"
|
|
340
|
+
with report_path.open("w") as report:
|
|
341
|
+
report.write(f"Sample: {sample_base}\n")
|
|
342
|
+
report.write(f"Valid gene reads: {len(gene_reads)}\n")
|
|
343
|
+
report.write(f"Unique AA substitutions: {len(counts)}\n")
|
|
344
|
+
report.write(f"Threshold: {threshold}\n")
|
|
345
|
+
report.write(f"Frequent AA substitutions (≥ {threshold}): {len(frequent)}\n\n")
|
|
346
|
+
report.write("Frequent AA counts:\n")
|
|
347
|
+
report.write("AA\tCount\n")
|
|
348
|
+
for aa in sorted(frequent):
|
|
349
|
+
report.write(f"{aa}\t{counts[aa]}\n")
|
|
350
|
+
report.write("\nGenerated files:\n")
|
|
351
|
+
report.write(f"- Plot: {plot_path.name}\n")
|
|
352
|
+
report.write(f"- Frequent counts: {freq_csv.name}\n")
|
|
353
|
+
report.write(f"- Co-occurrence baseline: {baseline_path.name}\n")
|
|
354
|
+
report.write(f"- Co-occurrence fisher: {fisher_path.name}\n")
|
|
355
|
+
|
|
356
|
+
results.append(
|
|
357
|
+
{
|
|
358
|
+
"sample": sample_base,
|
|
359
|
+
"directory": sample_dir,
|
|
360
|
+
"plot": plot_path,
|
|
361
|
+
"frequent_counts": freq_csv,
|
|
362
|
+
"baseline": baseline_path,
|
|
363
|
+
"fisher": fisher_path,
|
|
364
|
+
"report": report_path,
|
|
365
|
+
}
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
if not results:
|
|
369
|
+
logger.warning("No outputs generated; check inputs and thresholds.")
|
|
370
|
+
return results
|
|
371
|
+
finally:
|
|
372
|
+
if managed_logger and logger:
|
|
373
|
+
for handler in list(logger.handlers):
|
|
374
|
+
handler.close()
|
|
375
|
+
logger.removeHandler(handler)
|
|
376
|
+
logger.propagate = True
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
|
|
380
|
+
paths: List[Path] = []
|
|
381
|
+
for item in inputs:
|
|
382
|
+
if any(ch in item for ch in "*?[]"):
|
|
383
|
+
paths.extend(Path().glob(item))
|
|
384
|
+
else:
|
|
385
|
+
paths.append(Path(item))
|
|
386
|
+
unique_paths = []
|
|
387
|
+
seen = set()
|
|
388
|
+
for path in paths:
|
|
389
|
+
resolved = path.resolve()
|
|
390
|
+
if resolved not in seen:
|
|
391
|
+
seen.add(resolved)
|
|
392
|
+
unique_paths.append(path)
|
|
393
|
+
return unique_paths
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
397
|
+
parser = argparse.ArgumentParser(description="Identify mutations from long-read sequencing without UMIs.")
|
|
398
|
+
parser.add_argument("--template-fasta", required=True, type=Path, help="FASTA file containing the gene template.")
|
|
399
|
+
parser.add_argument("--flanks-csv", required=True, type=Path, help="CSV describing gene flanks and length bounds.")
|
|
400
|
+
parser.add_argument(
|
|
401
|
+
"--fastq",
|
|
402
|
+
required=True,
|
|
403
|
+
nargs="+",
|
|
404
|
+
help="One or more FASTQ(.gz) paths or glob patterns (e.g., data/*.fastq.gz).",
|
|
405
|
+
)
|
|
406
|
+
parser.add_argument("--output-dir", required=True, type=Path, help="Directory to place sample outputs.")
|
|
407
|
+
parser.add_argument(
|
|
408
|
+
"--threshold",
|
|
409
|
+
type=int,
|
|
410
|
+
default=10,
|
|
411
|
+
help="Minimum AA substitution count to include in the frequent-substitution report (default: 10).",
|
|
412
|
+
)
|
|
413
|
+
parser.add_argument("--log-path", default=None, type=Path, help="Optional log file path.")
|
|
414
|
+
return parser
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def main(argv: Optional[Sequence[str]] = None):
|
|
418
|
+
parser = build_parser()
|
|
419
|
+
args = parser.parse_args(argv)
|
|
420
|
+
fastq_files = expand_fastq_inputs(args.fastq)
|
|
421
|
+
run_mutation_caller(
|
|
422
|
+
template_fasta=args.template_fasta,
|
|
423
|
+
flanks_csv=args.flanks_csv,
|
|
424
|
+
fastq_files=fastq_files,
|
|
425
|
+
output_dir=args.output_dir,
|
|
426
|
+
threshold=args.threshold,
|
|
427
|
+
log_path=args.log_path,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
if __name__ == "__main__":
|
|
432
|
+
main()
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import csv
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
I7_INDEXES: Dict[str, str] = {
|
|
11
|
+
"701": "TCGCCTTA",
|
|
12
|
+
"702": "CTAGTACG",
|
|
13
|
+
"703": "TTCTGCCT",
|
|
14
|
+
"704": "GCTCAGGA",
|
|
15
|
+
"705": "AGGAGTCC",
|
|
16
|
+
"706": "CATGCCTA",
|
|
17
|
+
"707": "GTAGAGAG",
|
|
18
|
+
"708": "CCTCTCTG",
|
|
19
|
+
"709": "AGCGTAGC",
|
|
20
|
+
"710": "CAGCCTCG",
|
|
21
|
+
"711": "TGCCTCTT",
|
|
22
|
+
"712": "TCCTCTAC",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
I5_INDEXES: Dict[str, str] = {
|
|
26
|
+
"501": "TAGATCGC",
|
|
27
|
+
"502": "CTCTCTAT",
|
|
28
|
+
"503": "TATCCTCT",
|
|
29
|
+
"504": "AGAGTAGA",
|
|
30
|
+
"505": "GTAAGGAG",
|
|
31
|
+
"506": "ACTGCATA",
|
|
32
|
+
"507": "AAGGAGTA",
|
|
33
|
+
"508": "CTAAGCCT",
|
|
34
|
+
"510": "CGTCTAAT",
|
|
35
|
+
"511": "TCTCTCCG",
|
|
36
|
+
"513": "TCGACTAG",
|
|
37
|
+
"515": "TTCTAGCT",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
I7_PREFIX = "CAAGCAGAAGACGGCATACGAGAT"
|
|
41
|
+
I7_SUFFIX = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"
|
|
42
|
+
I5_PREFIX = "AATGATACGGCGACCACCGAGATCTACAC"
|
|
43
|
+
I5_SUFFIX = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def load_binding_sequences(csv_path: Path) -> Tuple[str, str]:
|
|
47
|
+
df = pd.read_csv(csv_path)
|
|
48
|
+
if "binding_region" not in df.columns:
|
|
49
|
+
raise ValueError("CSV must contain a 'binding_region' column")
|
|
50
|
+
if len(df) < 2:
|
|
51
|
+
raise ValueError("CSV must contain at least two rows for i7 and i5 binding regions")
|
|
52
|
+
return df["binding_region"].iloc[0], df["binding_region"].iloc[1]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_config(config_path: Optional[Path]) -> Dict[str, Dict[str, str]]:
|
|
56
|
+
if not config_path:
|
|
57
|
+
return {}
|
|
58
|
+
with open(config_path, "r", encoding="utf-8") as handle:
|
|
59
|
+
config = yaml.safe_load(handle) or {}
|
|
60
|
+
return config
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def generate_primers(
|
|
64
|
+
template_binding_i7: str,
|
|
65
|
+
template_binding_i5: str,
|
|
66
|
+
i7_indexes: Optional[Dict[str, str]] = None,
|
|
67
|
+
i5_indexes: Optional[Dict[str, str]] = None,
|
|
68
|
+
i7_prefix: str = I7_PREFIX,
|
|
69
|
+
i7_suffix: str = I7_SUFFIX,
|
|
70
|
+
i5_prefix: str = I5_PREFIX,
|
|
71
|
+
i5_suffix: str = I5_SUFFIX,
|
|
72
|
+
) -> List[Tuple[str, str]]:
|
|
73
|
+
primers: List[Tuple[str, str]] = []
|
|
74
|
+
i7_map = i7_indexes or I7_INDEXES
|
|
75
|
+
i5_map = i5_indexes or I5_INDEXES
|
|
76
|
+
|
|
77
|
+
for idx, seq in i7_map.items():
|
|
78
|
+
name = f"i7_{idx}"
|
|
79
|
+
full_seq = f"{i7_prefix}{seq}{i7_suffix}{template_binding_i7}"
|
|
80
|
+
primers.append((name, full_seq))
|
|
81
|
+
|
|
82
|
+
for idx, seq in i5_map.items():
|
|
83
|
+
name = f"i5_{idx}"
|
|
84
|
+
full_seq = f"{i5_prefix}{seq}{i5_suffix}{template_binding_i5}"
|
|
85
|
+
primers.append((name, full_seq))
|
|
86
|
+
|
|
87
|
+
return primers
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def run_nextera_primer_design(
|
|
91
|
+
binding_csv: Path,
|
|
92
|
+
output_csv: Path,
|
|
93
|
+
log_path: Optional[Path] = None,
|
|
94
|
+
config_path: Optional[Path] = None,
|
|
95
|
+
logger: Optional[logging.Logger] = None,
|
|
96
|
+
) -> Path:
|
|
97
|
+
binding_csv = Path(binding_csv)
|
|
98
|
+
output_csv = Path(output_csv)
|
|
99
|
+
output_csv.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
managed_logger = logger is None
|
|
102
|
+
if logger is None:
|
|
103
|
+
logger = logging.getLogger("uht_tooling.nextera")
|
|
104
|
+
logger.setLevel(logging.INFO)
|
|
105
|
+
handler: logging.Handler
|
|
106
|
+
if log_path:
|
|
107
|
+
handler = logging.FileHandler(log_path, mode="w")
|
|
108
|
+
else:
|
|
109
|
+
handler = logging.StreamHandler()
|
|
110
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
|
|
111
|
+
logger.handlers = []
|
|
112
|
+
logger.addHandler(handler)
|
|
113
|
+
logger.propagate = False
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
logger.info("Loading binding sequences from %s", binding_csv)
|
|
117
|
+
template_i7, template_i5 = load_binding_sequences(binding_csv)
|
|
118
|
+
logger.info("Loaded binding regions (i7 len=%s, i5 len=%s)", len(template_i7), len(template_i5))
|
|
119
|
+
|
|
120
|
+
config = load_config(config_path)
|
|
121
|
+
if config:
|
|
122
|
+
logger.info("Loaded configuration overrides from %s", config_path)
|
|
123
|
+
|
|
124
|
+
i7_indexes = config.get("i7_indexes") if config else None
|
|
125
|
+
i5_indexes = config.get("i5_indexes") if config else None
|
|
126
|
+
i7_prefix = config.get("i7_prefix", I7_PREFIX) if config else I7_PREFIX
|
|
127
|
+
i7_suffix = config.get("i7_suffix", I7_SUFFIX) if config else I7_SUFFIX
|
|
128
|
+
i5_prefix = config.get("i5_prefix", I5_PREFIX) if config else I5_PREFIX
|
|
129
|
+
i5_suffix = config.get("i5_suffix", I5_SUFFIX) if config else I5_SUFFIX
|
|
130
|
+
|
|
131
|
+
primers = generate_primers(
|
|
132
|
+
template_binding_i7=template_i7,
|
|
133
|
+
template_binding_i5=template_i5,
|
|
134
|
+
i7_indexes=i7_indexes,
|
|
135
|
+
i5_indexes=i5_indexes,
|
|
136
|
+
i7_prefix=i7_prefix,
|
|
137
|
+
i7_suffix=i7_suffix,
|
|
138
|
+
i5_prefix=i5_prefix,
|
|
139
|
+
i5_suffix=i5_suffix,
|
|
140
|
+
)
|
|
141
|
+
logger.info("Generated %s primers", len(primers))
|
|
142
|
+
|
|
143
|
+
with output_csv.open("w", newline="") as file:
|
|
144
|
+
writer = csv.writer(file)
|
|
145
|
+
writer.writerow(["primer_name", "sequence"])
|
|
146
|
+
writer.writerows(primers)
|
|
147
|
+
|
|
148
|
+
logger.info("Wrote primers to %s", output_csv)
|
|
149
|
+
return output_csv
|
|
150
|
+
finally:
|
|
151
|
+
if managed_logger and logger:
|
|
152
|
+
for handler in list(logger.handlers):
|
|
153
|
+
handler.close()
|
|
154
|
+
logger.removeHandler(handler)
|
|
155
|
+
logger.propagate = True
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
159
|
+
parser = argparse.ArgumentParser(description="Generate Nextera XT primers from binding region CSV input.")
|
|
160
|
+
parser.add_argument(
|
|
161
|
+
"--binding-csv",
|
|
162
|
+
required=True,
|
|
163
|
+
type=Path,
|
|
164
|
+
help="CSV file with a 'binding_region' column; first row is i7, second row is i5.",
|
|
165
|
+
)
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--output-csv",
|
|
168
|
+
required=True,
|
|
169
|
+
type=Path,
|
|
170
|
+
help="Path to write the generated primer CSV.",
|
|
171
|
+
)
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
"--log-path",
|
|
174
|
+
default=None,
|
|
175
|
+
type=Path,
|
|
176
|
+
help="Optional path to write a log file.",
|
|
177
|
+
)
|
|
178
|
+
parser.add_argument(
|
|
179
|
+
"--config",
|
|
180
|
+
default=None,
|
|
181
|
+
type=Path,
|
|
182
|
+
help="Optional YAML file providing overrides for indexes/prefixes/suffixes.",
|
|
183
|
+
)
|
|
184
|
+
return parser
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def main(argv: Optional[List[str]] = None):
|
|
188
|
+
parser = build_parser()
|
|
189
|
+
args = parser.parse_args(argv)
|
|
190
|
+
run_nextera_primer_design(
|
|
191
|
+
binding_csv=args.binding_csv,
|
|
192
|
+
output_csv=args.output_csv,
|
|
193
|
+
log_path=args.log_path,
|
|
194
|
+
config_path=args.config,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
if __name__ == "__main__":
|
|
199
|
+
main()
|