uht-tooling 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uht_tooling/__init__.py +10 -0
- uht_tooling/cli.py +368 -0
- uht_tooling/models/__init__.py +0 -0
- uht_tooling/workflows/__init__.py +0 -0
- uht_tooling/workflows/design_gibson.py +368 -0
- uht_tooling/workflows/design_slim.py +402 -0
- uht_tooling/workflows/gui.py +595 -0
- uht_tooling/workflows/mut_rate.py +2480 -0
- uht_tooling/workflows/mutation_caller.py +432 -0
- uht_tooling/workflows/nextera_designer.py +199 -0
- uht_tooling/workflows/profile_inserts.py +441 -0
- uht_tooling/workflows/umi_hunter.py +412 -0
- uht_tooling-0.1.2.dist-info/METADATA +271 -0
- uht_tooling-0.1.2.dist-info/RECORD +17 -0
- uht_tooling-0.1.2.dist-info/WHEEL +5 -0
- uht_tooling-0.1.2.dist-info/entry_points.txt +2 -0
- uht_tooling-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import csv
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from math import floor
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from Bio import SeqIO
|
|
12
|
+
from Bio.Seq import Seq
|
|
13
|
+
from Bio.SeqUtils import MeltingTemp as mt
|
|
14
|
+
|
|
15
|
+
OVERHANG_LEN = 20
|
|
16
|
+
ANNEAL_LEN = 20
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def codon_table() -> Dict[str, str]:
|
|
20
|
+
return {
|
|
21
|
+
"TTT": "F",
|
|
22
|
+
"TTC": "F",
|
|
23
|
+
"TTA": "L",
|
|
24
|
+
"TTG": "L",
|
|
25
|
+
"TCT": "S",
|
|
26
|
+
"TCC": "S",
|
|
27
|
+
"TCA": "S",
|
|
28
|
+
"TCG": "S",
|
|
29
|
+
"TAT": "Y",
|
|
30
|
+
"TAC": "Y",
|
|
31
|
+
"TAA": "*",
|
|
32
|
+
"TAG": "*",
|
|
33
|
+
"TGT": "C",
|
|
34
|
+
"TGC": "C",
|
|
35
|
+
"TGA": "*",
|
|
36
|
+
"TGG": "W",
|
|
37
|
+
"CTT": "L",
|
|
38
|
+
"CTC": "L",
|
|
39
|
+
"CTA": "L",
|
|
40
|
+
"CTG": "L",
|
|
41
|
+
"CCT": "P",
|
|
42
|
+
"CCC": "P",
|
|
43
|
+
"CCA": "P",
|
|
44
|
+
"CCG": "P",
|
|
45
|
+
"CAT": "H",
|
|
46
|
+
"CAC": "H",
|
|
47
|
+
"CAA": "Q",
|
|
48
|
+
"CAG": "Q",
|
|
49
|
+
"CGT": "R",
|
|
50
|
+
"CGC": "R",
|
|
51
|
+
"CGA": "R",
|
|
52
|
+
"CGG": "R",
|
|
53
|
+
"ATT": "I",
|
|
54
|
+
"ATC": "I",
|
|
55
|
+
"ATA": "I",
|
|
56
|
+
"ATG": "M",
|
|
57
|
+
"ACT": "T",
|
|
58
|
+
"ACC": "T",
|
|
59
|
+
"ACA": "T",
|
|
60
|
+
"ACG": "T",
|
|
61
|
+
"AAT": "N",
|
|
62
|
+
"AAC": "N",
|
|
63
|
+
"AAA": "K",
|
|
64
|
+
"AAG": "K",
|
|
65
|
+
"AGT": "S",
|
|
66
|
+
"AGC": "S",
|
|
67
|
+
"AGA": "R",
|
|
68
|
+
"AGG": "R",
|
|
69
|
+
"GTT": "V",
|
|
70
|
+
"GTC": "V",
|
|
71
|
+
"GTA": "V",
|
|
72
|
+
"GTG": "V",
|
|
73
|
+
"GCT": "A",
|
|
74
|
+
"GCC": "A",
|
|
75
|
+
"GCA": "A",
|
|
76
|
+
"GCG": "A",
|
|
77
|
+
"GAT": "D",
|
|
78
|
+
"GAC": "D",
|
|
79
|
+
"GAA": "E",
|
|
80
|
+
"GAG": "E",
|
|
81
|
+
"GGT": "G",
|
|
82
|
+
"GGC": "G",
|
|
83
|
+
"GGA": "G",
|
|
84
|
+
"GGG": "G",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def translate_codon(cd: str) -> str:
|
|
89
|
+
return codon_table().get(cd.upper(), "?")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def pick_mutant_codon(wt_codon: str, target_aa: str) -> str:
|
|
93
|
+
best_list: List[tuple[str, int]] = []
|
|
94
|
+
for codon, aa in codon_table().items():
|
|
95
|
+
if aa == target_aa:
|
|
96
|
+
diff = sum(a != b for a, b in zip(codon.upper(), wt_codon.upper()))
|
|
97
|
+
best_list.append((codon.upper(), diff))
|
|
98
|
+
if not best_list:
|
|
99
|
+
return None
|
|
100
|
+
best_list.sort(key=lambda x: x[1])
|
|
101
|
+
return best_list[0][0]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_subseq_circ(seq: str, start: int, length: int) -> str:
|
|
105
|
+
N = len(seq)
|
|
106
|
+
if length >= N:
|
|
107
|
+
raise ValueError(f"Requested length {length} ≥ sequence length {N}")
|
|
108
|
+
s_mod = start % N
|
|
109
|
+
end = s_mod + length
|
|
110
|
+
if end <= N:
|
|
111
|
+
return seq[s_mod:end]
|
|
112
|
+
return seq[s_mod:] + seq[: (end % N)]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def design_gibson_primers(full_seq: str, region_start: int, old_len: int, new_seq: str):
|
|
116
|
+
m = len(new_seq)
|
|
117
|
+
if OVERHANG_LEN < m:
|
|
118
|
+
raise ValueError(f"Mutation length {m} > OVERHANG_LEN ({OVERHANG_LEN})")
|
|
119
|
+
|
|
120
|
+
N = len(full_seq)
|
|
121
|
+
flank_left = floor((OVERHANG_LEN - m) / 2)
|
|
122
|
+
flank_right = OVERHANG_LEN - m - flank_left
|
|
123
|
+
|
|
124
|
+
if flank_left >= N or flank_right >= N or ANNEAL_LEN >= N:
|
|
125
|
+
raise ValueError("Sequence too short for requested OVERHANG/ANNEAL lengths")
|
|
126
|
+
|
|
127
|
+
oh_left = get_subseq_circ(full_seq, region_start - flank_left, flank_left)
|
|
128
|
+
oh_right = get_subseq_circ(full_seq, region_start + old_len, flank_right)
|
|
129
|
+
overhang = oh_left + new_seq + oh_right
|
|
130
|
+
|
|
131
|
+
fwd_start = (region_start + old_len + flank_right) % N
|
|
132
|
+
rev_start = (region_start - flank_left - ANNEAL_LEN) % N
|
|
133
|
+
|
|
134
|
+
fwd_anneal = get_subseq_circ(full_seq, fwd_start, ANNEAL_LEN)
|
|
135
|
+
rev_anneal = get_subseq_circ(full_seq, rev_start, ANNEAL_LEN)
|
|
136
|
+
|
|
137
|
+
gibson_fwd = (overhang + fwd_anneal).upper()
|
|
138
|
+
overhang_rc = str(Seq(overhang).reverse_complement()).upper()
|
|
139
|
+
rev_anneal_rc = str(Seq(rev_anneal).reverse_complement()).upper()
|
|
140
|
+
gibson_rev = (overhang_rc + rev_anneal_rc).upper()
|
|
141
|
+
|
|
142
|
+
return gibson_fwd, gibson_rev, fwd_start, rev_start
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def run_design_gibson(
|
|
146
|
+
gene_fasta: Path,
|
|
147
|
+
context_fasta: Path,
|
|
148
|
+
mutations_csv: Path,
|
|
149
|
+
output_dir: Path,
|
|
150
|
+
log_path: Optional[Path] = None,
|
|
151
|
+
logger: Optional[logging.Logger] = None,
|
|
152
|
+
) -> Dict[str, Path]:
|
|
153
|
+
gene_fasta = Path(gene_fasta)
|
|
154
|
+
context_fasta = Path(context_fasta)
|
|
155
|
+
mutations_csv = Path(mutations_csv)
|
|
156
|
+
output_dir = Path(output_dir)
|
|
157
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
|
|
159
|
+
managed_logger = logger is None
|
|
160
|
+
if logger is None:
|
|
161
|
+
logger = logging.getLogger("uht_tooling.design_gibson")
|
|
162
|
+
logger.setLevel(logging.INFO)
|
|
163
|
+
handler: logging.Handler
|
|
164
|
+
if log_path:
|
|
165
|
+
handler = logging.FileHandler(log_path, mode="w")
|
|
166
|
+
else:
|
|
167
|
+
handler = logging.StreamHandler()
|
|
168
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
|
|
169
|
+
logger.handlers = []
|
|
170
|
+
logger.addHandler(handler)
|
|
171
|
+
logger.propagate = False
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
gene_seq = str(next(SeqIO.parse(str(gene_fasta), "fasta")).seq).upper()
|
|
175
|
+
context_seq = str(next(SeqIO.parse(str(context_fasta), "fasta")).seq).upper()
|
|
176
|
+
logger.info("Loaded gene (%s nt) and context (%s nt).", len(gene_seq), len(context_seq))
|
|
177
|
+
|
|
178
|
+
df = pd.read_csv(mutations_csv)
|
|
179
|
+
if "mutations" not in df.columns:
|
|
180
|
+
raise ValueError("Mutations CSV must contain a 'mutations' column.")
|
|
181
|
+
entries = df["mutations"].dropna().tolist()
|
|
182
|
+
logger.info("Loaded %s mutation entries.", len(entries))
|
|
183
|
+
|
|
184
|
+
double_seq = context_seq + context_seq
|
|
185
|
+
idx = double_seq.find(gene_seq)
|
|
186
|
+
if idx == -1 or idx >= len(context_seq):
|
|
187
|
+
raise ValueError("Could not align gene within circular context.")
|
|
188
|
+
gene_offset = idx % len(context_seq)
|
|
189
|
+
logger.info("Gene aligned at offset %s within context.", gene_offset)
|
|
190
|
+
full_seq = context_seq
|
|
191
|
+
|
|
192
|
+
primers_csv = output_dir / "Gibson_primers.csv"
|
|
193
|
+
plan_csv = output_dir / "Gibson_assembly_plan.csv"
|
|
194
|
+
group_entries: Dict[str, List[Dict[str, object]]] = defaultdict(list)
|
|
195
|
+
|
|
196
|
+
with primers_csv.open("w", newline="") as csvfile:
|
|
197
|
+
writer = csv.writer(csvfile)
|
|
198
|
+
writer.writerow(["Group", "Submutation", "Primer Name", "Sequence"])
|
|
199
|
+
|
|
200
|
+
for entry in entries:
|
|
201
|
+
submuts = entry.split("+")
|
|
202
|
+
group_name = entry.replace("+", "_")
|
|
203
|
+
logger.info("Processing group: %s with submutations: %s", group_name, submuts)
|
|
204
|
+
|
|
205
|
+
for sub in submuts:
|
|
206
|
+
m_del = re.match(r"^([A-Z])(\d+)Del$", sub)
|
|
207
|
+
m_indel = re.match(r"^([A-Z])(\d+)InDel([A-Z])(\d+)([A-Z]+)$", sub)
|
|
208
|
+
m_sub = re.match(r"^([A-Z])(\d+)([A-Z])$", sub)
|
|
209
|
+
m_ins = re.match(r"^([A-Z])(\d+)([A-Z]{2,})$", sub)
|
|
210
|
+
|
|
211
|
+
if m_del:
|
|
212
|
+
_, pos1_s = m_del.groups()
|
|
213
|
+
region_start = gene_offset + (int(pos1_s) - 1) * 3
|
|
214
|
+
old_len = 3
|
|
215
|
+
new_seq = ""
|
|
216
|
+
elif m_indel:
|
|
217
|
+
wt1, pos1_s, _, pos2_s, ins_aa = m_indel.groups()
|
|
218
|
+
pos1, pos2 = int(pos1_s), int(pos2_s)
|
|
219
|
+
region_start = gene_offset + (pos1 - 1) * 3
|
|
220
|
+
old_len = (pos2 - pos1 + 1) * 3
|
|
221
|
+
wt_codon = get_subseq_circ(full_seq, region_start, 3)
|
|
222
|
+
new_seq = ""
|
|
223
|
+
for aa in ins_aa:
|
|
224
|
+
codon = pick_mutant_codon(wt_codon, aa)
|
|
225
|
+
if not codon:
|
|
226
|
+
raise ValueError(f"No codon found for {wt1}->{ins_aa}")
|
|
227
|
+
new_seq += codon
|
|
228
|
+
elif m_ins:
|
|
229
|
+
wt_aa, pos1_s, ins_str = m_ins.groups()
|
|
230
|
+
pos1 = int(pos1_s)
|
|
231
|
+
codon_start_old = gene_offset + (pos1 - 1) * 3
|
|
232
|
+
wt_codon = get_subseq_circ(full_seq, codon_start_old, 3)
|
|
233
|
+
if ins_str[0] == wt_aa:
|
|
234
|
+
inserted_aas = ins_str[1:]
|
|
235
|
+
region_start = codon_start_old + 3
|
|
236
|
+
old_len = 0
|
|
237
|
+
else:
|
|
238
|
+
inserted_aas = ins_str
|
|
239
|
+
region_start = codon_start_old
|
|
240
|
+
old_len = 3
|
|
241
|
+
new_seq = ""
|
|
242
|
+
for aa in inserted_aas:
|
|
243
|
+
codon = pick_mutant_codon(wt_codon, aa)
|
|
244
|
+
if not codon:
|
|
245
|
+
raise ValueError(f"No codon for insertion amino acid {aa}")
|
|
246
|
+
new_seq += codon
|
|
247
|
+
elif m_sub:
|
|
248
|
+
wt_aa, pos1_s, mut_aa = m_sub.groups()
|
|
249
|
+
pos1 = int(pos1_s)
|
|
250
|
+
region_start = gene_offset + (pos1 - 1) * 3
|
|
251
|
+
old_len = 3
|
|
252
|
+
wt_codon = get_subseq_circ(full_seq, region_start, 3)
|
|
253
|
+
translated = translate_codon(wt_codon)
|
|
254
|
+
if translated != wt_aa:
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"For {sub}: expected {wt_aa}, found {translated} at {wt_codon}"
|
|
257
|
+
)
|
|
258
|
+
new_seq = pick_mutant_codon(wt_codon, mut_aa)
|
|
259
|
+
if not new_seq:
|
|
260
|
+
raise ValueError(f"No minimal-change codon for {wt_aa}->{mut_aa}")
|
|
261
|
+
else:
|
|
262
|
+
raise ValueError(f"Unknown mutation format: {sub}")
|
|
263
|
+
|
|
264
|
+
gibson_fwd, gibson_rev, fwd_start, rev_start = design_gibson_primers(
|
|
265
|
+
full_seq, region_start, old_len, new_seq
|
|
266
|
+
)
|
|
267
|
+
primer_fwd_name = f"{group_name}__{sub}_Gibson_F"
|
|
268
|
+
primer_rev_name = f"{group_name}__{sub}_Gibson_R"
|
|
269
|
+
|
|
270
|
+
writer.writerow([group_name, sub, primer_fwd_name, gibson_fwd])
|
|
271
|
+
writer.writerow([group_name, sub, primer_rev_name, gibson_rev])
|
|
272
|
+
|
|
273
|
+
group_entries[group_name].append(
|
|
274
|
+
{
|
|
275
|
+
"sub": sub,
|
|
276
|
+
"fwd_name": primer_fwd_name,
|
|
277
|
+
"rev_name": primer_rev_name,
|
|
278
|
+
"fwd_pos": fwd_start % len(full_seq),
|
|
279
|
+
"rev_pos": rev_start % len(full_seq),
|
|
280
|
+
"fwd_seq": gibson_fwd,
|
|
281
|
+
"rev_seq": gibson_rev,
|
|
282
|
+
}
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
with plan_csv.open("w", newline="") as csvfile:
|
|
286
|
+
writer = csv.writer(csvfile)
|
|
287
|
+
writer.writerow(
|
|
288
|
+
[
|
|
289
|
+
"Group",
|
|
290
|
+
"Submutation",
|
|
291
|
+
"PCR_Primer_Forward",
|
|
292
|
+
"PCR_Primer_Reverse",
|
|
293
|
+
"Tm (celsius)",
|
|
294
|
+
"Amplicon Size (bp)",
|
|
295
|
+
]
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
for group_name, entries in group_entries.items():
|
|
299
|
+
sorted_forwards = sorted(entries, key=lambda e: e["fwd_pos"])
|
|
300
|
+
sorted_reverses = sorted(entries, key=lambda e: e["rev_pos"])
|
|
301
|
+
n = len(sorted_forwards)
|
|
302
|
+
N = len(full_seq)
|
|
303
|
+
for i in range(n):
|
|
304
|
+
f_entry = sorted_forwards[i]
|
|
305
|
+
r_entry = sorted_reverses[(i + 1) % n]
|
|
306
|
+
|
|
307
|
+
Tm_fwd = mt.Tm_NN(f_entry["fwd_seq"])
|
|
308
|
+
Tm_rev = mt.Tm_NN(r_entry["rev_seq"])
|
|
309
|
+
Tm_pair = min(Tm_fwd, Tm_rev)
|
|
310
|
+
|
|
311
|
+
fwd_start = f_entry["fwd_pos"]
|
|
312
|
+
rev_start = r_entry["rev_pos"]
|
|
313
|
+
rev_end = (rev_start + ANNEAL_LEN - 1) % N
|
|
314
|
+
if rev_end >= fwd_start:
|
|
315
|
+
amp_size = rev_end - fwd_start + 1
|
|
316
|
+
else:
|
|
317
|
+
amp_size = (N - fwd_start) + (rev_end + 1)
|
|
318
|
+
|
|
319
|
+
writer.writerow(
|
|
320
|
+
[
|
|
321
|
+
group_name,
|
|
322
|
+
f_entry["sub"],
|
|
323
|
+
f_entry["fwd_name"],
|
|
324
|
+
r_entry["rev_name"],
|
|
325
|
+
f"{Tm_pair:.1f}",
|
|
326
|
+
amp_size,
|
|
327
|
+
]
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
logger.info("Wrote Gibson outputs to %s and %s", primers_csv, plan_csv)
|
|
331
|
+
return {"primers_csv": primers_csv, "plan_csv": plan_csv}
|
|
332
|
+
finally:
|
|
333
|
+
if managed_logger and logger:
|
|
334
|
+
for handler in list(logger.handlers):
|
|
335
|
+
handler.close()
|
|
336
|
+
logger.removeHandler(handler)
|
|
337
|
+
logger.propagate = True
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
341
|
+
parser = argparse.ArgumentParser(description="Design Gibson assembly primers from user-provided inputs.")
|
|
342
|
+
parser.add_argument("--gene-fasta", required=True, type=Path, help="Path to gene FASTA file.")
|
|
343
|
+
parser.add_argument("--context-fasta", required=True, type=Path, help="Path to circular context FASTA file.")
|
|
344
|
+
parser.add_argument(
|
|
345
|
+
"--mutations-csv",
|
|
346
|
+
required=True,
|
|
347
|
+
type=Path,
|
|
348
|
+
help="CSV with a 'mutations' column (use '+' to chain sub-mutations).",
|
|
349
|
+
)
|
|
350
|
+
parser.add_argument("--output-dir", required=True, type=Path, help="Directory to write result CSV files.")
|
|
351
|
+
parser.add_argument("--log-path", default=None, type=Path, help="Optional log file path.")
|
|
352
|
+
return parser
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def main(argv: Optional[List[str]] = None):
|
|
356
|
+
parser = build_parser()
|
|
357
|
+
args = parser.parse_args(argv)
|
|
358
|
+
run_design_gibson(
|
|
359
|
+
gene_fasta=args.gene_fasta,
|
|
360
|
+
context_fasta=args.context_fasta,
|
|
361
|
+
mutations_csv=args.mutations_csv,
|
|
362
|
+
output_dir=args.output_dir,
|
|
363
|
+
log_path=args.log_path,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
if __name__ == "__main__":
|
|
368
|
+
main()
|