uht-tooling 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,368 @@
1
+ import argparse
2
+ import csv
3
+ import logging
4
+ import re
5
+ from collections import defaultdict
6
+ from math import floor
7
+ from pathlib import Path
8
+ from typing import Dict, List, Optional
9
+
10
+ import pandas as pd
11
+ from Bio import SeqIO
12
+ from Bio.Seq import Seq
13
+ from Bio.SeqUtils import MeltingTemp as mt
14
+
15
+ OVERHANG_LEN = 20
16
+ ANNEAL_LEN = 20
17
+
18
+
19
+ def codon_table() -> Dict[str, str]:
20
+ return {
21
+ "TTT": "F",
22
+ "TTC": "F",
23
+ "TTA": "L",
24
+ "TTG": "L",
25
+ "TCT": "S",
26
+ "TCC": "S",
27
+ "TCA": "S",
28
+ "TCG": "S",
29
+ "TAT": "Y",
30
+ "TAC": "Y",
31
+ "TAA": "*",
32
+ "TAG": "*",
33
+ "TGT": "C",
34
+ "TGC": "C",
35
+ "TGA": "*",
36
+ "TGG": "W",
37
+ "CTT": "L",
38
+ "CTC": "L",
39
+ "CTA": "L",
40
+ "CTG": "L",
41
+ "CCT": "P",
42
+ "CCC": "P",
43
+ "CCA": "P",
44
+ "CCG": "P",
45
+ "CAT": "H",
46
+ "CAC": "H",
47
+ "CAA": "Q",
48
+ "CAG": "Q",
49
+ "CGT": "R",
50
+ "CGC": "R",
51
+ "CGA": "R",
52
+ "CGG": "R",
53
+ "ATT": "I",
54
+ "ATC": "I",
55
+ "ATA": "I",
56
+ "ATG": "M",
57
+ "ACT": "T",
58
+ "ACC": "T",
59
+ "ACA": "T",
60
+ "ACG": "T",
61
+ "AAT": "N",
62
+ "AAC": "N",
63
+ "AAA": "K",
64
+ "AAG": "K",
65
+ "AGT": "S",
66
+ "AGC": "S",
67
+ "AGA": "R",
68
+ "AGG": "R",
69
+ "GTT": "V",
70
+ "GTC": "V",
71
+ "GTA": "V",
72
+ "GTG": "V",
73
+ "GCT": "A",
74
+ "GCC": "A",
75
+ "GCA": "A",
76
+ "GCG": "A",
77
+ "GAT": "D",
78
+ "GAC": "D",
79
+ "GAA": "E",
80
+ "GAG": "E",
81
+ "GGT": "G",
82
+ "GGC": "G",
83
+ "GGA": "G",
84
+ "GGG": "G",
85
+ }
86
+
87
+
88
+ def translate_codon(cd: str) -> str:
89
+ return codon_table().get(cd.upper(), "?")
90
+
91
+
92
+ def pick_mutant_codon(wt_codon: str, target_aa: str) -> str:
93
+ best_list: List[tuple[str, int]] = []
94
+ for codon, aa in codon_table().items():
95
+ if aa == target_aa:
96
+ diff = sum(a != b for a, b in zip(codon.upper(), wt_codon.upper()))
97
+ best_list.append((codon.upper(), diff))
98
+ if not best_list:
99
+ return None
100
+ best_list.sort(key=lambda x: x[1])
101
+ return best_list[0][0]
102
+
103
+
104
+ def get_subseq_circ(seq: str, start: int, length: int) -> str:
105
+ N = len(seq)
106
+ if length >= N:
107
+ raise ValueError(f"Requested length {length} ≥ sequence length {N}")
108
+ s_mod = start % N
109
+ end = s_mod + length
110
+ if end <= N:
111
+ return seq[s_mod:end]
112
+ return seq[s_mod:] + seq[: (end % N)]
113
+
114
+
115
+ def design_gibson_primers(full_seq: str, region_start: int, old_len: int, new_seq: str):
116
+ m = len(new_seq)
117
+ if OVERHANG_LEN < m:
118
+ raise ValueError(f"Mutation length {m} > OVERHANG_LEN ({OVERHANG_LEN})")
119
+
120
+ N = len(full_seq)
121
+ flank_left = floor((OVERHANG_LEN - m) / 2)
122
+ flank_right = OVERHANG_LEN - m - flank_left
123
+
124
+ if flank_left >= N or flank_right >= N or ANNEAL_LEN >= N:
125
+ raise ValueError("Sequence too short for requested OVERHANG/ANNEAL lengths")
126
+
127
+ oh_left = get_subseq_circ(full_seq, region_start - flank_left, flank_left)
128
+ oh_right = get_subseq_circ(full_seq, region_start + old_len, flank_right)
129
+ overhang = oh_left + new_seq + oh_right
130
+
131
+ fwd_start = (region_start + old_len + flank_right) % N
132
+ rev_start = (region_start - flank_left - ANNEAL_LEN) % N
133
+
134
+ fwd_anneal = get_subseq_circ(full_seq, fwd_start, ANNEAL_LEN)
135
+ rev_anneal = get_subseq_circ(full_seq, rev_start, ANNEAL_LEN)
136
+
137
+ gibson_fwd = (overhang + fwd_anneal).upper()
138
+ overhang_rc = str(Seq(overhang).reverse_complement()).upper()
139
+ rev_anneal_rc = str(Seq(rev_anneal).reverse_complement()).upper()
140
+ gibson_rev = (overhang_rc + rev_anneal_rc).upper()
141
+
142
+ return gibson_fwd, gibson_rev, fwd_start, rev_start
143
+
144
+
145
+ def run_design_gibson(
146
+ gene_fasta: Path,
147
+ context_fasta: Path,
148
+ mutations_csv: Path,
149
+ output_dir: Path,
150
+ log_path: Optional[Path] = None,
151
+ logger: Optional[logging.Logger] = None,
152
+ ) -> Dict[str, Path]:
153
+ gene_fasta = Path(gene_fasta)
154
+ context_fasta = Path(context_fasta)
155
+ mutations_csv = Path(mutations_csv)
156
+ output_dir = Path(output_dir)
157
+ output_dir.mkdir(parents=True, exist_ok=True)
158
+
159
+ managed_logger = logger is None
160
+ if logger is None:
161
+ logger = logging.getLogger("uht_tooling.design_gibson")
162
+ logger.setLevel(logging.INFO)
163
+ handler: logging.Handler
164
+ if log_path:
165
+ handler = logging.FileHandler(log_path, mode="w")
166
+ else:
167
+ handler = logging.StreamHandler()
168
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
169
+ logger.handlers = []
170
+ logger.addHandler(handler)
171
+ logger.propagate = False
172
+
173
+ try:
174
+ gene_seq = str(next(SeqIO.parse(str(gene_fasta), "fasta")).seq).upper()
175
+ context_seq = str(next(SeqIO.parse(str(context_fasta), "fasta")).seq).upper()
176
+ logger.info("Loaded gene (%s nt) and context (%s nt).", len(gene_seq), len(context_seq))
177
+
178
+ df = pd.read_csv(mutations_csv)
179
+ if "mutations" not in df.columns:
180
+ raise ValueError("Mutations CSV must contain a 'mutations' column.")
181
+ entries = df["mutations"].dropna().tolist()
182
+ logger.info("Loaded %s mutation entries.", len(entries))
183
+
184
+ double_seq = context_seq + context_seq
185
+ idx = double_seq.find(gene_seq)
186
+ if idx == -1 or idx >= len(context_seq):
187
+ raise ValueError("Could not align gene within circular context.")
188
+ gene_offset = idx % len(context_seq)
189
+ logger.info("Gene aligned at offset %s within context.", gene_offset)
190
+ full_seq = context_seq
191
+
192
+ primers_csv = output_dir / "Gibson_primers.csv"
193
+ plan_csv = output_dir / "Gibson_assembly_plan.csv"
194
+ group_entries: Dict[str, List[Dict[str, object]]] = defaultdict(list)
195
+
196
+ with primers_csv.open("w", newline="") as csvfile:
197
+ writer = csv.writer(csvfile)
198
+ writer.writerow(["Group", "Submutation", "Primer Name", "Sequence"])
199
+
200
+ for entry in entries:
201
+ submuts = entry.split("+")
202
+ group_name = entry.replace("+", "_")
203
+ logger.info("Processing group: %s with submutations: %s", group_name, submuts)
204
+
205
+ for sub in submuts:
206
+ m_del = re.match(r"^([A-Z])(\d+)Del$", sub)
207
+ m_indel = re.match(r"^([A-Z])(\d+)InDel([A-Z])(\d+)([A-Z]+)$", sub)
208
+ m_sub = re.match(r"^([A-Z])(\d+)([A-Z])$", sub)
209
+ m_ins = re.match(r"^([A-Z])(\d+)([A-Z]{2,})$", sub)
210
+
211
+ if m_del:
212
+ _, pos1_s = m_del.groups()
213
+ region_start = gene_offset + (int(pos1_s) - 1) * 3
214
+ old_len = 3
215
+ new_seq = ""
216
+ elif m_indel:
217
+ wt1, pos1_s, _, pos2_s, ins_aa = m_indel.groups()
218
+ pos1, pos2 = int(pos1_s), int(pos2_s)
219
+ region_start = gene_offset + (pos1 - 1) * 3
220
+ old_len = (pos2 - pos1 + 1) * 3
221
+ wt_codon = get_subseq_circ(full_seq, region_start, 3)
222
+ new_seq = ""
223
+ for aa in ins_aa:
224
+ codon = pick_mutant_codon(wt_codon, aa)
225
+ if not codon:
226
+ raise ValueError(f"No codon found for {wt1}->{ins_aa}")
227
+ new_seq += codon
228
+ elif m_ins:
229
+ wt_aa, pos1_s, ins_str = m_ins.groups()
230
+ pos1 = int(pos1_s)
231
+ codon_start_old = gene_offset + (pos1 - 1) * 3
232
+ wt_codon = get_subseq_circ(full_seq, codon_start_old, 3)
233
+ if ins_str[0] == wt_aa:
234
+ inserted_aas = ins_str[1:]
235
+ region_start = codon_start_old + 3
236
+ old_len = 0
237
+ else:
238
+ inserted_aas = ins_str
239
+ region_start = codon_start_old
240
+ old_len = 3
241
+ new_seq = ""
242
+ for aa in inserted_aas:
243
+ codon = pick_mutant_codon(wt_codon, aa)
244
+ if not codon:
245
+ raise ValueError(f"No codon for insertion amino acid {aa}")
246
+ new_seq += codon
247
+ elif m_sub:
248
+ wt_aa, pos1_s, mut_aa = m_sub.groups()
249
+ pos1 = int(pos1_s)
250
+ region_start = gene_offset + (pos1 - 1) * 3
251
+ old_len = 3
252
+ wt_codon = get_subseq_circ(full_seq, region_start, 3)
253
+ translated = translate_codon(wt_codon)
254
+ if translated != wt_aa:
255
+ raise ValueError(
256
+ f"For {sub}: expected {wt_aa}, found {translated} at {wt_codon}"
257
+ )
258
+ new_seq = pick_mutant_codon(wt_codon, mut_aa)
259
+ if not new_seq:
260
+ raise ValueError(f"No minimal-change codon for {wt_aa}->{mut_aa}")
261
+ else:
262
+ raise ValueError(f"Unknown mutation format: {sub}")
263
+
264
+ gibson_fwd, gibson_rev, fwd_start, rev_start = design_gibson_primers(
265
+ full_seq, region_start, old_len, new_seq
266
+ )
267
+ primer_fwd_name = f"{group_name}__{sub}_Gibson_F"
268
+ primer_rev_name = f"{group_name}__{sub}_Gibson_R"
269
+
270
+ writer.writerow([group_name, sub, primer_fwd_name, gibson_fwd])
271
+ writer.writerow([group_name, sub, primer_rev_name, gibson_rev])
272
+
273
+ group_entries[group_name].append(
274
+ {
275
+ "sub": sub,
276
+ "fwd_name": primer_fwd_name,
277
+ "rev_name": primer_rev_name,
278
+ "fwd_pos": fwd_start % len(full_seq),
279
+ "rev_pos": rev_start % len(full_seq),
280
+ "fwd_seq": gibson_fwd,
281
+ "rev_seq": gibson_rev,
282
+ }
283
+ )
284
+
285
+ with plan_csv.open("w", newline="") as csvfile:
286
+ writer = csv.writer(csvfile)
287
+ writer.writerow(
288
+ [
289
+ "Group",
290
+ "Submutation",
291
+ "PCR_Primer_Forward",
292
+ "PCR_Primer_Reverse",
293
+ "Tm (celsius)",
294
+ "Amplicon Size (bp)",
295
+ ]
296
+ )
297
+
298
+ for group_name, entries in group_entries.items():
299
+ sorted_forwards = sorted(entries, key=lambda e: e["fwd_pos"])
300
+ sorted_reverses = sorted(entries, key=lambda e: e["rev_pos"])
301
+ n = len(sorted_forwards)
302
+ N = len(full_seq)
303
+ for i in range(n):
304
+ f_entry = sorted_forwards[i]
305
+ r_entry = sorted_reverses[(i + 1) % n]
306
+
307
+ Tm_fwd = mt.Tm_NN(f_entry["fwd_seq"])
308
+ Tm_rev = mt.Tm_NN(r_entry["rev_seq"])
309
+ Tm_pair = min(Tm_fwd, Tm_rev)
310
+
311
+ fwd_start = f_entry["fwd_pos"]
312
+ rev_start = r_entry["rev_pos"]
313
+ rev_end = (rev_start + ANNEAL_LEN - 1) % N
314
+ if rev_end >= fwd_start:
315
+ amp_size = rev_end - fwd_start + 1
316
+ else:
317
+ amp_size = (N - fwd_start) + (rev_end + 1)
318
+
319
+ writer.writerow(
320
+ [
321
+ group_name,
322
+ f_entry["sub"],
323
+ f_entry["fwd_name"],
324
+ r_entry["rev_name"],
325
+ f"{Tm_pair:.1f}",
326
+ amp_size,
327
+ ]
328
+ )
329
+
330
+ logger.info("Wrote Gibson outputs to %s and %s", primers_csv, plan_csv)
331
+ return {"primers_csv": primers_csv, "plan_csv": plan_csv}
332
+ finally:
333
+ if managed_logger and logger:
334
+ for handler in list(logger.handlers):
335
+ handler.close()
336
+ logger.removeHandler(handler)
337
+ logger.propagate = True
338
+
339
+
340
+ def build_parser() -> argparse.ArgumentParser:
341
+ parser = argparse.ArgumentParser(description="Design Gibson assembly primers from user-provided inputs.")
342
+ parser.add_argument("--gene-fasta", required=True, type=Path, help="Path to gene FASTA file.")
343
+ parser.add_argument("--context-fasta", required=True, type=Path, help="Path to circular context FASTA file.")
344
+ parser.add_argument(
345
+ "--mutations-csv",
346
+ required=True,
347
+ type=Path,
348
+ help="CSV with a 'mutations' column (use '+' to chain sub-mutations).",
349
+ )
350
+ parser.add_argument("--output-dir", required=True, type=Path, help="Directory to write result CSV files.")
351
+ parser.add_argument("--log-path", default=None, type=Path, help="Optional log file path.")
352
+ return parser
353
+
354
+
355
+ def main(argv: Optional[List[str]] = None):
356
+ parser = build_parser()
357
+ args = parser.parse_args(argv)
358
+ run_design_gibson(
359
+ gene_fasta=args.gene_fasta,
360
+ context_fasta=args.context_fasta,
361
+ mutations_csv=args.mutations_csv,
362
+ output_dir=args.output_dir,
363
+ log_path=args.log_path,
364
+ )
365
+
366
+
367
+ if __name__ == "__main__":
368
+ main()