uht-tooling 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uht_tooling/__init__.py +10 -0
- uht_tooling/cli.py +368 -0
- uht_tooling/models/__init__.py +0 -0
- uht_tooling/workflows/__init__.py +0 -0
- uht_tooling/workflows/design_gibson.py +368 -0
- uht_tooling/workflows/design_slim.py +402 -0
- uht_tooling/workflows/gui.py +595 -0
- uht_tooling/workflows/mut_rate.py +2480 -0
- uht_tooling/workflows/mutation_caller.py +432 -0
- uht_tooling/workflows/nextera_designer.py +199 -0
- uht_tooling/workflows/profile_inserts.py +441 -0
- uht_tooling/workflows/umi_hunter.py +412 -0
- uht_tooling-0.1.2.dist-info/METADATA +271 -0
- uht_tooling-0.1.2.dist-info/RECORD +17 -0
- uht_tooling-0.1.2.dist-info/WHEEL +5 -0
- uht_tooling-0.1.2.dist-info/entry_points.txt +2 -0
- uht_tooling-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import csv
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from Bio import AlignIO, SeqIO
|
|
15
|
+
from Bio.Align.Applications import MafftCommandline
|
|
16
|
+
from Bio.Seq import Seq
|
|
17
|
+
from Bio.SeqRecord import SeqRecord
|
|
18
|
+
from tqdm import tqdm
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def reverse_complement(seq: str) -> str:
|
|
22
|
+
return seq.translate(str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_flank_config(config_csv: Path) -> dict:
|
|
26
|
+
df = pd.read_csv(config_csv)
|
|
27
|
+
return {
|
|
28
|
+
"umi_start": df.loc[0, "umi_flanks"],
|
|
29
|
+
"umi_end": df.loc[1, "umi_flanks"],
|
|
30
|
+
"umi_min": int(df.loc[0, "umi_min_max"]),
|
|
31
|
+
"umi_max": int(df.loc[1, "umi_min_max"]),
|
|
32
|
+
"gene_start": df.loc[0, "gene_flanks"],
|
|
33
|
+
"gene_end": df.loc[1, "gene_flanks"],
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def build_patterns(cfg: dict) -> tuple[re.Pattern, re.Pattern]:
|
|
38
|
+
pattern_umi = re.compile(
|
|
39
|
+
rf"{cfg['umi_start']}([ACGT]{{{cfg['umi_min']},{cfg['umi_max']}}}){cfg['umi_end']}",
|
|
40
|
+
re.IGNORECASE,
|
|
41
|
+
)
|
|
42
|
+
pattern_gene = re.compile(rf"{cfg['gene_start']}(.*?){cfg['gene_end']}", re.IGNORECASE)
|
|
43
|
+
return pattern_umi, pattern_gene
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_read_info(
|
|
47
|
+
seq: str,
|
|
48
|
+
pattern_umi: re.Pattern,
|
|
49
|
+
pattern_gene: re.Pattern,
|
|
50
|
+
logger: logging.Logger,
|
|
51
|
+
) -> tuple[Optional[str], Optional[str]]:
|
|
52
|
+
umi_match = pattern_umi.search(seq)
|
|
53
|
+
gene_match = pattern_gene.search(seq)
|
|
54
|
+
if umi_match and gene_match:
|
|
55
|
+
return umi_match.group(1), gene_match.group(1)
|
|
56
|
+
rev_seq = reverse_complement(seq)
|
|
57
|
+
umi_match = pattern_umi.search(rev_seq)
|
|
58
|
+
gene_match = pattern_gene.search(rev_seq)
|
|
59
|
+
if umi_match and gene_match:
|
|
60
|
+
return umi_match.group(1), gene_match.group(1)
|
|
61
|
+
logger.debug("Failed to extract UMI/gene from read")
|
|
62
|
+
return None, None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def process_fastq(
|
|
66
|
+
file_path: Path,
|
|
67
|
+
pattern_umi: re.Pattern,
|
|
68
|
+
pattern_gene: re.Pattern,
|
|
69
|
+
logger: logging.Logger,
|
|
70
|
+
) -> tuple[int, Dict[str, List[str]]]:
|
|
71
|
+
read_count = 0
|
|
72
|
+
umi_info: Dict[str, List[str]] = {}
|
|
73
|
+
extracted = 0
|
|
74
|
+
with gzip.open(file_path, "rt") as handle:
|
|
75
|
+
while True:
|
|
76
|
+
header = handle.readline()
|
|
77
|
+
if not header:
|
|
78
|
+
break
|
|
79
|
+
seq = handle.readline().strip()
|
|
80
|
+
handle.readline()
|
|
81
|
+
handle.readline()
|
|
82
|
+
read_count += 1
|
|
83
|
+
|
|
84
|
+
umi, gene = extract_read_info(seq, pattern_umi, pattern_gene, logger)
|
|
85
|
+
if umi and gene:
|
|
86
|
+
umi_info.setdefault(umi, []).append(gene)
|
|
87
|
+
extracted += 1
|
|
88
|
+
if read_count % 100000 == 0:
|
|
89
|
+
logger.info("Processed %s reads so far in %s", read_count, file_path.name)
|
|
90
|
+
logger.info(
|
|
91
|
+
"Finished reading %s: total reads=%s, extracted pairs=%s",
|
|
92
|
+
file_path,
|
|
93
|
+
read_count,
|
|
94
|
+
extracted,
|
|
95
|
+
)
|
|
96
|
+
return read_count, umi_info
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def levenshtein(s1: str, s2: str) -> int:
|
|
100
|
+
if len(s1) < len(s2):
|
|
101
|
+
return levenshtein(s2, s1)
|
|
102
|
+
if len(s2) == 0:
|
|
103
|
+
return len(s1)
|
|
104
|
+
previous_row = list(range(len(s2) + 1))
|
|
105
|
+
for i, c1 in enumerate(s1):
|
|
106
|
+
current_row = [i + 1]
|
|
107
|
+
for j, c2 in enumerate(s2):
|
|
108
|
+
insertions = previous_row[j + 1] + 1
|
|
109
|
+
deletions = current_row[j] + 1
|
|
110
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
111
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
112
|
+
previous_row = current_row
|
|
113
|
+
return previous_row[-1]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def percent_identity(seq1: str, seq2: str) -> float:
|
|
117
|
+
max_len = max(len(seq1), len(seq2))
|
|
118
|
+
if max_len == 0:
|
|
119
|
+
return 1.0
|
|
120
|
+
dist = levenshtein(seq1, seq2)
|
|
121
|
+
return (max_len - dist) / max_len
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def cluster_umis(
|
|
125
|
+
umi_info: Dict[str, List[str]],
|
|
126
|
+
threshold: float,
|
|
127
|
+
logger: logging.Logger,
|
|
128
|
+
) -> List[dict]:
|
|
129
|
+
logger.info("Clustering %s unique UMIs with threshold %.2f", len(umi_info), threshold)
|
|
130
|
+
sorted_umis = sorted(umi_info.items(), key=lambda item: len(item[1]), reverse=True)
|
|
131
|
+
clusters: List[dict] = []
|
|
132
|
+
for umi, gene_list in sorted_umis:
|
|
133
|
+
count = len(gene_list)
|
|
134
|
+
for cluster in clusters:
|
|
135
|
+
if percent_identity(umi, cluster["rep"]) >= threshold:
|
|
136
|
+
cluster["total_count"] += count
|
|
137
|
+
cluster["members"][umi] = count
|
|
138
|
+
cluster["gene_seqs"].extend(gene_list)
|
|
139
|
+
break
|
|
140
|
+
else:
|
|
141
|
+
clusters.append(
|
|
142
|
+
{
|
|
143
|
+
"rep": umi,
|
|
144
|
+
"total_count": count,
|
|
145
|
+
"members": {umi: count},
|
|
146
|
+
"gene_seqs": list(gene_list),
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
logger.info("Formed %s UMI clusters", len(clusters))
|
|
150
|
+
return clusters
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def run_mafft_alignment(reference: SeqRecord, gene_seqs: List[str]) -> AlignIO.MultipleSeqAlignment:
|
|
154
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_in:
|
|
155
|
+
fasta_in = tmp_in.name
|
|
156
|
+
reference.id = "REF_template"
|
|
157
|
+
reference.description = ""
|
|
158
|
+
SeqIO.write(reference, tmp_in, "fasta")
|
|
159
|
+
for i, seq in enumerate(gene_seqs):
|
|
160
|
+
record = SeqRecord(Seq(seq), id=f"seq{i}", description="")
|
|
161
|
+
SeqIO.write(record, tmp_in, "fasta")
|
|
162
|
+
|
|
163
|
+
mafft_cline = MafftCommandline(input=fasta_in)
|
|
164
|
+
proc = subprocess.Popen(
|
|
165
|
+
str(mafft_cline),
|
|
166
|
+
shell=True,
|
|
167
|
+
stdout=subprocess.PIPE,
|
|
168
|
+
stderr=subprocess.PIPE,
|
|
169
|
+
universal_newlines=True,
|
|
170
|
+
)
|
|
171
|
+
stdout, stderr = proc.communicate()
|
|
172
|
+
if proc.returncode != 0:
|
|
173
|
+
os.remove(fasta_in)
|
|
174
|
+
raise RuntimeError(f"MAFFT failed with exit code {proc.returncode}:\n{stderr}")
|
|
175
|
+
|
|
176
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_out:
|
|
177
|
+
fasta_out = tmp_out.name
|
|
178
|
+
tmp_out.write(stdout)
|
|
179
|
+
|
|
180
|
+
alignment = AlignIO.read(fasta_out, "fasta")
|
|
181
|
+
os.remove(fasta_in)
|
|
182
|
+
os.remove(fasta_out)
|
|
183
|
+
return alignment
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def generate_consensus(
|
|
187
|
+
reference_record: SeqRecord,
|
|
188
|
+
gene_seqs: List[str],
|
|
189
|
+
mutation_threshold: float,
|
|
190
|
+
logger: logging.Logger,
|
|
191
|
+
) -> str:
|
|
192
|
+
if not gene_seqs:
|
|
193
|
+
return ""
|
|
194
|
+
alignment = run_mafft_alignment(reference_record, gene_seqs)
|
|
195
|
+
ref_record = None
|
|
196
|
+
other_records: List[SeqRecord] = []
|
|
197
|
+
for record in alignment:
|
|
198
|
+
if record.id == "REF_template":
|
|
199
|
+
ref_record = record
|
|
200
|
+
else:
|
|
201
|
+
other_records.append(record)
|
|
202
|
+
if ref_record is None or not other_records:
|
|
203
|
+
logger.warning("Reference or read sequences missing from alignment output")
|
|
204
|
+
return ""
|
|
205
|
+
|
|
206
|
+
consensus_chars: List[str] = []
|
|
207
|
+
num_reads = len(other_records)
|
|
208
|
+
length = alignment.get_alignment_length()
|
|
209
|
+
for idx in range(length):
|
|
210
|
+
ref_base = ref_record.seq[idx]
|
|
211
|
+
col_bases = [record.seq[idx] for record in other_records]
|
|
212
|
+
counts = Counter(col_bases)
|
|
213
|
+
most_common, count = counts.most_common(1)[0]
|
|
214
|
+
freq = count / num_reads
|
|
215
|
+
if most_common != ref_base and freq >= mutation_threshold:
|
|
216
|
+
consensus_chars.append(most_common)
|
|
217
|
+
else:
|
|
218
|
+
consensus_chars.append(ref_base)
|
|
219
|
+
return "".join(consensus_chars).replace("-", "")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def write_umi_csv(output_file: Path, clusters: List[dict]):
|
|
223
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
with output_file.open("w", newline="") as handle:
|
|
225
|
+
writer = csv.writer(handle)
|
|
226
|
+
writer.writerow(["Cluster Representative", "Total Count", "Members"])
|
|
227
|
+
for cluster in clusters:
|
|
228
|
+
members_str = "; ".join(f"{umi}:{count}" for umi, count in cluster["members"].items())
|
|
229
|
+
writer.writerow([cluster["rep"], cluster["total_count"], members_str])
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def write_gene_csv(
|
|
233
|
+
output_file: Path,
|
|
234
|
+
clusters: List[dict],
|
|
235
|
+
reference_record: SeqRecord,
|
|
236
|
+
mutation_threshold: float,
|
|
237
|
+
logger: logging.Logger,
|
|
238
|
+
) -> List[SeqRecord]:
|
|
239
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
ungapped_ref_length = len(str(reference_record.seq).replace("-", ""))
|
|
241
|
+
consensus_records: List[SeqRecord] = []
|
|
242
|
+
with output_file.open("w", newline="") as handle:
|
|
243
|
+
writer = csv.writer(handle)
|
|
244
|
+
writer.writerow(
|
|
245
|
+
["Cluster Representative", "Total Count", "Consensus Gene", "Length Difference", "Members"]
|
|
246
|
+
)
|
|
247
|
+
clusters_to_align = [cluster for cluster in clusters if cluster["total_count"] > 0]
|
|
248
|
+
for idx, cluster in enumerate(tqdm(clusters_to_align, desc="Processing UMI clusters", unit="cluster")):
|
|
249
|
+
consensus = generate_consensus(reference_record, cluster["gene_seqs"], mutation_threshold, logger)
|
|
250
|
+
length_diff = len(consensus) - ungapped_ref_length
|
|
251
|
+
members_str = "; ".join(f"{umi}:{count}" for umi, count in cluster["members"].items())
|
|
252
|
+
writer.writerow([cluster["rep"], cluster["total_count"], consensus, length_diff, members_str])
|
|
253
|
+
record_id = f"{cluster['rep']}_cluster{idx + 1}"
|
|
254
|
+
consensus_records.append(
|
|
255
|
+
SeqRecord(Seq(consensus), id=record_id, description=f"Length diff: {length_diff}")
|
|
256
|
+
)
|
|
257
|
+
return consensus_records
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def run_umi_hunter(
|
|
261
|
+
template_fasta: Path,
|
|
262
|
+
config_csv: Path,
|
|
263
|
+
fastq_files: Sequence[Path],
|
|
264
|
+
output_dir: Path,
|
|
265
|
+
umi_identity_threshold: float = 0.9,
|
|
266
|
+
consensus_mutation_threshold: float = 0.7,
|
|
267
|
+
log_path: Optional[Path] = None,
|
|
268
|
+
logger: Optional[logging.Logger] = None,
|
|
269
|
+
) -> List[Dict[str, Path]]:
|
|
270
|
+
template_fasta = Path(template_fasta)
|
|
271
|
+
config_csv = Path(config_csv)
|
|
272
|
+
fastq_files = [Path(fq) for fq in fastq_files]
|
|
273
|
+
output_dir = Path(output_dir)
|
|
274
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
275
|
+
|
|
276
|
+
managed_logger = logger is None
|
|
277
|
+
if logger is None:
|
|
278
|
+
logger = logging.getLogger("uht_tooling.umi_hunter")
|
|
279
|
+
logger.setLevel(logging.INFO)
|
|
280
|
+
handler: logging.Handler
|
|
281
|
+
if log_path:
|
|
282
|
+
handler = logging.FileHandler(log_path, mode="w")
|
|
283
|
+
else:
|
|
284
|
+
handler = logging.StreamHandler()
|
|
285
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
|
|
286
|
+
logger.handlers = []
|
|
287
|
+
logger.addHandler(handler)
|
|
288
|
+
logger.propagate = False
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
if not fastq_files:
|
|
292
|
+
raise ValueError("No FASTQ files provided.")
|
|
293
|
+
|
|
294
|
+
cfg = load_flank_config(config_csv)
|
|
295
|
+
pattern_umi, pattern_gene = build_patterns(cfg)
|
|
296
|
+
reference_record = next(SeqIO.parse(str(template_fasta), "fasta"))
|
|
297
|
+
|
|
298
|
+
results: List[Dict[str, Path]] = []
|
|
299
|
+
|
|
300
|
+
for fastq in fastq_files:
|
|
301
|
+
if not fastq.exists():
|
|
302
|
+
logger.warning("FASTQ file %s not found; skipping.", fastq)
|
|
303
|
+
continue
|
|
304
|
+
sample_base = fastq.stem.replace(".fastq", "")
|
|
305
|
+
sample_dir = output_dir / sample_base
|
|
306
|
+
sample_dir.mkdir(parents=True, exist_ok=True)
|
|
307
|
+
|
|
308
|
+
read_count, umi_info = process_fastq(fastq, pattern_umi, pattern_gene, logger)
|
|
309
|
+
if not umi_info:
|
|
310
|
+
logger.warning("No UMIs extracted for %s; skipping.", fastq)
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
clusters = cluster_umis(umi_info, umi_identity_threshold, logger)
|
|
314
|
+
umi_csv = sample_dir / f"{sample_base}_UMI_clusters.csv"
|
|
315
|
+
write_umi_csv(umi_csv, clusters)
|
|
316
|
+
|
|
317
|
+
gene_csv = sample_dir / f"{sample_base}_gene_consensus.csv"
|
|
318
|
+
consensus_records = write_gene_csv(
|
|
319
|
+
gene_csv,
|
|
320
|
+
clusters,
|
|
321
|
+
reference_record,
|
|
322
|
+
consensus_mutation_threshold,
|
|
323
|
+
logger,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
fasta_out = sample_dir / f"{sample_base}_consensuses.fasta"
|
|
327
|
+
SeqIO.write(consensus_records, fasta_out, "fasta")
|
|
328
|
+
|
|
329
|
+
results.append(
|
|
330
|
+
{
|
|
331
|
+
"sample": sample_base,
|
|
332
|
+
"directory": sample_dir,
|
|
333
|
+
"umi_csv": umi_csv,
|
|
334
|
+
"gene_csv": gene_csv,
|
|
335
|
+
"fasta": fasta_out,
|
|
336
|
+
"reads": read_count,
|
|
337
|
+
"clusters": len(clusters),
|
|
338
|
+
}
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
if not results:
|
|
342
|
+
logger.warning("No UMI hunter outputs generated.")
|
|
343
|
+
return results
|
|
344
|
+
finally:
|
|
345
|
+
if managed_logger and logger:
|
|
346
|
+
for handler in list(logger.handlers):
|
|
347
|
+
handler.close()
|
|
348
|
+
logger.removeHandler(handler)
|
|
349
|
+
logger.propagate = True
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
|
|
353
|
+
paths: List[Path] = []
|
|
354
|
+
for item in inputs:
|
|
355
|
+
if any(ch in item for ch in "*?[]"):
|
|
356
|
+
paths.extend(Path().glob(item))
|
|
357
|
+
else:
|
|
358
|
+
paths.append(Path(item))
|
|
359
|
+
unique_paths: List[Path] = []
|
|
360
|
+
seen = set()
|
|
361
|
+
for path in paths:
|
|
362
|
+
resolved = path.resolve()
|
|
363
|
+
if resolved not in seen:
|
|
364
|
+
seen.add(resolved)
|
|
365
|
+
unique_paths.append(path)
|
|
366
|
+
return unique_paths
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
370
|
+
parser = argparse.ArgumentParser(description="Cluster UMIs and generate consensus sequences from long-read data.")
|
|
371
|
+
parser.add_argument("--template-fasta", required=True, type=Path, help="Template FASTA file.")
|
|
372
|
+
parser.add_argument("--config-csv", required=True, type=Path, help="CSV describing UMI and gene flanks.")
|
|
373
|
+
parser.add_argument(
|
|
374
|
+
"--fastq",
|
|
375
|
+
required=True,
|
|
376
|
+
nargs="+",
|
|
377
|
+
help="One or more FASTQ(.gz) paths or glob patterns.",
|
|
378
|
+
)
|
|
379
|
+
parser.add_argument("--output-dir", required=True, type=Path, help="Directory for per-sample outputs.")
|
|
380
|
+
parser.add_argument(
|
|
381
|
+
"--umi-identity-threshold",
|
|
382
|
+
type=float,
|
|
383
|
+
default=0.9,
|
|
384
|
+
help="UMI clustering identity threshold (default: 0.9).",
|
|
385
|
+
)
|
|
386
|
+
parser.add_argument(
|
|
387
|
+
"--consensus-mutation-threshold",
|
|
388
|
+
type=float,
|
|
389
|
+
default=0.7,
|
|
390
|
+
help="Consensus mutation threshold for MAFFT-derived consensus (default: 0.7).",
|
|
391
|
+
)
|
|
392
|
+
parser.add_argument("--log-path", default=None, type=Path, help="Optional log file path.")
|
|
393
|
+
return parser
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def main(argv: Optional[Sequence[str]] = None):
|
|
397
|
+
parser = build_parser()
|
|
398
|
+
args = parser.parse_args(argv)
|
|
399
|
+
fastq_files = expand_fastq_inputs(args.fastq)
|
|
400
|
+
run_umi_hunter(
|
|
401
|
+
template_fasta=args.template_fasta,
|
|
402
|
+
config_csv=args.config_csv,
|
|
403
|
+
fastq_files=fastq_files,
|
|
404
|
+
output_dir=args.output_dir,
|
|
405
|
+
umi_identity_threshold=args.umi_identity_threshold,
|
|
406
|
+
consensus_mutation_threshold=args.consensus_mutation_threshold,
|
|
407
|
+
log_path=args.log_path,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
if __name__ == "__main__":
|
|
412
|
+
main()
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: uht-tooling
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Tooling for ultra-high throughput screening workflows.
|
|
5
|
+
Author: Matt115A
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: biopython==1.85
|
|
10
|
+
Requires-Dist: fuzzywuzzy==0.18.0
|
|
11
|
+
Requires-Dist: matplotlib==3.10.7
|
|
12
|
+
Requires-Dist: pandas==2.3.3
|
|
13
|
+
Requires-Dist: python-Levenshtein==0.27.3
|
|
14
|
+
Requires-Dist: pyyaml==6.0.3
|
|
15
|
+
Requires-Dist: pysam==0.23.3
|
|
16
|
+
Requires-Dist: scipy==1.15.3
|
|
17
|
+
Requires-Dist: seaborn==0.13.2
|
|
18
|
+
Requires-Dist: tabulate==0.9.0
|
|
19
|
+
Requires-Dist: tqdm==4.67.1
|
|
20
|
+
Requires-Dist: typer==0.20.0
|
|
21
|
+
Provides-Extra: gui
|
|
22
|
+
Requires-Dist: gradio==5.49.1; extra == "gui"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest==9.0.0; extra == "dev"
|
|
25
|
+
Requires-Dist: black==25.9.0; extra == "dev"
|
|
26
|
+
Requires-Dist: ruff==0.14.4; extra == "dev"
|
|
27
|
+
|
|
28
|
+
# uht-tooling
|
|
29
|
+
|
|
30
|
+
Automation helpers for ultra-high-throughput molecular biology workflows. The package ships both a Typer-based CLI and an optional Gradio GUI that wrap the same workflow code paths.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
### Quick install (recommended, easiest file maintainance)
|
|
37
|
+
```bash
|
|
38
|
+
python -m pip install "uht-tooling[gui]"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
This installs the core workflows plus the optional GUI dependencies (Gradio, pandas). Omit the `[gui]` extras if you only need the CLI:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
python -m pip install uht-tooling
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Development install
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/Matt115A/uht-tooling.git
|
|
50
|
+
cd uht-tooling
|
|
51
|
+
python -m pip install -e ".[gui,dev]"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The editable install exposes the latest sources, while the `dev` extras add linting and test tooling.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Directory layout
|
|
59
|
+
|
|
60
|
+
- Reference inputs live under `data/<workflow>/`.
|
|
61
|
+
- Outputs (CSV, FASTA, plots, logs) are written to `results/<workflow>/`.
|
|
62
|
+
- All workflows log to `results/<workflow>/run.log` for reproducibility and debugging.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Command-line interface
|
|
67
|
+
|
|
68
|
+
The CLI is exposed as the `uht-tooling` executable. List the available commands:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
uht-tooling --help
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Each command mirrors a workflow module. Common entry points:
|
|
75
|
+
|
|
76
|
+
| Command | Purpose |
|
|
77
|
+
| --- | --- |
|
|
78
|
+
| `uht-tooling nextera-primers` | Generate Nextera XT primer pairs from a binding-region CSV. |
|
|
79
|
+
| `uht-tooling design-slim` | Design SLIM mutagenesis primers from FASTA/CSV inputs. |
|
|
80
|
+
| `uht-tooling design-gibson` | Produce Gibson mutagenesis primers and assembly plans. |
|
|
81
|
+
| `uht-tooling mutation-caller` | Summarise amino-acid substitutions from long-read FASTQ files. |
|
|
82
|
+
| `uht-tooling umi-hunter` | Cluster UMIs and call consensus alleles. |
|
|
83
|
+
| `uht-tooling ep-library-profile` | Measure mutation rates without UMIs. |
|
|
84
|
+
| `uht-tooling profile-inserts` | Extract inserts defined by probe pairs. |
|
|
85
|
+
|
|
86
|
+
Each command provides detailed help, including option descriptions and expected file formats:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uht-tooling mutation-caller --help
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
You can pass multiple FASTQ paths using repeated `--fastq` options or glob patterns. Optional `--log-path` flags redirect logs if you prefer a location outside the default results directory.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Workflow reference
|
|
97
|
+
|
|
98
|
+
### Nextera XT primer design
|
|
99
|
+
|
|
100
|
+
1. Prepare `data/nextera_designer/nextera_designer.csv` with a `binding_region` column. Row 1 should contain the forward region, row 2 the reverse region, both in 5'→3' orientation.
|
|
101
|
+
2. Optional: supply a YAML overrides file for index lists/prefixes via `--config`.
|
|
102
|
+
3. Run:
|
|
103
|
+
```bash
|
|
104
|
+
uht-tooling nextera-primers \
|
|
105
|
+
--binding-csv data/nextera_designer/nextera_designer.csv \
|
|
106
|
+
--output-csv results/nextera_designer/nextera_xt_primers.csv
|
|
107
|
+
```
|
|
108
|
+
4. Primer CSVs will be written to `results/nextera_designer/`, accompanied by a log file.
|
|
109
|
+
|
|
110
|
+
The helper is preloaded with twelve i5 and twelve i7 indices, enabling up to 144 unique amplicons. Downstream lab workflow suggestions (qPCR monitoring, SPRIselect cleanup) remain unchanged from earlier releases.
|
|
111
|
+
|
|
112
|
+
#### Wet-lab workflow notes
|
|
113
|
+
|
|
114
|
+
- Perform the initial amplification with an i5/i7 primer pair and monitor a small aliquot by qPCR. Cap thermocycling early so you only generate ~10% of the theoretical yield—this minimizes amplification bias.
|
|
115
|
+
- Purify the product with SPRIselect beads at approximately a 0.65:1 bead:DNA volume ratio to remove residual primers and short fragments.
|
|
116
|
+
- Confirm primer removal using electrophoresis (e.g., BioAnalyzer DNA chip) before moving to sequencing prep.
|
|
117
|
+
|
|
118
|
+
### SLIM primer design
|
|
119
|
+
|
|
120
|
+
- Inputs:
|
|
121
|
+
- `data/design_slim/slim_template_gene.fasta`
|
|
122
|
+
- `data/design_slim/slim_context.fasta`
|
|
123
|
+
- `data/design_slim/slim_target_mutations.csv` (single `mutations` column)
|
|
124
|
+
- Run:
|
|
125
|
+
```bash
|
|
126
|
+
uht-tooling design-slim \
|
|
127
|
+
--gene-fasta data/design_slim/slim_template_gene.fasta \
|
|
128
|
+
--context-fasta data/design_slim/slim_context.fasta \
|
|
129
|
+
--mutations-csv data/design_slim/slim_target_mutations.csv \
|
|
130
|
+
--output-dir results/design_slim/
|
|
131
|
+
```
|
|
132
|
+
- Output: `results/design_slim/SLIM_primers.csv` plus logs.
|
|
133
|
+
|
|
134
|
+
Mutation nomenclature examples:
|
|
135
|
+
- `A123G` (substitution)
|
|
136
|
+
- `T241Del` (deletion)
|
|
137
|
+
- `T241TS` (insert Ser after Thr241)
|
|
138
|
+
- `L46GP` (replace Leu46 with Gly-Pro)
|
|
139
|
+
|
|
140
|
+
#### Experimental blueprint
|
|
141
|
+
|
|
142
|
+
- Hands-on time is approximately three hours (excluding protein purification), with mutant protein obtainable in roughly three days.
|
|
143
|
+
- Conduct two PCRs per mutant set: (A) long forward with short reverse and (B) long reverse with short forward.
|
|
144
|
+
- Combine 10 µL from each PCR with 10 µL H-buffer (150 mM Tris pH 8, 400 mM NaCl, 60 mM EDTA) for a 30 µL annealing reaction: 99 °C for 3 min, then two cycles of 65 °C for 5 min followed by 30 °C for 15 min, hold at 4 °C.
|
|
145
|
+
- Transform directly into NEB 5-alpha or BL21 (DE3) cells without additional cleanup. The protocol has been validated for simultaneous introduction of dozens of mutations.
|
|
146
|
+
|
|
147
|
+
### Gibson assembly primers
|
|
148
|
+
|
|
149
|
+
- Inputs mirror the SLIM workflow but use `data/design_gibson/`.
|
|
150
|
+
- Link sub-mutations with `+` to specify multi-mutation assemblies (e.g., `A123G+T150A`).
|
|
151
|
+
- Run:
|
|
152
|
+
```bash
|
|
153
|
+
uht-tooling design-gibson \
|
|
154
|
+
--gene-fasta data/design_gibson/gibson_template_gene.fasta \
|
|
155
|
+
--context-fasta data/design_gibson/gibson_context.fasta \
|
|
156
|
+
--mutations-csv data/design_gibson/gibson_target_mutations.csv \
|
|
157
|
+
--output-dir results/design_gibson/
|
|
158
|
+
```
|
|
159
|
+
- Outputs include primer sets and an assembly-plan CSV.
|
|
160
|
+
|
|
161
|
+
If mutations fall within overlapping primer windows, design sequential reactions to avoid excessive primer reuse.
|
|
162
|
+
|
|
163
|
+
### Mutation caller (no UMIs)
|
|
164
|
+
|
|
165
|
+
1. Supply:
|
|
166
|
+
- `data/mutation_caller/mutation_caller_template.fasta`
|
|
167
|
+
- `data/mutation_caller/mutation_caller.csv` with `gene_flanks` and `gene_min_max` columns (two rows each).
|
|
168
|
+
- One or more FASTQ files via `--fastq`.
|
|
169
|
+
2. Run:
|
|
170
|
+
```bash
|
|
171
|
+
uht-tooling mutation-caller \
|
|
172
|
+
--template-fasta data/mutation_caller/mutation_caller_template.fasta \
|
|
173
|
+
--flanks-csv data/mutation_caller/mutation_caller.csv \
|
|
174
|
+
--fastq data/mutation_caller/*.fastq.gz \
|
|
175
|
+
--output-dir results/mutation_caller/ \
|
|
176
|
+
--threshold 10
|
|
177
|
+
```
|
|
178
|
+
3. Outputs: per-sample subdirectories with substitution summaries, co-occurrence matrices, and logs.
|
|
179
|
+
|
|
180
|
+
### UMI Hunter
|
|
181
|
+
|
|
182
|
+
- Inputs: `data/umi_hunter/template.fasta`, `data/umi_hunter/umi_hunter.csv`, and FASTQ reads.
|
|
183
|
+
- Command:
|
|
184
|
+
```bash
|
|
185
|
+
uht-tooling umi-hunter \
|
|
186
|
+
--template-fasta data/umi_hunter/template.fasta \
|
|
187
|
+
--config-csv data/umi_hunter/umi_hunter.csv \
|
|
188
|
+
--fastq data/umi_hunter/*.fastq.gz \
|
|
189
|
+
--output-dir results/umi_hunter/
|
|
190
|
+
```
|
|
191
|
+
- Tunable parameters include `--umi-identity-threshold` and `--consensus-mutation-threshold`.
|
|
192
|
+
|
|
193
|
+
### Profile inserts
|
|
194
|
+
|
|
195
|
+
- Prepare `data/profile_inserts/sample_probes.csv` with `upstream` and `downstream` columns.
|
|
196
|
+
- Run:
|
|
197
|
+
```bash
|
|
198
|
+
uht-tooling profile-inserts \
|
|
199
|
+
--probes-csv data/profile_inserts/sample_probes.csv \
|
|
200
|
+
--fastq data/profile_inserts/*.fastq.gz \
|
|
201
|
+
--output-dir results/profile_inserts/
|
|
202
|
+
```
|
|
203
|
+
- Outputs: extracted insert FASTA files, QC plots, metrics, and logs. Adjust fuzzy matching strictness via `--min-ratio`.
|
|
204
|
+
|
|
205
|
+
### EP library profiler (no UMIs)
|
|
206
|
+
|
|
207
|
+
- Inputs:
|
|
208
|
+
- `data/ep-library-profile/region_of_interest.fasta`
|
|
209
|
+
- `data/ep-library-profile/plasmid.fasta`
|
|
210
|
+
- FASTQ inputs (`--fastq` accepts multiple files)
|
|
211
|
+
- Run:
|
|
212
|
+
```bash
|
|
213
|
+
uht-tooling ep-library-profile \
|
|
214
|
+
--region-fasta data/ep-library-profile/region_of_interest.fasta \
|
|
215
|
+
--plasmid-fasta data/ep-library-profile/plasmid.fasta \
|
|
216
|
+
--fastq data/ep-library-profile/*.fastq.gz \
|
|
217
|
+
--output-dir results/ep-library-profile/
|
|
218
|
+
```
|
|
219
|
+
- Output bundle includes per-sample directories and a master summary TSV.
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## GUI quick start (optional)
|
|
224
|
+
|
|
225
|
+
The Gradio GUI wraps the same workflows with upload widgets and result previews. Launch it directly:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
python -m uht_tooling.workflows.gui
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Key points:
|
|
232
|
+
- The server binds to `http://127.0.0.1:7860` by default and falls back to an available port if 7860 is busy. Copy http://127.0.0.1:7860 into your browser.
|
|
233
|
+
- Temporary working directories are created under the system temp folder and cleaned automatically.
|
|
234
|
+
- Output archives (ZIP files) mirror the directory structure produced by the CLI.
|
|
235
|
+
|
|
236
|
+
### Tabs and capabilities
|
|
237
|
+
|
|
238
|
+
1. **Nextera XT** – forward/reverse primer inputs with CSV preview.
|
|
239
|
+
2. **SLIM** – template/context FASTA text areas plus mutation list.
|
|
240
|
+
3. **Gibson** – multi-mutation support using `+` syntax.
|
|
241
|
+
4. **Mutation Caller** – upload FASTQ, template FASTA, and configuration CSV.
|
|
242
|
+
5. **UMI Hunter** – long-read UMI clustering with configurable thresholds.
|
|
243
|
+
6. **Profile Inserts** – probe CSV and multiple FASTQ uploads.
|
|
244
|
+
7. **EP Library Profile** – FASTQ uploads plus plasmid and region FASTA inputs.
|
|
245
|
+
|
|
246
|
+
### Workflow tips
|
|
247
|
+
|
|
248
|
+
- For large FASTQ datasets, the CLI remains the most efficient option (especially for automation or batch processing).
|
|
249
|
+
- Use the command-line flag `--share` in `python -m uht_tooling.workflows.gui` if you need to expose the GUI outside localhost.
|
|
250
|
+
|
|
251
|
+
### Troubleshooting
|
|
252
|
+
|
|
253
|
+
- **Port already bound:** the launcher automatically selects the next free port and logs the chosen URL.
|
|
254
|
+
- **Missing dependency:** ensure you installed with `pip install "uht-tooling[gui]"`.
|
|
255
|
+
- **Stopping the server:** press `Ctrl+C` in the terminal session running the GUI.
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Logging
|
|
260
|
+
|
|
261
|
+
Every workflow configures logging to the destination output directory. Inspect `run.log` for command echoes, parameter choices, and any warnings produced during execution. When providing bug reports, include this log file along with input metadata to streamline triage.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Roadmap
|
|
266
|
+
|
|
267
|
+
- Replace deprecated Biopython command-line wrappers with native subprocess implementations.
|
|
268
|
+
- Expand CLI coverage to any remaining legacy scripts that are still invoked via `make`.
|
|
269
|
+
- Add documentation for automation pipelines and integrate continuous integration tests.
|
|
270
|
+
|
|
271
|
+
Contributions in the form of bug reports, pull requests, or feature suggestions are welcome. File issues on GitHub with clear reproduction steps and sample data when possible.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
|
|
2
|
+
uht_tooling/cli.py,sha256=G8eDBtw_YgcYV7ynk_YD9W2ua0a0pqGCqud5VaHsB6M,11695
|
|
3
|
+
uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
|
|
6
|
+
uht_tooling/workflows/design_slim.py,sha256=Qeh8N32kmVFZvohmTlBudJsLzOqLy4XcY3aXbkP-sFQ,14421
|
|
7
|
+
uht_tooling/workflows/gui.py,sha256=jP3gYZp8hyBCms65nzoZ_EW3rsNrn2ZGGp8gBSvny6Q,23123
|
|
8
|
+
uht_tooling/workflows/mut_rate.py,sha256=wjX1lNXTcaH49gfARSrpKLU1mD5hCgH0ZFTcdlNrAB4,105670
|
|
9
|
+
uht_tooling/workflows/mutation_caller.py,sha256=BczuNATOSUcmlw-x6qTzEQfW8MBbvGclEyqiQiBX0cg,16222
|
|
10
|
+
uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
|
|
11
|
+
uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
|
|
12
|
+
uht_tooling/workflows/umi_hunter.py,sha256=kXR7Tw3vK4TnL8OShRt9kZ36ONpOSd-1txwB95Ldi-I,14470
|
|
13
|
+
uht_tooling-0.1.2.dist-info/METADATA,sha256=bu34FtN9RwijcENNhkRxT1LdXQyl8uBfToBOItastDU,10800
|
|
14
|
+
uht_tooling-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
uht_tooling-0.1.2.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
|
|
16
|
+
uht_tooling-0.1.2.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
|
|
17
|
+
uht_tooling-0.1.2.dist-info/RECORD,,
|