uht-tooling 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,432 @@
1
+ import argparse
2
+ import gzip
3
+ import logging
4
+ import os
5
+ import re
6
+ import subprocess
7
+ import tempfile
8
+ from collections import Counter, defaultdict
9
+ from pathlib import Path
10
+ from typing import Dict, Iterable, List, Optional, Sequence
11
+
12
+ import matplotlib.pyplot as plt
13
+ import numpy as np
14
+ import pandas as pd
15
+ from Bio import AlignIO, SeqIO
16
+ from Bio.Align.Applications import MafftCommandline
17
+ from Bio.Seq import Seq
18
+ from Bio.SeqRecord import SeqRecord
19
+ from scipy.stats import fisher_exact, gaussian_kde
20
+
21
+
22
+ def reverse_complement(seq: str) -> str:
23
+ return seq.translate(str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]
24
+
25
+
26
+ def build_flank_pattern(flanks_csv: Path) -> tuple[re.Pattern, int, int]:
27
+ df = pd.read_csv(flanks_csv)
28
+ gene_start = df.loc[0, "gene_flanks"]
29
+ gene_end = df.loc[1, "gene_flanks"]
30
+ gene_min = int(df.loc[0, "gene_min_max"])
31
+ gene_max = int(df.loc[1, "gene_min_max"])
32
+ pattern = re.compile(
33
+ rf"{gene_start}([ACGTNacgtn]{{{gene_min},{gene_max}}}){gene_end}",
34
+ re.IGNORECASE,
35
+ )
36
+ return pattern, gene_min, gene_max
37
+
38
+
39
+ def extract_gene(seq: str, pattern: re.Pattern, gene_min: int, gene_max: int) -> Optional[str]:
40
+ match = pattern.search(seq)
41
+ if match:
42
+ gene = match.group(1)
43
+ if gene_min <= len(gene) <= gene_max:
44
+ return gene
45
+ match = pattern.search(reverse_complement(seq))
46
+ if match:
47
+ gene = match.group(1)
48
+ if gene_min <= len(gene) <= gene_max:
49
+ return gene
50
+ return None
51
+
52
+
53
+ def process_fastq(file_path: Path, pattern: re.Pattern, gene_min: int, gene_max: int) -> Dict[str, str]:
54
+ gene_reads: Dict[str, str] = {}
55
+ with gzip.open(file_path, "rt") as handle:
56
+ while True:
57
+ header = handle.readline()
58
+ if not header:
59
+ break
60
+ seq = handle.readline().strip()
61
+ handle.readline() # '+'
62
+ handle.readline() # quality
63
+ gene = extract_gene(seq, pattern, gene_min, gene_max)
64
+ if gene:
65
+ read_id = header.strip()[1:]
66
+ gene_reads[read_id] = gene
67
+ return gene_reads
68
+
69
+
70
+ def align_to_reference(gene_seqs: Dict[str, str], reference: str) -> tuple[str, Dict[str, str]]:
71
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".fasta") as tmp_in:
72
+ SeqIO.write(SeqRecord(Seq(reference), id="REF", description=""), tmp_in, "fasta")
73
+ for rid, seq in gene_seqs.items():
74
+ SeqIO.write(SeqRecord(Seq(seq), id=rid, description=""), tmp_in, "fasta")
75
+ tmp_in_path = tmp_in.name
76
+
77
+ mafft = MafftCommandline(input=tmp_in_path)
78
+ proc = subprocess.Popen(
79
+ str(mafft),
80
+ shell=True,
81
+ stdout=subprocess.PIPE,
82
+ stderr=subprocess.PIPE,
83
+ universal_newlines=True,
84
+ )
85
+ stdout, stderr = proc.communicate()
86
+
87
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".fasta") as tmp_out:
88
+ tmp_out.write(stdout)
89
+ tmp_out_path = tmp_out.name
90
+
91
+ if proc.returncode != 0:
92
+ raise RuntimeError(f"MAFFT failed with exit code {proc.returncode}:\n{stderr}")
93
+
94
+ alignment = AlignIO.read(tmp_out_path, "fasta")
95
+
96
+ aligned_ref = None
97
+ aligned_reads: Dict[str, str] = {}
98
+ for record in alignment:
99
+ if record.id == "REF":
100
+ aligned_ref = str(record.seq)
101
+ else:
102
+ aligned_reads[record.id] = str(record.seq)
103
+
104
+ os.remove(tmp_in_path)
105
+ os.remove(tmp_out_path)
106
+
107
+ if aligned_ref is None:
108
+ raise RuntimeError("Reference sequence missing from alignment output.")
109
+
110
+ return aligned_ref, aligned_reads
111
+
112
+
113
+ def identify_substitutions(ref: str, aligned_reads: Dict[str, str]) -> Dict[str, List[str]]:
114
+ subs_by_read: Dict[str, List[str]] = defaultdict(list)
115
+
116
+ aln2ref: Dict[int, Optional[int]] = {}
117
+ ref_clean: List[str] = []
118
+ ref_index = 0
119
+ for aln_idx, base in enumerate(ref):
120
+ if base != "-":
121
+ aln2ref[aln_idx] = ref_index
122
+ ref_clean.append(base)
123
+ ref_index += 1
124
+ else:
125
+ aln2ref[aln_idx] = None
126
+ ref_clean_seq = "".join(ref_clean)
127
+
128
+ ref2aln: Dict[int, int] = {}
129
+ for aln_idx, ref_idx in aln2ref.items():
130
+ if ref_idx is not None and ref_idx not in ref2aln:
131
+ ref2aln[ref_idx] = aln_idx
132
+
133
+ codon_count = len(ref_clean_seq) // 3
134
+ for read_id, seq in aligned_reads.items():
135
+ for codon_i in range(codon_count):
136
+ start_r = codon_i * 3
137
+ codon_ref = ref_clean_seq[start_r : start_r + 3]
138
+ codon_read: List[str] = []
139
+ valid = True
140
+ diff = False
141
+ for offset in range(3):
142
+ ref_pos = start_r + offset
143
+ aln_idx = ref2aln.get(ref_pos)
144
+ if aln_idx is None:
145
+ valid = False
146
+ break
147
+ base_q = seq[aln_idx]
148
+ if base_q == "-":
149
+ valid = False
150
+ break
151
+ codon_read.append(base_q)
152
+ if base_q != codon_ref[offset]:
153
+ diff = True
154
+ if not valid or not diff:
155
+ continue
156
+ try:
157
+ aa_from = str(Seq(codon_ref).translate())
158
+ aa_to = str(Seq("".join(codon_read)).translate())
159
+ except Exception:
160
+ aa_from, aa_to = "?", "?"
161
+ aa_mut = f"{aa_from}{codon_i + 1}{aa_to}"
162
+ nt_mut = f"{codon_ref}->{''.join(codon_read)}"
163
+ subs_by_read[read_id].append(f"{nt_mut} ({aa_mut})")
164
+
165
+ return subs_by_read
166
+
167
+
168
+ def find_cooccurring_aa(
169
+ subs_by_read_aa: Dict[str, List[str]],
170
+ frequent_aa: set[str],
171
+ output_dir: Path,
172
+ sample_name: str,
173
+ ) -> tuple[Path, Path]:
174
+ aa_list = sorted(frequent_aa)
175
+ aa_idx = {aa: i for i, aa in enumerate(aa_list)}
176
+
177
+ matrix: List[List[int]] = []
178
+ for calls in subs_by_read_aa.values():
179
+ row = [0] * len(aa_list)
180
+ any_selected = False
181
+ for aa in calls:
182
+ if aa in aa_idx:
183
+ row[aa_idx[aa]] = 1
184
+ any_selected = True
185
+ if any_selected:
186
+ matrix.append(row)
187
+
188
+ baseline_path = output_dir / f"{sample_name}_cooccurring_AA_baseline.csv"
189
+ fisher_path = output_dir / f"{sample_name}_cooccurring_AA_fisher.csv"
190
+
191
+ if not matrix:
192
+ pd.DataFrame(columns=["AA1", "AA2", "Both_Count", "AA1_Count", "AA2_Count"]).to_csv(
193
+ baseline_path, index=False
194
+ )
195
+ pd.DataFrame(columns=["AA1", "AA2", "p-value"]).to_csv(fisher_path, index=False)
196
+ return baseline_path, fisher_path
197
+
198
+ df = pd.DataFrame(matrix, columns=aa_list)
199
+ simple: List[tuple[str, str, int, int, int]] = []
200
+ fisher_rows: List[tuple[str, str, float]] = []
201
+
202
+ for i in range(len(aa_list)):
203
+ for j in range(i + 1, len(aa_list)):
204
+ col_a, col_b = df.iloc[:, i], df.iloc[:, j]
205
+ both = int(((col_a == 1) & (col_b == 1)).sum())
206
+ a_tot = int((col_a == 1).sum())
207
+ b_tot = int((col_b == 1).sum())
208
+ if (both >= 2) or (both > 0 and both == a_tot == b_tot):
209
+ simple.append((aa_list[i], aa_list[j], both, a_tot, b_tot))
210
+ table = [
211
+ [both, int(((col_a == 1) & (col_b == 0)).sum())],
212
+ [int(((col_a == 0) & (col_b == 1)).sum()), int(((col_a == 0) & (col_b == 0)).sum())],
213
+ ]
214
+ try:
215
+ _, p_value = fisher_exact(table)
216
+ if p_value < 0.05:
217
+ fisher_rows.append((aa_list[i], aa_list[j], p_value))
218
+ except Exception:
219
+ continue
220
+
221
+ pd.DataFrame(simple, columns=["AA1", "AA2", "Both_Count", "AA1_Count", "AA2_Count"]).to_csv(
222
+ baseline_path, index=False
223
+ )
224
+ pd.DataFrame(fisher_rows, columns=["AA1", "AA2", "p-value"]).to_csv(fisher_path, index=False)
225
+ return baseline_path, fisher_path
226
+
227
+
228
+ def run_mutation_caller(
229
+ template_fasta: Path,
230
+ flanks_csv: Path,
231
+ fastq_files: Sequence[Path],
232
+ output_dir: Path,
233
+ threshold: int,
234
+ log_path: Optional[Path] = None,
235
+ logger: Optional[logging.Logger] = None,
236
+ ) -> List[Dict[str, Path]]:
237
+ template_fasta = Path(template_fasta)
238
+ flanks_csv = Path(flanks_csv)
239
+ fastq_files = [Path(fq) for fq in fastq_files]
240
+ output_dir = Path(output_dir)
241
+ output_dir.mkdir(parents=True, exist_ok=True)
242
+
243
+ managed_logger = logger is None
244
+ if logger is None:
245
+ logger = logging.getLogger("uht_tooling.mutation_caller")
246
+ logger.setLevel(logging.INFO)
247
+ handler: logging.Handler
248
+ if log_path:
249
+ handler = logging.FileHandler(log_path, mode="w")
250
+ else:
251
+ handler = logging.StreamHandler()
252
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
253
+ logger.handlers = []
254
+ logger.addHandler(handler)
255
+ logger.propagate = False
256
+
257
+ try:
258
+ if not fastq_files:
259
+ raise ValueError("No FASTQ files provided.")
260
+
261
+ pattern, gene_min, gene_max = build_flank_pattern(flanks_csv)
262
+ template_record = next(SeqIO.parse(str(template_fasta), "fasta"))
263
+ full_ref = str(template_record.seq)
264
+ logger.info("Loaded template sequence of length %s.", len(full_ref))
265
+
266
+ df = pd.read_csv(flanks_csv)
267
+ gene_start = df.loc[0, "gene_flanks"]
268
+ gene_end = df.loc[1, "gene_flanks"]
269
+ if full_ref.startswith(gene_start) and full_ref.endswith(gene_end):
270
+ reference = full_ref[len(gene_start) : len(full_ref) - len(gene_end)]
271
+ logger.info("Trimmed flanking regions from template.")
272
+ else:
273
+ reference = full_ref
274
+
275
+ results: List[Dict[str, Path]] = []
276
+
277
+ for fastq in fastq_files:
278
+ if not fastq.exists():
279
+ logger.warning("FASTQ file %s not found; skipping.", fastq)
280
+ continue
281
+ sample_base = fastq.stem.replace(".fastq", "")
282
+ sample_dir = output_dir / sample_base
283
+ sample_dir.mkdir(parents=True, exist_ok=True)
284
+
285
+ logger.info("Processing sample %s", sample_base)
286
+ gene_reads = process_fastq(fastq, pattern, gene_min, gene_max)
287
+ if not gene_reads:
288
+ logger.warning("No valid gene reads for %s; skipping.", sample_base)
289
+ continue
290
+
291
+ aligned_ref, aligned_reads = align_to_reference(gene_reads, reference)
292
+ substitutions = identify_substitutions(aligned_ref, aligned_reads)
293
+ subs_aa = {
294
+ rid: [item.split()[1][1:-1] for item in items if "(" in item and item.endswith(")")]
295
+ for rid, items in substitutions.items()
296
+ }
297
+ counts = Counter(aa for aas in subs_aa.values() for aa in aas)
298
+ if not counts:
299
+ logger.warning("No amino-acid substitutions detected for %s; skipping.", sample_base)
300
+ continue
301
+
302
+ keys = list(counts.keys())
303
+ values = np.array([counts[k] for k in keys], dtype=float)
304
+ idx = np.arange(len(keys))
305
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
306
+ ax1.bar(idx, values)
307
+ ax1.set_xticks(idx)
308
+ ax1.set_xticklabels(keys, rotation=90, fontsize=8)
309
+ ax1.set_ylabel("Count")
310
+ ax1.set_title("Amino-Acid Substitution Frequencies")
311
+
312
+ try:
313
+ kde = gaussian_kde(values)
314
+ xmin, xmax = max(1.0, values.min()), values.max()
315
+ xs = np.logspace(np.log10(xmin), np.log10(xmax), 200)
316
+ ax2.plot(xs, kde(xs), linewidth=2)
317
+ except Exception:
318
+ ax2.hist(values, bins="auto", density=True, alpha=0.6)
319
+ ax2.set_xscale("log")
320
+ ax2.set_xlabel("Substitution Count (log scale)")
321
+ ax2.set_ylabel("Density")
322
+ ax2.set_title("KDE of AA Substitution Frequencies")
323
+
324
+ plt.tight_layout()
325
+ plot_path = sample_dir / f"{sample_base}_aa_substitution_frequency.png"
326
+ fig.savefig(plot_path)
327
+ plt.close(fig)
328
+ logger.info("Saved substitution frequency plot to %s", plot_path)
329
+
330
+ frequent = {aa for aa, count in counts.items() if count >= threshold}
331
+ freq_csv = sample_dir / f"{sample_base}_frequent_aa_counts.csv"
332
+ pd.DataFrame(
333
+ sorted(((aa, counts[aa]) for aa in frequent), key=lambda x: x[0]),
334
+ columns=["AA", "Count"],
335
+ ).to_csv(freq_csv, index=False)
336
+
337
+ baseline_path, fisher_path = find_cooccurring_aa(subs_aa, frequent, sample_dir, sample_base)
338
+
339
+ report_path = sample_dir / f"{sample_base}_report.txt"
340
+ with report_path.open("w") as report:
341
+ report.write(f"Sample: {sample_base}\n")
342
+ report.write(f"Valid gene reads: {len(gene_reads)}\n")
343
+ report.write(f"Unique AA substitutions: {len(counts)}\n")
344
+ report.write(f"Threshold: {threshold}\n")
345
+ report.write(f"Frequent AA substitutions (≥ {threshold}): {len(frequent)}\n\n")
346
+ report.write("Frequent AA counts:\n")
347
+ report.write("AA\tCount\n")
348
+ for aa in sorted(frequent):
349
+ report.write(f"{aa}\t{counts[aa]}\n")
350
+ report.write("\nGenerated files:\n")
351
+ report.write(f"- Plot: {plot_path.name}\n")
352
+ report.write(f"- Frequent counts: {freq_csv.name}\n")
353
+ report.write(f"- Co-occurrence baseline: {baseline_path.name}\n")
354
+ report.write(f"- Co-occurrence fisher: {fisher_path.name}\n")
355
+
356
+ results.append(
357
+ {
358
+ "sample": sample_base,
359
+ "directory": sample_dir,
360
+ "plot": plot_path,
361
+ "frequent_counts": freq_csv,
362
+ "baseline": baseline_path,
363
+ "fisher": fisher_path,
364
+ "report": report_path,
365
+ }
366
+ )
367
+
368
+ if not results:
369
+ logger.warning("No outputs generated; check inputs and thresholds.")
370
+ return results
371
+ finally:
372
+ if managed_logger and logger:
373
+ for handler in list(logger.handlers):
374
+ handler.close()
375
+ logger.removeHandler(handler)
376
+ logger.propagate = True
377
+
378
+
379
+ def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
380
+ paths: List[Path] = []
381
+ for item in inputs:
382
+ if any(ch in item for ch in "*?[]"):
383
+ paths.extend(Path().glob(item))
384
+ else:
385
+ paths.append(Path(item))
386
+ unique_paths = []
387
+ seen = set()
388
+ for path in paths:
389
+ resolved = path.resolve()
390
+ if resolved not in seen:
391
+ seen.add(resolved)
392
+ unique_paths.append(path)
393
+ return unique_paths
394
+
395
+
396
+ def build_parser() -> argparse.ArgumentParser:
397
+ parser = argparse.ArgumentParser(description="Identify mutations from long-read sequencing without UMIs.")
398
+ parser.add_argument("--template-fasta", required=True, type=Path, help="FASTA file containing the gene template.")
399
+ parser.add_argument("--flanks-csv", required=True, type=Path, help="CSV describing gene flanks and length bounds.")
400
+ parser.add_argument(
401
+ "--fastq",
402
+ required=True,
403
+ nargs="+",
404
+ help="One or more FASTQ(.gz) paths or glob patterns (e.g., data/*.fastq.gz).",
405
+ )
406
+ parser.add_argument("--output-dir", required=True, type=Path, help="Directory to place sample outputs.")
407
+ parser.add_argument(
408
+ "--threshold",
409
+ type=int,
410
+ default=10,
411
+ help="Minimum AA substitution count to include in the frequent-substitution report (default: 10).",
412
+ )
413
+ parser.add_argument("--log-path", default=None, type=Path, help="Optional log file path.")
414
+ return parser
415
+
416
+
417
+ def main(argv: Optional[Sequence[str]] = None):
418
+ parser = build_parser()
419
+ args = parser.parse_args(argv)
420
+ fastq_files = expand_fastq_inputs(args.fastq)
421
+ run_mutation_caller(
422
+ template_fasta=args.template_fasta,
423
+ flanks_csv=args.flanks_csv,
424
+ fastq_files=fastq_files,
425
+ output_dir=args.output_dir,
426
+ threshold=args.threshold,
427
+ log_path=args.log_path,
428
+ )
429
+
430
+
431
+ if __name__ == "__main__":
432
+ main()
@@ -0,0 +1,199 @@
1
+ import argparse
2
+ import csv
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import pandas as pd
8
+ import yaml
9
+
10
+ I7_INDEXES: Dict[str, str] = {
11
+ "701": "TCGCCTTA",
12
+ "702": "CTAGTACG",
13
+ "703": "TTCTGCCT",
14
+ "704": "GCTCAGGA",
15
+ "705": "AGGAGTCC",
16
+ "706": "CATGCCTA",
17
+ "707": "GTAGAGAG",
18
+ "708": "CCTCTCTG",
19
+ "709": "AGCGTAGC",
20
+ "710": "CAGCCTCG",
21
+ "711": "TGCCTCTT",
22
+ "712": "TCCTCTAC",
23
+ }
24
+
25
+ I5_INDEXES: Dict[str, str] = {
26
+ "501": "TAGATCGC",
27
+ "502": "CTCTCTAT",
28
+ "503": "TATCCTCT",
29
+ "504": "AGAGTAGA",
30
+ "505": "GTAAGGAG",
31
+ "506": "ACTGCATA",
32
+ "507": "AAGGAGTA",
33
+ "508": "CTAAGCCT",
34
+ "510": "CGTCTAAT",
35
+ "511": "TCTCTCCG",
36
+ "513": "TCGACTAG",
37
+ "515": "TTCTAGCT",
38
+ }
39
+
40
+ I7_PREFIX = "CAAGCAGAAGACGGCATACGAGAT"
41
+ I7_SUFFIX = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"
42
+ I5_PREFIX = "AATGATACGGCGACCACCGAGATCTACAC"
43
+ I5_SUFFIX = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
44
+
45
+
46
+ def load_binding_sequences(csv_path: Path) -> Tuple[str, str]:
47
+ df = pd.read_csv(csv_path)
48
+ if "binding_region" not in df.columns:
49
+ raise ValueError("CSV must contain a 'binding_region' column")
50
+ if len(df) < 2:
51
+ raise ValueError("CSV must contain at least two rows for i7 and i5 binding regions")
52
+ return df["binding_region"].iloc[0], df["binding_region"].iloc[1]
53
+
54
+
55
+ def load_config(config_path: Optional[Path]) -> Dict[str, Dict[str, str]]:
56
+ if not config_path:
57
+ return {}
58
+ with open(config_path, "r", encoding="utf-8") as handle:
59
+ config = yaml.safe_load(handle) or {}
60
+ return config
61
+
62
+
63
+ def generate_primers(
64
+ template_binding_i7: str,
65
+ template_binding_i5: str,
66
+ i7_indexes: Optional[Dict[str, str]] = None,
67
+ i5_indexes: Optional[Dict[str, str]] = None,
68
+ i7_prefix: str = I7_PREFIX,
69
+ i7_suffix: str = I7_SUFFIX,
70
+ i5_prefix: str = I5_PREFIX,
71
+ i5_suffix: str = I5_SUFFIX,
72
+ ) -> List[Tuple[str, str]]:
73
+ primers: List[Tuple[str, str]] = []
74
+ i7_map = i7_indexes or I7_INDEXES
75
+ i5_map = i5_indexes or I5_INDEXES
76
+
77
+ for idx, seq in i7_map.items():
78
+ name = f"i7_{idx}"
79
+ full_seq = f"{i7_prefix}{seq}{i7_suffix}{template_binding_i7}"
80
+ primers.append((name, full_seq))
81
+
82
+ for idx, seq in i5_map.items():
83
+ name = f"i5_{idx}"
84
+ full_seq = f"{i5_prefix}{seq}{i5_suffix}{template_binding_i5}"
85
+ primers.append((name, full_seq))
86
+
87
+ return primers
88
+
89
+
90
+ def run_nextera_primer_design(
91
+ binding_csv: Path,
92
+ output_csv: Path,
93
+ log_path: Optional[Path] = None,
94
+ config_path: Optional[Path] = None,
95
+ logger: Optional[logging.Logger] = None,
96
+ ) -> Path:
97
+ binding_csv = Path(binding_csv)
98
+ output_csv = Path(output_csv)
99
+ output_csv.parent.mkdir(parents=True, exist_ok=True)
100
+
101
+ managed_logger = logger is None
102
+ if logger is None:
103
+ logger = logging.getLogger("uht_tooling.nextera")
104
+ logger.setLevel(logging.INFO)
105
+ handler: logging.Handler
106
+ if log_path:
107
+ handler = logging.FileHandler(log_path, mode="w")
108
+ else:
109
+ handler = logging.StreamHandler()
110
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
111
+ logger.handlers = []
112
+ logger.addHandler(handler)
113
+ logger.propagate = False
114
+
115
+ try:
116
+ logger.info("Loading binding sequences from %s", binding_csv)
117
+ template_i7, template_i5 = load_binding_sequences(binding_csv)
118
+ logger.info("Loaded binding regions (i7 len=%s, i5 len=%s)", len(template_i7), len(template_i5))
119
+
120
+ config = load_config(config_path)
121
+ if config:
122
+ logger.info("Loaded configuration overrides from %s", config_path)
123
+
124
+ i7_indexes = config.get("i7_indexes") if config else None
125
+ i5_indexes = config.get("i5_indexes") if config else None
126
+ i7_prefix = config.get("i7_prefix", I7_PREFIX) if config else I7_PREFIX
127
+ i7_suffix = config.get("i7_suffix", I7_SUFFIX) if config else I7_SUFFIX
128
+ i5_prefix = config.get("i5_prefix", I5_PREFIX) if config else I5_PREFIX
129
+ i5_suffix = config.get("i5_suffix", I5_SUFFIX) if config else I5_SUFFIX
130
+
131
+ primers = generate_primers(
132
+ template_binding_i7=template_i7,
133
+ template_binding_i5=template_i5,
134
+ i7_indexes=i7_indexes,
135
+ i5_indexes=i5_indexes,
136
+ i7_prefix=i7_prefix,
137
+ i7_suffix=i7_suffix,
138
+ i5_prefix=i5_prefix,
139
+ i5_suffix=i5_suffix,
140
+ )
141
+ logger.info("Generated %s primers", len(primers))
142
+
143
+ with output_csv.open("w", newline="") as file:
144
+ writer = csv.writer(file)
145
+ writer.writerow(["primer_name", "sequence"])
146
+ writer.writerows(primers)
147
+
148
+ logger.info("Wrote primers to %s", output_csv)
149
+ return output_csv
150
+ finally:
151
+ if managed_logger and logger:
152
+ for handler in list(logger.handlers):
153
+ handler.close()
154
+ logger.removeHandler(handler)
155
+ logger.propagate = True
156
+
157
+
158
+ def build_parser() -> argparse.ArgumentParser:
159
+ parser = argparse.ArgumentParser(description="Generate Nextera XT primers from binding region CSV input.")
160
+ parser.add_argument(
161
+ "--binding-csv",
162
+ required=True,
163
+ type=Path,
164
+ help="CSV file with a 'binding_region' column; first row is i7, second row is i5.",
165
+ )
166
+ parser.add_argument(
167
+ "--output-csv",
168
+ required=True,
169
+ type=Path,
170
+ help="Path to write the generated primer CSV.",
171
+ )
172
+ parser.add_argument(
173
+ "--log-path",
174
+ default=None,
175
+ type=Path,
176
+ help="Optional path to write a log file.",
177
+ )
178
+ parser.add_argument(
179
+ "--config",
180
+ default=None,
181
+ type=Path,
182
+ help="Optional YAML file providing overrides for indexes/prefixes/suffixes.",
183
+ )
184
+ return parser
185
+
186
+
187
+ def main(argv: Optional[List[str]] = None):
188
+ parser = build_parser()
189
+ args = parser.parse_args(argv)
190
+ run_nextera_primer_design(
191
+ binding_csv=args.binding_csv,
192
+ output_csv=args.output_csv,
193
+ log_path=args.log_path,
194
+ config_path=args.config,
195
+ )
196
+
197
+
198
+ if __name__ == "__main__":
199
+ main()