uht-tooling 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,441 @@
1
+ import argparse
2
+ import gzip
3
+ import logging
4
+ from collections import Counter
5
+ from pathlib import Path
6
+ from typing import Dict, Iterable, List, Optional, Sequence
7
+
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import pandas as pd
11
+ import seaborn as sns
12
+ from Bio.Seq import Seq
13
+ from Bio.SeqUtils import gc_fraction
14
+ from fuzzywuzzy import fuzz
15
+ from tqdm import tqdm
16
+
17
+ sns.set_palette("husl")
18
+
19
+
20
+ def gc_percent(sequence: str) -> float:
21
+ return gc_fraction(sequence) * 100
22
+
23
+
24
+ def load_probes(csv_path: Path, logger: logging.Logger) -> List[Dict[str, str]]:
25
+ df = pd.read_csv(csv_path)
26
+ if "upstream" not in df.columns or "downstream" not in df.columns:
27
+ raise ValueError("Probe CSV must contain 'upstream' and 'downstream' columns.")
28
+ probes = []
29
+ for idx, row in df.iterrows():
30
+ probes.append(
31
+ {
32
+ "upstream": str(row["upstream"]).upper(),
33
+ "downstream": str(row["downstream"]).upper(),
34
+ "name": row.get("name", f"probe_{idx + 1}"),
35
+ }
36
+ )
37
+ logger.info("Loaded %s probe pairs from %s", len(probes), csv_path)
38
+ return probes
39
+
40
+
41
+ def find_probe_positions(sequence: str, probe: str, min_ratio: int) -> List[tuple[int, int, int, str]]:
42
+ positions: List[tuple[int, int, int, str]] = []
43
+ probe_len = len(probe)
44
+ rc_probe = str(Seq(probe).reverse_complement())
45
+ for i in range(len(sequence) - probe_len + 1):
46
+ window = sequence[i : i + probe_len]
47
+ ratio_forward = fuzz.ratio(window, probe)
48
+ if ratio_forward >= min_ratio:
49
+ positions.append((i, i + probe_len, ratio_forward, "forward"))
50
+ ratio_reverse = fuzz.ratio(window, rc_probe)
51
+ if ratio_reverse >= min_ratio:
52
+ positions.append((i, i + probe_len, ratio_reverse, "reverse"))
53
+ return positions
54
+
55
+
56
+ def extract_inserts(
57
+ fastq_path: Path,
58
+ probes: List[Dict[str, str]],
59
+ min_ratio: int,
60
+ logger: logging.Logger,
61
+ ) -> List[Dict[str, object]]:
62
+ inserts: List[Dict[str, object]] = []
63
+ with gzip.open(fastq_path, "rt") as handle:
64
+ total_reads = sum(1 for _ in handle) // 4
65
+ logger.info("Processing %s reads from %s", total_reads, fastq_path)
66
+
67
+ with gzip.open(fastq_path, "rt") as handle, tqdm(total=total_reads, desc="Processing reads") as progress:
68
+ while True:
69
+ header = handle.readline()
70
+ if not header:
71
+ break
72
+ seq = handle.readline().strip()
73
+ handle.readline()
74
+ handle.readline()
75
+
76
+ read_id = header.strip()[1:]
77
+ for probe in probes:
78
+ upstream_positions = find_probe_positions(seq, probe["upstream"], min_ratio)
79
+ downstream_positions = find_probe_positions(seq, probe["downstream"], min_ratio)
80
+ for up_start, up_end, up_ratio, up_strand in upstream_positions:
81
+ for down_start, down_end, down_ratio, down_strand in downstream_positions:
82
+ if up_strand == "forward" and down_strand == "forward" and down_start > up_end:
83
+ insert_seq = seq[up_end:down_start]
84
+ if insert_seq:
85
+ inserts.append(
86
+ {
87
+ "sequence": insert_seq,
88
+ "length": len(insert_seq),
89
+ "probe_name": probe["name"],
90
+ "up_ratio": up_ratio,
91
+ "down_ratio": down_ratio,
92
+ "up_strand": up_strand,
93
+ "down_strand": down_strand,
94
+ "read_id": read_id,
95
+ }
96
+ )
97
+ progress.update(1)
98
+
99
+ logger.info("Extracted %s inserts from %s", len(inserts), fastq_path)
100
+ return inserts
101
+
102
+
103
+ def calculate_qc_metrics(inserts: List[Dict[str, object]], logger: logging.Logger) -> Dict[str, object]:
104
+ if not inserts:
105
+ logger.warning("No inserts found for QC analysis")
106
+ return {}
107
+
108
+ lengths = [insert["length"] for insert in inserts]
109
+ sequences = [insert["sequence"] for insert in inserts]
110
+
111
+ metrics: Dict[str, object] = {
112
+ "total_inserts": len(inserts),
113
+ "length_stats": {
114
+ "mean": float(np.mean(lengths)),
115
+ "median": float(np.median(lengths)),
116
+ "std": float(np.std(lengths)),
117
+ "min": int(min(lengths)),
118
+ "max": int(max(lengths)),
119
+ "q25": float(np.percentile(lengths, 25)),
120
+ "q75": float(np.percentile(lengths, 75)),
121
+ },
122
+ "gc_content": float(np.mean([gc_percent(seq) for seq in sequences])),
123
+ "length_distribution": Counter(lengths),
124
+ "probe_matches": Counter(insert["probe_name"] for insert in inserts),
125
+ "strand_combinations": Counter(f"{insert['up_strand']}-{insert['down_strand']}" for insert in inserts),
126
+ "match_quality": {
127
+ "up_ratio_mean": float(np.mean([insert["up_ratio"] for insert in inserts])),
128
+ "down_ratio_mean": float(np.mean([insert["down_ratio"] for insert in inserts])),
129
+ },
130
+ }
131
+
132
+ all_bases = "".join(sequences)
133
+ base_counts = Counter(all_bases)
134
+ total_bases = len(all_bases)
135
+ metrics["base_composition"] = {base: count / total_bases for base, count in base_counts.items()}
136
+
137
+ seq_counts = Counter(sequences)
138
+ metrics["unique_sequences"] = len(seq_counts)
139
+ metrics["duplicate_rate"] = 1 - (len(seq_counts) / len(sequences))
140
+ logger.info("QC metrics calculated for %s inserts", len(inserts))
141
+ return metrics
142
+
143
+
144
+ def create_qc_plots(inserts: List[Dict[str, object]], metrics: Dict[str, object], output_dir: Path) -> Path:
145
+ output_dir.mkdir(parents=True, exist_ok=True)
146
+ fig = plt.figure(figsize=(20, 16))
147
+
148
+ lengths = [insert["length"] for insert in inserts]
149
+ gc_contents = [gc_percent(insert["sequence"]) for insert in inserts]
150
+ up_ratios = [insert["up_ratio"] for insert in inserts]
151
+ down_ratios = [insert["down_ratio"] for insert in inserts]
152
+
153
+ plt.subplot(3, 4, 1)
154
+ plt.hist(lengths, bins=50, alpha=0.7, edgecolor="black")
155
+ plt.xlabel("Insert Length (bp)")
156
+ plt.ylabel("Frequency")
157
+ plt.title("Insert Length Distribution")
158
+ plt.axvline(metrics["length_stats"]["mean"], color="red", linestyle="--", label="Mean")
159
+ plt.axvline(metrics["length_stats"]["median"], color="green", linestyle="--", label="Median")
160
+ plt.legend()
161
+
162
+ plt.subplot(3, 4, 2)
163
+ plt.hist(gc_contents, bins=30, alpha=0.7, edgecolor="black")
164
+ plt.xlabel("GC Content (%)")
165
+ plt.ylabel("Frequency")
166
+ plt.title("GC Content Distribution")
167
+ plt.axvline(metrics["gc_content"], color="red", linestyle="--", label="Mean")
168
+ plt.legend()
169
+
170
+ plt.subplot(3, 4, 3)
171
+ plt.scatter(up_ratios, down_ratios, alpha=0.6)
172
+ plt.xlabel("Upstream Match Ratio (%)")
173
+ plt.ylabel("Downstream Match Ratio (%)")
174
+ plt.title("Probe Match Quality")
175
+
176
+ plt.subplot(3, 4, 4)
177
+ combo_counts = metrics["strand_combinations"]
178
+ plt.bar(combo_counts.keys(), combo_counts.values())
179
+ plt.xlabel("Strand Combination")
180
+ plt.ylabel("Count")
181
+ plt.title("Strand Combination Analysis")
182
+ plt.xticks(rotation=45)
183
+
184
+ plt.subplot(3, 4, 5)
185
+ base_comp = metrics["base_composition"]
186
+ plt.bar(base_comp.keys(), base_comp.values())
187
+ plt.xlabel("Base")
188
+ plt.ylabel("Frequency")
189
+ plt.title("Base Composition")
190
+
191
+ plt.subplot(3, 4, 6)
192
+ probe_counts = metrics["probe_matches"]
193
+ plt.bar(probe_counts.keys(), probe_counts.values())
194
+ plt.xlabel("Probe")
195
+ plt.ylabel("Insert Count")
196
+ plt.title("Probe Performance")
197
+ plt.xticks(rotation=45)
198
+
199
+ plt.subplot(3, 4, 7)
200
+ plt.scatter(lengths, gc_contents, alpha=0.6)
201
+ plt.xlabel("Insert Length (bp)")
202
+ plt.ylabel("GC Content (%)")
203
+ plt.title("Length vs GC Content")
204
+
205
+ plt.subplot(3, 4, 8)
206
+ sorted_lengths = sorted(lengths)
207
+ cumulative = np.arange(1, len(sorted_lengths) + 1) / len(sorted_lengths)
208
+ plt.plot(sorted_lengths, cumulative)
209
+ plt.xlabel("Insert Length (bp)")
210
+ plt.ylabel("Cumulative Fraction")
211
+ plt.title("Cumulative Length Distribution")
212
+
213
+ plt.subplot(3, 4, 9)
214
+ quality_scores = [(u + d) / 2 for u, d in zip(up_ratios, down_ratios)]
215
+ plt.hist(quality_scores, bins=30, alpha=0.7, edgecolor="black")
216
+ plt.xlabel("Average Match Quality (%)")
217
+ plt.ylabel("Frequency")
218
+ plt.title("Match Quality Distribution")
219
+
220
+ plt.subplot(3, 4, 10)
221
+ plt.boxplot(lengths)
222
+ plt.ylabel("Insert Length (bp)")
223
+ plt.title("Length Statistics")
224
+
225
+ plt.subplot(3, 4, 11)
226
+ seq_counts = Counter(insert["sequence"] for insert in inserts)
227
+ duplicate_counts = [count for count in seq_counts.values() if count > 1]
228
+ if duplicate_counts:
229
+ plt.hist(duplicate_counts, bins=20, alpha=0.7, edgecolor="black")
230
+ plt.xlabel("Duplicate Count")
231
+ plt.ylabel("Frequency")
232
+ plt.title("Duplicate Distribution")
233
+ else:
234
+ plt.text(0.5, 0.5, "No duplicates found", ha="center", va="center", transform=plt.gca().transAxes)
235
+ plt.title("Duplicate Distribution")
236
+
237
+ plt.subplot(3, 4, 12)
238
+ plt.axis("off")
239
+ summary_text = f"""
240
+ Summary Statistics:
241
+
242
+ Total Inserts: {metrics['total_inserts']}
243
+ Mean Length: {metrics['length_stats']['mean']:.1f} bp
244
+ Median Length: {metrics['length_stats']['median']:.1f} bp
245
+ GC Content: {metrics['gc_content']:.1f}%
246
+ Unique Sequences: {metrics['unique_sequences']}
247
+ Duplicate Rate: {metrics['duplicate_rate']:.2%}
248
+ Mean Match Quality: {metrics['match_quality']['up_ratio_mean']:.1f}%
249
+ """
250
+ plt.text(0.05, 0.95, summary_text, transform=plt.gca().transAxes, fontsize=10, va="top", family="monospace")
251
+
252
+ plt.tight_layout()
253
+ plot_path = output_dir / "qc_plots.png"
254
+ fig.savefig(plot_path, dpi=300, bbox_inches="tight")
255
+ plt.close(fig)
256
+ return plot_path
257
+
258
+
259
+ def save_inserts_fasta(inserts: List[Dict[str, object]], output_path: Path):
260
+ output_path.parent.mkdir(parents=True, exist_ok=True)
261
+ with output_path.open("w") as handle:
262
+ for idx, insert in enumerate(inserts, start=1):
263
+ header = (
264
+ f">{insert['probe_name']}_insert_{idx}_len{insert['length']}_"
265
+ f"up{int(insert['up_ratio'])}_down{int(insert['down_ratio'])}"
266
+ )
267
+ handle.write(f"{header}\n{insert['sequence']}\n")
268
+
269
+
270
+ def save_qc_report(metrics: Dict[str, object], output_path: Path):
271
+ output_path.parent.mkdir(parents=True, exist_ok=True)
272
+ with output_path.open("w") as handle:
273
+ handle.write("PROFILE INSERTS QC REPORT\n")
274
+ handle.write("=" * 50 + "\n\n")
275
+ handle.write("SUMMARY STATISTICS:\n")
276
+ handle.write(f"Total inserts: {metrics['total_inserts']}\n")
277
+ handle.write(f"Mean length: {metrics['length_stats']['mean']:.2f} bp\n")
278
+ handle.write(f"Median length: {metrics['length_stats']['median']:.2f} bp\n")
279
+ handle.write(f"Standard deviation: {metrics['length_stats']['std']:.2f} bp\n")
280
+ handle.write(f"Min length: {metrics['length_stats']['min']} bp\n")
281
+ handle.write(f"Max length: {metrics['length_stats']['max']} bp\n")
282
+ handle.write(f"GC content: {metrics['gc_content']:.2f}%\n")
283
+ handle.write(f"Unique sequences: {metrics['unique_sequences']}\n")
284
+ handle.write(f"Duplicate rate: {metrics['duplicate_rate']:.2%}\n\n")
285
+
286
+ handle.write("LENGTH STATISTICS:\n")
287
+ handle.write(f"Q25: {metrics['length_stats']['q25']:.2f} bp\n")
288
+ handle.write(f"Q75: {metrics['length_stats']['q75']:.2f} bp\n\n")
289
+
290
+ handle.write("PROBE PERFORMANCE:\n")
291
+ for probe, count in metrics["probe_matches"].items():
292
+ handle.write(f"{probe}: {count} inserts\n")
293
+ handle.write("\n")
294
+
295
+ handle.write("STRAND COMBINATIONS:\n")
296
+ for combo, count in metrics["strand_combinations"].items():
297
+ handle.write(f"{combo}: {count} inserts\n")
298
+ handle.write("\n")
299
+
300
+ handle.write("BASE COMPOSITION:\n")
301
+ for base, freq in metrics["base_composition"].items():
302
+ handle.write(f"{base}: {freq:.3f}\n")
303
+ handle.write("\n")
304
+
305
+ handle.write("MATCH QUALITY:\n")
306
+ handle.write(f"Mean upstream ratio: {metrics['match_quality']['up_ratio_mean']:.2f}%\n")
307
+ handle.write(f"Mean downstream ratio: {metrics['match_quality']['down_ratio_mean']:.2f}%\n")
308
+
309
+
310
+ def run_profile_inserts(
311
+ probes_csv: Path,
312
+ fastq_files: Sequence[Path],
313
+ output_dir: Path,
314
+ min_ratio: int = 80,
315
+ log_path: Optional[Path] = None,
316
+ logger: Optional[logging.Logger] = None,
317
+ ) -> List[Dict[str, Path]]:
318
+ probes_csv = Path(probes_csv)
319
+ fastq_files = [Path(path) for path in fastq_files]
320
+ output_dir = Path(output_dir)
321
+ output_dir.mkdir(parents=True, exist_ok=True)
322
+
323
+ managed_logger = logger is None
324
+ if logger is None:
325
+ logger = logging.getLogger("uht_tooling.profile_inserts")
326
+ logger.setLevel(logging.INFO)
327
+ handler: logging.Handler
328
+ if log_path:
329
+ handler = logging.FileHandler(log_path, mode="w")
330
+ else:
331
+ handler = logging.StreamHandler()
332
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
333
+ logger.handlers = []
334
+ logger.addHandler(handler)
335
+ logger.propagate = False
336
+
337
+ try:
338
+ if not fastq_files:
339
+ raise ValueError("No FASTQ files provided.")
340
+
341
+ probes = load_probes(probes_csv, logger)
342
+ results: List[Dict[str, Path]] = []
343
+ for fastq in fastq_files:
344
+ if not fastq.exists():
345
+ logger.warning("FASTQ file %s not found; skipping.", fastq)
346
+ continue
347
+ sample_base = fastq.stem.replace(".fastq", "")
348
+ sample_dir = output_dir / sample_base
349
+ sample_dir.mkdir(parents=True, exist_ok=True)
350
+
351
+ inserts = extract_inserts(fastq, probes, min_ratio, logger)
352
+ if not inserts:
353
+ logger.warning("No inserts extracted for %s; skipping.", fastq)
354
+ continue
355
+
356
+ metrics = calculate_qc_metrics(inserts, logger)
357
+ if not metrics:
358
+ logger.warning("Metrics unavailable for %s; skipping.", fastq)
359
+ continue
360
+
361
+ fasta_path = sample_dir / "extracted_inserts.fasta"
362
+ save_inserts_fasta(inserts, fasta_path)
363
+
364
+ report_path = sample_dir / "qc_report.txt"
365
+ save_qc_report(metrics, report_path)
366
+
367
+ plot_path = create_qc_plots(inserts, metrics, sample_dir)
368
+
369
+ results.append(
370
+ {
371
+ "sample": sample_base,
372
+ "directory": sample_dir,
373
+ "fasta": fasta_path,
374
+ "report": report_path,
375
+ "plots": plot_path,
376
+ }
377
+ )
378
+
379
+ if not results:
380
+ logger.warning("No profile inserts outputs generated.")
381
+ return results
382
+ finally:
383
+ if managed_logger and logger:
384
+ for handler in list(logger.handlers):
385
+ handler.close()
386
+ logger.removeHandler(handler)
387
+ logger.propagate = True
388
+
389
+
390
+ def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
391
+ paths: List[Path] = []
392
+ for item in inputs:
393
+ if any(ch in item for ch in "*?[]"):
394
+ paths.extend(Path().glob(item))
395
+ else:
396
+ paths.append(Path(item))
397
+ unique_paths: List[Path] = []
398
+ seen = set()
399
+ for path in paths:
400
+ resolved = path.resolve()
401
+ if resolved not in seen:
402
+ seen.add(resolved)
403
+ unique_paths.append(path)
404
+ return unique_paths
405
+
406
+
407
+ def build_parser() -> argparse.ArgumentParser:
408
+ parser = argparse.ArgumentParser(description="Profile insert sequences using probe pairs and FASTQ data.")
409
+ parser.add_argument("--probes-csv", required=True, type=Path, help="CSV containing upstream/downstream probes.")
410
+ parser.add_argument(
411
+ "--fastq",
412
+ required=True,
413
+ nargs="+",
414
+ help="One or more FASTQ(.gz) paths or glob patterns.",
415
+ )
416
+ parser.add_argument("--output-dir", required=True, type=Path, help="Directory for per-sample outputs.")
417
+ parser.add_argument(
418
+ "--min-ratio",
419
+ type=int,
420
+ default=80,
421
+ help="Minimum fuzzy match ratio (0-100) for probe detection (default: 80).",
422
+ )
423
+ parser.add_argument("--log-path", default=None, type=Path, help="Optional log file path.")
424
+ return parser
425
+
426
+
427
+ def main(argv: Optional[Sequence[str]] = None):
428
+ parser = build_parser()
429
+ args = parser.parse_args(argv)
430
+ fastq_files = expand_fastq_inputs(args.fastq)
431
+ run_profile_inserts(
432
+ probes_csv=args.probes_csv,
433
+ fastq_files=fastq_files,
434
+ output_dir=args.output_dir,
435
+ min_ratio=args.min_ratio,
436
+ log_path=args.log_path,
437
+ )
438
+
439
+
440
+ if __name__ == "__main__":
441
+ main()