uht-tooling 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
uht_tooling/cli.py CHANGED
@@ -233,6 +233,11 @@ def umi_hunter_command(
233
233
  max=1.0,
234
234
  help="Mutation threshold for consensus calling (default: 0.7).",
235
235
  ),
236
+ min_cluster_size: int = typer.Option(
237
+ 1,
238
+ min=1,
239
+ help="Minimum number of reads required in a UMI cluster before a consensus is generated.",
240
+ ),
236
241
  log_path: Optional[Path] = typer.Option(
237
242
  None,
238
243
  dir_okay=False,
@@ -249,6 +254,7 @@ def umi_hunter_command(
249
254
  output_dir=output_dir,
250
255
  umi_identity_threshold=umi_identity_threshold,
251
256
  consensus_mutation_threshold=consensus_mutation_threshold,
257
+ min_cluster_size=min_cluster_size,
252
258
  log_path=log_path,
253
259
  )
254
260
  if not results:
@@ -256,7 +262,12 @@ def umi_hunter_command(
256
262
  else:
257
263
  typer.echo("UMI hunter outputs:")
258
264
  for entry in results:
259
- typer.echo(f" Sample {entry['sample']}: {entry['directory']}")
265
+ total_clusters = entry.get("clusters_total", entry.get("clusters", 0))
266
+ typer.echo(
267
+ f" Sample {entry['sample']}: "
268
+ f"{entry.get('clusters', 0)} consensus clusters "
269
+ f"(from {total_clusters} total) → {entry['directory']}"
270
+ )
260
271
 
261
272
 
262
273
  @app.command("ep-library-profile", help="Profile mutation rates for ep-library sequencing data.")
@@ -10,7 +10,7 @@ import tempfile
10
10
  import textwrap
11
11
  import zipfile
12
12
  from pathlib import Path
13
- from typing import Iterable, List, Optional, Sequence, Tuple
13
+ from typing import Any, Iterable, List, Optional, Sequence, Tuple
14
14
 
15
15
  try:
16
16
  import gradio as gr
@@ -241,28 +241,60 @@ def run_gui_design_gibson(
241
241
  def run_gui_mutation_caller(
242
242
  fastq_file: Optional[str],
243
243
  template_file: Optional[str],
244
- config_csv_file: Optional[str],
244
+ upstream_flank: str,
245
+ downstream_flank: str,
246
+ min_gene_length: Optional[float],
247
+ max_gene_length: Optional[float],
245
248
  ) -> Tuple[str, Optional[str]]:
249
+ config_dir: Optional[Path] = None
250
+ output_dir: Optional[Path] = None
246
251
  try:
247
- if not fastq_file or not template_file or not config_csv_file:
248
- raise ValueError("Upload a FASTQ(.gz), template FASTA, and configuration CSV.")
252
+ if not fastq_file or not template_file:
253
+ raise ValueError("Upload a FASTQ(.gz) read file and the reference template FASTA.")
254
+
255
+ gene_start = _ensure_text(upstream_flank, "Upstream flank")
256
+ gene_end = _ensure_text(downstream_flank, "Downstream flank")
257
+ if min_gene_length is None or max_gene_length is None:
258
+ raise ValueError("Provide minimum and maximum gene lengths (in nucleotides).")
259
+
260
+ gene_min = int(min_gene_length)
261
+ gene_max = int(max_gene_length)
262
+ if gene_min <= 0 or gene_max <= 0:
263
+ raise ValueError("Gene length bounds must be positive integers.")
264
+ if gene_min > gene_max:
265
+ raise ValueError("Minimum gene length cannot exceed the maximum gene length.")
266
+
267
+ config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_mutation_cfg_"))
268
+ config_csv = config_dir / "mutation_flanks.csv"
269
+ pd.DataFrame(
270
+ {
271
+ "gene_flanks": [gene_start.upper(), gene_end.upper()],
272
+ "gene_min_max": [gene_min, gene_max],
273
+ }
274
+ ).to_csv(config_csv, index=False)
249
275
 
250
276
  output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_mutation_out_"))
251
277
  results = run_mutation_caller(
252
278
  template_fasta=Path(template_file),
253
- flanks_csv=Path(config_csv_file),
279
+ flanks_csv=config_csv,
254
280
  fastq_files=[Path(fastq_file)],
255
281
  output_dir=output_dir,
256
282
  threshold=10,
257
283
  )
258
284
 
259
285
  if not results:
260
- return "No amino-acid substitutions detected.", None
261
-
262
- lines = ["### Mutation Caller", ""]
286
+ return "No amino-acid substitutions detected. Check flank selections and read quality.", None
287
+
288
+ lines = [
289
+ "### Mutation Caller",
290
+ "",
291
+ "Long-read reads were aligned to the provided template, flank-delimited coding regions were extracted, and amino-acid substitutions were summarised.",
292
+ "",
293
+ "**Run outputs**",
294
+ ]
263
295
  sample_dirs = []
264
296
  for entry in results:
265
- lines.append(f"**{entry['sample']}** → {entry['directory']}")
297
+ lines.append(f"- **{entry['sample']}** → {entry['directory']}")
266
298
  sample_dirs.append(Path(entry["directory"]))
267
299
  summary = "\n".join(lines)
268
300
  archive = _zip_paths(sample_dirs, "mutation_caller")
@@ -270,33 +302,94 @@ def run_gui_mutation_caller(
270
302
  except Exception as exc: # pragma: no cover
271
303
  _LOGGER.exception("Mutation caller GUI failure")
272
304
  return f"⚠️ Error: {exc}", None
305
+ finally:
306
+ if config_dir:
307
+ _clean_temp_path(config_dir)
308
+ if output_dir:
309
+ _clean_temp_path(output_dir)
273
310
 
274
311
 
275
312
  def run_gui_umi_hunter(
276
313
  fastq_file: Optional[str],
277
314
  template_file: Optional[str],
278
- config_csv_file: Optional[str],
315
+ umi_start: str,
316
+ umi_end: str,
317
+ umi_min_length: Optional[float],
318
+ umi_max_length: Optional[float],
319
+ gene_start: str,
320
+ gene_end: str,
321
+ umi_identity_threshold: float,
322
+ consensus_threshold: float,
323
+ min_cluster_size: int,
279
324
  ) -> Tuple[str, Optional[str]]:
325
+ config_dir: Optional[Path] = None
326
+ output_dir: Optional[Path] = None
280
327
  try:
281
- if not fastq_file or not template_file or not config_csv_file:
282
- raise ValueError("Upload a FASTQ(.gz), template FASTA, and configuration CSV.")
328
+ if not fastq_file or not template_file:
329
+ raise ValueError("Upload a FASTQ(.gz) read file and the template FASTA.")
330
+
331
+ umi_start_clean = _ensure_text(umi_start, "UMI upstream flank").upper()
332
+ umi_end_clean = _ensure_text(umi_end, "UMI downstream flank").upper()
333
+ gene_start_clean = _ensure_text(gene_start, "Gene upstream flank").upper()
334
+ gene_end_clean = _ensure_text(gene_end, "Gene downstream flank").upper()
335
+ if umi_min_length is None or umi_max_length is None:
336
+ raise ValueError("Provide minimum and maximum UMI lengths.")
337
+
338
+ umi_min = int(umi_min_length)
339
+ umi_max = int(umi_max_length)
340
+ if umi_min <= 0 or umi_max <= 0:
341
+ raise ValueError("UMI length bounds must be positive integers.")
342
+ if umi_min > umi_max:
343
+ raise ValueError("Minimum UMI length cannot exceed the maximum length.")
344
+ if not (0.0 <= umi_identity_threshold <= 1.0):
345
+ raise ValueError("UMI identity threshold must be between 0 and 1.")
346
+ if not (0.0 <= consensus_threshold <= 1.0):
347
+ raise ValueError("Consensus mutation threshold must be between 0 and 1.")
348
+ if min_cluster_size is None or int(min_cluster_size) < 1:
349
+ raise ValueError("Minimum cluster size must be at least 1.")
350
+ min_cluster_size_int = int(min_cluster_size)
351
+
352
+ config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_umi_cfg_"))
353
+ config_csv = config_dir / "umi_config.csv"
354
+ pd.DataFrame(
355
+ {
356
+ "umi_flanks": [umi_start_clean, umi_end_clean],
357
+ "umi_min_max": [umi_min, umi_max],
358
+ "gene_flanks": [gene_start_clean, gene_end_clean],
359
+ }
360
+ ).to_csv(config_csv, index=False)
283
361
 
284
362
  output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_umi_out_"))
285
363
  results = run_umi_hunter(
286
364
  template_fasta=Path(template_file),
287
- config_csv=Path(config_csv_file),
365
+ config_csv=config_csv,
288
366
  fastq_files=[Path(fastq_file)],
289
367
  output_dir=output_dir,
368
+ umi_identity_threshold=umi_identity_threshold,
369
+ consensus_mutation_threshold=consensus_threshold,
370
+ min_cluster_size=min_cluster_size_int,
290
371
  )
291
372
 
292
373
  if not results:
293
- return "No UMI clusters were generated. Check input quality and thresholds.", None
374
+ return (
375
+ "No UMI clusters were generated. Double-check flank selections and threshold settings.",
376
+ None,
377
+ )
294
378
 
295
- lines = ["### UMI Hunter", ""]
379
+ lines = [
380
+ "### UMI Hunter",
381
+ "",
382
+ "Reads were scanned for UMI and gene flanks, deduplicated by UMI, and consensus alleles were generated.",
383
+ "",
384
+ "**Run outputs**",
385
+ ]
296
386
  sample_dirs = []
297
387
  for entry in results:
388
+ total_clusters = entry.get("clusters_total", entry["clusters"])
298
389
  lines.append(
299
- f"**{entry['sample']}** → {entry['clusters']} clusters, results in {entry['directory']}"
390
+ f"- **{entry['sample']}** → {entry['clusters']} consensus clusters "
391
+ f"(≥ {min_cluster_size_int} reads) from {total_clusters} total, "
392
+ f"results in {entry['directory']}"
300
393
  )
301
394
  sample_dirs.append(Path(entry["directory"]))
302
395
  summary = "\n".join(lines)
@@ -305,35 +398,82 @@ def run_gui_umi_hunter(
305
398
  except Exception as exc: # pragma: no cover
306
399
  _LOGGER.exception("UMI hunter GUI failure")
307
400
  return f"⚠️ Error: {exc}", None
401
+ finally:
402
+ if config_dir:
403
+ _clean_temp_path(config_dir)
404
+ if output_dir:
405
+ _clean_temp_path(output_dir)
308
406
 
309
407
 
310
408
  def run_gui_profile_inserts(
311
- probes_csv_path: Optional[str],
409
+ probes_table: Any,
312
410
  fastq_files: Sequence[str],
411
+ min_ratio: int,
313
412
  ) -> Tuple[str, Optional[str]]:
413
+ config_dir: Optional[Path] = None
414
+ output_dir: Optional[Path] = None
314
415
  try:
315
- if not probes_csv_path or not fastq_files:
316
- raise ValueError("Upload the probe CSV and at least one FASTQ(.gz) file.")
416
+ if not fastq_files:
417
+ raise ValueError("Upload at least one FASTQ(.gz) file.")
418
+ if probes_table is None:
419
+ raise ValueError("Provide at least one probe pair.")
420
+
421
+ if isinstance(probes_table, pd.DataFrame):
422
+ df = probes_table.copy()
423
+ else:
424
+ df = pd.DataFrame(probes_table or [], columns=["name", "upstream", "downstream"])
425
+
426
+ # Normalise and validate probe entries
427
+ df = df.replace({pd.NA: "", None: ""})
428
+ for column in df.columns:
429
+ if df[column].dtype == object:
430
+ df[column] = df[column].map(lambda x: x.strip() if isinstance(x, str) else x)
431
+
432
+ if "upstream" not in df.columns or "downstream" not in df.columns:
433
+ raise ValueError("Probe table must contain 'upstream' and 'downstream' columns.")
434
+
435
+ df_valid = df[(df["upstream"] != "") & (df["downstream"] != "")].copy()
436
+ if df_valid.empty:
437
+ raise ValueError("Enter at least one probe pair with both upstream and downstream sequences.")
438
+
439
+ df_valid = df_valid.reset_index(drop=True)
440
+ if "name" not in df_valid.columns:
441
+ df_valid["name"] = [f"probe_{i + 1}" for i in range(len(df_valid))]
442
+ else:
443
+ fallback_names = pd.Series(
444
+ [f"probe_{i + 1}" for i in range(len(df_valid))], index=df_valid.index
445
+ )
446
+ df_valid["name"] = df_valid["name"].replace("", pd.NA).fillna(fallback_names)
447
+
448
+ config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_profile_cfg_"))
449
+ probes_csv = config_dir / "probes.csv"
450
+ df_valid.to_csv(probes_csv, index=False)
317
451
 
318
452
  output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_profile_out_"))
319
453
  results = run_profile_inserts(
320
- probes_csv=Path(probes_csv_path),
454
+ probes_csv=probes_csv,
321
455
  fastq_files=[Path(f) for f in fastq_files],
322
456
  output_dir=output_dir,
457
+ min_ratio=int(min_ratio),
323
458
  )
324
459
 
325
460
  if not results:
326
- return "No inserts were extracted. Adjust probe settings and try again.", None
461
+ return "No inserts were extracted. Adjust probe sequences or similarity threshold and try again.", None
327
462
 
328
463
  first_insert = results[0]["fasta"] if isinstance(results, list) else None
329
464
  preview = "*(preview unavailable)*"
330
465
  if first_insert and Path(first_insert).exists():
331
- preview = Path(first_insert).read_text().splitlines()[0][:80] + "..."
466
+ preview = Path(first_insert).read_text().splitlines()[0][:120] + "..."
332
467
 
333
468
  summary = textwrap.dedent(
334
469
  """
335
470
  ### Insert Profiling
336
- Extracted inserts and generated QC metrics. Download the archive for full outputs.
471
+ Probe-defined regions were scanned in the provided FASTQ files, inserts were extracted, and QC metrics were generated.
472
+
473
+ **Key outputs**
474
+ - FASTA files containing extracted inserts per probe pair
475
+ - Summary tables covering length, GC content, duplicate rate, and probe match quality
476
+ - A gallery of QC plots (length distributions, base composition, probe performance)
337
477
  """
338
478
  )
339
479
  archive = _zip_paths([Path(r["directory"]) for r in results], "profile_inserts")
@@ -341,6 +481,11 @@ def run_gui_profile_inserts(
341
481
  except Exception as exc: # pragma: no cover
342
482
  _LOGGER.exception("Profile inserts GUI failure")
343
483
  return f"⚠️ Error: {exc}", None
484
+ finally:
485
+ if config_dir:
486
+ _clean_temp_path(config_dir)
487
+ if output_dir:
488
+ _clean_temp_path(output_dir)
344
489
 
345
490
 
346
491
  def run_gui_ep_library_profile(
@@ -406,18 +551,34 @@ def create_gui() -> gr.Blocks:
406
551
  textwrap.dedent(
407
552
  """
408
553
  # uht-tooling
409
- A guided graphical interface for primer design and sequencing analysis.
410
- Use the tabs below, supply the required inputs, and download the generated results.
554
+ A guided graphical interface for primer design and sequencing analysis. Each tab mirrors the command-line workflows documented in the README and bundles results, logs, and QC artefacts for download.
555
+
556
+ **How to use**
557
+ 1. Select the workflow that matches your experiment.
558
+ 2. Provide the required inputs (text fields, FASTQ/FASTA uploads, or probe tables).
559
+ 3. Run the analysis and download the ZIP archive for complete outputs.
560
+
561
+ Need automation or batch processing? Use the Typer CLI (`uht-tooling ...`) with the same arguments shown here.
411
562
  """
412
563
  )
413
564
  )
414
565
 
415
566
  with gr.Tab("Nextera XT"): # --- Nextera ---
416
567
  gr.Markdown(
417
- """
418
- ### Illumina-Compatible Primer Design
419
- Provide the forward and reverse binding regions in 5'→3' orientation.
420
- """
568
+ textwrap.dedent(
569
+ """
570
+ ### Illumina-Compatible Primer Design
571
+ Generates Nextera XT-ready primers from forward/reverse binding regions. The workflow preloads 12 i5 and 12 i7 indices (144 combinations) and mirrors the “One-PCR-to-flowcell” process described in the README.
572
+
573
+ **Inputs**
574
+ - Forward primer binding region (5'→3')
575
+ - Reverse primer binding region (5'→3')
576
+
577
+ **Outputs**
578
+ - CSV with i5/i7 indices, primer sequences, and ordering-ready metadata.
579
+ - Run log noting index selection and any validation warnings.
580
+ """
581
+ )
421
582
  )
422
583
  forward = gr.Textbox(label="Forward primer (5'→3')")
423
584
  reverse = gr.Textbox(label="Reverse primer (5'→3')")
@@ -429,13 +590,34 @@ def create_gui() -> gr.Blocks:
429
590
  inputs=[forward, reverse],
430
591
  outputs=[nextera_summary, nextera_download],
431
592
  )
593
+ with gr.Accordion("Wet-lab guidance", open=False):
594
+ gr.Markdown(
595
+ textwrap.dedent(
596
+ """
597
+ - Monitor amplification by qPCR and cap the cycle count to reach roughly 10 % yield to limit bias.
598
+ - Purify products with SPRIselect beads (~0.65:1 bead:DNA ratio) to remove residual primers.
599
+ - Confirm primer depletion via electrophoresis (e.g., BioAnalyzer) before sequencing prep.
600
+ """
601
+ )
602
+ )
432
603
 
433
604
  with gr.Tab("SLIM"):
434
605
  gr.Markdown(
435
- """
436
- ### Sequence-Ligation Independent Mutagenesis
437
- Paste the gene coding sequence, the plasmid context, and one mutation per line.
438
- """
606
+ textwrap.dedent(
607
+ """
608
+ ### Sequence-Ligation Independent Mutagenesis
609
+ Designs paired short/long primers to introduce targeted mutations by SLIM cloning, matching the workflow outlined in the README.
610
+
611
+ **Inputs**
612
+ - Target gene coding sequence (FASTA content).
613
+ - Plasmid or genomic context containing the gene.
614
+ - Mutations (one per line, e.g. substitution `A123G`, deletion `T241Del`, insertion `T241TS`).
615
+
616
+ **Outputs**
617
+ - `SLIM_primers.csv` with primer sequences and annealing temperatures.
618
+ - Log file capturing primer QC and any design warnings.
619
+ """
620
+ )
439
621
  )
440
622
  slim_gene = gr.Textbox(label="Gene sequence", lines=4)
441
623
  slim_context = gr.Textbox(label="Plasmid context", lines=4)
@@ -448,13 +630,36 @@ def create_gui() -> gr.Blocks:
448
630
  inputs=[slim_gene, slim_context, slim_mutations],
449
631
  outputs=[slim_summary, slim_download],
450
632
  )
633
+ with gr.Accordion("Bench workflow blueprint", open=False):
634
+ gr.Markdown(
635
+ textwrap.dedent(
636
+ """
637
+ 1. Run two PCRs: (A) long forward + short reverse, (B) long reverse + short forward.
638
+ 2. Combine 10 µL from each PCR with 10 µL H-buffer (150 mM Tris pH 8, 400 mM NaCl, 60 mM EDTA).
639
+ 3. Thermocycle: 99 °C 3 min → 2× (65 °C 5 min → 30 °C 15 min) → hold at 4 °C.
640
+ 4. Transform directly into NEB 5-alpha or BL21 (DE3); the method scales to dozens of mutants simultaneously.
641
+ """
642
+ )
643
+ )
451
644
 
452
645
  with gr.Tab("Gibson"):
453
646
  gr.Markdown(
454
- """
455
- ### Gibson Assembly Primer Design
456
- Use `+` to combine multiple mutations applied simultaneously.
457
- """
647
+ textwrap.dedent(
648
+ """
649
+ ### Gibson Assembly Primer Design
650
+ Plans primer sets and assembly steps for Gibson mutagenesis, supporting multi-mutation constructs using the `+` syntax (e.g. `A123G+T150A`).
651
+
652
+ **Inputs**
653
+ - Coding sequence for the gene of interest.
654
+ - Circular plasmid context sequence.
655
+ - Mutation definitions (one per line; use `+` to bundle simultaneous edits).
656
+
657
+ **Outputs**
658
+ - Primer CSV with overlap sequences and melting temperatures.
659
+ - Assembly plan CSV detailing fragment combinations.
660
+ - Log summarising design decisions and any warnings about overlapping regions.
661
+ """
662
+ )
458
663
  )
459
664
  gibson_gene = gr.Textbox(label="Gene sequence", lines=4)
460
665
  gibson_context = gr.Textbox(label="Plasmid context", lines=4)
@@ -467,74 +672,270 @@ def create_gui() -> gr.Blocks:
467
672
  inputs=[gibson_gene, gibson_context, gibson_mutations],
468
673
  outputs=[gibson_summary, gibson_download],
469
674
  )
675
+ with gr.Accordion("Tips for multi-mutation designs", open=False):
676
+ gr.Markdown(
677
+ textwrap.dedent(
678
+ """
679
+ - If two mutations compete for primer space, design them in sequential runs to avoid overly long primers.
680
+ - Use the assembly plan CSV to map which fragments to combine in each Gibson reaction.
681
+ - When replacing entire codons (e.g. `L46GP`), ensure the plasmid context covers both flanks to maintain overlap.
682
+ """
683
+ )
684
+ )
470
685
 
471
686
  with gr.Tab("Mutation Caller"):
472
687
  gr.Markdown(
473
- """
474
- ### Long-read Mutation Analysis
475
- Upload a FASTQ(.gz), the template FASTA, and the mutation_caller CSV configuration.
476
- """
688
+ textwrap.dedent(
689
+ """
690
+ ### Long-read Mutation Analysis
691
+ Extracts coding regions bounded by user-defined flanks, aligns them to the template, and reports amino-acid substitutions alongside co-occurrence summaries.
692
+
693
+ **Required inputs**
694
+ - FASTQ (.fastq.gz): Oxford Nanopore or other long-read data.
695
+ - Template FASTA: coding sequence used as the reference for alignment.
696
+ - Flank sequences: short 8–12 bp motifs immediately upstream and downstream of the gene.
697
+ - Gene length bounds: acceptable size window (in nucleotides) for the extracted gene segment.
698
+ """
699
+ )
477
700
  )
478
- mc_fastq = gr.File(label="FASTQ (.fastq.gz)", file_types=[".fastq", ".gz"], type="filepath")
479
- mc_template = gr.File(label="Template FASTA", file_types=[".fasta", ".fa"], type="filepath")
480
- mc_config = gr.File(label="Configuration CSV", file_types=[".csv"], type="filepath")
701
+ with gr.Row():
702
+ mc_fastq = gr.File(
703
+ label="FASTQ (.fastq.gz)",
704
+ file_types=[".fastq", ".gz"],
705
+ type="filepath",
706
+ )
707
+ mc_template = gr.File(
708
+ label="Template FASTA",
709
+ file_types=[".fasta", ".fa"],
710
+ type="filepath",
711
+ )
712
+ with gr.Row():
713
+ mc_upstream = gr.Textbox(
714
+ label="Upstream flank (5'→3')",
715
+ placeholder="e.g. ACTGTTAG",
716
+ )
717
+ mc_downstream = gr.Textbox(
718
+ label="Downstream flank (5'→3')",
719
+ placeholder="e.g. CGAACCTA",
720
+ )
721
+ with gr.Row():
722
+ mc_min_len = gr.Number(
723
+ label="Minimum gene length (nt)",
724
+ value=900,
725
+ precision=0,
726
+ )
727
+ mc_max_len = gr.Number(
728
+ label="Maximum gene length (nt)",
729
+ value=1200,
730
+ precision=0,
731
+ )
481
732
  mc_btn = gr.Button("Run mutation caller", variant="primary")
482
733
  mc_summary = gr.Markdown(label="Summary")
483
734
  mc_download = gr.File(label="Download results", file_count="single")
484
735
  mc_btn.click(
485
736
  fn=run_gui_mutation_caller,
486
- inputs=[mc_fastq, mc_template, mc_config],
737
+ inputs=[
738
+ mc_fastq,
739
+ mc_template,
740
+ mc_upstream,
741
+ mc_downstream,
742
+ mc_min_len,
743
+ mc_max_len,
744
+ ],
487
745
  outputs=[mc_summary, mc_download],
488
746
  )
747
+ with gr.Accordion("What happens under the hood", open=False):
748
+ gr.Markdown(
749
+ textwrap.dedent(
750
+ """
751
+ - Reads are scanned for the upstream and downstream flanks; the sequence between them is treated as the gene of interest if it falls within the specified length window.
752
+ - MAFFT aligns recovered genes to the reference template and the pipeline annotates amino-acid substitutions, co-occurrence networks, and depth statistics.
753
+ - Outputs mirror the CLI version: per-sample directories with CSV summaries, JSON co-occurrence graphs, QC plots, and a detailed `run.log`.
754
+ """
755
+ )
756
+ )
489
757
 
490
758
  with gr.Tab("UMI Hunter"):
491
759
  gr.Markdown(
492
- """
493
- ### UMI-Gene Pair Clustering
494
- Upload a FASTQ(.gz), template FASTA, and the UMI configuration CSV.
495
- """
760
+ textwrap.dedent(
761
+ """
762
+ ### UMI–Gene Pair Clustering
763
+ Detects UMI barcodes, extracts paired gene inserts, clusters reads by UMI identity, and emits consensus sequences with abundance tables.
764
+
765
+ **Required inputs**
766
+ - FASTQ (.fastq.gz) containing UMI-tagged reads.
767
+ - Template FASTA for downstream consensus calling.
768
+ - UMI and gene flank sequences marking the barcode and insert boundaries.
769
+ - UMI length bounds plus clustering thresholds.
770
+ - Minimum reads per cluster to keep (clusters below the threshold are reported but no consensus is generated).
771
+ """
772
+ )
773
+ )
774
+ with gr.Row():
775
+ umi_fastq = gr.File(
776
+ label="FASTQ (.fastq.gz)",
777
+ file_types=[".fastq", ".gz"],
778
+ type="filepath",
779
+ )
780
+ umi_template = gr.File(
781
+ label="Template FASTA",
782
+ file_types=[".fasta", ".fa"],
783
+ type="filepath",
784
+ )
785
+ with gr.Row():
786
+ umi_start = gr.Textbox(
787
+ label="UMI upstream flank (5'→3')",
788
+ placeholder="e.g. ACACTCTTTCCCTACACGAC",
789
+ )
790
+ umi_end = gr.Textbox(
791
+ label="UMI downstream flank (5'→3')",
792
+ placeholder="e.g. GACTGGAGTTCAGACGTGTG",
793
+ )
794
+ with gr.Row():
795
+ gene_start = gr.Textbox(
796
+ label="Gene upstream flank (5'→3')",
797
+ placeholder="e.g. ATG...",
798
+ )
799
+ gene_end = gr.Textbox(
800
+ label="Gene downstream flank (5'→3')",
801
+ placeholder="e.g. TTA...",
802
+ )
803
+ with gr.Row():
804
+ umi_min_len = gr.Number(
805
+ label="Minimum UMI length (nt)",
806
+ value=8,
807
+ precision=0,
808
+ )
809
+ umi_max_len = gr.Number(
810
+ label="Maximum UMI length (nt)",
811
+ value=14,
812
+ precision=0,
813
+ )
814
+ with gr.Row():
815
+ umi_identity = gr.Slider(
816
+ label="UMI clustering identity",
817
+ minimum=0.5,
818
+ maximum=1.0,
819
+ value=0.9,
820
+ step=0.05,
821
+ )
822
+ consensus_threshold = gr.Slider(
823
+ label="Consensus mutation threshold",
824
+ minimum=0.5,
825
+ maximum=1.0,
826
+ value=0.7,
827
+ step=0.05,
828
+ )
829
+ umi_min_cluster = gr.Slider(
830
+ label="Minimum reads per cluster",
831
+ minimum=1,
832
+ maximum=50,
833
+ value=3,
834
+ step=1,
496
835
  )
497
- umi_fastq = gr.File(label="FASTQ (.fastq.gz)", file_types=[".fastq", ".gz"], type="filepath")
498
- umi_template = gr.File(label="Template FASTA", file_types=[".fasta", ".fa"], type="filepath")
499
- umi_config = gr.File(label="UMI config CSV", file_types=[".csv"], type="filepath")
500
836
  umi_btn = gr.Button("Run UMI hunter", variant="primary")
501
837
  umi_summary = gr.Markdown(label="Summary")
502
838
  umi_download = gr.File(label="Download results", file_count="single")
503
839
  umi_btn.click(
504
840
  fn=run_gui_umi_hunter,
505
- inputs=[umi_fastq, umi_template, umi_config],
841
+ inputs=[
842
+ umi_fastq,
843
+ umi_template,
844
+ umi_start,
845
+ umi_end,
846
+ umi_min_len,
847
+ umi_max_len,
848
+ gene_start,
849
+ gene_end,
850
+ umi_identity,
851
+ consensus_threshold,
852
+ umi_min_cluster,
853
+ ],
506
854
  outputs=[umi_summary, umi_download],
507
855
  )
856
+ with gr.Accordion("What the pipeline generates", open=False):
857
+ gr.Markdown(
858
+ textwrap.dedent(
859
+ """
860
+ - Reads are searched for the UMI barcode and gene flanks on both strands; valid pairs feed into UMI grouping.
861
+ - UMIs within the chosen identity threshold are merged, and consensus sequences are computed with the mutation threshold.
862
+ - Outputs include per-sample summaries, consensus FASTA files, cluster membership tables, QC plots, and logs mirroring the CLI workflow.
863
+ """
864
+ )
865
+ )
508
866
 
509
867
  with gr.Tab("Profile Inserts"):
510
868
  gr.Markdown(
511
- """
512
- ### Insert Profiling
513
- Upload the probe CSV and one or more FASTQ(.gz) files containing reads.
514
- """
869
+ textwrap.dedent(
870
+ """
871
+ ### Probe-Guided Insert Profiling
872
+ Characterises inserts demarcated by user-supplied upstream/downstream probes, extracts sequences, and produces QC plots plus summary tables.
873
+
874
+ **Required inputs**
875
+ - FASTQ reads containing the inserts of interest.
876
+ - One or more probe pairs: 5'→3' sequences for the upstream and downstream anchors (reverse complements are matched automatically).
877
+ """
878
+ )
879
+ )
880
+ probes_table = gr.Dataframe(
881
+ headers=["name (optional)", "upstream", "downstream"],
882
+ datatype=["str", "str", "str"],
883
+ row_count=(1, "dynamic"),
884
+ col_count=3,
885
+ value=[["probe_1", "", ""]],
886
+ interactive=True,
887
+ label="Probe pairs",
515
888
  )
516
- pi_csv = gr.File(label="Probe CSV", file_types=[".csv"], type="filepath")
517
889
  pi_fastq = gr.File(
518
- label="FASTQ files",
890
+ label="FASTQ files (.fastq/.gz)",
519
891
  file_types=[".fastq", ".gz"],
520
892
  file_count="multiple",
521
893
  type="filepath",
522
894
  )
895
+ pi_ratio = gr.Slider(
896
+ label="Minimum fuzzy-match ratio",
897
+ minimum=50,
898
+ maximum=100,
899
+ value=80,
900
+ step=1,
901
+ )
523
902
  pi_btn = gr.Button("Profile inserts", variant="primary")
524
903
  pi_summary = gr.Markdown(label="Summary")
525
904
  pi_download = gr.File(label="Download results", file_count="single")
526
905
  pi_btn.click(
527
906
  fn=run_gui_profile_inserts,
528
- inputs=[pi_csv, pi_fastq],
907
+ inputs=[probes_table, pi_fastq, pi_ratio],
529
908
  outputs=[pi_summary, pi_download],
530
909
  )
910
+ with gr.Accordion("Output overview", open=False):
911
+ gr.Markdown(
912
+ textwrap.dedent(
913
+ """
914
+ - Inserts are extracted whenever probe matches are detected above the chosen similarity threshold (default 80).
915
+ - A FASTA file of inserts, probe-level QC metrics, base composition summaries, and a suite of plots (length distribution, GC content, duplicate rate, probe performance) are packaged for each input FASTQ.
916
+ - Logs are stored alongside the results so runs remain fully reproducible.
917
+ """
918
+ )
919
+ )
531
920
 
532
921
  with gr.Tab("EP Library Profile"):
533
922
  gr.Markdown(
534
- """
535
- ### Library Profiling Without UMIs
536
- Upload one or more FASTQ(.gz) files plus the region and plasmid references.
537
- """
923
+ textwrap.dedent(
924
+ """
925
+ ### Library Profiling Without UMIs
926
+ Estimates background and target mutation rates for enzyme evolution libraries without UMI barcodes.
927
+
928
+ **Inputs**
929
+ - FASTQ reads (*.fastq/.gz) from the ep-library experiment.
930
+ - Region-of-interest FASTA delineating the mutational window.
931
+ - Plasmid FASTA providing the full reference context.
932
+
933
+ **Outputs**
934
+ - Per-sample directories with coverage tables, mutation rate statistics, and QC plots.
935
+ - `master_summary.txt` aggregating condition-level metrics.
936
+ - Verbose logs recording alignment commands and rate calculations.
937
+ """
938
+ )
538
939
  )
539
940
  ep_fastq = gr.File(
540
941
  label="FASTQ files",
@@ -552,6 +953,17 @@ def create_gui() -> gr.Blocks:
552
953
  inputs=[ep_fastq, ep_region, ep_plasmid],
553
954
  outputs=[ep_summary, ep_download],
554
955
  )
956
+ with gr.Accordion("How mutation rates are derived", open=False):
957
+ gr.Markdown(
958
+ textwrap.dedent(
959
+ """
960
+ - Reads are aligned against both the region-of-interest and the full plasmid to measure target and background mismatch rates; their difference yields the net nucleotide mutation rate with propagated binomial and quality-score uncertainty.
961
+ - The net per-base rate is multiplied by the CDS length to obtain λ₍bp₎ (mutations per copy), then Monte Carlo simulations flip random bases, translate the mutated CDS, and count amino-acid differences—those simulated means and confidence intervals are the values plotted in the QC figure.
962
+ - When multiple Q-score thresholds are analysed, the CLI combines them via a precision-weighted consensus (after discarding filters with <1000 mappable bases). The consensus AA mutation rate is written to `aa_mutation_consensus.txt` and drawn as a horizontal guide in the plot.
963
+ - Download the archive to inspect per-sample plots, TSV summaries, the consensus summary, and logs for troubleshooting.
964
+ """
965
+ )
966
+ )
555
967
 
556
968
  gr.Markdown(
557
969
  textwrap.dedent(
@@ -15,7 +15,7 @@ import matplotlib.pyplot as plt
15
15
  import math
16
16
  import tempfile
17
17
  from pathlib import Path
18
- from typing import Dict, Iterable, List, Optional, Sequence
18
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
19
19
 
20
20
  # Use a built-in Matplotlib style ("ggplot") for consistency
21
21
  plt.style.use("ggplot")
@@ -505,94 +505,153 @@ def run_qc_analysis(fastq_path, results_dir, ref_hit_fasta, plasmid_fasta):
505
505
  else:
506
506
  logging.warning(f"Failed to calculate mutation rate for quality threshold {q_threshold}")
507
507
 
508
- # Find optimal Q-score threshold (lowest empirical error)
509
- optimal_qscore, optimal_result = find_optimal_qscore_simple(qc_results)
508
+ # Derive consensus AA mutation estimates across valid Q-score thresholds
509
+ consensus_info, _ = compute_consensus_aa_mutation(qc_results)
510
510
 
511
511
  # Create QC plots
512
512
  if len(qc_results) >= 2:
513
- create_simple_qc_plots(successful_thresholds, qc_results, results_dir, optimal_qscore, optimal_result)
513
+ create_simple_qc_plots(
514
+ successful_thresholds,
515
+ qc_results,
516
+ results_dir,
517
+ consensus_info=consensus_info,
518
+ )
514
519
  else:
515
520
  logging.warning("Insufficient data points for QC plots (need at least 2)")
516
521
 
517
- # Save optimal Q-score information
518
- if optimal_qscore is not None:
519
- optimal_qscore_path = os.path.join(results_dir, "optimal_qscore_analysis.txt")
520
- with open(optimal_qscore_path, 'w') as f:
521
- f.write("=== OPTIMAL Q-SCORE ANALYSIS (PRECISION-WEIGHTED) ===\n")
522
- f.write(f"Optimal Q-score threshold: {optimal_qscore}\n")
523
- f.write(f"Precision-weighted score: {(1.0 / optimal_result['std_aa_mutations']) * optimal_qscore:.6f}\n" if optimal_result['std_aa_mutations'] > 0 else "Precision-weighted score: inf (perfect precision)\n")
524
- f.write(f"Empirical error (std): {optimal_result['std_aa_mutations']:.6f}\n")
525
- f.write(f"AA mutations per gene: {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}\n")
526
- f.write(f"95% Confidence Interval: [{optimal_result['ci_lower']:.4f}, {optimal_result['ci_upper']:.4f}]\n")
527
- f.write(f"Total mappable bases: {optimal_result['total_mappable_bases']}\n")
528
- f.write(f"Number of segments: {optimal_result['n_segments']}\n")
529
- f.write("\n=== ALL Q-SCORE COMPARISON ===\n")
530
- f.write("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases\tAA_Mutations\tCI_Lower\tCI_Upper\n")
531
- for result in qc_results:
532
- precision_score = (1.0 / result['std_aa_mutations']) * result['quality_threshold'] if result['std_aa_mutations'] > 0 else float('inf')
533
- f.write(f"{result['quality_threshold']}\t{result['std_aa_mutations']:.6f}\t{precision_score:.6f}\t{result['total_mappable_bases']}\t{result['mean_aa_mutations']:.4f}\t{result['ci_lower']:.4f}\t{result['ci_upper']:.4f}\n")
534
-
535
- logging.info(f"Optimal Q-score analysis saved to: {optimal_qscore_path}")
522
+ # Save consensus summary
523
+ consensus_summary_path = os.path.join(results_dir, "aa_mutation_consensus.txt")
524
+ with open(consensus_summary_path, "w") as f:
525
+ f.write("=== CONSENSUS AMINO-ACID MUTATION ESTIMATE ===\n")
526
+ if consensus_info:
527
+ f.write(f"Minimum mappable bases required: {consensus_info['min_mappable_bases']}\n")
528
+ f.write(
529
+ f"Consensus AA mutations per gene: {consensus_info['consensus_mean']:.4f} ± "
530
+ f"{consensus_info['consensus_std']:.4f}\n"
531
+ )
532
+ f.write(f"Thresholds contributing: {consensus_info['thresholds_used']}\n")
533
+ f.write(f"Normalized weights: {consensus_info['weights']}\n")
534
+ if consensus_info.get("note"):
535
+ f.write(f"Note: {consensus_info['note']}\n")
536
+ else:
537
+ f.write("Consensus AA mutation rate could not be computed; see QC logs for details.\n")
538
+ f.write("\n=== ALL Q-SCORE RESULTS ===\n")
539
+ f.write(
540
+ "Q-score\tMean_AA\tStd_AA\tCI_Lower\tCI_Upper\tMappable_Bases\tSegments\n"
541
+ )
542
+ for result in qc_results:
543
+ f.write(
544
+ f"{result['quality_threshold']}\t"
545
+ f"{result['mean_aa_mutations']:.6f}\t"
546
+ f"{result['std_aa_mutations']:.6f}\t"
547
+ f"{result['ci_lower']:.6f}\t"
548
+ f"{result['ci_upper']:.6f}\t"
549
+ f"{result['total_mappable_bases']}\t"
550
+ f"{result['n_segments']}\n"
551
+ )
552
+ logging.info("Consensus AA mutation summary saved to: %s", consensus_summary_path)
536
553
 
537
554
  # Clean up segment files
538
- import shutil
539
555
  segment_dir = os.path.dirname(segment_files[0])
540
556
  if os.path.exists(segment_dir):
541
557
  shutil.rmtree(segment_dir)
542
558
  logging.info(f"Cleaned up segment directory: {segment_dir}")
543
559
 
544
- # Return both QC results and optimal Q-score for use in main analysis
545
- return qc_results, optimal_qscore
560
+ # Return QC results and consensus information for downstream analysis
561
+ return qc_results, consensus_info
546
562
 
547
- def find_optimal_qscore_simple(qc_results):
563
+ def compute_consensus_aa_mutation(
564
+ qc_results: List[dict],
565
+ min_mappable_bases: int = 1000,
566
+ ) -> Tuple[Optional[dict], List[dict]]:
548
567
  """
549
- Find the Q-score threshold with the highest precision-weighted score.
550
- Precision-weighted score = (1 / standard_deviation) * q_score
551
-
552
- Args:
553
- qc_results: List of segmentation analysis results
554
-
568
+ Derive a consensus amino-acid mutation estimate across Q-score thresholds.
569
+
570
+ Each threshold must meet a minimum coverage requirement. The consensus is a
571
+ precision-weighted average (weights = 1 / std_aa_mutations).
572
+
555
573
  Returns:
556
- tuple: (optimal_qscore, optimal_result)
574
+ consensus_info (dict or None)
575
+ {
576
+ 'consensus_mean': float,
577
+ 'consensus_std': float,
578
+ 'thresholds_used': List[int],
579
+ 'weights': List[float],
580
+ 'min_mappable_bases': int,
581
+ }
582
+ valid_results: list of QC result dicts that were included in the consensus
557
583
  """
558
- logging.info("=== FINDING OPTIMAL Q-SCORE THRESHOLD (PRECISION-WEIGHTED) ===")
559
-
560
584
  if not qc_results:
561
- return None, None
562
-
563
- # Find Q-score with highest precision-weighted score
564
- max_score = -1
565
- optimal_result = None
566
- optimal_qscore = None
567
-
568
- logging.info("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases")
569
- logging.info("-" * 60)
570
-
585
+ return None, []
586
+
587
+ valid_results = []
571
588
  for result in qc_results:
572
- qscore = result['quality_threshold']
573
- empirical_error = result['std_aa_mutations']
574
- mappable_bases = result['total_mappable_bases']
575
-
576
- # Calculate precision-weighted score: (1/sd) * q_score
577
- if empirical_error > 0:
578
- precision_score = (1.0 / empirical_error) * qscore
579
- else:
580
- precision_score = float('inf') # Perfect precision
581
-
582
- logging.info(f"Q{qscore}\t{empirical_error:.6f}\t{precision_score:.6f}\t{mappable_bases}")
583
-
584
- if precision_score > max_score:
585
- max_score = precision_score
586
- optimal_result = result
587
- optimal_qscore = qscore
588
-
589
- logging.info("-" * 60)
590
- logging.info(f"OPTIMAL Q-SCORE: Q{optimal_qscore} (highest precision-weighted score: {max_score:.6f})")
591
- logging.info(f"Optimal result: AA mutations = {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}")
592
-
593
- return optimal_qscore, optimal_result
589
+ total_bases = result.get("total_mappable_bases", 0)
590
+ std_aa = result.get("std_aa_mutations", 0.0)
591
+ if total_bases is None:
592
+ total_bases = 0
593
+ if total_bases >= min_mappable_bases and std_aa is not None:
594
+ valid_results.append(result)
595
+
596
+ if not valid_results:
597
+ logging.warning(
598
+ "No Q-score thresholds met the minimum mappable base requirement (%s). "
599
+ "Consensus AA mutation rate will fall back to the threshold with the highest coverage.",
600
+ min_mappable_bases,
601
+ )
602
+ best_by_coverage = max(qc_results, key=lambda r: r.get("total_mappable_bases", 0))
603
+ fallback_std = best_by_coverage.get("std_aa_mutations", 0.0)
604
+ consensus_info = {
605
+ "consensus_mean": best_by_coverage.get("mean_aa_mutations", 0.0),
606
+ "consensus_std": fallback_std,
607
+ "thresholds_used": [best_by_coverage.get("quality_threshold")],
608
+ "weights": [1.0],
609
+ "min_mappable_bases": min_mappable_bases,
610
+ "note": "FELL_BACK_TO_MAX_COVERAGE",
611
+ }
612
+ return consensus_info, [best_by_coverage]
613
+
614
+ weights = []
615
+ means = []
616
+ variances = []
617
+ thresholds = []
618
+ for result in valid_results:
619
+ std_aa = result.get("std_aa_mutations", 0.0) or 0.0
620
+ weight = 1.0 / max(std_aa, 1e-9) # Avoid division by zero; effectively a very large weight.
621
+ weights.append(weight)
622
+ means.append(result.get("mean_aa_mutations", 0.0))
623
+ variances.append(std_aa**2)
624
+ thresholds.append(result.get("quality_threshold"))
625
+
626
+ weight_sum = float(np.sum(weights))
627
+ normalized_weights = [w / weight_sum for w in weights]
628
+ consensus_mean = float(np.sum(np.array(normalized_weights) * np.array(means)))
629
+
630
+ combined_variance = 0.0
631
+ for w, mean, var in zip(normalized_weights, means, variances):
632
+ combined_variance += w * (var + (mean - consensus_mean) ** 2)
633
+ combined_variance = max(combined_variance, 0.0)
634
+ consensus_std = float(np.sqrt(combined_variance))
635
+
636
+ consensus_info = {
637
+ "consensus_mean": consensus_mean,
638
+ "consensus_std": consensus_std,
639
+ "thresholds_used": thresholds,
640
+ "weights": normalized_weights,
641
+ "min_mappable_bases": min_mappable_bases,
642
+ "note": "WEIGHTED_AVERAGE",
643
+ }
644
+
645
+ logging.info(
646
+ "Consensus AA mutation estimate: %.4f ± %.4f (thresholds used: %s)",
647
+ consensus_mean,
648
+ consensus_std,
649
+ thresholds,
650
+ )
594
651
 
595
- def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_qscore=None, optimal_result=None):
652
+ return consensus_info, valid_results
653
+
654
+ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensus_info=None):
596
655
  """
597
656
  Create simple QC plots with empirical error bars.
598
657
 
@@ -600,8 +659,7 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_
600
659
  quality_thresholds: List of quality score thresholds
601
660
  qc_results: List of segmentation analysis results
602
661
  results_dir: Directory to save the plots
603
- optimal_qscore: Optimal Q-score threshold (optional)
604
- optimal_result: Optimal result data (optional)
662
+ consensus_info: Optional dict describing the consensus AA mutation estimate.
605
663
  """
606
664
  try:
607
665
  # Extract data for plotting
@@ -624,10 +682,17 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_
624
682
  ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
625
683
  alpha=0.3, color=color1, label='95% Confidence Interval')
626
684
 
627
- # Highlight optimal Q-score
628
- if optimal_qscore is not None:
629
- ax1.axvline(x=optimal_qscore, color='red', linestyle='--', alpha=0.7,
630
- label=f'Optimal Q{optimal_qscore}')
685
+ # Add consensus AA mutation estimate if available
686
+ if consensus_info and consensus_info.get("consensus_mean") is not None:
687
+ consensus_mean = consensus_info["consensus_mean"]
688
+ consensus_std = consensus_info.get("consensus_std", 0.0)
689
+ ax1.axhline(
690
+ y=consensus_mean,
691
+ color='red',
692
+ linestyle='--',
693
+ alpha=0.7,
694
+ label=f"Consensus AA mutations ({consensus_mean:.3f}±{consensus_std:.3f})",
695
+ )
631
696
 
632
697
  ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
633
698
  ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
@@ -1185,16 +1250,23 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
1185
1250
  bg_rate = bg_mis / bg_cov if bg_cov > 0 else 0
1186
1251
  net_rate = max(hit_rate - bg_rate, 0.0)
1187
1252
 
1188
- # Calculate AA mutations per gene (simplified)
1253
+ # Calculate AA mutations per gene via Monte Carlo simulation
1189
1254
  lambda_bp = net_rate * len(hit_seq)
1190
- aa_mutations = lambda_bp / 3.0 # Approximate: 3 bp per AA
1255
+ aa_samples = simulate_aa_distribution(lambda_bp, hit_seq, n_trials=500)
1256
+ if len(aa_samples) > 1:
1257
+ aa_mean = float(np.mean(aa_samples))
1258
+ aa_var = float(np.var(aa_samples, ddof=1))
1259
+ else:
1260
+ aa_mean = float(aa_samples[0]) if aa_samples else 0.0
1261
+ aa_var = 0.0
1191
1262
 
1192
1263
  segment_results.append({
1193
1264
  'segment': i+1,
1194
1265
  'hit_rate': hit_rate,
1195
1266
  'bg_rate': bg_rate,
1196
1267
  'net_rate': net_rate,
1197
- 'aa_mutations': aa_mutations,
1268
+ 'aa_mutations': aa_mean,
1269
+ 'aa_variance': aa_var,
1198
1270
  'mappable_bases': hit_cov,
1199
1271
  'hit_mismatches': hit_mis,
1200
1272
  'hit_coverage': hit_cov
@@ -1204,29 +1276,44 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
1204
1276
  return None
1205
1277
 
1206
1278
  # Calculate empirical statistics
1207
- aa_mutations_list = [r['aa_mutations'] for r in segment_results]
1208
- net_rates_list = [r['net_rate'] for r in segment_results]
1209
- mappable_bases_list = [r['mappable_bases'] for r in segment_results]
1279
+ aa_mutations_list = np.array([r['aa_mutations'] for r in segment_results], dtype=float)
1280
+ aa_variances = np.array([r.get('aa_variance', 0.0) for r in segment_results], dtype=float)
1281
+ net_rates_list = np.array([r['net_rate'] for r in segment_results], dtype=float)
1282
+ mappable_bases_list = np.array([r['mappable_bases'] for r in segment_results], dtype=float)
1283
+
1284
+ total_mappable_bases = float(mappable_bases_list.sum())
1285
+ if total_mappable_bases > 0:
1286
+ weights = mappable_bases_list
1287
+ mean_aa = float(np.average(aa_mutations_list, weights=weights))
1288
+ mean_net_rate = float(np.average(net_rates_list, weights=weights))
1289
+ weighted_var = float(
1290
+ np.sum(weights * (aa_variances + (aa_mutations_list - mean_aa) ** 2)) / total_mappable_bases
1291
+ )
1292
+ weighted_net_var = float(
1293
+ np.sum(weights * ( (net_rates_list - mean_net_rate) ** 2 )) / total_mappable_bases
1294
+ )
1295
+ else:
1296
+ weights = None
1297
+ mean_aa = float(np.mean(aa_mutations_list))
1298
+ mean_net_rate = float(np.mean(net_rates_list))
1299
+ weighted_var = float(np.var(aa_mutations_list, ddof=1)) if len(aa_mutations_list) > 1 else 0.0
1300
+ weighted_net_var = float(np.var(net_rates_list, ddof=1)) if len(net_rates_list) > 1 else 0.0
1210
1301
 
1211
- mean_aa = np.mean(aa_mutations_list)
1212
- std_aa = np.std(aa_mutations_list, ddof=1) # Sample standard deviation
1213
- mean_net_rate = np.mean(net_rates_list)
1214
- std_net_rate = np.std(net_rates_list, ddof=1)
1215
- total_mappable_bases = sum(mappable_bases_list)
1302
+ std_aa = float(np.sqrt(max(weighted_var, 0.0)))
1303
+ std_net_rate = float(np.sqrt(max(weighted_net_var, 0.0)))
1216
1304
 
1217
1305
  # Calculate confidence interval using t-distribution
1218
1306
  n_segments = len(segment_results)
1219
1307
  if n_segments > 1:
1220
- # 95% confidence interval
1221
- from scipy.stats import t
1222
- t_val = t.ppf(0.975, n_segments - 1)
1223
1308
  se_aa = std_aa / np.sqrt(n_segments)
1224
- ci_lower = mean_aa - t_val * se_aa
1225
- ci_upper = mean_aa + t_val * se_aa
1309
+ ci_lower = mean_aa - 1.96 * se_aa
1310
+ ci_upper = mean_aa + 1.96 * se_aa
1226
1311
  else:
1227
1312
  ci_lower = mean_aa
1228
1313
  ci_upper = mean_aa
1229
1314
 
1315
+ ci_lower = max(ci_lower, 0.0)
1316
+
1230
1317
  return {
1231
1318
  'mean_aa_mutations': mean_aa,
1232
1319
  'std_aa_mutations': std_aa,
@@ -2072,18 +2159,23 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2072
2159
  ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
2073
2160
  ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
2074
2161
  else:
2075
- # Not protein or no AA differences
2076
- ax3.text(0.5, 0.5, "Not a protein‐coding region",
2077
- horizontalalignment='center', verticalalignment='center',
2078
- fontsize=12, color='gray', transform=ax3.transAxes)
2079
-
2080
- ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
2081
- ax3.set_xlabel("Number of AA Mutations", fontsize=12)
2082
- ax3.set_ylabel("Density", fontsize=12)
2083
- ax3.spines['top'].set_visible(False)
2084
- ax3.spines['right'].set_visible(False)
2085
- ax3.set_xticks([])
2086
- ax3.set_yticks([])
2162
+ # Not protein or no AA differences — display an informative message
2163
+ ax3.text(
2164
+ 0.5,
2165
+ 0.5,
2166
+ "Amino-acid distribution unavailable",
2167
+ horizontalalignment="center",
2168
+ verticalalignment="center",
2169
+ fontsize=12,
2170
+ color="gray",
2171
+ transform=ax3.transAxes,
2172
+ )
2173
+
2174
+ ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
2175
+ ax3.set_xlabel("Number of AA Mutations", fontsize=12)
2176
+ ax3.set_ylabel("Density", fontsize=12)
2177
+ ax3.spines['top'].set_visible(False)
2178
+ ax3.spines['right'].set_visible(False)
2087
2179
 
2088
2180
  # Save the combined figure as both PNG and PDF
2089
2181
  panel_path_png = os.path.join(qscore_results_dir, "summary_panels.png")
@@ -2412,7 +2504,7 @@ def process_single_fastq(
2412
2504
  logging.info("Running QC analysis to get Q-score results...")
2413
2505
  qc_results = None
2414
2506
  try:
2415
- qc_results, optimal_qscore = run_qc_analysis(
2507
+ qc_results, consensus_info = run_qc_analysis(
2416
2508
  str(fastq_path),
2417
2509
  str(results_dir),
2418
2510
  str(region_fasta),
@@ -2420,8 +2512,13 @@ def process_single_fastq(
2420
2512
  )
2421
2513
  if qc_results is not None:
2422
2514
  logging.info("QC analysis completed successfully. Found %s Q-score results.", len(qc_results))
2423
- if optimal_qscore is not None:
2424
- logging.info("Optimal Q-score determined: %s", optimal_qscore)
2515
+ if consensus_info and consensus_info.get("consensus_mean") is not None:
2516
+ logging.info(
2517
+ "Consensus AA mutations per gene: %.4f ± %.4f (thresholds used: %s)",
2518
+ consensus_info["consensus_mean"],
2519
+ consensus_info.get("consensus_std", 0.0),
2520
+ consensus_info.get("thresholds_used"),
2521
+ )
2425
2522
  else:
2426
2523
  logging.warning("QC analysis completed but no Q-score results found.")
2427
2524
  except Exception as exc:
@@ -264,6 +264,7 @@ def run_umi_hunter(
264
264
  output_dir: Path,
265
265
  umi_identity_threshold: float = 0.9,
266
266
  consensus_mutation_threshold: float = 0.7,
267
+ min_cluster_size: int = 1,
267
268
  log_path: Optional[Path] = None,
268
269
  logger: Optional[logging.Logger] = None,
269
270
  ) -> List[Dict[str, Path]]:
@@ -291,6 +292,9 @@ def run_umi_hunter(
291
292
  if not fastq_files:
292
293
  raise ValueError("No FASTQ files provided.")
293
294
 
295
+ if min_cluster_size < 1:
296
+ raise ValueError("Minimum cluster size must be at least 1.")
297
+
294
298
  cfg = load_flank_config(config_csv)
295
299
  pattern_umi, pattern_gene = build_patterns(cfg)
296
300
  reference_record = next(SeqIO.parse(str(template_fasta), "fasta"))
@@ -314,10 +318,20 @@ def run_umi_hunter(
314
318
  umi_csv = sample_dir / f"{sample_base}_UMI_clusters.csv"
315
319
  write_umi_csv(umi_csv, clusters)
316
320
 
321
+ significant_clusters = [
322
+ cluster for cluster in clusters if cluster["total_count"] >= min_cluster_size
323
+ ]
324
+ if not significant_clusters:
325
+ logger.info(
326
+ "No clusters met the minimum size threshold (%s reads) for %s.",
327
+ min_cluster_size,
328
+ sample_base,
329
+ )
330
+
317
331
  gene_csv = sample_dir / f"{sample_base}_gene_consensus.csv"
318
332
  consensus_records = write_gene_csv(
319
333
  gene_csv,
320
- clusters,
334
+ significant_clusters,
321
335
  reference_record,
322
336
  consensus_mutation_threshold,
323
337
  logger,
@@ -334,7 +348,8 @@ def run_umi_hunter(
334
348
  "gene_csv": gene_csv,
335
349
  "fasta": fasta_out,
336
350
  "reads": read_count,
337
- "clusters": len(clusters),
351
+ "clusters": len(significant_clusters),
352
+ "clusters_total": len(clusters),
338
353
  }
339
354
  )
340
355
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: uht-tooling
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Tooling for ultra-high throughput screening workflows.
5
5
  Author: Matt115A
6
6
  License: MIT
@@ -35,7 +35,7 @@ Automation helpers for ultra-high-throughput molecular biology workflows. The pa
35
35
 
36
36
  ### Quick install (recommended, easiest file maintainance)
37
37
  ```bash
38
- pip install "uht-tooling[gui]==0.1.3"
38
+ pip install "uht-tooling[gui]==0.1.4"
39
39
 
40
40
  ```
41
41
 
@@ -189,9 +189,10 @@ If mutations fall within overlapping primer windows, design sequential reactions
189
189
  --fastq data/umi_hunter/*.fastq.gz \
190
190
  --output-dir results/umi_hunter/
191
191
  ```
192
- - Tunable parameters include `--umi-identity-threshold` and `--consensus-mutation-threshold`.
193
- - --umi-identity-threshold is a decimal between 0-1 and defines how similar two UMIs have to be to be considered grouped.
194
- - --consensus-mutation-threshold is the minimum group size to report a consensus sequence.
192
+ - Tunable parameters include `--umi-identity-threshold`, `--consensus-mutation-threshold`, and `--min-cluster-size`.
193
+ - `--umi-identity-threshold` (01) controls how similar two UMIs must be to fall into the same cluster.
194
+ - `--consensus-mutation-threshold` (0–1) is the fraction of reads within a cluster that must agree on a base before it is written into the consensus sequence.
195
+ - `--min-cluster-size` sets the minimum number of reads required in a cluster before a consensus is generated (smaller clusters remain listed in the raw UMI CSV but no consensus FASTA is produced).
195
196
 
196
197
  Please be aware, this toolkit will not scale well beyond around 50k reads/sample. See UMIC-seq pipelines for efficient UMI-gene dictionary generation.
197
198
 
@@ -221,7 +222,14 @@ Please be aware, this toolkit will not scale well beyond around 50k reads/sample
221
222
  --fastq data/ep-library-profile/*.fastq.gz \
222
223
  --output-dir results/ep-library-profile/
223
224
  ```
224
- - Output bundle includes per-sample directories and a master summary TSV.
225
+ - Output bundle includes per-sample directories, a master summary TSV, and a `summary_panels` figure that visualises positional mutation rates, coverage, and amino-acid simulations.
226
+
227
+ **How the mutation rate and AA expectations are derived**
228
+
229
+ 1. Reads are aligned to both the region of interest and the full plasmid. Mismatches in the region define the “target” rate; mismatches elsewhere provide the background.
230
+ 2. The per-base background rate is subtracted from the target rate to yield a net nucleotide mutation rate, and the standard deviation reflects binomial sampling and quality-score uncertainty.
231
+ 3. The net rate is multiplied by the CDS length to estimate λ_bp (mutations per copy). Monte Carlo simulations then flip random bases, translate the mutated CDS, and count amino-acid differences across 1,000 trials—these drives the AA mutation mean/variance that appear in the panel plot.
232
+ 4. If multiple Q-score thresholds are analysed, the CLI aggregates them via a precision-weighted consensus (1 / standard deviation weighting) after filtering out thresholds with insufficient coverage; the consensus value is written to `aa_mutation_consensus.txt` and plotted as a horizontal guide.
225
233
 
226
234
  ---
227
235
 
@@ -243,9 +251,9 @@ Key points:
243
251
  1. **Nextera XT** – forward/reverse primer inputs with CSV preview.
244
252
  2. **SLIM** – template/context FASTA text areas plus mutation list.
245
253
  3. **Gibson** – multi-mutation support using `+` syntax.
246
- 4. **Mutation Caller** – upload FASTQ, template FASTA, and configuration CSV.
247
- 5. **UMI Hunter** – long-read UMI clustering with configurable thresholds.
248
- 6. **Profile Inserts** – probe CSV and multiple FASTQ uploads.
254
+ 4. **Mutation Caller** – upload FASTQ and template FASTA, then enter flanks and gene length bounds inline.
255
+ 5. **UMI Hunter** – long-read UMI clustering with flank entry, UMI length bounds, mutation threshold, and minimum cluster size.
256
+ 6. **Profile Inserts** – interactive probe table plus multiple FASTQ uploads with adjustable fuzzy-match ratio.
249
257
  7. **EP Library Profile** – FASTQ uploads plus plasmid and region FASTA inputs.
250
258
 
251
259
  ### Workflow tips
@@ -1,17 +1,17 @@
1
1
  uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
2
- uht_tooling/cli.py,sha256=sQU0duLmMOqvqzB6hDV7GIQYdvzAKKK3rLx0Iq07ZR4,12432
2
+ uht_tooling/cli.py,sha256=yKTPqWwYAs7tzO_TeyaLhSfzkNoCUPnc0wU2fgOR2wk,12882
3
3
  uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
6
6
  uht_tooling/workflows/design_slim.py,sha256=Qeh8N32kmVFZvohmTlBudJsLzOqLy4XcY3aXbkP-sFQ,14421
7
- uht_tooling/workflows/gui.py,sha256=jP3gYZp8hyBCms65nzoZ_EW3rsNrn2ZGGp8gBSvny6Q,23123
8
- uht_tooling/workflows/mut_rate.py,sha256=wjX1lNXTcaH49gfARSrpKLU1mD5hCgH0ZFTcdlNrAB4,105670
7
+ uht_tooling/workflows/gui.py,sha256=P4FdZWsS0NLX5VmOZZ-WO-biVEhbfa6M1gY6DFcgR7k,43153
8
+ uht_tooling/workflows/mut_rate.py,sha256=j8QzYe9QrT_yyhSYUbH3MHyvUp61U_h0w1bEd8b3aFI,109038
9
9
  uht_tooling/workflows/mutation_caller.py,sha256=BczuNATOSUcmlw-x6qTzEQfW8MBbvGclEyqiQiBX0cg,16222
10
10
  uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
11
11
  uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
12
- uht_tooling/workflows/umi_hunter.py,sha256=kXR7Tw3vK4TnL8OShRt9kZ36ONpOSd-1txwB95Ldi-I,14470
13
- uht_tooling-0.1.3.dist-info/METADATA,sha256=0bPz8odnvbX13BvlQC4HsXvJwu7dRK7YyQ2nD7KwHEA,11220
14
- uht_tooling-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- uht_tooling-0.1.3.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
16
- uht_tooling-0.1.3.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
17
- uht_tooling-0.1.3.dist-info/RECORD,,
12
+ uht_tooling/workflows/umi_hunter.py,sha256=baycWycqVzUfMp5u2WZdHRl0sNuykTjy-iqtj5ahucU,15075
13
+ uht_tooling-0.1.5.dist-info/METADATA,sha256=rqbE3jGdLJvbUEXlLmS-VcDMKJHCw0-7l8NKosD9WEQ,12751
14
+ uht_tooling-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ uht_tooling-0.1.5.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
16
+ uht_tooling-0.1.5.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
17
+ uht_tooling-0.1.5.dist-info/RECORD,,