PyPI - uht-tooling - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

uht-tooling 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

uht_tooling/cli.py CHANGED Viewed

@@ -233,6 +233,11 @@ def umi_hunter_command(
         max=1.0,
         help="Mutation threshold for consensus calling (default: 0.7).",
     ),
+    min_cluster_size: int = typer.Option(
+        1,
+        min=1,
+        help="Minimum number of reads required in a UMI cluster before a consensus is generated.",
+    ),
     log_path: Optional[Path] = typer.Option(
         None,
         dir_okay=False,
@@ -249,6 +254,7 @@ def umi_hunter_command(
         output_dir=output_dir,
         umi_identity_threshold=umi_identity_threshold,
         consensus_mutation_threshold=consensus_mutation_threshold,
+        min_cluster_size=min_cluster_size,
         log_path=log_path,
     )
     if not results:
@@ -256,7 +262,12 @@ def umi_hunter_command(
     else:
         typer.echo("UMI hunter outputs:")
         for entry in results:
-            typer.echo(f"  Sample {entry['sample']}: {entry['directory']}")
+            total_clusters = entry.get("clusters_total", entry.get("clusters", 0))
+            typer.echo(
+                f"  Sample {entry['sample']}: "
+                f"{entry.get('clusters', 0)} consensus clusters "
+                f"(from {total_clusters} total) → {entry['directory']}"
+            )
 @app.command("ep-library-profile", help="Profile mutation rates for ep-library sequencing data.")

uht_tooling/workflows/gui.py CHANGED Viewed

@@ -10,7 +10,7 @@ import tempfile
 import textwrap
 import zipfile
 from pathlib import Path
-from typing import Iterable, List, Optional, Sequence, Tuple
+from typing import Any, Iterable, List, Optional, Sequence, Tuple
 try:
     import gradio as gr
@@ -241,28 +241,60 @@ def run_gui_design_gibson(
 def run_gui_mutation_caller(
     fastq_file: Optional[str],
     template_file: Optional[str],
-    config_csv_file: Optional[str],
+    upstream_flank: str,
+    downstream_flank: str,
+    min_gene_length: Optional[float],
+    max_gene_length: Optional[float],
 ) -> Tuple[str, Optional[str]]:
+    config_dir: Optional[Path] = None
+    output_dir: Optional[Path] = None
     try:
-        if not fastq_file or not template_file or not config_csv_file:
-            raise ValueError("Upload a FASTQ(.gz), template FASTA, and configuration CSV.")
+        if not fastq_file or not template_file:
+            raise ValueError("Upload a FASTQ(.gz) read file and the reference template FASTA.")
+        gene_start = _ensure_text(upstream_flank, "Upstream flank")
+        gene_end = _ensure_text(downstream_flank, "Downstream flank")
+        if min_gene_length is None or max_gene_length is None:
+            raise ValueError("Provide minimum and maximum gene lengths (in nucleotides).")
+        gene_min = int(min_gene_length)
+        gene_max = int(max_gene_length)
+        if gene_min <= 0 or gene_max <= 0:
+            raise ValueError("Gene length bounds must be positive integers.")
+        if gene_min > gene_max:
+            raise ValueError("Minimum gene length cannot exceed the maximum gene length.")
+        config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_mutation_cfg_"))
+        config_csv = config_dir / "mutation_flanks.csv"
+        pd.DataFrame(
+            {
+                "gene_flanks": [gene_start.upper(), gene_end.upper()],
+                "gene_min_max": [gene_min, gene_max],
+            }
+        ).to_csv(config_csv, index=False)
         output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_mutation_out_"))
         results = run_mutation_caller(
             template_fasta=Path(template_file),
-            flanks_csv=Path(config_csv_file),
+            flanks_csv=config_csv,
             fastq_files=[Path(fastq_file)],
             output_dir=output_dir,
             threshold=10,
         )
         if not results:
-            return "No amino-acid substitutions detected.", None
-        lines = ["### Mutation Caller", ""]
+            return "No amino-acid substitutions detected. Check flank selections and read quality.", None
+        lines = [
+            "### Mutation Caller",
+            "",
+            "Long-read reads were aligned to the provided template, flank-delimited coding regions were extracted, and amino-acid substitutions were summarised.",
+            "",
+            "**Run outputs**",
+        ]
         sample_dirs = []
         for entry in results:
-            lines.append(f"**{entry['sample']}** → {entry['directory']}")
+            lines.append(f"- **{entry['sample']}** → {entry['directory']}")
             sample_dirs.append(Path(entry["directory"]))
         summary = "\n".join(lines)
         archive = _zip_paths(sample_dirs, "mutation_caller")
@@ -270,33 +302,94 @@ def run_gui_mutation_caller(
     except Exception as exc:  # pragma: no cover
         _LOGGER.exception("Mutation caller GUI failure")
         return f"⚠️ Error: {exc}", None
+    finally:
+        if config_dir:
+            _clean_temp_path(config_dir)
+        if output_dir:
+            _clean_temp_path(output_dir)
 def run_gui_umi_hunter(
     fastq_file: Optional[str],
     template_file: Optional[str],
-    config_csv_file: Optional[str],
+    umi_start: str,
+    umi_end: str,
+    umi_min_length: Optional[float],
+    umi_max_length: Optional[float],
+    gene_start: str,
+    gene_end: str,
+    umi_identity_threshold: float,
+    consensus_threshold: float,
+    min_cluster_size: int,
 ) -> Tuple[str, Optional[str]]:
+    config_dir: Optional[Path] = None
+    output_dir: Optional[Path] = None
     try:
-        if not fastq_file or not template_file or not config_csv_file:
-            raise ValueError("Upload a FASTQ(.gz), template FASTA, and configuration CSV.")
+        if not fastq_file or not template_file:
+            raise ValueError("Upload a FASTQ(.gz) read file and the template FASTA.")
+        umi_start_clean = _ensure_text(umi_start, "UMI upstream flank").upper()
+        umi_end_clean = _ensure_text(umi_end, "UMI downstream flank").upper()
+        gene_start_clean = _ensure_text(gene_start, "Gene upstream flank").upper()
+        gene_end_clean = _ensure_text(gene_end, "Gene downstream flank").upper()
+        if umi_min_length is None or umi_max_length is None:
+            raise ValueError("Provide minimum and maximum UMI lengths.")
+        umi_min = int(umi_min_length)
+        umi_max = int(umi_max_length)
+        if umi_min <= 0 or umi_max <= 0:
+            raise ValueError("UMI length bounds must be positive integers.")
+        if umi_min > umi_max:
+            raise ValueError("Minimum UMI length cannot exceed the maximum length.")
+        if not (0.0 <= umi_identity_threshold <= 1.0):
+            raise ValueError("UMI identity threshold must be between 0 and 1.")
+        if not (0.0 <= consensus_threshold <= 1.0):
+            raise ValueError("Consensus mutation threshold must be between 0 and 1.")
+        if min_cluster_size is None or int(min_cluster_size) < 1:
+            raise ValueError("Minimum cluster size must be at least 1.")
+        min_cluster_size_int = int(min_cluster_size)
+        config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_umi_cfg_"))
+        config_csv = config_dir / "umi_config.csv"
+        pd.DataFrame(
+            {
+                "umi_flanks": [umi_start_clean, umi_end_clean],
+                "umi_min_max": [umi_min, umi_max],
+                "gene_flanks": [gene_start_clean, gene_end_clean],
+            }
+        ).to_csv(config_csv, index=False)
         output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_umi_out_"))
         results = run_umi_hunter(
             template_fasta=Path(template_file),
-            config_csv=Path(config_csv_file),
+            config_csv=config_csv,
             fastq_files=[Path(fastq_file)],
             output_dir=output_dir,
+            umi_identity_threshold=umi_identity_threshold,
+            consensus_mutation_threshold=consensus_threshold,
+            min_cluster_size=min_cluster_size_int,
         )
         if not results:
-            return "No UMI clusters were generated. Check input quality and thresholds.", None
+            return (
+                "No UMI clusters were generated. Double-check flank selections and threshold settings.",
+                None,
+            )
-        lines = ["### UMI Hunter", ""]
+        lines = [
+            "### UMI Hunter",
+            "",
+            "Reads were scanned for UMI and gene flanks, deduplicated by UMI, and consensus alleles were generated.",
+            "",
+            "**Run outputs**",
+        ]
         sample_dirs = []
         for entry in results:
+            total_clusters = entry.get("clusters_total", entry["clusters"])
             lines.append(
-                f"**{entry['sample']}** → {entry['clusters']} clusters, results in {entry['directory']}"
+                f"- **{entry['sample']}** → {entry['clusters']} consensus clusters "
+                f"(≥ {min_cluster_size_int} reads) from {total_clusters} total, "
+                f"results in {entry['directory']}"
             )
             sample_dirs.append(Path(entry["directory"]))
         summary = "\n".join(lines)
@@ -305,35 +398,82 @@ def run_gui_umi_hunter(
     except Exception as exc:  # pragma: no cover
         _LOGGER.exception("UMI hunter GUI failure")
         return f"⚠️ Error: {exc}", None
+    finally:
+        if config_dir:
+            _clean_temp_path(config_dir)
+        if output_dir:
+            _clean_temp_path(output_dir)
 def run_gui_profile_inserts(
-    probes_csv_path: Optional[str],
+    probes_table: Any,
     fastq_files: Sequence[str],
+    min_ratio: int,
 ) -> Tuple[str, Optional[str]]:
+    config_dir: Optional[Path] = None
+    output_dir: Optional[Path] = None
     try:
-        if not probes_csv_path or not fastq_files:
-            raise ValueError("Upload the probe CSV and at least one FASTQ(.gz) file.")
+        if not fastq_files:
+            raise ValueError("Upload at least one FASTQ(.gz) file.")
+        if probes_table is None:
+            raise ValueError("Provide at least one probe pair.")
+        if isinstance(probes_table, pd.DataFrame):
+            df = probes_table.copy()
+        else:
+            df = pd.DataFrame(probes_table or [], columns=["name", "upstream", "downstream"])
+        # Normalise and validate probe entries
+        df = df.replace({pd.NA: "", None: ""})
+        for column in df.columns:
+            if df[column].dtype == object:
+                df[column] = df[column].map(lambda x: x.strip() if isinstance(x, str) else x)
+        if "upstream" not in df.columns or "downstream" not in df.columns:
+            raise ValueError("Probe table must contain 'upstream' and 'downstream' columns.")
+        df_valid = df[(df["upstream"] != "") & (df["downstream"] != "")].copy()
+        if df_valid.empty:
+            raise ValueError("Enter at least one probe pair with both upstream and downstream sequences.")
+        df_valid = df_valid.reset_index(drop=True)
+        if "name" not in df_valid.columns:
+            df_valid["name"] = [f"probe_{i + 1}" for i in range(len(df_valid))]
+        else:
+            fallback_names = pd.Series(
+                [f"probe_{i + 1}" for i in range(len(df_valid))], index=df_valid.index
+            )
+            df_valid["name"] = df_valid["name"].replace("", pd.NA).fillna(fallback_names)
+        config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_profile_cfg_"))
+        probes_csv = config_dir / "probes.csv"
+        df_valid.to_csv(probes_csv, index=False)
         output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_profile_out_"))
         results = run_profile_inserts(
-            probes_csv=Path(probes_csv_path),
+            probes_csv=probes_csv,
             fastq_files=[Path(f) for f in fastq_files],
             output_dir=output_dir,
+            min_ratio=int(min_ratio),
         )
         if not results:
-            return "No inserts were extracted. Adjust probe settings and try again.", None
+            return "No inserts were extracted. Adjust probe sequences or similarity threshold and try again.", None
         first_insert = results[0]["fasta"] if isinstance(results, list) else None
         preview = "*(preview unavailable)*"
         if first_insert and Path(first_insert).exists():
-            preview = Path(first_insert).read_text().splitlines()[0][:80] + "..."
+            preview = Path(first_insert).read_text().splitlines()[0][:120] + "..."
         summary = textwrap.dedent(
             """
             ### Insert Profiling
-            Extracted inserts and generated QC metrics. Download the archive for full outputs.
+            Probe-defined regions were scanned in the provided FASTQ files, inserts were extracted, and QC metrics were generated.
+            **Key outputs**
+            - FASTA files containing extracted inserts per probe pair
+            - Summary tables covering length, GC content, duplicate rate, and probe match quality
+            - A gallery of QC plots (length distributions, base composition, probe performance)
             """
         )
         archive = _zip_paths([Path(r["directory"]) for r in results], "profile_inserts")
@@ -341,6 +481,11 @@ def run_gui_profile_inserts(
     except Exception as exc:  # pragma: no cover
         _LOGGER.exception("Profile inserts GUI failure")
         return f"⚠️ Error: {exc}", None
+    finally:
+        if config_dir:
+            _clean_temp_path(config_dir)
+        if output_dir:
+            _clean_temp_path(output_dir)
 def run_gui_ep_library_profile(
@@ -406,18 +551,34 @@ def create_gui() -> gr.Blocks:
                 textwrap.dedent(
                     """
                     # uht-tooling
-                    A guided graphical interface for primer design and sequencing analysis.
-                    Use the tabs below, supply the required inputs, and download the generated results.
+                    A guided graphical interface for primer design and sequencing analysis. Each tab mirrors the command-line workflows documented in the README and bundles results, logs, and QC artefacts for download.
+                    **How to use**
+                    1. Select the workflow that matches your experiment.
+                    2. Provide the required inputs (text fields, FASTQ/FASTA uploads, or probe tables).
+                    3. Run the analysis and download the ZIP archive for complete outputs.
+                    Need automation or batch processing? Use the Typer CLI (`uht-tooling ...`) with the same arguments shown here.
                     """
                 )
             )
         with gr.Tab("Nextera XT"):  # --- Nextera ---
             gr.Markdown(
-                """
-                ### Illumina-Compatible Primer Design
-                Provide the forward and reverse binding regions in 5'→3' orientation.
-                """
+                textwrap.dedent(
+                    """
+                    ### Illumina-Compatible Primer Design
+                    Generates Nextera XT-ready primers from forward/reverse binding regions. The workflow preloads 12 i5 and 12 i7 indices (144 combinations) and mirrors the “One-PCR-to-flowcell” process described in the README.
+                    **Inputs**
+                    - Forward primer binding region (5'→3')
+                    - Reverse primer binding region (5'→3')
+                    **Outputs**
+                    - CSV with i5/i7 indices, primer sequences, and ordering-ready metadata.
+                    - Run log noting index selection and any validation warnings.
+                    """
+                )
             )
             forward = gr.Textbox(label="Forward primer (5'→3')")
             reverse = gr.Textbox(label="Reverse primer (5'→3')")
@@ -429,13 +590,34 @@ def create_gui() -> gr.Blocks:
                 inputs=[forward, reverse],
                 outputs=[nextera_summary, nextera_download],
             )
+            with gr.Accordion("Wet-lab guidance", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        - Monitor amplification by qPCR and cap the cycle count to reach roughly 10 % yield to limit bias.
+                        - Purify products with SPRIselect beads (~0.65:1 bead:DNA ratio) to remove residual primers.
+                        - Confirm primer depletion via electrophoresis (e.g., BioAnalyzer) before sequencing prep.
+                        """
+                    )
+                )
         with gr.Tab("SLIM"):
             gr.Markdown(
-                """
-                ### Sequence-Ligation Independent Mutagenesis
-                Paste the gene coding sequence, the plasmid context, and one mutation per line.
-                """
+                textwrap.dedent(
+                    """
+                    ### Sequence-Ligation Independent Mutagenesis
+                    Designs paired short/long primers to introduce targeted mutations by SLIM cloning, matching the workflow outlined in the README.
+                    **Inputs**
+                    - Target gene coding sequence (FASTA content).
+                    - Plasmid or genomic context containing the gene.
+                    - Mutations (one per line, e.g. substitution `A123G`, deletion `T241Del`, insertion `T241TS`).
+                    **Outputs**
+                    - `SLIM_primers.csv` with primer sequences and annealing temperatures.
+                    - Log file capturing primer QC and any design warnings.
+                    """
+                )
             )
             slim_gene = gr.Textbox(label="Gene sequence", lines=4)
             slim_context = gr.Textbox(label="Plasmid context", lines=4)
@@ -448,13 +630,36 @@ def create_gui() -> gr.Blocks:
                 inputs=[slim_gene, slim_context, slim_mutations],
                 outputs=[slim_summary, slim_download],
             )
+            with gr.Accordion("Bench workflow blueprint", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        1. Run two PCRs: (A) long forward + short reverse, (B) long reverse + short forward.
+                        2. Combine 10 µL from each PCR with 10 µL H-buffer (150 mM Tris pH 8, 400 mM NaCl, 60 mM EDTA).
+                        3. Thermocycle: 99 °C 3 min → 2× (65 °C 5 min → 30 °C 15 min) → hold at 4 °C.
+                        4. Transform directly into NEB 5-alpha or BL21 (DE3); the method scales to dozens of mutants simultaneously.
+                        """
+                    )
+                )
         with gr.Tab("Gibson"):
             gr.Markdown(
-                """
-                ### Gibson Assembly Primer Design
-                Use `+` to combine multiple mutations applied simultaneously.
-                """
+                textwrap.dedent(
+                    """
+                    ### Gibson Assembly Primer Design
+                    Plans primer sets and assembly steps for Gibson mutagenesis, supporting multi-mutation constructs using the `+` syntax (e.g. `A123G+T150A`).
+                    **Inputs**
+                    - Coding sequence for the gene of interest.
+                    - Circular plasmid context sequence.
+                    - Mutation definitions (one per line; use `+` to bundle simultaneous edits).
+                    **Outputs**
+                    - Primer CSV with overlap sequences and melting temperatures.
+                    - Assembly plan CSV detailing fragment combinations.
+                    - Log summarising design decisions and any warnings about overlapping regions.
+                    """
+                )
             )
             gibson_gene = gr.Textbox(label="Gene sequence", lines=4)
             gibson_context = gr.Textbox(label="Plasmid context", lines=4)
@@ -467,74 +672,270 @@ def create_gui() -> gr.Blocks:
                 inputs=[gibson_gene, gibson_context, gibson_mutations],
                 outputs=[gibson_summary, gibson_download],
             )
+            with gr.Accordion("Tips for multi-mutation designs", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        - If two mutations compete for primer space, design them in sequential runs to avoid overly long primers.
+                        - Use the assembly plan CSV to map which fragments to combine in each Gibson reaction.
+                        - When replacing entire codons (e.g. `L46GP`), ensure the plasmid context covers both flanks to maintain overlap.
+                        """
+                    )
+                )
         with gr.Tab("Mutation Caller"):
             gr.Markdown(
-                """
-                ### Long-read Mutation Analysis
-                Upload a FASTQ(.gz), the template FASTA, and the mutation_caller CSV configuration.
-                """
+                textwrap.dedent(
+                    """
+                    ### Long-read Mutation Analysis
+                    Extracts coding regions bounded by user-defined flanks, aligns them to the template, and reports amino-acid substitutions alongside co-occurrence summaries.
+                    **Required inputs**
+                    - FASTQ (.fastq.gz): Oxford Nanopore or other long-read data.
+                    - Template FASTA: coding sequence used as the reference for alignment.
+                    - Flank sequences: short 8–12 bp motifs immediately upstream and downstream of the gene.
+                    - Gene length bounds: acceptable size window (in nucleotides) for the extracted gene segment.
+                    """
+                )
             )
-            mc_fastq = gr.File(label="FASTQ (.fastq.gz)", file_types=[".fastq", ".gz"], type="filepath")
-            mc_template = gr.File(label="Template FASTA", file_types=[".fasta", ".fa"], type="filepath")
-            mc_config = gr.File(label="Configuration CSV", file_types=[".csv"], type="filepath")
+            with gr.Row():
+                mc_fastq = gr.File(
+                    label="FASTQ (.fastq.gz)",
+                    file_types=[".fastq", ".gz"],
+                    type="filepath",
+                )
+                mc_template = gr.File(
+                    label="Template FASTA",
+                    file_types=[".fasta", ".fa"],
+                    type="filepath",
+                )
+            with gr.Row():
+                mc_upstream = gr.Textbox(
+                    label="Upstream flank (5'→3')",
+                    placeholder="e.g. ACTGTTAG",
+                )
+                mc_downstream = gr.Textbox(
+                    label="Downstream flank (5'→3')",
+                    placeholder="e.g. CGAACCTA",
+                )
+            with gr.Row():
+                mc_min_len = gr.Number(
+                    label="Minimum gene length (nt)",
+                    value=900,
+                    precision=0,
+                )
+                mc_max_len = gr.Number(
+                    label="Maximum gene length (nt)",
+                    value=1200,
+                    precision=0,
+                )
             mc_btn = gr.Button("Run mutation caller", variant="primary")
             mc_summary = gr.Markdown(label="Summary")
             mc_download = gr.File(label="Download results", file_count="single")
             mc_btn.click(
                 fn=run_gui_mutation_caller,
-                inputs=[mc_fastq, mc_template, mc_config],
+                inputs=[
+                    mc_fastq,
+                    mc_template,
+                    mc_upstream,
+                    mc_downstream,
+                    mc_min_len,
+                    mc_max_len,
+                ],
                 outputs=[mc_summary, mc_download],
             )
+            with gr.Accordion("What happens under the hood", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        - Reads are scanned for the upstream and downstream flanks; the sequence between them is treated as the gene of interest if it falls within the specified length window.
+                        - MAFFT aligns recovered genes to the reference template and the pipeline annotates amino-acid substitutions, co-occurrence networks, and depth statistics.
+                        - Outputs mirror the CLI version: per-sample directories with CSV summaries, JSON co-occurrence graphs, QC plots, and a detailed `run.log`.
+                        """
+                    )
+                )
         with gr.Tab("UMI Hunter"):
             gr.Markdown(
-                """
-                ### UMI-Gene Pair Clustering
-                Upload a FASTQ(.gz), template FASTA, and the UMI configuration CSV.
-                """
+                textwrap.dedent(
+                    """
+                    ### UMI–Gene Pair Clustering
+                    Detects UMI barcodes, extracts paired gene inserts, clusters reads by UMI identity, and emits consensus sequences with abundance tables.
+                    **Required inputs**
+                    - FASTQ (.fastq.gz) containing UMI-tagged reads.
+                    - Template FASTA for downstream consensus calling.
+                    - UMI and gene flank sequences marking the barcode and insert boundaries.
+                    - UMI length bounds plus clustering thresholds.
+                    - Minimum reads per cluster to keep (clusters below the threshold are reported but no consensus is generated).
+                    """
+                )
+            )
+            with gr.Row():
+                umi_fastq = gr.File(
+                    label="FASTQ (.fastq.gz)",
+                    file_types=[".fastq", ".gz"],
+                    type="filepath",
+                )
+                umi_template = gr.File(
+                    label="Template FASTA",
+                    file_types=[".fasta", ".fa"],
+                    type="filepath",
+                )
+            with gr.Row():
+                umi_start = gr.Textbox(
+                    label="UMI upstream flank (5'→3')",
+                    placeholder="e.g. ACACTCTTTCCCTACACGAC",
+                )
+                umi_end = gr.Textbox(
+                    label="UMI downstream flank (5'→3')",
+                    placeholder="e.g. GACTGGAGTTCAGACGTGTG",
+                )
+            with gr.Row():
+                gene_start = gr.Textbox(
+                    label="Gene upstream flank (5'→3')",
+                    placeholder="e.g. ATG...",
+                )
+                gene_end = gr.Textbox(
+                    label="Gene downstream flank (5'→3')",
+                    placeholder="e.g. TTA...",
+                )
+            with gr.Row():
+                umi_min_len = gr.Number(
+                    label="Minimum UMI length (nt)",
+                    value=8,
+                    precision=0,
+                )
+                umi_max_len = gr.Number(
+                    label="Maximum UMI length (nt)",
+                    value=14,
+                    precision=0,
+                )
+            with gr.Row():
+                umi_identity = gr.Slider(
+                    label="UMI clustering identity",
+                    minimum=0.5,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.05,
+                )
+                consensus_threshold = gr.Slider(
+                    label="Consensus mutation threshold",
+                    minimum=0.5,
+                    maximum=1.0,
+                    value=0.7,
+                    step=0.05,
+                )
+            umi_min_cluster = gr.Slider(
+                label="Minimum reads per cluster",
+                minimum=1,
+                maximum=50,
+                value=3,
+                step=1,
             )
-            umi_fastq = gr.File(label="FASTQ (.fastq.gz)", file_types=[".fastq", ".gz"], type="filepath")
-            umi_template = gr.File(label="Template FASTA", file_types=[".fasta", ".fa"], type="filepath")
-            umi_config = gr.File(label="UMI config CSV", file_types=[".csv"], type="filepath")
             umi_btn = gr.Button("Run UMI hunter", variant="primary")
             umi_summary = gr.Markdown(label="Summary")
             umi_download = gr.File(label="Download results", file_count="single")
             umi_btn.click(
                 fn=run_gui_umi_hunter,
-                inputs=[umi_fastq, umi_template, umi_config],
+                inputs=[
+                    umi_fastq,
+                    umi_template,
+                    umi_start,
+                    umi_end,
+                    umi_min_len,
+                    umi_max_len,
+                    gene_start,
+                    gene_end,
+                    umi_identity,
+                    consensus_threshold,
+                    umi_min_cluster,
+                ],
                 outputs=[umi_summary, umi_download],
             )
+            with gr.Accordion("What the pipeline generates", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        - Reads are searched for the UMI barcode and gene flanks on both strands; valid pairs feed into UMI grouping.
+                        - UMIs within the chosen identity threshold are merged, and consensus sequences are computed with the mutation threshold.
+                        - Outputs include per-sample summaries, consensus FASTA files, cluster membership tables, QC plots, and logs mirroring the CLI workflow.
+                        """
+                    )
+                )
         with gr.Tab("Profile Inserts"):
             gr.Markdown(
-                """
-                ### Insert Profiling
-                Upload the probe CSV and one or more FASTQ(.gz) files containing reads.
-                """
+                textwrap.dedent(
+                    """
+                    ### Probe-Guided Insert Profiling
+                    Characterises inserts demarcated by user-supplied upstream/downstream probes, extracts sequences, and produces QC plots plus summary tables.
+                    **Required inputs**
+                    - FASTQ reads containing the inserts of interest.
+                    - One or more probe pairs: 5'→3' sequences for the upstream and downstream anchors (reverse complements are matched automatically).
+                    """
+                )
+            )
+            probes_table = gr.Dataframe(
+                headers=["name (optional)", "upstream", "downstream"],
+                datatype=["str", "str", "str"],
+                row_count=(1, "dynamic"),
+                col_count=3,
+                value=[["probe_1", "", ""]],
+                interactive=True,
+                label="Probe pairs",
             )
-            pi_csv = gr.File(label="Probe CSV", file_types=[".csv"], type="filepath")
             pi_fastq = gr.File(
-                label="FASTQ files",
+                label="FASTQ files (.fastq/.gz)",
                 file_types=[".fastq", ".gz"],
                 file_count="multiple",
                 type="filepath",
             )
+            pi_ratio = gr.Slider(
+                label="Minimum fuzzy-match ratio",
+                minimum=50,
+                maximum=100,
+                value=80,
+                step=1,
+            )
             pi_btn = gr.Button("Profile inserts", variant="primary")
             pi_summary = gr.Markdown(label="Summary")
             pi_download = gr.File(label="Download results", file_count="single")
             pi_btn.click(
                 fn=run_gui_profile_inserts,
-                inputs=[pi_csv, pi_fastq],
+                inputs=[probes_table, pi_fastq, pi_ratio],
                 outputs=[pi_summary, pi_download],
             )
+            with gr.Accordion("Output overview", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        - Inserts are extracted whenever probe matches are detected above the chosen similarity threshold (default 80).
+                        - A FASTA file of inserts, probe-level QC metrics, base composition summaries, and a suite of plots (length distribution, GC content, duplicate rate, probe performance) are packaged for each input FASTQ.
+                        - Logs are stored alongside the results so runs remain fully reproducible.
+                        """
+                    )
+                )
         with gr.Tab("EP Library Profile"):
             gr.Markdown(
-                """
-                ### Library Profiling Without UMIs
-                Upload one or more FASTQ(.gz) files plus the region and plasmid references.
-                """
+                textwrap.dedent(
+                    """
+                    ### Library Profiling Without UMIs
+                    Estimates background and target mutation rates for enzyme evolution libraries without UMI barcodes.
+                    **Inputs**
+                    - FASTQ reads (*.fastq/.gz) from the ep-library experiment.
+                    - Region-of-interest FASTA delineating the mutational window.
+                    - Plasmid FASTA providing the full reference context.
+                    **Outputs**
+                    - Per-sample directories with coverage tables, mutation rate statistics, and QC plots.
+                    - `master_summary.txt` aggregating condition-level metrics.
+                    - Verbose logs recording alignment commands and rate calculations.
+                    """
+                )
             )
             ep_fastq = gr.File(
                 label="FASTQ files",
@@ -552,6 +953,17 @@ def create_gui() -> gr.Blocks:
                 inputs=[ep_fastq, ep_region, ep_plasmid],
                 outputs=[ep_summary, ep_download],
             )
+            with gr.Accordion("How mutation rates are derived", open=False):
+                gr.Markdown(
+                    textwrap.dedent(
+                        """
+                        - Reads are aligned against both the region-of-interest and the full plasmid to measure target and background mismatch rates; their difference yields the net nucleotide mutation rate with propagated binomial and quality-score uncertainty.
+                        - The net per-base rate is multiplied by the CDS length to obtain λ₍bp₎ (mutations per copy), then Monte Carlo simulations flip random bases, translate the mutated CDS, and count amino-acid differences—those simulated means and confidence intervals are the values plotted in the QC figure.
+                        - When multiple Q-score thresholds are analysed, the CLI combines them via a precision-weighted consensus (after discarding filters with <1000 mappable bases). The consensus AA mutation rate is written to `aa_mutation_consensus.txt` and drawn as a horizontal guide in the plot.
+                        - Download the archive to inspect per-sample plots, TSV summaries, the consensus summary, and logs for troubleshooting.
+                        """
+                    )
+                )
         gr.Markdown(
             textwrap.dedent(

uht_tooling/workflows/mut_rate.py CHANGED Viewed

@@ -15,7 +15,7 @@ import matplotlib.pyplot as plt
 import math
 import tempfile
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Sequence
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
 # Use a built-in Matplotlib style ("ggplot") for consistency
 plt.style.use("ggplot")
@@ -505,94 +505,153 @@ def run_qc_analysis(fastq_path, results_dir, ref_hit_fasta, plasmid_fasta):
             else:
                 logging.warning(f"Failed to calculate mutation rate for quality threshold {q_threshold}")
-        # Find optimal Q-score threshold (lowest empirical error)
-        optimal_qscore, optimal_result = find_optimal_qscore_simple(qc_results)
+        # Derive consensus AA mutation estimates across valid Q-score thresholds
+        consensus_info, _ = compute_consensus_aa_mutation(qc_results)
         # Create QC plots
         if len(qc_results) >= 2:
-            create_simple_qc_plots(successful_thresholds, qc_results, results_dir, optimal_qscore, optimal_result)
+            create_simple_qc_plots(
+                successful_thresholds,
+                qc_results,
+                results_dir,
+                consensus_info=consensus_info,
+            )
         else:
             logging.warning("Insufficient data points for QC plots (need at least 2)")
-        # Save optimal Q-score information
-        if optimal_qscore is not None:
-            optimal_qscore_path = os.path.join(results_dir, "optimal_qscore_analysis.txt")
-            with open(optimal_qscore_path, 'w') as f:
-                f.write("=== OPTIMAL Q-SCORE ANALYSIS (PRECISION-WEIGHTED) ===\n")
-                f.write(f"Optimal Q-score threshold: {optimal_qscore}\n")
-                f.write(f"Precision-weighted score: {(1.0 / optimal_result['std_aa_mutations']) * optimal_qscore:.6f}\n" if optimal_result['std_aa_mutations'] > 0 else "Precision-weighted score: inf (perfect precision)\n")
-                f.write(f"Empirical error (std): {optimal_result['std_aa_mutations']:.6f}\n")
-                f.write(f"AA mutations per gene: {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}\n")
-                f.write(f"95% Confidence Interval: [{optimal_result['ci_lower']:.4f}, {optimal_result['ci_upper']:.4f}]\n")
-                f.write(f"Total mappable bases: {optimal_result['total_mappable_bases']}\n")
-                f.write(f"Number of segments: {optimal_result['n_segments']}\n")
-                f.write("\n=== ALL Q-SCORE COMPARISON ===\n")
-                f.write("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases\tAA_Mutations\tCI_Lower\tCI_Upper\n")
-                for result in qc_results:
-                    precision_score = (1.0 / result['std_aa_mutations']) * result['quality_threshold'] if result['std_aa_mutations'] > 0 else float('inf')
-                    f.write(f"{result['quality_threshold']}\t{result['std_aa_mutations']:.6f}\t{precision_score:.6f}\t{result['total_mappable_bases']}\t{result['mean_aa_mutations']:.4f}\t{result['ci_lower']:.4f}\t{result['ci_upper']:.4f}\n")
-            logging.info(f"Optimal Q-score analysis saved to: {optimal_qscore_path}")
+        # Save consensus summary
+        consensus_summary_path = os.path.join(results_dir, "aa_mutation_consensus.txt")
+        with open(consensus_summary_path, "w") as f:
+            f.write("=== CONSENSUS AMINO-ACID MUTATION ESTIMATE ===\n")
+            if consensus_info:
+                f.write(f"Minimum mappable bases required: {consensus_info['min_mappable_bases']}\n")
+                f.write(
+                    f"Consensus AA mutations per gene: {consensus_info['consensus_mean']:.4f} ± "
+                    f"{consensus_info['consensus_std']:.4f}\n"
+                )
+                f.write(f"Thresholds contributing: {consensus_info['thresholds_used']}\n")
+                f.write(f"Normalized weights: {consensus_info['weights']}\n")
+                if consensus_info.get("note"):
+                    f.write(f"Note: {consensus_info['note']}\n")
+            else:
+                f.write("Consensus AA mutation rate could not be computed; see QC logs for details.\n")
+            f.write("\n=== ALL Q-SCORE RESULTS ===\n")
+            f.write(
+                "Q-score\tMean_AA\tStd_AA\tCI_Lower\tCI_Upper\tMappable_Bases\tSegments\n"
+            )
+            for result in qc_results:
+                f.write(
+                    f"{result['quality_threshold']}\t"
+                    f"{result['mean_aa_mutations']:.6f}\t"
+                    f"{result['std_aa_mutations']:.6f}\t"
+                    f"{result['ci_lower']:.6f}\t"
+                    f"{result['ci_upper']:.6f}\t"
+                    f"{result['total_mappable_bases']}\t"
+                    f"{result['n_segments']}\n"
+                )
+        logging.info("Consensus AA mutation summary saved to: %s", consensus_summary_path)
         # Clean up segment files
-        import shutil
         segment_dir = os.path.dirname(segment_files[0])
         if os.path.exists(segment_dir):
             shutil.rmtree(segment_dir)
             logging.info(f"Cleaned up segment directory: {segment_dir}")
-        # Return both QC results and optimal Q-score for use in main analysis
-        return qc_results, optimal_qscore
+        # Return QC results and consensus information for downstream analysis
+        return qc_results, consensus_info
-def find_optimal_qscore_simple(qc_results):
+def compute_consensus_aa_mutation(
+    qc_results: List[dict],
+    min_mappable_bases: int = 1000,
+) -> Tuple[Optional[dict], List[dict]]:
     """
-    Find the Q-score threshold with the highest precision-weighted score.
-    Precision-weighted score = (1 / standard_deviation) * q_score
-    Args:
-        qc_results: List of segmentation analysis results
+    Derive a consensus amino-acid mutation estimate across Q-score thresholds.
+    Each threshold must meet a minimum coverage requirement. The consensus is a
+    precision-weighted average (weights = 1 / std_aa_mutations).
     Returns:
-        tuple: (optimal_qscore, optimal_result)
+        consensus_info (dict or None)
+            {
+                'consensus_mean': float,
+                'consensus_std': float,
+                'thresholds_used': List[int],
+                'weights': List[float],
+                'min_mappable_bases': int,
+            }
+        valid_results: list of QC result dicts that were included in the consensus
     """
-    logging.info("=== FINDING OPTIMAL Q-SCORE THRESHOLD (PRECISION-WEIGHTED) ===")
     if not qc_results:
-        return None, None
-    # Find Q-score with highest precision-weighted score
-    max_score = -1
-    optimal_result = None
-    optimal_qscore = None
-    logging.info("Q-score\tEmpirical_Error\tPrecision_Score\tMappable_Bases")
-    logging.info("-" * 60)
+        return None, []
+    valid_results = []
     for result in qc_results:
-        qscore = result['quality_threshold']
-        empirical_error = result['std_aa_mutations']
-        mappable_bases = result['total_mappable_bases']
-        # Calculate precision-weighted score: (1/sd) * q_score
-        if empirical_error > 0:
-            precision_score = (1.0 / empirical_error) * qscore
-        else:
-            precision_score = float('inf')  # Perfect precision
-        logging.info(f"Q{qscore}\t{empirical_error:.6f}\t{precision_score:.6f}\t{mappable_bases}")
-        if precision_score > max_score:
-            max_score = precision_score
-            optimal_result = result
-            optimal_qscore = qscore
-    logging.info("-" * 60)
-    logging.info(f"OPTIMAL Q-SCORE: Q{optimal_qscore} (highest precision-weighted score: {max_score:.6f})")
-    logging.info(f"Optimal result: AA mutations = {optimal_result['mean_aa_mutations']:.4f} ± {optimal_result['std_aa_mutations']:.4f}")
-    return optimal_qscore, optimal_result
+        total_bases = result.get("total_mappable_bases", 0)
+        std_aa = result.get("std_aa_mutations", 0.0)
+        if total_bases is None:
+            total_bases = 0
+        if total_bases >= min_mappable_bases and std_aa is not None:
+            valid_results.append(result)
+    if not valid_results:
+        logging.warning(
+            "No Q-score thresholds met the minimum mappable base requirement (%s). "
+            "Consensus AA mutation rate will fall back to the threshold with the highest coverage.",
+            min_mappable_bases,
+        )
+        best_by_coverage = max(qc_results, key=lambda r: r.get("total_mappable_bases", 0))
+        fallback_std = best_by_coverage.get("std_aa_mutations", 0.0)
+        consensus_info = {
+            "consensus_mean": best_by_coverage.get("mean_aa_mutations", 0.0),
+            "consensus_std": fallback_std,
+            "thresholds_used": [best_by_coverage.get("quality_threshold")],
+            "weights": [1.0],
+            "min_mappable_bases": min_mappable_bases,
+            "note": "FELL_BACK_TO_MAX_COVERAGE",
+        }
+        return consensus_info, [best_by_coverage]
+    weights = []
+    means = []
+    variances = []
+    thresholds = []
+    for result in valid_results:
+        std_aa = result.get("std_aa_mutations", 0.0) or 0.0
+        weight = 1.0 / max(std_aa, 1e-9)  # Avoid division by zero; effectively a very large weight.
+        weights.append(weight)
+        means.append(result.get("mean_aa_mutations", 0.0))
+        variances.append(std_aa**2)
+        thresholds.append(result.get("quality_threshold"))
+    weight_sum = float(np.sum(weights))
+    normalized_weights = [w / weight_sum for w in weights]
+    consensus_mean = float(np.sum(np.array(normalized_weights) * np.array(means)))
+    combined_variance = 0.0
+    for w, mean, var in zip(normalized_weights, means, variances):
+        combined_variance += w * (var + (mean - consensus_mean) ** 2)
+    combined_variance = max(combined_variance, 0.0)
+    consensus_std = float(np.sqrt(combined_variance))
+    consensus_info = {
+        "consensus_mean": consensus_mean,
+        "consensus_std": consensus_std,
+        "thresholds_used": thresholds,
+        "weights": normalized_weights,
+        "min_mappable_bases": min_mappable_bases,
+        "note": "WEIGHTED_AVERAGE",
+    }
+    logging.info(
+        "Consensus AA mutation estimate: %.4f ± %.4f (thresholds used: %s)",
+        consensus_mean,
+        consensus_std,
+        thresholds,
+    )
-def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_qscore=None, optimal_result=None):
+    return consensus_info, valid_results
+def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensus_info=None):
     """
     Create simple QC plots with empirical error bars.
@@ -600,8 +659,7 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_
         quality_thresholds: List of quality score thresholds
         qc_results: List of segmentation analysis results
         results_dir: Directory to save the plots
-        optimal_qscore: Optimal Q-score threshold (optional)
-        optimal_result: Optimal result data (optional)
+        consensus_info: Optional dict describing the consensus AA mutation estimate.
     """
     try:
         # Extract data for plotting
@@ -624,10 +682,17 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, optimal_
         ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
                         alpha=0.3, color=color1, label='95% Confidence Interval')
-        # Highlight optimal Q-score
-        if optimal_qscore is not None:
-            ax1.axvline(x=optimal_qscore, color='red', linestyle='--', alpha=0.7,
-                       label=f'Optimal Q{optimal_qscore}')
+        # Add consensus AA mutation estimate if available
+        if consensus_info and consensus_info.get("consensus_mean") is not None:
+            consensus_mean = consensus_info["consensus_mean"]
+            consensus_std = consensus_info.get("consensus_std", 0.0)
+            ax1.axhline(
+                y=consensus_mean,
+                color='red',
+                linestyle='--',
+                alpha=0.7,
+                label=f"Consensus AA mutations ({consensus_mean:.3f}±{consensus_std:.3f})",
+            )
         ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
         ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
@@ -1185,16 +1250,23 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
             bg_rate = bg_mis / bg_cov if bg_cov > 0 else 0
             net_rate = max(hit_rate - bg_rate, 0.0)
-            # Calculate AA mutations per gene (simplified)
+            # Calculate AA mutations per gene via Monte Carlo simulation
             lambda_bp = net_rate * len(hit_seq)
-            aa_mutations = lambda_bp / 3.0  # Approximate: 3 bp per AA
+            aa_samples = simulate_aa_distribution(lambda_bp, hit_seq, n_trials=500)
+            if len(aa_samples) > 1:
+                aa_mean = float(np.mean(aa_samples))
+                aa_var = float(np.var(aa_samples, ddof=1))
+            else:
+                aa_mean = float(aa_samples[0]) if aa_samples else 0.0
+                aa_var = 0.0
             segment_results.append({
                 'segment': i+1,
                 'hit_rate': hit_rate,
                 'bg_rate': bg_rate,
                 'net_rate': net_rate,
-                'aa_mutations': aa_mutations,
+                'aa_mutations': aa_mean,
+                'aa_variance': aa_var,
                 'mappable_bases': hit_cov,
                 'hit_mismatches': hit_mis,
                 'hit_coverage': hit_cov
@@ -1204,29 +1276,44 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
             return None
         # Calculate empirical statistics
-        aa_mutations_list = [r['aa_mutations'] for r in segment_results]
-        net_rates_list = [r['net_rate'] for r in segment_results]
-        mappable_bases_list = [r['mappable_bases'] for r in segment_results]
+        aa_mutations_list = np.array([r['aa_mutations'] for r in segment_results], dtype=float)
+        aa_variances = np.array([r.get('aa_variance', 0.0) for r in segment_results], dtype=float)
+        net_rates_list = np.array([r['net_rate'] for r in segment_results], dtype=float)
+        mappable_bases_list = np.array([r['mappable_bases'] for r in segment_results], dtype=float)
+        total_mappable_bases = float(mappable_bases_list.sum())
+        if total_mappable_bases > 0:
+            weights = mappable_bases_list
+            mean_aa = float(np.average(aa_mutations_list, weights=weights))
+            mean_net_rate = float(np.average(net_rates_list, weights=weights))
+            weighted_var = float(
+                np.sum(weights * (aa_variances + (aa_mutations_list - mean_aa) ** 2)) / total_mappable_bases
+            )
+            weighted_net_var = float(
+                np.sum(weights * ( (net_rates_list - mean_net_rate) ** 2 )) / total_mappable_bases
+            )
+        else:
+            weights = None
+            mean_aa = float(np.mean(aa_mutations_list))
+            mean_net_rate = float(np.mean(net_rates_list))
+            weighted_var = float(np.var(aa_mutations_list, ddof=1)) if len(aa_mutations_list) > 1 else 0.0
+            weighted_net_var = float(np.var(net_rates_list, ddof=1)) if len(net_rates_list) > 1 else 0.0
-        mean_aa = np.mean(aa_mutations_list)
-        std_aa = np.std(aa_mutations_list, ddof=1)  # Sample standard deviation
-        mean_net_rate = np.mean(net_rates_list)
-        std_net_rate = np.std(net_rates_list, ddof=1)
-        total_mappable_bases = sum(mappable_bases_list)
+        std_aa = float(np.sqrt(max(weighted_var, 0.0)))
+        std_net_rate = float(np.sqrt(max(weighted_net_var, 0.0)))
         # Calculate confidence interval using t-distribution
         n_segments = len(segment_results)
         if n_segments > 1:
-            # 95% confidence interval
-            from scipy.stats import t
-            t_val = t.ppf(0.975, n_segments - 1)
             se_aa = std_aa / np.sqrt(n_segments)
-            ci_lower = mean_aa - t_val * se_aa
-            ci_upper = mean_aa + t_val * se_aa
+            ci_lower = mean_aa - 1.96 * se_aa
+            ci_upper = mean_aa + 1.96 * se_aa
         else:
             ci_lower = mean_aa
             ci_upper = mean_aa
+        ci_lower = max(ci_lower, 0.0)
         return {
             'mean_aa_mutations': mean_aa,
             'std_aa_mutations': std_aa,
@@ -2072,18 +2159,23 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
             ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
             ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
     else:
-        # Not protein or no AA differences
-        ax3.text(0.5, 0.5, "Not a protein‐coding region",
-                 horizontalalignment='center', verticalalignment='center',
-                 fontsize=12, color='gray', transform=ax3.transAxes)
-        ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
-        ax3.set_xlabel("Number of AA Mutations", fontsize=12)
-        ax3.set_ylabel("Density", fontsize=12)
-        ax3.spines['top'].set_visible(False)
-        ax3.spines['right'].set_visible(False)
-        ax3.set_xticks([])
-        ax3.set_yticks([])
+        # Not protein or no AA differences — display an informative message
+        ax3.text(
+            0.5,
+            0.5,
+            "Amino-acid distribution unavailable",
+            horizontalalignment="center",
+            verticalalignment="center",
+            fontsize=12,
+            color="gray",
+            transform=ax3.transAxes,
+        )
+    ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
+    ax3.set_xlabel("Number of AA Mutations", fontsize=12)
+    ax3.set_ylabel("Density", fontsize=12)
+    ax3.spines['top'].set_visible(False)
+    ax3.spines['right'].set_visible(False)
     # Save the combined figure as both PNG and PDF
     panel_path_png = os.path.join(qscore_results_dir, "summary_panels.png")
@@ -2412,7 +2504,7 @@ def process_single_fastq(
     logging.info("Running QC analysis to get Q-score results...")
     qc_results = None
     try:
-        qc_results, optimal_qscore = run_qc_analysis(
+        qc_results, consensus_info = run_qc_analysis(
             str(fastq_path),
             str(results_dir),
             str(region_fasta),
@@ -2420,8 +2512,13 @@ def process_single_fastq(
         )
         if qc_results is not None:
             logging.info("QC analysis completed successfully. Found %s Q-score results.", len(qc_results))
-            if optimal_qscore is not None:
-                logging.info("Optimal Q-score determined: %s", optimal_qscore)
+            if consensus_info and consensus_info.get("consensus_mean") is not None:
+                logging.info(
+                    "Consensus AA mutations per gene: %.4f ± %.4f (thresholds used: %s)",
+                    consensus_info["consensus_mean"],
+                    consensus_info.get("consensus_std", 0.0),
+                    consensus_info.get("thresholds_used"),
+                )
         else:
             logging.warning("QC analysis completed but no Q-score results found.")
     except Exception as exc:

uht_tooling/workflows/umi_hunter.py CHANGED Viewed

@@ -264,6 +264,7 @@ def run_umi_hunter(
     output_dir: Path,
     umi_identity_threshold: float = 0.9,
     consensus_mutation_threshold: float = 0.7,
+    min_cluster_size: int = 1,
     log_path: Optional[Path] = None,
     logger: Optional[logging.Logger] = None,
 ) -> List[Dict[str, Path]]:
@@ -291,6 +292,9 @@ def run_umi_hunter(
         if not fastq_files:
             raise ValueError("No FASTQ files provided.")
+        if min_cluster_size < 1:
+            raise ValueError("Minimum cluster size must be at least 1.")
         cfg = load_flank_config(config_csv)
         pattern_umi, pattern_gene = build_patterns(cfg)
         reference_record = next(SeqIO.parse(str(template_fasta), "fasta"))
@@ -314,10 +318,20 @@ def run_umi_hunter(
             umi_csv = sample_dir / f"{sample_base}_UMI_clusters.csv"
             write_umi_csv(umi_csv, clusters)
+            significant_clusters = [
+                cluster for cluster in clusters if cluster["total_count"] >= min_cluster_size
+            ]
+            if not significant_clusters:
+                logger.info(
+                    "No clusters met the minimum size threshold (%s reads) for %s.",
+                    min_cluster_size,
+                    sample_base,
+                )
             gene_csv = sample_dir / f"{sample_base}_gene_consensus.csv"
             consensus_records = write_gene_csv(
                 gene_csv,
-                clusters,
+                significant_clusters,
                 reference_record,
                 consensus_mutation_threshold,
                 logger,
@@ -334,7 +348,8 @@ def run_umi_hunter(
                     "gene_csv": gene_csv,
                     "fasta": fasta_out,
                     "reads": read_count,
-                    "clusters": len(clusters),
+                    "clusters": len(significant_clusters),
+                    "clusters_total": len(clusters),
                 }
             )

{uht_tooling-0.1.3.dist-info → uht_tooling-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: uht-tooling
-Version: 0.1.3
+Version: 0.1.5
 Summary: Tooling for ultra-high throughput screening workflows.
 Author: Matt115A
 License: MIT
@@ -35,7 +35,7 @@ Automation helpers for ultra-high-throughput molecular biology workflows. The pa
 ### Quick install (recommended, easiest file maintainance)
 ```bash
-pip install "uht-tooling[gui]==0.1.3"
+pip install "uht-tooling[gui]==0.1.4"
 ```
@@ -189,9 +189,10 @@ If mutations fall within overlapping primer windows, design sequential reactions
     --fastq data/umi_hunter/*.fastq.gz \
     --output-dir results/umi_hunter/
   ```
-- Tunable parameters include `--umi-identity-threshold` and `--consensus-mutation-threshold`.
-- --umi-identity-threshold is a decimal between 0-1 and defines how similar two UMIs have to be to be considered grouped.
-- --consensus-mutation-threshold is the minimum group size to report a consensus sequence.
+- Tunable parameters include `--umi-identity-threshold`, `--consensus-mutation-threshold`, and `--min-cluster-size`.
+- `--umi-identity-threshold` (0–1) controls how similar two UMIs must be to fall into the same cluster.
+- `--consensus-mutation-threshold` (0–1) is the fraction of reads within a cluster that must agree on a base before it is written into the consensus sequence.
+- `--min-cluster-size` sets the minimum number of reads required in a cluster before a consensus is generated (smaller clusters remain listed in the raw UMI CSV but no consensus FASTA is produced).
 Please be aware, this toolkit will not scale well beyond around 50k reads/sample. See UMIC-seq pipelines for efficient UMI-gene dictionary generation.
@@ -221,7 +222,14 @@ Please be aware, this toolkit will not scale well beyond around 50k reads/sample
     --fastq data/ep-library-profile/*.fastq.gz \
     --output-dir results/ep-library-profile/
   ```
-- Output bundle includes per-sample directories and a master summary TSV.
+- Output bundle includes per-sample directories, a master summary TSV, and a `summary_panels` figure that visualises positional mutation rates, coverage, and amino-acid simulations.
+**How the mutation rate and AA expectations are derived**
+1. Reads are aligned to both the region of interest and the full plasmid. Mismatches in the region define the “target” rate; mismatches elsewhere provide the background.
+2. The per-base background rate is subtracted from the target rate to yield a net nucleotide mutation rate, and the standard deviation reflects binomial sampling and quality-score uncertainty.
+3. The net rate is multiplied by the CDS length to estimate λ_bp (mutations per copy). Monte Carlo simulations then flip random bases, translate the mutated CDS, and count amino-acid differences across 1,000 trials—these drives the AA mutation mean/variance that appear in the panel plot.
+4. If multiple Q-score thresholds are analysed, the CLI aggregates them via a precision-weighted consensus (1 / standard deviation weighting) after filtering out thresholds with insufficient coverage; the consensus value is written to `aa_mutation_consensus.txt` and plotted as a horizontal guide.
 ---
@@ -243,9 +251,9 @@ Key points:
 1. **Nextera XT** – forward/reverse primer inputs with CSV preview.
 2. **SLIM** – template/context FASTA text areas plus mutation list.
 3. **Gibson** – multi-mutation support using `+` syntax.
-4. **Mutation Caller** – upload FASTQ, template FASTA, and configuration CSV.
-5. **UMI Hunter** – long-read UMI clustering with configurable thresholds.
-6. **Profile Inserts** – probe CSV and multiple FASTQ uploads.
+4. **Mutation Caller** – upload FASTQ and template FASTA, then enter flanks and gene length bounds inline.
+5. **UMI Hunter** – long-read UMI clustering with flank entry, UMI length bounds, mutation threshold, and minimum cluster size.
+6. **Profile Inserts** – interactive probe table plus multiple FASTQ uploads with adjustable fuzzy-match ratio.
 7. **EP Library Profile** – FASTQ uploads plus plasmid and region FASTA inputs.
 ### Workflow tips

{uht_tooling-0.1.3.dist-info → uht_tooling-0.1.5.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
-uht_tooling/cli.py,sha256=sQU0duLmMOqvqzB6hDV7GIQYdvzAKKK3rLx0Iq07ZR4,12432
+uht_tooling/cli.py,sha256=yKTPqWwYAs7tzO_TeyaLhSfzkNoCUPnc0wU2fgOR2wk,12882
 uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
 uht_tooling/workflows/design_slim.py,sha256=Qeh8N32kmVFZvohmTlBudJsLzOqLy4XcY3aXbkP-sFQ,14421
-uht_tooling/workflows/gui.py,sha256=jP3gYZp8hyBCms65nzoZ_EW3rsNrn2ZGGp8gBSvny6Q,23123
-uht_tooling/workflows/mut_rate.py,sha256=wjX1lNXTcaH49gfARSrpKLU1mD5hCgH0ZFTcdlNrAB4,105670
+uht_tooling/workflows/gui.py,sha256=P4FdZWsS0NLX5VmOZZ-WO-biVEhbfa6M1gY6DFcgR7k,43153
+uht_tooling/workflows/mut_rate.py,sha256=j8QzYe9QrT_yyhSYUbH3MHyvUp61U_h0w1bEd8b3aFI,109038
 uht_tooling/workflows/mutation_caller.py,sha256=BczuNATOSUcmlw-x6qTzEQfW8MBbvGclEyqiQiBX0cg,16222
 uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
 uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
-uht_tooling/workflows/umi_hunter.py,sha256=kXR7Tw3vK4TnL8OShRt9kZ36ONpOSd-1txwB95Ldi-I,14470
-uht_tooling-0.1.3.dist-info/METADATA,sha256=0bPz8odnvbX13BvlQC4HsXvJwu7dRK7YyQ2nD7KwHEA,11220
-uht_tooling-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-uht_tooling-0.1.3.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
-uht_tooling-0.1.3.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
-uht_tooling-0.1.3.dist-info/RECORD,,
+uht_tooling/workflows/umi_hunter.py,sha256=baycWycqVzUfMp5u2WZdHRl0sNuykTjy-iqtj5ahucU,15075
+uht_tooling-0.1.5.dist-info/METADATA,sha256=rqbE3jGdLJvbUEXlLmS-VcDMKJHCw0-7l8NKosD9WEQ,12751
+uht_tooling-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+uht_tooling-0.1.5.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
+uht_tooling-0.1.5.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
+uht_tooling-0.1.5.dist-info/RECORD,,

{uht_tooling-0.1.3.dist-info → uht_tooling-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{uht_tooling-0.1.3.dist-info → uht_tooling-0.1.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{uht_tooling-0.1.3.dist-info → uht_tooling-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

uht-tooling 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

uht-tooling 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl