PyPI - uht-tooling - Versions diffs - 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

uht-tooling 0.1.9py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

uht_tooling/cli.py +153 -4
uht_tooling/config.py +137 -0
uht_tooling/tools.py +143 -0
uht_tooling/workflows/gui.py +19 -0
uht_tooling/workflows/mut_rate.py +484 -124
uht_tooling/workflows/mutation_caller.py +11 -2
uht_tooling/workflows/umi_hunter.py +9 -4
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/METADATA +123 -5
uht_tooling-0.3.0.dist-info/RECORD +20 -0
uht_tooling-0.1.9.dist-info/RECORD +0 -18
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/WHEEL +0 -0
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/entry_points.txt +0 -0
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/top_level.txt +0 -0

uht_tooling/workflows/mutation_caller.py CHANGED Viewed

@@ -17,6 +17,7 @@ from Bio.Align.Applications import MafftCommandline
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from scipy.stats import fisher_exact, gaussian_kde
+from tqdm import tqdm
 def reverse_complement(seq: str) -> str:
@@ -52,8 +53,16 @@ def extract_gene(seq: str, pattern: re.Pattern, gene_min: int, gene_max: int) ->
 def process_fastq(file_path: Path, pattern: re.Pattern, gene_min: int, gene_max: int) -> Dict[str, str]:
     gene_reads: Dict[str, str] = {}
+    # Count total reads for progress bar
+    total_reads = 0
+    with gzip.open(file_path, "rt") as handle:
+        for _ in handle:
+            total_reads += 1
+    total_reads = total_reads // 4
     with gzip.open(file_path, "rt") as handle:
-        while True:
+        for _ in tqdm(range(total_reads), desc=f"Processing {file_path.name}", unit="read"):
             header = handle.readline()
             if not header:
                 break
@@ -274,7 +283,7 @@ def run_mutation_caller(
         results: List[Dict[str, Path]] = []
-        for fastq in fastq_files:
+        for fastq in tqdm(fastq_files, desc="Processing samples", unit="sample"):
             if not fastq.exists():
                 logger.warning("FASTQ file %s not found; skipping.", fastq)
                 continue

uht_tooling/workflows/umi_hunter.py CHANGED Viewed

@@ -68,11 +68,18 @@ def process_fastq(
     pattern_gene: re.Pattern,
     logger: logging.Logger,
 ) -> tuple[int, Dict[str, List[str]]]:
+    # Count total reads for progress bar
+    total_reads = 0
+    with gzip.open(file_path, "rt") as handle:
+        for _ in handle:
+            total_reads += 1
+    total_reads = total_reads // 4
     read_count = 0
     umi_info: Dict[str, List[str]] = {}
     extracted = 0
     with gzip.open(file_path, "rt") as handle:
-        while True:
+        for _ in tqdm(range(total_reads), desc=f"Processing {file_path.name}", unit="read"):
             header = handle.readline()
             if not header:
                 break
@@ -85,8 +92,6 @@ def process_fastq(
             if umi and gene:
                 umi_info.setdefault(umi, []).append(gene)
                 extracted += 1
-            if read_count % 100000 == 0:
-                logger.info("Processed %s reads so far in %s", read_count, file_path.name)
     logger.info(
         "Finished reading %s: total reads=%s, extracted pairs=%s",
         file_path,
@@ -129,7 +134,7 @@ def cluster_umis(
     logger.info("Clustering %s unique UMIs with threshold %.2f", len(umi_info), threshold)
     sorted_umis = sorted(umi_info.items(), key=lambda item: len(item[1]), reverse=True)
     clusters: List[dict] = []
-    for umi, gene_list in sorted_umis:
+    for umi, gene_list in tqdm(sorted_umis, desc="Clustering UMIs", unit="UMI"):
         count = len(gene_list)
         for cluster in clusters:
             if percent_identity(umi, cluster["rep"]) >= threshold:

{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: uht-tooling
-Version: 0.1.9
+Version: 0.3.0
 Summary: Tooling for ultra-high throughput screening workflows.
 Author: Matt115A
 License-Expression: MIT
@@ -47,7 +47,22 @@ This installs the core workflows plus the optional GUI dependency (Gradio). Omit
 pip install uht-tooling
 ```
-You will need a functioning version of mafft - you should install this separately and it should be accessible from your environment.
+### External Tools
+Some workflows require external bioinformatics tools:
+| Workflow | Required Tools |
+|----------|---------------|
+| mutation-caller | mafft |
+| umi-hunter | mafft |
+| ep-library-profile | minimap2, NanoFilt |
+Install via conda:
+```bash
+conda install -c bioconda mafft minimap2 nanofilt
+```
+The CLI and GUI will validate tool availability before running and provide clear error messages if tools are missing.
 ### Development install
 ```bash
@@ -95,10 +110,69 @@ Each command provides detailed help, including option descriptions and expected
 uht-tooling mutation-caller --help
 ```
+### Short Flags
+All commands support short flags for common options:
+```bash
+# Long form
+uht-tooling design-slim --gene-fasta gene.fa --context-fasta ctx.fa --mutations-csv mut.csv --output-dir out/
+# Short form
+uht-tooling design-slim -g gene.fa -c ctx.fa -m mut.csv -o out/
+```
+| Long Flag | Short | Commands |
+|-----------|-------|----------|
+| `--gene-fasta` | `-g` | design-slim, design-kld, design-gibson |
+| `--context-fasta` | `-c` | design-slim, design-kld, design-gibson |
+| `--mutations-csv` | `-m` | design-slim, design-kld, design-gibson |
+| `--output-dir` | `-o` | 7 commands |
+| `--log-path` | `-l` | 7 commands |
+| `--template-fasta` | `-t` | mutation-caller, umi-hunter |
+| `--fastq` | `-q` | 4 commands |
+| `--threshold` | `-T` | mutation-caller |
+| `--config-csv` | `-C` | umi-hunter |
+| `--binding-csv` | `-b` | nextera-primers |
+| `--probes-csv` | `-P` | profile-inserts |
+| `--region-fasta` | `-R` | ep-library-profile |
+| `--plasmid-fasta` | `-p` | ep-library-profile |
+| `--work-dir` | `-w` | ep-library-profile |
+| `--config` | `-K` | global (all commands) |
 You can pass multiple FASTQ paths using repeated `--fastq` options or glob patterns. Optional `--log-path` flags redirect logs if you prefer a location outside the default results directory.
 ---
+## Configuration File
+uht-tooling supports a YAML configuration file for default options.
+**Auto-discovery locations** (in order):
+1. `$UHT_TOOLING_CONFIG` environment variable
+2. `~/.uht-tooling.yaml`
+3. `~/.config/uht-tooling/config.yaml`
+4. `.uht-tooling.yaml` (current directory)
+Or specify explicitly: `uht-tooling --config my-config.yaml ...`
+**Example ~/.uht-tooling.yaml:**
+```yaml
+paths:
+  output_dir: ~/results/uht-tooling
+defaults:
+  mutation_caller:
+    threshold: 15
+  umi_hunter:
+    umi_identity_threshold: 0.85
+    min_cluster_size: 5
+```
+CLI options always take precedence over config values.
+---
 ## Workflow reference
 ### Nextera XT primer design
@@ -313,13 +387,57 @@ Please be aware, this toolkit will not scale well beyond around 50k reads/sample
     --fastq data/ep-library-profile/*.fastq.gz \
     --output-dir results/ep-library-profile/
   ```
-- Output bundle includes per-sample directories, a master summary TSV, and a `summary_panels` figure that visualises positional mutation rates, coverage, and amino-acid simulations.
+**Output structure**
+Each sample produces an organized output directory:
+```
+sample_name/
+├── KEY_FINDINGS.txt              # Lay-user executive summary
+├── summary_panels.png/pdf        # Main visualization
+├── aa_mutation_consensus.txt     # Consensus estimate details
+├── run.log                       # Analysis log
+└── detailed/                     # Technical outputs
+    ├── methodology_notes.txt     # Documents which lambda drives what
+    ├── lambda_comparison.csv     # Side-by-side lambda comparison
+    ├── gene_mismatch_rates.csv
+    ├── base_distribution.csv
+    ├── aa_substitutions.csv
+    ├── plasmid_coverage.csv
+    ├── aa_mutation_distribution.csv
+    ├── comprehensive_qc_data.csv
+    ├── simple_qc_data.csv
+    └── qc_plots/                 # QC visualizations
+        ├── qc_plot_*.png
+        ├── comprehensive_qc_analysis.png
+        ├── error_analysis.png
+        └── qc_mutation_rate_vs_quality.png/csv
+```
+**Lambda estimates: which to use**
+The profiler calculates lambda (mutations per gene copy) via two methods:
+| Method | Formula | Error Quantified? | Used For |
+|--------|---------|-------------------|----------|
+| Simple | `(hit_rate - bg_rate) × seq_len` | No | KDE plot, Monte Carlo simulation |
+| Consensus | Precision-weighted average across Q-scores | Yes | Recommended for reporting |
+- **For publication/reporting**: Use the consensus value from `KEY_FINDINGS.txt` or `aa_mutation_consensus.txt`.
+- **For understanding distribution shape**: See the KDE plot in `summary_panels.png` (note: uses simple lambda).
+- **For detailed error analysis**: See `detailed/comprehensive_qc_data.csv`.
+The `KEY_FINDINGS.txt` file provides a plain-language summary including:
+- Expected AA mutations per gene copy
+- Poisson-based interpretation (% wild-type, % 1 mutation, % 2+ mutations)
+- Quality assessment (GOOD/ACCEPTABLE/LOW COVERAGE)
 **How the mutation rate and AA expectations are derived**
-1. Reads are aligned to both the region of interest and the full plasmid. Mismatches in the region define the “target” rate; mismatches elsewhere provide the background.
+1. Reads are aligned to both the region of interest and the full plasmid. Mismatches in the region define the "target" rate; mismatches elsewhere provide the background.
 2. The per-base background rate is subtracted from the target rate to yield a net nucleotide mutation rate, and the standard deviation reflects binomial sampling and quality-score uncertainty.
-3. The net rate is multiplied by the CDS length to estimate λ_bp (mutations per copy). Monte Carlo simulations then flip random bases, translate the mutated CDS, and count amino-acid differences across 1,000 trials—these drives the AA mutation mean/variance that appear in the panel plot.
+3. The net rate is multiplied by the CDS length to estimate λ_bp (mutations per copy). Monte Carlo simulations then flip random bases, translate the mutated CDS, and count amino-acid differences across 1,000 trials—these drive the AA mutation mean/variance that appear in the panel plot.
 4. If multiple Q-score thresholds are analysed, the CLI aggregates them via a precision-weighted consensus (1 / standard deviation weighting) after filtering out thresholds with insufficient coverage; the consensus value is written to `aa_mutation_consensus.txt` and plotted as a horizontal guide.
 ---

uht_tooling-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
+uht_tooling/cli.py,sha256=EHuBzQIG3sDyujbhoUvIEo1J7KayH2Z76I0uvaCIHs0,18068
+uht_tooling/config.py,sha256=FnI1IRDGLoYt9MnT_s_1Qz80uK2Wb8HCFoHCmriOFL0,3644
+uht_tooling/tools.py,sha256=msQVdllP5NqG6UtfM0EIxYiJuUrVqOaYmBx33wVGx64,4109
+uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
+uht_tooling/workflows/design_kld.py,sha256=SWbKVfi1JgJ7cN9TU3dLEiYmZT7LQiGL_mUZ-n3PdzE,27368
+uht_tooling/workflows/design_slim.py,sha256=wGXnmaJCzlAZTjf2SRupwt_3MBl5cgZr1O9nnMQyoGo,17767
+uht_tooling/workflows/gui.py,sha256=aruae9_8OCacjTJkGGeavbjJSC69XqfenmhgI9O3zcM,47128
+uht_tooling/workflows/mut_rate.py,sha256=mFhIuGfKXvAFmsY7wZHHy6cWgtR04pUAcwIEZM4jwr4,122890
+uht_tooling/workflows/mutation_caller.py,sha256=ZGPXf8WcbmsVVnAJGcqjklHlnmUe-T_-3my-CBwHIQ0,16564
+uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
+uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
+uht_tooling/workflows/umi_hunter.py,sha256=DlkJxaJEExIwhe7P0_aWqrM8xPbupc7oLQPZA7iylV8,15256
+uht_tooling-0.3.0.dist-info/METADATA,sha256=Ge6mux5hYlR4koBeLiruBtFrTVJgIXrYYgD6xT428tQ,20211
+uht_tooling-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+uht_tooling-0.3.0.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
+uht_tooling-0.3.0.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
+uht_tooling-0.3.0.dist-info/RECORD,,

uht_tooling-0.1.9.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
-uht_tooling/cli.py,sha256=3QUxYBFqhQyeZ9xM_JTlqhr_UJhb_PRj7Y_UMH5Tslc,14366
-uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
-uht_tooling/workflows/design_kld.py,sha256=SWbKVfi1JgJ7cN9TU3dLEiYmZT7LQiGL_mUZ-n3PdzE,27368
-uht_tooling/workflows/design_slim.py,sha256=wGXnmaJCzlAZTjf2SRupwt_3MBl5cgZr1O9nnMQyoGo,17767
-uht_tooling/workflows/gui.py,sha256=FpzxgjOo8SQCPJRM7ltVLk3bcwZ_AxjQzZxwz7J_c1M,46436
-uht_tooling/workflows/mut_rate.py,sha256=Sv4OU68RNTOOsKV0QSbJ7FOgxh3vQeUeib_5mrXqyHg,109074
-uht_tooling/workflows/mutation_caller.py,sha256=BczuNATOSUcmlw-x6qTzEQfW8MBbvGclEyqiQiBX0cg,16222
-uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
-uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
-uht_tooling/workflows/umi_hunter.py,sha256=baycWycqVzUfMp5u2WZdHRl0sNuykTjy-iqtj5ahucU,15075
-uht_tooling-0.1.9.dist-info/METADATA,sha256=mMC92ln1dMYhDQFlKRfBsMMsMVvy0p0LDY8s6aX-4Ig,16399
-uht_tooling-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-uht_tooling-0.1.9.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
-uht_tooling-0.1.9.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
-uht_tooling-0.1.9.dist-info/RECORD,,

{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

uht-tooling 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

uht-tooling 0.1.9py3-none-any.whl → 0.3.0py3-none-any.whl