uht-tooling 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@ from Bio.Align.Applications import MafftCommandline
17
17
  from Bio.Seq import Seq
18
18
  from Bio.SeqRecord import SeqRecord
19
19
  from scipy.stats import fisher_exact, gaussian_kde
20
+ from tqdm import tqdm
20
21
 
21
22
 
22
23
  def reverse_complement(seq: str) -> str:
@@ -52,8 +53,16 @@ def extract_gene(seq: str, pattern: re.Pattern, gene_min: int, gene_max: int) ->
52
53
 
53
54
  def process_fastq(file_path: Path, pattern: re.Pattern, gene_min: int, gene_max: int) -> Dict[str, str]:
54
55
  gene_reads: Dict[str, str] = {}
56
+
57
+ # Count total reads for progress bar
58
+ total_reads = 0
59
+ with gzip.open(file_path, "rt") as handle:
60
+ for _ in handle:
61
+ total_reads += 1
62
+ total_reads = total_reads // 4
63
+
55
64
  with gzip.open(file_path, "rt") as handle:
56
- while True:
65
+ for _ in tqdm(range(total_reads), desc=f"Processing {file_path.name}", unit="read"):
57
66
  header = handle.readline()
58
67
  if not header:
59
68
  break
@@ -274,7 +283,7 @@ def run_mutation_caller(
274
283
 
275
284
  results: List[Dict[str, Path]] = []
276
285
 
277
- for fastq in fastq_files:
286
+ for fastq in tqdm(fastq_files, desc="Processing samples", unit="sample"):
278
287
  if not fastq.exists():
279
288
  logger.warning("FASTQ file %s not found; skipping.", fastq)
280
289
  continue
@@ -68,11 +68,18 @@ def process_fastq(
68
68
  pattern_gene: re.Pattern,
69
69
  logger: logging.Logger,
70
70
  ) -> tuple[int, Dict[str, List[str]]]:
71
+ # Count total reads for progress bar
72
+ total_reads = 0
73
+ with gzip.open(file_path, "rt") as handle:
74
+ for _ in handle:
75
+ total_reads += 1
76
+ total_reads = total_reads // 4
77
+
71
78
  read_count = 0
72
79
  umi_info: Dict[str, List[str]] = {}
73
80
  extracted = 0
74
81
  with gzip.open(file_path, "rt") as handle:
75
- while True:
82
+ for _ in tqdm(range(total_reads), desc=f"Processing {file_path.name}", unit="read"):
76
83
  header = handle.readline()
77
84
  if not header:
78
85
  break
@@ -85,8 +92,6 @@ def process_fastq(
85
92
  if umi and gene:
86
93
  umi_info.setdefault(umi, []).append(gene)
87
94
  extracted += 1
88
- if read_count % 100000 == 0:
89
- logger.info("Processed %s reads so far in %s", read_count, file_path.name)
90
95
  logger.info(
91
96
  "Finished reading %s: total reads=%s, extracted pairs=%s",
92
97
  file_path,
@@ -129,7 +134,7 @@ def cluster_umis(
129
134
  logger.info("Clustering %s unique UMIs with threshold %.2f", len(umi_info), threshold)
130
135
  sorted_umis = sorted(umi_info.items(), key=lambda item: len(item[1]), reverse=True)
131
136
  clusters: List[dict] = []
132
- for umi, gene_list in sorted_umis:
137
+ for umi, gene_list in tqdm(sorted_umis, desc="Clustering UMIs", unit="UMI"):
133
138
  count = len(gene_list)
134
139
  for cluster in clusters:
135
140
  if percent_identity(umi, cluster["rep"]) >= threshold:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: uht-tooling
3
- Version: 0.1.9
3
+ Version: 0.3.0
4
4
  Summary: Tooling for ultra-high throughput screening workflows.
5
5
  Author: Matt115A
6
6
  License-Expression: MIT
@@ -47,7 +47,22 @@ This installs the core workflows plus the optional GUI dependency (Gradio). Omit
47
47
  pip install uht-tooling
48
48
  ```
49
49
 
50
- You will need a functioning version of mafft - you should install this separately and it should be accessible from your environment.
50
+ ### External Tools
51
+
52
+ Some workflows require external bioinformatics tools:
53
+
54
+ | Workflow | Required Tools |
55
+ |----------|---------------|
56
+ | mutation-caller | mafft |
57
+ | umi-hunter | mafft |
58
+ | ep-library-profile | minimap2, NanoFilt |
59
+
60
+ Install via conda:
61
+ ```bash
62
+ conda install -c bioconda mafft minimap2 nanofilt
63
+ ```
64
+
65
+ The CLI and GUI will validate tool availability before running and provide clear error messages if tools are missing.
51
66
 
52
67
  ### Development install
53
68
  ```bash
@@ -95,10 +110,69 @@ Each command provides detailed help, including option descriptions and expected
95
110
  uht-tooling mutation-caller --help
96
111
  ```
97
112
 
113
+ ### Short Flags
114
+
115
+ All commands support short flags for common options:
116
+
117
+ ```bash
118
+ # Long form
119
+ uht-tooling design-slim --gene-fasta gene.fa --context-fasta ctx.fa --mutations-csv mut.csv --output-dir out/
120
+
121
+ # Short form
122
+ uht-tooling design-slim -g gene.fa -c ctx.fa -m mut.csv -o out/
123
+ ```
124
+
125
+ | Long Flag | Short | Commands |
126
+ |-----------|-------|----------|
127
+ | `--gene-fasta` | `-g` | design-slim, design-kld, design-gibson |
128
+ | `--context-fasta` | `-c` | design-slim, design-kld, design-gibson |
129
+ | `--mutations-csv` | `-m` | design-slim, design-kld, design-gibson |
130
+ | `--output-dir` | `-o` | 7 commands |
131
+ | `--log-path` | `-l` | 7 commands |
132
+ | `--template-fasta` | `-t` | mutation-caller, umi-hunter |
133
+ | `--fastq` | `-q` | 4 commands |
134
+ | `--threshold` | `-T` | mutation-caller |
135
+ | `--config-csv` | `-C` | umi-hunter |
136
+ | `--binding-csv` | `-b` | nextera-primers |
137
+ | `--probes-csv` | `-P` | profile-inserts |
138
+ | `--region-fasta` | `-R` | ep-library-profile |
139
+ | `--plasmid-fasta` | `-p` | ep-library-profile |
140
+ | `--work-dir` | `-w` | ep-library-profile |
141
+ | `--config` | `-K` | global (all commands) |
142
+
98
143
  You can pass multiple FASTQ paths using repeated `--fastq` options or glob patterns. Optional `--log-path` flags redirect logs if you prefer a location outside the default results directory.
99
144
 
100
145
  ---
101
146
 
147
+ ## Configuration File
148
+
149
+ uht-tooling supports a YAML configuration file for default options.
150
+
151
+ **Auto-discovery locations** (in order):
152
+ 1. `$UHT_TOOLING_CONFIG` environment variable
153
+ 2. `~/.uht-tooling.yaml`
154
+ 3. `~/.config/uht-tooling/config.yaml`
155
+ 4. `.uht-tooling.yaml` (current directory)
156
+
157
+ Or specify explicitly: `uht-tooling --config my-config.yaml ...`
158
+
159
+ **Example ~/.uht-tooling.yaml:**
160
+ ```yaml
161
+ paths:
162
+ output_dir: ~/results/uht-tooling
163
+
164
+ defaults:
165
+ mutation_caller:
166
+ threshold: 15
167
+ umi_hunter:
168
+ umi_identity_threshold: 0.85
169
+ min_cluster_size: 5
170
+ ```
171
+
172
+ CLI options always take precedence over config values.
173
+
174
+ ---
175
+
102
176
  ## Workflow reference
103
177
 
104
178
  ### Nextera XT primer design
@@ -313,13 +387,57 @@ Please be aware, this toolkit will not scale well beyond around 50k reads/sample
313
387
  --fastq data/ep-library-profile/*.fastq.gz \
314
388
  --output-dir results/ep-library-profile/
315
389
  ```
316
- - Output bundle includes per-sample directories, a master summary TSV, and a `summary_panels` figure that visualises positional mutation rates, coverage, and amino-acid simulations.
390
+
391
+ **Output structure**
392
+
393
+ Each sample produces an organized output directory:
394
+
395
+ ```
396
+ sample_name/
397
+ ├── KEY_FINDINGS.txt # Lay-user executive summary
398
+ ├── summary_panels.png/pdf # Main visualization
399
+ ├── aa_mutation_consensus.txt # Consensus estimate details
400
+ ├── run.log # Analysis log
401
+ └── detailed/ # Technical outputs
402
+ ├── methodology_notes.txt # Documents which lambda drives what
403
+ ├── lambda_comparison.csv # Side-by-side lambda comparison
404
+ ├── gene_mismatch_rates.csv
405
+ ├── base_distribution.csv
406
+ ├── aa_substitutions.csv
407
+ ├── plasmid_coverage.csv
408
+ ├── aa_mutation_distribution.csv
409
+ ├── comprehensive_qc_data.csv
410
+ ├── simple_qc_data.csv
411
+ └── qc_plots/ # QC visualizations
412
+ ├── qc_plot_*.png
413
+ ├── comprehensive_qc_analysis.png
414
+ ├── error_analysis.png
415
+ └── qc_mutation_rate_vs_quality.png/csv
416
+ ```
417
+
418
+ **Lambda estimates: which to use**
419
+
420
+ The profiler calculates lambda (mutations per gene copy) via two methods:
421
+
422
+ | Method | Formula | Error Quantified? | Used For |
423
+ |--------|---------|-------------------|----------|
424
+ | Simple | `(hit_rate - bg_rate) × seq_len` | No | KDE plot, Monte Carlo simulation |
425
+ | Consensus | Precision-weighted average across Q-scores | Yes | Recommended for reporting |
426
+
427
+ - **For publication/reporting**: Use the consensus value from `KEY_FINDINGS.txt` or `aa_mutation_consensus.txt`.
428
+ - **For understanding distribution shape**: See the KDE plot in `summary_panels.png` (note: uses simple lambda).
429
+ - **For detailed error analysis**: See `detailed/comprehensive_qc_data.csv`.
430
+
431
+ The `KEY_FINDINGS.txt` file provides a plain-language summary including:
432
+ - Expected AA mutations per gene copy
433
+ - Poisson-based interpretation (% wild-type, % 1 mutation, % 2+ mutations)
434
+ - Quality assessment (GOOD/ACCEPTABLE/LOW COVERAGE)
317
435
 
318
436
  **How the mutation rate and AA expectations are derived**
319
437
 
320
- 1. Reads are aligned to both the region of interest and the full plasmid. Mismatches in the region define the target rate; mismatches elsewhere provide the background.
438
+ 1. Reads are aligned to both the region of interest and the full plasmid. Mismatches in the region define the "target" rate; mismatches elsewhere provide the background.
321
439
  2. The per-base background rate is subtracted from the target rate to yield a net nucleotide mutation rate, and the standard deviation reflects binomial sampling and quality-score uncertainty.
322
- 3. The net rate is multiplied by the CDS length to estimate λ_bp (mutations per copy). Monte Carlo simulations then flip random bases, translate the mutated CDS, and count amino-acid differences across 1,000 trials—these drives the AA mutation mean/variance that appear in the panel plot.
440
+ 3. The net rate is multiplied by the CDS length to estimate λ_bp (mutations per copy). Monte Carlo simulations then flip random bases, translate the mutated CDS, and count amino-acid differences across 1,000 trials—these drive the AA mutation mean/variance that appear in the panel plot.
323
441
  4. If multiple Q-score thresholds are analysed, the CLI aggregates them via a precision-weighted consensus (1 / standard deviation weighting) after filtering out thresholds with insufficient coverage; the consensus value is written to `aa_mutation_consensus.txt` and plotted as a horizontal guide.
324
442
 
325
443
  ---
@@ -0,0 +1,20 @@
1
+ uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
2
+ uht_tooling/cli.py,sha256=EHuBzQIG3sDyujbhoUvIEo1J7KayH2Z76I0uvaCIHs0,18068
3
+ uht_tooling/config.py,sha256=FnI1IRDGLoYt9MnT_s_1Qz80uK2Wb8HCFoHCmriOFL0,3644
4
+ uht_tooling/tools.py,sha256=msQVdllP5NqG6UtfM0EIxYiJuUrVqOaYmBx33wVGx64,4109
5
+ uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
8
+ uht_tooling/workflows/design_kld.py,sha256=SWbKVfi1JgJ7cN9TU3dLEiYmZT7LQiGL_mUZ-n3PdzE,27368
9
+ uht_tooling/workflows/design_slim.py,sha256=wGXnmaJCzlAZTjf2SRupwt_3MBl5cgZr1O9nnMQyoGo,17767
10
+ uht_tooling/workflows/gui.py,sha256=aruae9_8OCacjTJkGGeavbjJSC69XqfenmhgI9O3zcM,47128
11
+ uht_tooling/workflows/mut_rate.py,sha256=mFhIuGfKXvAFmsY7wZHHy6cWgtR04pUAcwIEZM4jwr4,122890
12
+ uht_tooling/workflows/mutation_caller.py,sha256=ZGPXf8WcbmsVVnAJGcqjklHlnmUe-T_-3my-CBwHIQ0,16564
13
+ uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
14
+ uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
15
+ uht_tooling/workflows/umi_hunter.py,sha256=DlkJxaJEExIwhe7P0_aWqrM8xPbupc7oLQPZA7iylV8,15256
16
+ uht_tooling-0.3.0.dist-info/METADATA,sha256=Ge6mux5hYlR4koBeLiruBtFrTVJgIXrYYgD6xT428tQ,20211
17
+ uht_tooling-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
18
+ uht_tooling-0.3.0.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
19
+ uht_tooling-0.3.0.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
20
+ uht_tooling-0.3.0.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
2
- uht_tooling/cli.py,sha256=3QUxYBFqhQyeZ9xM_JTlqhr_UJhb_PRj7Y_UMH5Tslc,14366
3
- uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
6
- uht_tooling/workflows/design_kld.py,sha256=SWbKVfi1JgJ7cN9TU3dLEiYmZT7LQiGL_mUZ-n3PdzE,27368
7
- uht_tooling/workflows/design_slim.py,sha256=wGXnmaJCzlAZTjf2SRupwt_3MBl5cgZr1O9nnMQyoGo,17767
8
- uht_tooling/workflows/gui.py,sha256=FpzxgjOo8SQCPJRM7ltVLk3bcwZ_AxjQzZxwz7J_c1M,46436
9
- uht_tooling/workflows/mut_rate.py,sha256=Sv4OU68RNTOOsKV0QSbJ7FOgxh3vQeUeib_5mrXqyHg,109074
10
- uht_tooling/workflows/mutation_caller.py,sha256=BczuNATOSUcmlw-x6qTzEQfW8MBbvGclEyqiQiBX0cg,16222
11
- uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
12
- uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
13
- uht_tooling/workflows/umi_hunter.py,sha256=baycWycqVzUfMp5u2WZdHRl0sNuykTjy-iqtj5ahucU,15075
14
- uht_tooling-0.1.9.dist-info/METADATA,sha256=mMC92ln1dMYhDQFlKRfBsMMsMVvy0p0LDY8s6aX-4Ig,16399
15
- uht_tooling-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
16
- uht_tooling-0.1.9.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
17
- uht_tooling-0.1.9.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
18
- uht_tooling-0.1.9.dist-info/RECORD,,