uht-tooling 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uht_tooling/cli.py +41 -4
- uht_tooling/workflows/gui.py +475 -64
- uht_tooling/workflows/umi_hunter.py +17 -2
- {uht_tooling-0.1.2.dist-info → uht_tooling-0.1.4.dist-info}/METADATA +25 -19
- {uht_tooling-0.1.2.dist-info → uht_tooling-0.1.4.dist-info}/RECORD +8 -8
- {uht_tooling-0.1.2.dist-info → uht_tooling-0.1.4.dist-info}/WHEEL +0 -0
- {uht_tooling-0.1.2.dist-info → uht_tooling-0.1.4.dist-info}/entry_points.txt +0 -0
- {uht_tooling-0.1.2.dist-info → uht_tooling-0.1.4.dist-info}/top_level.txt +0 -0
uht_tooling/cli.py
CHANGED
|
@@ -18,6 +18,7 @@ from uht_tooling.workflows.umi_hunter import (
|
|
|
18
18
|
expand_fastq_inputs as expand_fastq_inputs_umi,
|
|
19
19
|
run_umi_hunter,
|
|
20
20
|
)
|
|
21
|
+
from uht_tooling.workflows.gui import launch_gui
|
|
21
22
|
|
|
22
23
|
app = typer.Typer(help="Command-line interface for the uht-tooling package.")
|
|
23
24
|
|
|
@@ -232,6 +233,11 @@ def umi_hunter_command(
|
|
|
232
233
|
max=1.0,
|
|
233
234
|
help="Mutation threshold for consensus calling (default: 0.7).",
|
|
234
235
|
),
|
|
236
|
+
min_cluster_size: int = typer.Option(
|
|
237
|
+
1,
|
|
238
|
+
min=1,
|
|
239
|
+
help="Minimum number of reads required in a UMI cluster before a consensus is generated.",
|
|
240
|
+
),
|
|
235
241
|
log_path: Optional[Path] = typer.Option(
|
|
236
242
|
None,
|
|
237
243
|
dir_okay=False,
|
|
@@ -248,6 +254,7 @@ def umi_hunter_command(
|
|
|
248
254
|
output_dir=output_dir,
|
|
249
255
|
umi_identity_threshold=umi_identity_threshold,
|
|
250
256
|
consensus_mutation_threshold=consensus_mutation_threshold,
|
|
257
|
+
min_cluster_size=min_cluster_size,
|
|
251
258
|
log_path=log_path,
|
|
252
259
|
)
|
|
253
260
|
if not results:
|
|
@@ -255,7 +262,12 @@ def umi_hunter_command(
|
|
|
255
262
|
else:
|
|
256
263
|
typer.echo("UMI hunter outputs:")
|
|
257
264
|
for entry in results:
|
|
258
|
-
|
|
265
|
+
total_clusters = entry.get("clusters_total", entry.get("clusters", 0))
|
|
266
|
+
typer.echo(
|
|
267
|
+
f" Sample {entry['sample']}: "
|
|
268
|
+
f"{entry.get('clusters', 0)} consensus clusters "
|
|
269
|
+
f"(from {total_clusters} total) → {entry['directory']}"
|
|
270
|
+
)
|
|
259
271
|
|
|
260
272
|
|
|
261
273
|
@app.command("ep-library-profile", help="Profile mutation rates for ep-library sequencing data.")
|
|
@@ -355,9 +367,34 @@ def profile_inserts_command(
|
|
|
355
367
|
typer.echo(f" Sample {entry['sample']}: {entry['directory']}")
|
|
356
368
|
|
|
357
369
|
|
|
358
|
-
@app.command("gui", help="Launch the graphical interface
|
|
359
|
-
def gui_command(
|
|
360
|
-
|
|
370
|
+
@app.command("gui", help="Launch the graphical interface.")
|
|
371
|
+
def gui_command(
|
|
372
|
+
server_name: str = typer.Option(
|
|
373
|
+
"127.0.0.1",
|
|
374
|
+
"--server-name",
|
|
375
|
+
"-n",
|
|
376
|
+
help="Hostname or IP address to bind the GUI server.",
|
|
377
|
+
),
|
|
378
|
+
server_port: Optional[int] = typer.Option(
|
|
379
|
+
7860,
|
|
380
|
+
"--server-port",
|
|
381
|
+
"-p",
|
|
382
|
+
help="Preferred port for the GUI (falls back automatically if unavailable).",
|
|
383
|
+
),
|
|
384
|
+
share: bool = typer.Option(
|
|
385
|
+
False,
|
|
386
|
+
"--share",
|
|
387
|
+
help="Enable Gradio's public sharing tunnel (requires network access).",
|
|
388
|
+
),
|
|
389
|
+
):
|
|
390
|
+
"""Launch the Gradio GUI."""
|
|
391
|
+
try:
|
|
392
|
+
launch_gui(server_name=server_name, server_port=server_port, share=share)
|
|
393
|
+
except KeyboardInterrupt:
|
|
394
|
+
typer.echo("GUI stopped by user.")
|
|
395
|
+
except Exception as exc:
|
|
396
|
+
typer.echo(f"Failed to start GUI: {exc}")
|
|
397
|
+
raise typer.Exit(1)
|
|
361
398
|
|
|
362
399
|
|
|
363
400
|
def main():
|
uht_tooling/workflows/gui.py
CHANGED
|
@@ -10,7 +10,7 @@ import tempfile
|
|
|
10
10
|
import textwrap
|
|
11
11
|
import zipfile
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Iterable, List, Optional, Sequence, Tuple
|
|
13
|
+
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
16
|
import gradio as gr
|
|
@@ -241,28 +241,60 @@ def run_gui_design_gibson(
|
|
|
241
241
|
def run_gui_mutation_caller(
|
|
242
242
|
fastq_file: Optional[str],
|
|
243
243
|
template_file: Optional[str],
|
|
244
|
-
|
|
244
|
+
upstream_flank: str,
|
|
245
|
+
downstream_flank: str,
|
|
246
|
+
min_gene_length: Optional[float],
|
|
247
|
+
max_gene_length: Optional[float],
|
|
245
248
|
) -> Tuple[str, Optional[str]]:
|
|
249
|
+
config_dir: Optional[Path] = None
|
|
250
|
+
output_dir: Optional[Path] = None
|
|
246
251
|
try:
|
|
247
|
-
if not fastq_file or not template_file
|
|
248
|
-
raise ValueError("Upload a FASTQ(.gz)
|
|
252
|
+
if not fastq_file or not template_file:
|
|
253
|
+
raise ValueError("Upload a FASTQ(.gz) read file and the reference template FASTA.")
|
|
254
|
+
|
|
255
|
+
gene_start = _ensure_text(upstream_flank, "Upstream flank")
|
|
256
|
+
gene_end = _ensure_text(downstream_flank, "Downstream flank")
|
|
257
|
+
if min_gene_length is None or max_gene_length is None:
|
|
258
|
+
raise ValueError("Provide minimum and maximum gene lengths (in nucleotides).")
|
|
259
|
+
|
|
260
|
+
gene_min = int(min_gene_length)
|
|
261
|
+
gene_max = int(max_gene_length)
|
|
262
|
+
if gene_min <= 0 or gene_max <= 0:
|
|
263
|
+
raise ValueError("Gene length bounds must be positive integers.")
|
|
264
|
+
if gene_min > gene_max:
|
|
265
|
+
raise ValueError("Minimum gene length cannot exceed the maximum gene length.")
|
|
266
|
+
|
|
267
|
+
config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_mutation_cfg_"))
|
|
268
|
+
config_csv = config_dir / "mutation_flanks.csv"
|
|
269
|
+
pd.DataFrame(
|
|
270
|
+
{
|
|
271
|
+
"gene_flanks": [gene_start.upper(), gene_end.upper()],
|
|
272
|
+
"gene_min_max": [gene_min, gene_max],
|
|
273
|
+
}
|
|
274
|
+
).to_csv(config_csv, index=False)
|
|
249
275
|
|
|
250
276
|
output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_mutation_out_"))
|
|
251
277
|
results = run_mutation_caller(
|
|
252
278
|
template_fasta=Path(template_file),
|
|
253
|
-
flanks_csv=
|
|
279
|
+
flanks_csv=config_csv,
|
|
254
280
|
fastq_files=[Path(fastq_file)],
|
|
255
281
|
output_dir=output_dir,
|
|
256
282
|
threshold=10,
|
|
257
283
|
)
|
|
258
284
|
|
|
259
285
|
if not results:
|
|
260
|
-
return "No amino-acid substitutions detected.", None
|
|
261
|
-
|
|
262
|
-
lines = [
|
|
286
|
+
return "No amino-acid substitutions detected. Check flank selections and read quality.", None
|
|
287
|
+
|
|
288
|
+
lines = [
|
|
289
|
+
"### Mutation Caller",
|
|
290
|
+
"",
|
|
291
|
+
"Long-read reads were aligned to the provided template, flank-delimited coding regions were extracted, and amino-acid substitutions were summarised.",
|
|
292
|
+
"",
|
|
293
|
+
"**Run outputs**",
|
|
294
|
+
]
|
|
263
295
|
sample_dirs = []
|
|
264
296
|
for entry in results:
|
|
265
|
-
lines.append(f"**{entry['sample']}** → {entry['directory']}")
|
|
297
|
+
lines.append(f"- **{entry['sample']}** → {entry['directory']}")
|
|
266
298
|
sample_dirs.append(Path(entry["directory"]))
|
|
267
299
|
summary = "\n".join(lines)
|
|
268
300
|
archive = _zip_paths(sample_dirs, "mutation_caller")
|
|
@@ -270,33 +302,94 @@ def run_gui_mutation_caller(
|
|
|
270
302
|
except Exception as exc: # pragma: no cover
|
|
271
303
|
_LOGGER.exception("Mutation caller GUI failure")
|
|
272
304
|
return f"⚠️ Error: {exc}", None
|
|
305
|
+
finally:
|
|
306
|
+
if config_dir:
|
|
307
|
+
_clean_temp_path(config_dir)
|
|
308
|
+
if output_dir:
|
|
309
|
+
_clean_temp_path(output_dir)
|
|
273
310
|
|
|
274
311
|
|
|
275
312
|
def run_gui_umi_hunter(
|
|
276
313
|
fastq_file: Optional[str],
|
|
277
314
|
template_file: Optional[str],
|
|
278
|
-
|
|
315
|
+
umi_start: str,
|
|
316
|
+
umi_end: str,
|
|
317
|
+
umi_min_length: Optional[float],
|
|
318
|
+
umi_max_length: Optional[float],
|
|
319
|
+
gene_start: str,
|
|
320
|
+
gene_end: str,
|
|
321
|
+
umi_identity_threshold: float,
|
|
322
|
+
consensus_threshold: float,
|
|
323
|
+
min_cluster_size: int,
|
|
279
324
|
) -> Tuple[str, Optional[str]]:
|
|
325
|
+
config_dir: Optional[Path] = None
|
|
326
|
+
output_dir: Optional[Path] = None
|
|
280
327
|
try:
|
|
281
|
-
if not fastq_file or not template_file
|
|
282
|
-
raise ValueError("Upload a FASTQ(.gz)
|
|
328
|
+
if not fastq_file or not template_file:
|
|
329
|
+
raise ValueError("Upload a FASTQ(.gz) read file and the template FASTA.")
|
|
330
|
+
|
|
331
|
+
umi_start_clean = _ensure_text(umi_start, "UMI upstream flank").upper()
|
|
332
|
+
umi_end_clean = _ensure_text(umi_end, "UMI downstream flank").upper()
|
|
333
|
+
gene_start_clean = _ensure_text(gene_start, "Gene upstream flank").upper()
|
|
334
|
+
gene_end_clean = _ensure_text(gene_end, "Gene downstream flank").upper()
|
|
335
|
+
if umi_min_length is None or umi_max_length is None:
|
|
336
|
+
raise ValueError("Provide minimum and maximum UMI lengths.")
|
|
337
|
+
|
|
338
|
+
umi_min = int(umi_min_length)
|
|
339
|
+
umi_max = int(umi_max_length)
|
|
340
|
+
if umi_min <= 0 or umi_max <= 0:
|
|
341
|
+
raise ValueError("UMI length bounds must be positive integers.")
|
|
342
|
+
if umi_min > umi_max:
|
|
343
|
+
raise ValueError("Minimum UMI length cannot exceed the maximum length.")
|
|
344
|
+
if not (0.0 <= umi_identity_threshold <= 1.0):
|
|
345
|
+
raise ValueError("UMI identity threshold must be between 0 and 1.")
|
|
346
|
+
if not (0.0 <= consensus_threshold <= 1.0):
|
|
347
|
+
raise ValueError("Consensus mutation threshold must be between 0 and 1.")
|
|
348
|
+
if min_cluster_size is None or int(min_cluster_size) < 1:
|
|
349
|
+
raise ValueError("Minimum cluster size must be at least 1.")
|
|
350
|
+
min_cluster_size_int = int(min_cluster_size)
|
|
351
|
+
|
|
352
|
+
config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_umi_cfg_"))
|
|
353
|
+
config_csv = config_dir / "umi_config.csv"
|
|
354
|
+
pd.DataFrame(
|
|
355
|
+
{
|
|
356
|
+
"umi_flanks": [umi_start_clean, umi_end_clean],
|
|
357
|
+
"umi_min_max": [umi_min, umi_max],
|
|
358
|
+
"gene_flanks": [gene_start_clean, gene_end_clean],
|
|
359
|
+
}
|
|
360
|
+
).to_csv(config_csv, index=False)
|
|
283
361
|
|
|
284
362
|
output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_umi_out_"))
|
|
285
363
|
results = run_umi_hunter(
|
|
286
364
|
template_fasta=Path(template_file),
|
|
287
|
-
config_csv=
|
|
365
|
+
config_csv=config_csv,
|
|
288
366
|
fastq_files=[Path(fastq_file)],
|
|
289
367
|
output_dir=output_dir,
|
|
368
|
+
umi_identity_threshold=umi_identity_threshold,
|
|
369
|
+
consensus_mutation_threshold=consensus_threshold,
|
|
370
|
+
min_cluster_size=min_cluster_size_int,
|
|
290
371
|
)
|
|
291
372
|
|
|
292
373
|
if not results:
|
|
293
|
-
return
|
|
374
|
+
return (
|
|
375
|
+
"No UMI clusters were generated. Double-check flank selections and threshold settings.",
|
|
376
|
+
None,
|
|
377
|
+
)
|
|
294
378
|
|
|
295
|
-
lines = [
|
|
379
|
+
lines = [
|
|
380
|
+
"### UMI Hunter",
|
|
381
|
+
"",
|
|
382
|
+
"Reads were scanned for UMI and gene flanks, deduplicated by UMI, and consensus alleles were generated.",
|
|
383
|
+
"",
|
|
384
|
+
"**Run outputs**",
|
|
385
|
+
]
|
|
296
386
|
sample_dirs = []
|
|
297
387
|
for entry in results:
|
|
388
|
+
total_clusters = entry.get("clusters_total", entry["clusters"])
|
|
298
389
|
lines.append(
|
|
299
|
-
f"**{entry['sample']}** → {entry['clusters']} clusters
|
|
390
|
+
f"- **{entry['sample']}** → {entry['clusters']} consensus clusters "
|
|
391
|
+
f"(≥ {min_cluster_size_int} reads) from {total_clusters} total, "
|
|
392
|
+
f"results in {entry['directory']}"
|
|
300
393
|
)
|
|
301
394
|
sample_dirs.append(Path(entry["directory"]))
|
|
302
395
|
summary = "\n".join(lines)
|
|
@@ -305,35 +398,82 @@ def run_gui_umi_hunter(
|
|
|
305
398
|
except Exception as exc: # pragma: no cover
|
|
306
399
|
_LOGGER.exception("UMI hunter GUI failure")
|
|
307
400
|
return f"⚠️ Error: {exc}", None
|
|
401
|
+
finally:
|
|
402
|
+
if config_dir:
|
|
403
|
+
_clean_temp_path(config_dir)
|
|
404
|
+
if output_dir:
|
|
405
|
+
_clean_temp_path(output_dir)
|
|
308
406
|
|
|
309
407
|
|
|
310
408
|
def run_gui_profile_inserts(
|
|
311
|
-
|
|
409
|
+
probes_table: Any,
|
|
312
410
|
fastq_files: Sequence[str],
|
|
411
|
+
min_ratio: int,
|
|
313
412
|
) -> Tuple[str, Optional[str]]:
|
|
413
|
+
config_dir: Optional[Path] = None
|
|
414
|
+
output_dir: Optional[Path] = None
|
|
314
415
|
try:
|
|
315
|
-
if not
|
|
316
|
-
raise ValueError("Upload
|
|
416
|
+
if not fastq_files:
|
|
417
|
+
raise ValueError("Upload at least one FASTQ(.gz) file.")
|
|
418
|
+
if probes_table is None:
|
|
419
|
+
raise ValueError("Provide at least one probe pair.")
|
|
420
|
+
|
|
421
|
+
if isinstance(probes_table, pd.DataFrame):
|
|
422
|
+
df = probes_table.copy()
|
|
423
|
+
else:
|
|
424
|
+
df = pd.DataFrame(probes_table or [], columns=["name", "upstream", "downstream"])
|
|
425
|
+
|
|
426
|
+
# Normalise and validate probe entries
|
|
427
|
+
df = df.replace({pd.NA: "", None: ""})
|
|
428
|
+
for column in df.columns:
|
|
429
|
+
if df[column].dtype == object:
|
|
430
|
+
df[column] = df[column].map(lambda x: x.strip() if isinstance(x, str) else x)
|
|
431
|
+
|
|
432
|
+
if "upstream" not in df.columns or "downstream" not in df.columns:
|
|
433
|
+
raise ValueError("Probe table must contain 'upstream' and 'downstream' columns.")
|
|
434
|
+
|
|
435
|
+
df_valid = df[(df["upstream"] != "") & (df["downstream"] != "")].copy()
|
|
436
|
+
if df_valid.empty:
|
|
437
|
+
raise ValueError("Enter at least one probe pair with both upstream and downstream sequences.")
|
|
438
|
+
|
|
439
|
+
df_valid = df_valid.reset_index(drop=True)
|
|
440
|
+
if "name" not in df_valid.columns:
|
|
441
|
+
df_valid["name"] = [f"probe_{i + 1}" for i in range(len(df_valid))]
|
|
442
|
+
else:
|
|
443
|
+
fallback_names = pd.Series(
|
|
444
|
+
[f"probe_{i + 1}" for i in range(len(df_valid))], index=df_valid.index
|
|
445
|
+
)
|
|
446
|
+
df_valid["name"] = df_valid["name"].replace("", pd.NA).fillna(fallback_names)
|
|
447
|
+
|
|
448
|
+
config_dir = Path(tempfile.mkdtemp(prefix="uht_gui_profile_cfg_"))
|
|
449
|
+
probes_csv = config_dir / "probes.csv"
|
|
450
|
+
df_valid.to_csv(probes_csv, index=False)
|
|
317
451
|
|
|
318
452
|
output_dir = Path(tempfile.mkdtemp(prefix="uht_gui_profile_out_"))
|
|
319
453
|
results = run_profile_inserts(
|
|
320
|
-
probes_csv=
|
|
454
|
+
probes_csv=probes_csv,
|
|
321
455
|
fastq_files=[Path(f) for f in fastq_files],
|
|
322
456
|
output_dir=output_dir,
|
|
457
|
+
min_ratio=int(min_ratio),
|
|
323
458
|
)
|
|
324
459
|
|
|
325
460
|
if not results:
|
|
326
|
-
return "No inserts were extracted. Adjust probe
|
|
461
|
+
return "No inserts were extracted. Adjust probe sequences or similarity threshold and try again.", None
|
|
327
462
|
|
|
328
463
|
first_insert = results[0]["fasta"] if isinstance(results, list) else None
|
|
329
464
|
preview = "*(preview unavailable)*"
|
|
330
465
|
if first_insert and Path(first_insert).exists():
|
|
331
|
-
preview = Path(first_insert).read_text().splitlines()[0][:
|
|
466
|
+
preview = Path(first_insert).read_text().splitlines()[0][:120] + "..."
|
|
332
467
|
|
|
333
468
|
summary = textwrap.dedent(
|
|
334
469
|
"""
|
|
335
470
|
### Insert Profiling
|
|
336
|
-
|
|
471
|
+
Probe-defined regions were scanned in the provided FASTQ files, inserts were extracted, and QC metrics were generated.
|
|
472
|
+
|
|
473
|
+
**Key outputs**
|
|
474
|
+
- FASTA files containing extracted inserts per probe pair
|
|
475
|
+
- Summary tables covering length, GC content, duplicate rate, and probe match quality
|
|
476
|
+
- A gallery of QC plots (length distributions, base composition, probe performance)
|
|
337
477
|
"""
|
|
338
478
|
)
|
|
339
479
|
archive = _zip_paths([Path(r["directory"]) for r in results], "profile_inserts")
|
|
@@ -341,6 +481,11 @@ def run_gui_profile_inserts(
|
|
|
341
481
|
except Exception as exc: # pragma: no cover
|
|
342
482
|
_LOGGER.exception("Profile inserts GUI failure")
|
|
343
483
|
return f"⚠️ Error: {exc}", None
|
|
484
|
+
finally:
|
|
485
|
+
if config_dir:
|
|
486
|
+
_clean_temp_path(config_dir)
|
|
487
|
+
if output_dir:
|
|
488
|
+
_clean_temp_path(output_dir)
|
|
344
489
|
|
|
345
490
|
|
|
346
491
|
def run_gui_ep_library_profile(
|
|
@@ -406,18 +551,34 @@ def create_gui() -> gr.Blocks:
|
|
|
406
551
|
textwrap.dedent(
|
|
407
552
|
"""
|
|
408
553
|
# uht-tooling
|
|
409
|
-
A guided graphical interface for primer design and sequencing analysis.
|
|
410
|
-
|
|
554
|
+
A guided graphical interface for primer design and sequencing analysis. Each tab mirrors the command-line workflows documented in the README and bundles results, logs, and QC artefacts for download.
|
|
555
|
+
|
|
556
|
+
**How to use**
|
|
557
|
+
1. Select the workflow that matches your experiment.
|
|
558
|
+
2. Provide the required inputs (text fields, FASTQ/FASTA uploads, or probe tables).
|
|
559
|
+
3. Run the analysis and download the ZIP archive for complete outputs.
|
|
560
|
+
|
|
561
|
+
Need automation or batch processing? Use the Typer CLI (`uht-tooling ...`) with the same arguments shown here.
|
|
411
562
|
"""
|
|
412
563
|
)
|
|
413
564
|
)
|
|
414
565
|
|
|
415
566
|
with gr.Tab("Nextera XT"): # --- Nextera ---
|
|
416
567
|
gr.Markdown(
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
568
|
+
textwrap.dedent(
|
|
569
|
+
"""
|
|
570
|
+
### Illumina-Compatible Primer Design
|
|
571
|
+
Generates Nextera XT-ready primers from forward/reverse binding regions. The workflow preloads 12 i5 and 12 i7 indices (144 combinations) and mirrors the “One-PCR-to-flowcell” process described in the README.
|
|
572
|
+
|
|
573
|
+
**Inputs**
|
|
574
|
+
- Forward primer binding region (5'→3')
|
|
575
|
+
- Reverse primer binding region (5'→3')
|
|
576
|
+
|
|
577
|
+
**Outputs**
|
|
578
|
+
- CSV with i5/i7 indices, primer sequences, and ordering-ready metadata.
|
|
579
|
+
- Run log noting index selection and any validation warnings.
|
|
580
|
+
"""
|
|
581
|
+
)
|
|
421
582
|
)
|
|
422
583
|
forward = gr.Textbox(label="Forward primer (5'→3')")
|
|
423
584
|
reverse = gr.Textbox(label="Reverse primer (5'→3')")
|
|
@@ -429,13 +590,34 @@ def create_gui() -> gr.Blocks:
|
|
|
429
590
|
inputs=[forward, reverse],
|
|
430
591
|
outputs=[nextera_summary, nextera_download],
|
|
431
592
|
)
|
|
593
|
+
with gr.Accordion("Wet-lab guidance", open=False):
|
|
594
|
+
gr.Markdown(
|
|
595
|
+
textwrap.dedent(
|
|
596
|
+
"""
|
|
597
|
+
- Monitor amplification by qPCR and cap the cycle count to reach roughly 10 % yield to limit bias.
|
|
598
|
+
- Purify products with SPRIselect beads (~0.65:1 bead:DNA ratio) to remove residual primers.
|
|
599
|
+
- Confirm primer depletion via electrophoresis (e.g., BioAnalyzer) before sequencing prep.
|
|
600
|
+
"""
|
|
601
|
+
)
|
|
602
|
+
)
|
|
432
603
|
|
|
433
604
|
with gr.Tab("SLIM"):
|
|
434
605
|
gr.Markdown(
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
606
|
+
textwrap.dedent(
|
|
607
|
+
"""
|
|
608
|
+
### Sequence-Ligation Independent Mutagenesis
|
|
609
|
+
Designs paired short/long primers to introduce targeted mutations by SLIM cloning, matching the workflow outlined in the README.
|
|
610
|
+
|
|
611
|
+
**Inputs**
|
|
612
|
+
- Target gene coding sequence (FASTA content).
|
|
613
|
+
- Plasmid or genomic context containing the gene.
|
|
614
|
+
- Mutations (one per line, e.g. substitution `A123G`, deletion `T241Del`, insertion `T241TS`).
|
|
615
|
+
|
|
616
|
+
**Outputs**
|
|
617
|
+
- `SLIM_primers.csv` with primer sequences and annealing temperatures.
|
|
618
|
+
- Log file capturing primer QC and any design warnings.
|
|
619
|
+
"""
|
|
620
|
+
)
|
|
439
621
|
)
|
|
440
622
|
slim_gene = gr.Textbox(label="Gene sequence", lines=4)
|
|
441
623
|
slim_context = gr.Textbox(label="Plasmid context", lines=4)
|
|
@@ -448,13 +630,36 @@ def create_gui() -> gr.Blocks:
|
|
|
448
630
|
inputs=[slim_gene, slim_context, slim_mutations],
|
|
449
631
|
outputs=[slim_summary, slim_download],
|
|
450
632
|
)
|
|
633
|
+
with gr.Accordion("Bench workflow blueprint", open=False):
|
|
634
|
+
gr.Markdown(
|
|
635
|
+
textwrap.dedent(
|
|
636
|
+
"""
|
|
637
|
+
1. Run two PCRs: (A) long forward + short reverse, (B) long reverse + short forward.
|
|
638
|
+
2. Combine 10 µL from each PCR with 10 µL H-buffer (150 mM Tris pH 8, 400 mM NaCl, 60 mM EDTA).
|
|
639
|
+
3. Thermocycle: 99 °C 3 min → 2× (65 °C 5 min → 30 °C 15 min) → hold at 4 °C.
|
|
640
|
+
4. Transform directly into NEB 5-alpha or BL21 (DE3); the method scales to dozens of mutants simultaneously.
|
|
641
|
+
"""
|
|
642
|
+
)
|
|
643
|
+
)
|
|
451
644
|
|
|
452
645
|
with gr.Tab("Gibson"):
|
|
453
646
|
gr.Markdown(
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
647
|
+
textwrap.dedent(
|
|
648
|
+
"""
|
|
649
|
+
### Gibson Assembly Primer Design
|
|
650
|
+
Plans primer sets and assembly steps for Gibson mutagenesis, supporting multi-mutation constructs using the `+` syntax (e.g. `A123G+T150A`).
|
|
651
|
+
|
|
652
|
+
**Inputs**
|
|
653
|
+
- Coding sequence for the gene of interest.
|
|
654
|
+
- Circular plasmid context sequence.
|
|
655
|
+
- Mutation definitions (one per line; use `+` to bundle simultaneous edits).
|
|
656
|
+
|
|
657
|
+
**Outputs**
|
|
658
|
+
- Primer CSV with overlap sequences and melting temperatures.
|
|
659
|
+
- Assembly plan CSV detailing fragment combinations.
|
|
660
|
+
- Log summarising design decisions and any warnings about overlapping regions.
|
|
661
|
+
"""
|
|
662
|
+
)
|
|
458
663
|
)
|
|
459
664
|
gibson_gene = gr.Textbox(label="Gene sequence", lines=4)
|
|
460
665
|
gibson_context = gr.Textbox(label="Plasmid context", lines=4)
|
|
@@ -467,74 +672,270 @@ def create_gui() -> gr.Blocks:
|
|
|
467
672
|
inputs=[gibson_gene, gibson_context, gibson_mutations],
|
|
468
673
|
outputs=[gibson_summary, gibson_download],
|
|
469
674
|
)
|
|
675
|
+
with gr.Accordion("Tips for multi-mutation designs", open=False):
|
|
676
|
+
gr.Markdown(
|
|
677
|
+
textwrap.dedent(
|
|
678
|
+
"""
|
|
679
|
+
- If two mutations compete for primer space, design them in sequential runs to avoid overly long primers.
|
|
680
|
+
- Use the assembly plan CSV to map which fragments to combine in each Gibson reaction.
|
|
681
|
+
- When replacing entire codons (e.g. `L46GP`), ensure the plasmid context covers both flanks to maintain overlap.
|
|
682
|
+
"""
|
|
683
|
+
)
|
|
684
|
+
)
|
|
470
685
|
|
|
471
686
|
with gr.Tab("Mutation Caller"):
|
|
472
687
|
gr.Markdown(
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
688
|
+
textwrap.dedent(
|
|
689
|
+
"""
|
|
690
|
+
### Long-read Mutation Analysis
|
|
691
|
+
Extracts coding regions bounded by user-defined flanks, aligns them to the template, and reports amino-acid substitutions alongside co-occurrence summaries.
|
|
692
|
+
|
|
693
|
+
**Required inputs**
|
|
694
|
+
- FASTQ (.fastq.gz): Oxford Nanopore or other long-read data.
|
|
695
|
+
- Template FASTA: coding sequence used as the reference for alignment.
|
|
696
|
+
- Flank sequences: short 8–12 bp motifs immediately upstream and downstream of the gene.
|
|
697
|
+
- Gene length bounds: acceptable size window (in nucleotides) for the extracted gene segment.
|
|
698
|
+
"""
|
|
699
|
+
)
|
|
477
700
|
)
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
701
|
+
with gr.Row():
|
|
702
|
+
mc_fastq = gr.File(
|
|
703
|
+
label="FASTQ (.fastq.gz)",
|
|
704
|
+
file_types=[".fastq", ".gz"],
|
|
705
|
+
type="filepath",
|
|
706
|
+
)
|
|
707
|
+
mc_template = gr.File(
|
|
708
|
+
label="Template FASTA",
|
|
709
|
+
file_types=[".fasta", ".fa"],
|
|
710
|
+
type="filepath",
|
|
711
|
+
)
|
|
712
|
+
with gr.Row():
|
|
713
|
+
mc_upstream = gr.Textbox(
|
|
714
|
+
label="Upstream flank (5'→3')",
|
|
715
|
+
placeholder="e.g. ACTGTTAG",
|
|
716
|
+
)
|
|
717
|
+
mc_downstream = gr.Textbox(
|
|
718
|
+
label="Downstream flank (5'→3')",
|
|
719
|
+
placeholder="e.g. CGAACCTA",
|
|
720
|
+
)
|
|
721
|
+
with gr.Row():
|
|
722
|
+
mc_min_len = gr.Number(
|
|
723
|
+
label="Minimum gene length (nt)",
|
|
724
|
+
value=900,
|
|
725
|
+
precision=0,
|
|
726
|
+
)
|
|
727
|
+
mc_max_len = gr.Number(
|
|
728
|
+
label="Maximum gene length (nt)",
|
|
729
|
+
value=1200,
|
|
730
|
+
precision=0,
|
|
731
|
+
)
|
|
481
732
|
mc_btn = gr.Button("Run mutation caller", variant="primary")
|
|
482
733
|
mc_summary = gr.Markdown(label="Summary")
|
|
483
734
|
mc_download = gr.File(label="Download results", file_count="single")
|
|
484
735
|
mc_btn.click(
|
|
485
736
|
fn=run_gui_mutation_caller,
|
|
486
|
-
inputs=[
|
|
737
|
+
inputs=[
|
|
738
|
+
mc_fastq,
|
|
739
|
+
mc_template,
|
|
740
|
+
mc_upstream,
|
|
741
|
+
mc_downstream,
|
|
742
|
+
mc_min_len,
|
|
743
|
+
mc_max_len,
|
|
744
|
+
],
|
|
487
745
|
outputs=[mc_summary, mc_download],
|
|
488
746
|
)
|
|
747
|
+
with gr.Accordion("What happens under the hood", open=False):
|
|
748
|
+
gr.Markdown(
|
|
749
|
+
textwrap.dedent(
|
|
750
|
+
"""
|
|
751
|
+
- Reads are scanned for the upstream and downstream flanks; the sequence between them is treated as the gene of interest if it falls within the specified length window.
|
|
752
|
+
- MAFFT aligns recovered genes to the reference template and the pipeline annotates amino-acid substitutions, co-occurrence networks, and depth statistics.
|
|
753
|
+
- Outputs mirror the CLI version: per-sample directories with CSV summaries, JSON co-occurrence graphs, QC plots, and a detailed `run.log`.
|
|
754
|
+
"""
|
|
755
|
+
)
|
|
756
|
+
)
|
|
489
757
|
|
|
490
758
|
with gr.Tab("UMI Hunter"):
|
|
491
759
|
gr.Markdown(
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
760
|
+
textwrap.dedent(
|
|
761
|
+
"""
|
|
762
|
+
### UMI–Gene Pair Clustering
|
|
763
|
+
Detects UMI barcodes, extracts paired gene inserts, clusters reads by UMI identity, and emits consensus sequences with abundance tables.
|
|
764
|
+
|
|
765
|
+
**Required inputs**
|
|
766
|
+
- FASTQ (.fastq.gz) containing UMI-tagged reads.
|
|
767
|
+
- Template FASTA for downstream consensus calling.
|
|
768
|
+
- UMI and gene flank sequences marking the barcode and insert boundaries.
|
|
769
|
+
- UMI length bounds plus clustering thresholds.
|
|
770
|
+
- Minimum reads per cluster to keep (clusters below the threshold are reported but no consensus is generated).
|
|
771
|
+
"""
|
|
772
|
+
)
|
|
773
|
+
)
|
|
774
|
+
with gr.Row():
|
|
775
|
+
umi_fastq = gr.File(
|
|
776
|
+
label="FASTQ (.fastq.gz)",
|
|
777
|
+
file_types=[".fastq", ".gz"],
|
|
778
|
+
type="filepath",
|
|
779
|
+
)
|
|
780
|
+
umi_template = gr.File(
|
|
781
|
+
label="Template FASTA",
|
|
782
|
+
file_types=[".fasta", ".fa"],
|
|
783
|
+
type="filepath",
|
|
784
|
+
)
|
|
785
|
+
with gr.Row():
|
|
786
|
+
umi_start = gr.Textbox(
|
|
787
|
+
label="UMI upstream flank (5'→3')",
|
|
788
|
+
placeholder="e.g. ACACTCTTTCCCTACACGAC",
|
|
789
|
+
)
|
|
790
|
+
umi_end = gr.Textbox(
|
|
791
|
+
label="UMI downstream flank (5'→3')",
|
|
792
|
+
placeholder="e.g. GACTGGAGTTCAGACGTGTG",
|
|
793
|
+
)
|
|
794
|
+
with gr.Row():
|
|
795
|
+
gene_start = gr.Textbox(
|
|
796
|
+
label="Gene upstream flank (5'→3')",
|
|
797
|
+
placeholder="e.g. ATG...",
|
|
798
|
+
)
|
|
799
|
+
gene_end = gr.Textbox(
|
|
800
|
+
label="Gene downstream flank (5'→3')",
|
|
801
|
+
placeholder="e.g. TTA...",
|
|
802
|
+
)
|
|
803
|
+
with gr.Row():
|
|
804
|
+
umi_min_len = gr.Number(
|
|
805
|
+
label="Minimum UMI length (nt)",
|
|
806
|
+
value=8,
|
|
807
|
+
precision=0,
|
|
808
|
+
)
|
|
809
|
+
umi_max_len = gr.Number(
|
|
810
|
+
label="Maximum UMI length (nt)",
|
|
811
|
+
value=14,
|
|
812
|
+
precision=0,
|
|
813
|
+
)
|
|
814
|
+
with gr.Row():
|
|
815
|
+
umi_identity = gr.Slider(
|
|
816
|
+
label="UMI clustering identity",
|
|
817
|
+
minimum=0.5,
|
|
818
|
+
maximum=1.0,
|
|
819
|
+
value=0.9,
|
|
820
|
+
step=0.05,
|
|
821
|
+
)
|
|
822
|
+
consensus_threshold = gr.Slider(
|
|
823
|
+
label="Consensus mutation threshold",
|
|
824
|
+
minimum=0.5,
|
|
825
|
+
maximum=1.0,
|
|
826
|
+
value=0.7,
|
|
827
|
+
step=0.05,
|
|
828
|
+
)
|
|
829
|
+
umi_min_cluster = gr.Slider(
|
|
830
|
+
label="Minimum reads per cluster",
|
|
831
|
+
minimum=1,
|
|
832
|
+
maximum=50,
|
|
833
|
+
value=3,
|
|
834
|
+
step=1,
|
|
496
835
|
)
|
|
497
|
-
umi_fastq = gr.File(label="FASTQ (.fastq.gz)", file_types=[".fastq", ".gz"], type="filepath")
|
|
498
|
-
umi_template = gr.File(label="Template FASTA", file_types=[".fasta", ".fa"], type="filepath")
|
|
499
|
-
umi_config = gr.File(label="UMI config CSV", file_types=[".csv"], type="filepath")
|
|
500
836
|
umi_btn = gr.Button("Run UMI hunter", variant="primary")
|
|
501
837
|
umi_summary = gr.Markdown(label="Summary")
|
|
502
838
|
umi_download = gr.File(label="Download results", file_count="single")
|
|
503
839
|
umi_btn.click(
|
|
504
840
|
fn=run_gui_umi_hunter,
|
|
505
|
-
inputs=[
|
|
841
|
+
inputs=[
|
|
842
|
+
umi_fastq,
|
|
843
|
+
umi_template,
|
|
844
|
+
umi_start,
|
|
845
|
+
umi_end,
|
|
846
|
+
umi_min_len,
|
|
847
|
+
umi_max_len,
|
|
848
|
+
gene_start,
|
|
849
|
+
gene_end,
|
|
850
|
+
umi_identity,
|
|
851
|
+
consensus_threshold,
|
|
852
|
+
umi_min_cluster,
|
|
853
|
+
],
|
|
506
854
|
outputs=[umi_summary, umi_download],
|
|
507
855
|
)
|
|
856
|
+
with gr.Accordion("What the pipeline generates", open=False):
|
|
857
|
+
gr.Markdown(
|
|
858
|
+
textwrap.dedent(
|
|
859
|
+
"""
|
|
860
|
+
- Reads are searched for the UMI barcode and gene flanks on both strands; valid pairs feed into UMI grouping.
|
|
861
|
+
- UMIs within the chosen identity threshold are merged, and consensus sequences are computed with the mutation threshold.
|
|
862
|
+
- Outputs include per-sample summaries, consensus FASTA files, cluster membership tables, QC plots, and logs mirroring the CLI workflow.
|
|
863
|
+
"""
|
|
864
|
+
)
|
|
865
|
+
)
|
|
508
866
|
|
|
509
867
|
with gr.Tab("Profile Inserts"):
|
|
510
868
|
gr.Markdown(
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
869
|
+
textwrap.dedent(
|
|
870
|
+
"""
|
|
871
|
+
### Probe-Guided Insert Profiling
|
|
872
|
+
Characterises inserts demarcated by user-supplied upstream/downstream probes, extracts sequences, and produces QC plots plus summary tables.
|
|
873
|
+
|
|
874
|
+
**Required inputs**
|
|
875
|
+
- FASTQ reads containing the inserts of interest.
|
|
876
|
+
- One or more probe pairs: 5'→3' sequences for the upstream and downstream anchors (reverse complements are matched automatically).
|
|
877
|
+
"""
|
|
878
|
+
)
|
|
879
|
+
)
|
|
880
|
+
probes_table = gr.Dataframe(
|
|
881
|
+
headers=["name (optional)", "upstream", "downstream"],
|
|
882
|
+
datatype=["str", "str", "str"],
|
|
883
|
+
row_count=(1, "dynamic"),
|
|
884
|
+
col_count=3,
|
|
885
|
+
value=[["probe_1", "", ""]],
|
|
886
|
+
interactive=True,
|
|
887
|
+
label="Probe pairs",
|
|
515
888
|
)
|
|
516
|
-
pi_csv = gr.File(label="Probe CSV", file_types=[".csv"], type="filepath")
|
|
517
889
|
pi_fastq = gr.File(
|
|
518
|
-
label="FASTQ files",
|
|
890
|
+
label="FASTQ files (.fastq/.gz)",
|
|
519
891
|
file_types=[".fastq", ".gz"],
|
|
520
892
|
file_count="multiple",
|
|
521
893
|
type="filepath",
|
|
522
894
|
)
|
|
895
|
+
pi_ratio = gr.Slider(
|
|
896
|
+
label="Minimum fuzzy-match ratio",
|
|
897
|
+
minimum=50,
|
|
898
|
+
maximum=100,
|
|
899
|
+
value=80,
|
|
900
|
+
step=1,
|
|
901
|
+
)
|
|
523
902
|
pi_btn = gr.Button("Profile inserts", variant="primary")
|
|
524
903
|
pi_summary = gr.Markdown(label="Summary")
|
|
525
904
|
pi_download = gr.File(label="Download results", file_count="single")
|
|
526
905
|
pi_btn.click(
|
|
527
906
|
fn=run_gui_profile_inserts,
|
|
528
|
-
inputs=[
|
|
907
|
+
inputs=[probes_table, pi_fastq, pi_ratio],
|
|
529
908
|
outputs=[pi_summary, pi_download],
|
|
530
909
|
)
|
|
910
|
+
with gr.Accordion("Output overview", open=False):
|
|
911
|
+
gr.Markdown(
|
|
912
|
+
textwrap.dedent(
|
|
913
|
+
"""
|
|
914
|
+
- Inserts are extracted whenever probe matches are detected above the chosen similarity threshold (default 80).
|
|
915
|
+
- A FASTA file of inserts, probe-level QC metrics, base composition summaries, and a suite of plots (length distribution, GC content, duplicate rate, probe performance) are packaged for each input FASTQ.
|
|
916
|
+
- Logs are stored alongside the results so runs remain fully reproducible.
|
|
917
|
+
"""
|
|
918
|
+
)
|
|
919
|
+
)
|
|
531
920
|
|
|
532
921
|
with gr.Tab("EP Library Profile"):
|
|
533
922
|
gr.Markdown(
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
923
|
+
textwrap.dedent(
|
|
924
|
+
"""
|
|
925
|
+
### Library Profiling Without UMIs
|
|
926
|
+
Estimates background and target mutation rates for enzyme evolution libraries without UMI barcodes.
|
|
927
|
+
|
|
928
|
+
**Inputs**
|
|
929
|
+
- FASTQ reads (*.fastq/.gz) from the ep-library experiment.
|
|
930
|
+
- Region-of-interest FASTA delineating the mutational window.
|
|
931
|
+
- Plasmid FASTA providing the full reference context.
|
|
932
|
+
|
|
933
|
+
**Outputs**
|
|
934
|
+
- Per-sample directories with coverage tables, mutation rate statistics, and QC plots.
|
|
935
|
+
- `master_summary.txt` aggregating condition-level metrics.
|
|
936
|
+
- Verbose logs recording alignment commands and rate calculations.
|
|
937
|
+
"""
|
|
938
|
+
)
|
|
538
939
|
)
|
|
539
940
|
ep_fastq = gr.File(
|
|
540
941
|
label="FASTQ files",
|
|
@@ -552,6 +953,16 @@ def create_gui() -> gr.Blocks:
|
|
|
552
953
|
inputs=[ep_fastq, ep_region, ep_plasmid],
|
|
553
954
|
outputs=[ep_summary, ep_download],
|
|
554
955
|
)
|
|
956
|
+
with gr.Accordion("How mutation rates are derived", open=False):
|
|
957
|
+
gr.Markdown(
|
|
958
|
+
textwrap.dedent(
|
|
959
|
+
"""
|
|
960
|
+
- Reads are aligned against the plasmid reference; mismatches inside the region-of-interest drive target rate estimates, while mismatches elsewhere define the background rate.
|
|
961
|
+
- Z-scores and p-values summarise enrichment versus background, mirroring the CLI outputs.
|
|
962
|
+
- Download the archive to inspect per-sample plots, TSV summaries, and logs for troubleshooting.
|
|
963
|
+
"""
|
|
964
|
+
)
|
|
965
|
+
)
|
|
555
966
|
|
|
556
967
|
gr.Markdown(
|
|
557
968
|
textwrap.dedent(
|
|
@@ -264,6 +264,7 @@ def run_umi_hunter(
|
|
|
264
264
|
output_dir: Path,
|
|
265
265
|
umi_identity_threshold: float = 0.9,
|
|
266
266
|
consensus_mutation_threshold: float = 0.7,
|
|
267
|
+
min_cluster_size: int = 1,
|
|
267
268
|
log_path: Optional[Path] = None,
|
|
268
269
|
logger: Optional[logging.Logger] = None,
|
|
269
270
|
) -> List[Dict[str, Path]]:
|
|
@@ -291,6 +292,9 @@ def run_umi_hunter(
|
|
|
291
292
|
if not fastq_files:
|
|
292
293
|
raise ValueError("No FASTQ files provided.")
|
|
293
294
|
|
|
295
|
+
if min_cluster_size < 1:
|
|
296
|
+
raise ValueError("Minimum cluster size must be at least 1.")
|
|
297
|
+
|
|
294
298
|
cfg = load_flank_config(config_csv)
|
|
295
299
|
pattern_umi, pattern_gene = build_patterns(cfg)
|
|
296
300
|
reference_record = next(SeqIO.parse(str(template_fasta), "fasta"))
|
|
@@ -314,10 +318,20 @@ def run_umi_hunter(
|
|
|
314
318
|
umi_csv = sample_dir / f"{sample_base}_UMI_clusters.csv"
|
|
315
319
|
write_umi_csv(umi_csv, clusters)
|
|
316
320
|
|
|
321
|
+
significant_clusters = [
|
|
322
|
+
cluster for cluster in clusters if cluster["total_count"] >= min_cluster_size
|
|
323
|
+
]
|
|
324
|
+
if not significant_clusters:
|
|
325
|
+
logger.info(
|
|
326
|
+
"No clusters met the minimum size threshold (%s reads) for %s.",
|
|
327
|
+
min_cluster_size,
|
|
328
|
+
sample_base,
|
|
329
|
+
)
|
|
330
|
+
|
|
317
331
|
gene_csv = sample_dir / f"{sample_base}_gene_consensus.csv"
|
|
318
332
|
consensus_records = write_gene_csv(
|
|
319
333
|
gene_csv,
|
|
320
|
-
|
|
334
|
+
significant_clusters,
|
|
321
335
|
reference_record,
|
|
322
336
|
consensus_mutation_threshold,
|
|
323
337
|
logger,
|
|
@@ -334,7 +348,8 @@ def run_umi_hunter(
|
|
|
334
348
|
"gene_csv": gene_csv,
|
|
335
349
|
"fasta": fasta_out,
|
|
336
350
|
"reads": read_count,
|
|
337
|
-
"clusters": len(
|
|
351
|
+
"clusters": len(significant_clusters),
|
|
352
|
+
"clusters_total": len(clusters),
|
|
338
353
|
}
|
|
339
354
|
)
|
|
340
355
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: uht-tooling
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Tooling for ultra-high throughput screening workflows.
|
|
5
5
|
Author: Matt115A
|
|
6
6
|
License: MIT
|
|
@@ -27,7 +27,7 @@ Requires-Dist: ruff==0.14.4; extra == "dev"
|
|
|
27
27
|
|
|
28
28
|
# uht-tooling
|
|
29
29
|
|
|
30
|
-
Automation helpers for ultra-high-throughput molecular biology workflows. The package ships both a
|
|
30
|
+
Automation helpers for ultra-high-throughput molecular biology workflows. The package ships both a CLI and an optional GUI that wrap the same workflow code paths.
|
|
31
31
|
|
|
32
32
|
---
|
|
33
33
|
|
|
@@ -35,19 +35,20 @@ Automation helpers for ultra-high-throughput molecular biology workflows. The pa
|
|
|
35
35
|
|
|
36
36
|
### Quick install (recommended, easiest file maintainance)
|
|
37
37
|
```bash
|
|
38
|
-
|
|
38
|
+
pip install "uht-tooling[gui]==0.1.3"
|
|
39
|
+
|
|
39
40
|
```
|
|
40
41
|
|
|
41
42
|
This installs the core workflows plus the optional GUI dependencies (Gradio, pandas). Omit the `[gui]` extras if you only need the CLI:
|
|
42
43
|
|
|
43
44
|
```bash
|
|
44
|
-
|
|
45
|
+
pip install uht-tooling
|
|
45
46
|
```
|
|
46
47
|
|
|
47
48
|
### Development install
|
|
48
49
|
```bash
|
|
49
|
-
git clone https://github.com/Matt115A/uht-tooling.git
|
|
50
|
-
cd uht-tooling
|
|
50
|
+
git clone https://github.com/Matt115A/uht-tooling-packaged.git
|
|
51
|
+
cd uht-tooling-packaged
|
|
51
52
|
python -m pip install -e ".[gui,dev]"
|
|
52
53
|
```
|
|
53
54
|
|
|
@@ -57,7 +58,7 @@ The editable install exposes the latest sources, while the `dev` extras add lint
|
|
|
57
58
|
|
|
58
59
|
## Directory layout
|
|
59
60
|
|
|
60
|
-
- Reference inputs
|
|
61
|
+
- Reference inputs can be found anywhere (you specify in the cli), but we recommend using `data/<workflow>/`.
|
|
61
62
|
- Outputs (CSV, FASTA, plots, logs) are written to `results/<workflow>/`.
|
|
62
63
|
- All workflows log to `results/<workflow>/run.log` for reproducibility and debugging.
|
|
63
64
|
|
|
@@ -79,9 +80,9 @@ Each command mirrors a workflow module. Common entry points:
|
|
|
79
80
|
| `uht-tooling design-slim` | Design SLIM mutagenesis primers from FASTA/CSV inputs. |
|
|
80
81
|
| `uht-tooling design-gibson` | Produce Gibson mutagenesis primers and assembly plans. |
|
|
81
82
|
| `uht-tooling mutation-caller` | Summarise amino-acid substitutions from long-read FASTQ files. |
|
|
82
|
-
| `uht-tooling umi-hunter` | Cluster UMIs and call consensus
|
|
83
|
-
| `uht-tooling ep-library-profile` | Measure mutation rates without UMIs. |
|
|
84
|
-
| `uht-tooling profile-inserts` | Extract inserts defined by probe pairs. |
|
|
83
|
+
| `uht-tooling umi-hunter` | Cluster UMIs and call consensus genes. |
|
|
84
|
+
| `uht-tooling ep-library-profile` | Measure mutation rates in plasmid libraries without UMIs. |
|
|
85
|
+
| `uht-tooling profile-inserts` | Extract and analyse inserts defined by flanking probe pairs. |
|
|
85
86
|
|
|
86
87
|
Each command provides detailed help, including option descriptions and expected file formats:
|
|
87
88
|
|
|
@@ -107,13 +108,13 @@ You can pass multiple FASTQ paths using repeated `--fastq` options or glob patte
|
|
|
107
108
|
```
|
|
108
109
|
4. Primer CSVs will be written to `results/nextera_designer/`, accompanied by a log file.
|
|
109
110
|
|
|
110
|
-
The helper is preloaded with twelve i5 and twelve i7 indices, enabling up to 144 unique amplicons.
|
|
111
|
+
The helper is preloaded with twelve i5 and twelve i7 indices, enabling up to 144 unique amplicons.
|
|
111
112
|
|
|
112
113
|
#### Wet-lab workflow notes
|
|
113
114
|
|
|
114
115
|
- Perform the initial amplification with an i5/i7 primer pair and monitor a small aliquot by qPCR. Cap thermocycling early so you only generate ~10% of the theoretical yield—this minimizes amplification bias.
|
|
115
116
|
- Purify the product with SPRIselect beads at approximately a 0.65:1 bead:DNA volume ratio to remove residual primers and short fragments.
|
|
116
|
-
- Confirm primer removal using electrophoresis (e.g., BioAnalyzer DNA chip) before moving to
|
|
117
|
+
- Confirm primer removal and quantify DNA using electrophoresis (e.g., BioAnalyzer DNA chip) before moving to the flow cell.
|
|
117
118
|
|
|
118
119
|
### SLIM primer design
|
|
119
120
|
|
|
@@ -158,7 +159,7 @@ Mutation nomenclature examples:
|
|
|
158
159
|
```
|
|
159
160
|
- Outputs include primer sets and an assembly-plan CSV.
|
|
160
161
|
|
|
161
|
-
If mutations fall within overlapping primer windows, design sequential reactions
|
|
162
|
+
If mutations fall within overlapping primer windows, design sequential reactions.
|
|
162
163
|
|
|
163
164
|
### Mutation caller (no UMIs)
|
|
164
165
|
|
|
@@ -175,7 +176,7 @@ If mutations fall within overlapping primer windows, design sequential reactions
|
|
|
175
176
|
--output-dir results/mutation_caller/ \
|
|
176
177
|
--threshold 10
|
|
177
178
|
```
|
|
178
|
-
3. Outputs: per-sample subdirectories with substitution summaries, co-occurrence matrices, and logs.
|
|
179
|
+
3. Outputs: per-sample subdirectories with substitution summaries, co-occurrence matrices, and logs. Co-occurence matrices are experimental and are not yet to be relied on.
|
|
179
180
|
|
|
180
181
|
### UMI Hunter
|
|
181
182
|
|
|
@@ -188,7 +189,12 @@ If mutations fall within overlapping primer windows, design sequential reactions
|
|
|
188
189
|
--fastq data/umi_hunter/*.fastq.gz \
|
|
189
190
|
--output-dir results/umi_hunter/
|
|
190
191
|
```
|
|
191
|
-
- Tunable parameters include `--umi-identity-threshold
|
|
192
|
+
- Tunable parameters include `--umi-identity-threshold`, `--consensus-mutation-threshold`, and `--min-cluster-size`.
|
|
193
|
+
- `--umi-identity-threshold` (0–1) controls how similar two UMIs must be to fall into the same cluster.
|
|
194
|
+
- `--consensus-mutation-threshold` (0–1) is the fraction of reads within a cluster that must agree on a base before it is written into the consensus sequence.
|
|
195
|
+
- `--min-cluster-size` sets the minimum number of reads required in a cluster before a consensus is generated (smaller clusters remain listed in the raw UMI CSV but no consensus FASTA is produced).
|
|
196
|
+
|
|
197
|
+
Please be aware, this toolkit will not scale well beyond around 50k reads/sample. See UMIC-seq pipelines for efficient UMI-gene dictionary generation.
|
|
192
198
|
|
|
193
199
|
### Profile inserts
|
|
194
200
|
|
|
@@ -229,7 +235,7 @@ python -m uht_tooling.workflows.gui
|
|
|
229
235
|
```
|
|
230
236
|
|
|
231
237
|
Key points:
|
|
232
|
-
- The server binds to `http://127.0.0.1:7860` by default and falls back to an available port if 7860 is busy. Copy http://127.0.0.1:7860 into your browser.
|
|
238
|
+
- The server binds to `http://127.0.0.1:7860` by default and falls back to an available port if 7860 is busy. Copy http://127.0.0.1:7860 into your browser to interface with the GUI.
|
|
233
239
|
- Temporary working directories are created under the system temp folder and cleaned automatically.
|
|
234
240
|
- Output archives (ZIP files) mirror the directory structure produced by the CLI.
|
|
235
241
|
|
|
@@ -238,9 +244,9 @@ Key points:
|
|
|
238
244
|
1. **Nextera XT** – forward/reverse primer inputs with CSV preview.
|
|
239
245
|
2. **SLIM** – template/context FASTA text areas plus mutation list.
|
|
240
246
|
3. **Gibson** – multi-mutation support using `+` syntax.
|
|
241
|
-
4. **Mutation Caller** – upload FASTQ
|
|
242
|
-
5. **UMI Hunter** – long-read UMI clustering with
|
|
243
|
-
6. **Profile Inserts** – probe
|
|
247
|
+
4. **Mutation Caller** – upload FASTQ and template FASTA, then enter flanks and gene length bounds inline.
|
|
248
|
+
5. **UMI Hunter** – long-read UMI clustering with flank entry, UMI length bounds, mutation threshold, and minimum cluster size.
|
|
249
|
+
6. **Profile Inserts** – interactive probe table plus multiple FASTQ uploads with adjustable fuzzy-match ratio.
|
|
244
250
|
7. **EP Library Profile** – FASTQ uploads plus plasmid and region FASTA inputs.
|
|
245
251
|
|
|
246
252
|
### Workflow tips
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
uht_tooling/__init__.py,sha256=hf0tJaa4_9y9aYb8OB1FtJh1FOuX08dQ6_MCveWFNAc,242
|
|
2
|
-
uht_tooling/cli.py,sha256=
|
|
2
|
+
uht_tooling/cli.py,sha256=yKTPqWwYAs7tzO_TeyaLhSfzkNoCUPnc0wU2fgOR2wk,12882
|
|
3
3
|
uht_tooling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
uht_tooling/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
uht_tooling/workflows/design_gibson.py,sha256=SQEThq6dxPMPCsUrwqMUaG5I-diE9jUXPRii9Y7O_7U,13617
|
|
6
6
|
uht_tooling/workflows/design_slim.py,sha256=Qeh8N32kmVFZvohmTlBudJsLzOqLy4XcY3aXbkP-sFQ,14421
|
|
7
|
-
uht_tooling/workflows/gui.py,sha256=
|
|
7
|
+
uht_tooling/workflows/gui.py,sha256=g3lhHBWuKi1qEpY4iBnhr-tROSGepMQJm-fjiVCrk08,42559
|
|
8
8
|
uht_tooling/workflows/mut_rate.py,sha256=wjX1lNXTcaH49gfARSrpKLU1mD5hCgH0ZFTcdlNrAB4,105670
|
|
9
9
|
uht_tooling/workflows/mutation_caller.py,sha256=BczuNATOSUcmlw-x6qTzEQfW8MBbvGclEyqiQiBX0cg,16222
|
|
10
10
|
uht_tooling/workflows/nextera_designer.py,sha256=8MZ_DyQ0JwPojXH5mZ6bAGAkqki_0qQGac45T_Ll8FQ,6170
|
|
11
11
|
uht_tooling/workflows/profile_inserts.py,sha256=C-SZ10YefiV_4QZbo1oEkI4qYipwaYqPP5jF-MC5O58,16947
|
|
12
|
-
uht_tooling/workflows/umi_hunter.py,sha256=
|
|
13
|
-
uht_tooling-0.1.
|
|
14
|
-
uht_tooling-0.1.
|
|
15
|
-
uht_tooling-0.1.
|
|
16
|
-
uht_tooling-0.1.
|
|
17
|
-
uht_tooling-0.1.
|
|
12
|
+
uht_tooling/workflows/umi_hunter.py,sha256=baycWycqVzUfMp5u2WZdHRl0sNuykTjy-iqtj5ahucU,15075
|
|
13
|
+
uht_tooling-0.1.4.dist-info/METADATA,sha256=l7CcHpNlvnxYghc8eqw1PGRaPcqxSHeL_GJuRJNifSI,11626
|
|
14
|
+
uht_tooling-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
uht_tooling-0.1.4.dist-info/entry_points.txt,sha256=t3_bMkEnlnV4vd6nrjNQxHDsHzHHoZenhmxuIYLcRBY,53
|
|
16
|
+
uht_tooling-0.1.4.dist-info/top_level.txt,sha256=iTCCiSn0OjrTx1VOdxXhUlPi1TR9LxaJEZJoMyRcv9c,12
|
|
17
|
+
uht_tooling-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|