merge-cli 3.5.0__tar.gz → 3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {merge_cli-3.5.0 → merge_cli-3.6}/PKG-INFO +2 -2
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/cli.py +204 -44
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/models/ensemble_predict.py +3 -3
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/ensemble_predict.py +3 -3
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/local_engine.py +221 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/output.py +16 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli.egg-info/PKG-INFO +2 -2
- {merge_cli-3.5.0 → merge_cli-3.6}/pyproject.toml +2 -2
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/__init__.py +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/api.py +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/cli_env_patch.py +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/config.py +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/__init__.py +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/models/.gitkeep +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/models/BestModel_coding.pkl +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/models/BestModel_noncoding.pkl +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/models/BestModel_splice.pkl +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli/data/models/__init__.py +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli.egg-info/SOURCES.txt +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli.egg-info/dependency_links.txt +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli.egg-info/entry_points.txt +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli.egg-info/requires.txt +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/merge_cli.egg-info/top_level.txt +0 -0
- {merge_cli-3.5.0 → merge_cli-3.6}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: merge-cli
|
|
3
|
-
Version: 3.
|
|
4
|
-
Summary: MERGE
|
|
3
|
+
Version: 3.6
|
|
4
|
+
Summary: MERGE variant pathogenicity prediction CLI (fixed server, integrated model embedded, no manual configuration required)
|
|
5
5
|
Project-URL: Homepage, https://merge.fanglab.cn
|
|
6
6
|
Requires-Python: >=3.11
|
|
7
7
|
Requires-Dist: click>=8.1
|
|
@@ -7,7 +7,8 @@ Changes (v3.5.0):
|
|
|
7
7
|
3. Added --genome-ref option to specify reference genome FASTA path
|
|
8
8
|
4. Added --no-shap option for skipping SHAP analysis
|
|
9
9
|
"""
|
|
10
|
-
import os, sys, time
|
|
10
|
+
import os, sys, time, subprocess
|
|
11
|
+
import requests
|
|
11
12
|
import click
|
|
12
13
|
from rich.console import Console
|
|
13
14
|
from rich.table import Table
|
|
@@ -251,46 +252,62 @@ def predict(chrom, pos, ref, alt, genome, fmt, no_ensemble,
|
|
|
251
252
|
skip_shap = not click.confirm("\nGenerate SHAP analysis plot?", default=True)
|
|
252
253
|
variant_info = f"{chrom}:{pos} {ref}>{alt}"
|
|
253
254
|
|
|
254
|
-
# ──
|
|
255
|
-
#
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
255
|
+
# ── Pre-computed fast path ────────────────────────────────
|
|
256
|
+
# Both remote (/predict/ returns prediction["precomputed"]) and local
|
|
257
|
+
# (predict_local returns prediction["precomputed"]) use the same key.
|
|
258
|
+
precomp = prediction.get("precomputed")
|
|
259
|
+
if precomp:
|
|
260
|
+
model_key = precomp.get("model_used", "MERGE_Precomputed")
|
|
261
|
+
ensemble = {
|
|
262
|
+
"success": True,
|
|
263
|
+
"ensemble_results": {model_key: precomp},
|
|
264
|
+
"raw_features": {},
|
|
265
|
+
"from_cache": True,
|
|
266
|
+
}
|
|
267
|
+
prediction["_variant_type"] = precomp.get("variant_type")
|
|
268
|
+
console.print("[dim]⚡ Pre-computed score retrieved from cache[/dim]")
|
|
269
|
+
# ─────────────────────────────────────────────────────────
|
|
270
|
+
else:
|
|
271
|
+
# ── Determine variant type ────────────────────────────────────
|
|
272
|
+
# Priority: CLI --ensemble-type > prediction["_variant_type"] > remote ANNOVAR API
|
|
273
|
+
effective_ensemble_type = ensemble_type
|
|
274
|
+
if not effective_ensemble_type:
|
|
275
|
+
effective_ensemble_type = prediction.get("_variant_type")
|
|
276
|
+
if not effective_ensemble_type:
|
|
277
|
+
with console.status("[bold cyan]Detecting variant type...[/bold cyan]"):
|
|
278
|
+
try:
|
|
279
|
+
from . import local_engine as _le
|
|
280
|
+
effective_ensemble_type = _le.get_variant_type(chrom, pos, ref, alt, genome)
|
|
281
|
+
except Exception:
|
|
282
|
+
effective_ensemble_type = None
|
|
283
|
+
if effective_ensemble_type:
|
|
284
|
+
prediction["_variant_type"] = effective_ensemble_type
|
|
285
|
+
|
|
286
|
+
with console.status("[bold cyan]Computing MERGE ensemble score...[/bold cyan]"):
|
|
261
287
|
try:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
ensemble_type=effective_ensemble_type)
|
|
286
|
-
if ens_resp.get("success"):
|
|
287
|
-
ensemble = ens_resp
|
|
288
|
-
for _v in (ensemble.get("ensemble_results") or {}).values():
|
|
289
|
-
if isinstance(_v, dict) and _v.get("variant_type"):
|
|
290
|
-
prediction["_variant_type"] = _v["variant_type"]
|
|
291
|
-
break
|
|
292
|
-
except Exception as e:
|
|
293
|
-
console.print(f"[yellow]Ensemble score failed (raw scores unaffected): {e}[/yellow]")
|
|
288
|
+
if is_local:
|
|
289
|
+
from . import local_engine
|
|
290
|
+
dbnsfp = prediction.get("dbnsfp") or {}
|
|
291
|
+
transcripts = dbnsfp.get("transcripts") or ([dbnsfp] if dbnsfp else [])
|
|
292
|
+
ens_result, ens_err = local_engine.run_local_ensemble(
|
|
293
|
+
prediction, transcripts, ensemble_type=effective_ensemble_type,
|
|
294
|
+
skip_shap=skip_shap, variant_info=variant_info)
|
|
295
|
+
if ens_err:
|
|
296
|
+
console.print(f"[yellow]Ensemble model warning: {ens_err}[/yellow]")
|
|
297
|
+
ensemble = {"success": True, **ens_result}
|
|
298
|
+
else:
|
|
299
|
+
dbnsfp = prediction.get("dbnsfp") or {}
|
|
300
|
+
transcripts = [dbnsfp] if dbnsfp else []
|
|
301
|
+
ens_resp = api.predict_ensemble(prediction, transcripts,
|
|
302
|
+
ensemble_type=effective_ensemble_type)
|
|
303
|
+
if ens_resp.get("success"):
|
|
304
|
+
ensemble = ens_resp
|
|
305
|
+
for _v in (ensemble.get("ensemble_results") or {}).values():
|
|
306
|
+
if isinstance(_v, dict) and _v.get("variant_type"):
|
|
307
|
+
prediction["_variant_type"] = _v["variant_type"]
|
|
308
|
+
break
|
|
309
|
+
except Exception as e:
|
|
310
|
+
console.print(f"[yellow]Ensemble score failed (raw scores unaffected): {e}[/yellow]")
|
|
294
311
|
|
|
295
312
|
output.render_single(prediction, ensemble, fmt=fmt,
|
|
296
313
|
errors=prediction.get("errors"))
|
|
@@ -428,15 +445,33 @@ def local_setup():
|
|
|
428
445
|
# ── merge local download ──────────────────────────────────────
|
|
429
446
|
@local.command("download")
|
|
430
447
|
@click.option("--file", "file_type",
|
|
431
|
-
type=click.Choice(["all", "dbnsfp", "gpn-msa", "popeve"]),
|
|
448
|
+
type=click.Choice(["all", "dbnsfp", "gpn-msa", "popeve", "precomputed"]),
|
|
432
449
|
default="all",
|
|
433
|
-
help="Show download instructions for specific file")
|
|
434
|
-
|
|
435
|
-
|
|
450
|
+
help="Show download instructions for specific file (or auto-download precomputed VCF)")
|
|
451
|
+
@click.option("--genome", default="all", type=click.Choice(["all", "hg38", "hg19"]),
|
|
452
|
+
help="Genome version for precomputed VCF download (default: both)")
|
|
453
|
+
@click.option("--variant-type", "variant_type", default="all",
|
|
454
|
+
type=click.Choice(["all", "coding", "splicing"]),
|
|
455
|
+
help="Variant type for precomputed VCF download (default: both)")
|
|
456
|
+
def local_download(file_type, genome, variant_type):
|
|
457
|
+
"""Show official download URLs for pre-computed data files.
|
|
458
|
+
|
|
459
|
+
\b
|
|
460
|
+
The precomputed VCF cache files are auto-downloaded from merge.fanglab.cn:
|
|
461
|
+
merge local download --file precomputed # download all 4 files
|
|
462
|
+
merge local download --file precomputed --genome hg38
|
|
463
|
+
merge local download --file precomputed --genome hg38 --variant-type coding
|
|
464
|
+
"""
|
|
436
465
|
cfg = get_local_config()
|
|
437
466
|
data_dir = cfg["data_dir"]
|
|
438
467
|
os.makedirs(data_dir, exist_ok=True)
|
|
439
468
|
|
|
469
|
+
# ── Auto-download precomputed VCF files ───────────────────────
|
|
470
|
+
if file_type == "precomputed":
|
|
471
|
+
_download_precomputed_vcfs(data_dir, genome, variant_type)
|
|
472
|
+
return
|
|
473
|
+
# ─────────────────────────────────────────────────────────────
|
|
474
|
+
|
|
440
475
|
INSTRUCTIONS = {
|
|
441
476
|
"dbnsfp": {
|
|
442
477
|
"label": "dbNSFP (ESM1b + AlphaMissense)",
|
|
@@ -472,6 +507,8 @@ def local_download(file_type):
|
|
|
472
507
|
"\n[bold green]✓ Ensemble models (pkl files)[/bold green] are bundled with pip install, "
|
|
473
508
|
"no manual download needed.\n"
|
|
474
509
|
" If model files are missing, run: [bold]pip install --force-reinstall merge-cli[/bold]\n"
|
|
510
|
+
"\n[bold yellow]⚡ Pre-computed VCF cache[/bold yellow] can be auto-downloaded:\n"
|
|
511
|
+
" [bold]merge local download --file precomputed[/bold]\n"
|
|
475
512
|
)
|
|
476
513
|
|
|
477
514
|
show_keys = (list(INSTRUCTIONS.keys()) if file_type == "all"
|
|
@@ -487,6 +524,129 @@ def local_download(file_type):
|
|
|
487
524
|
console.print()
|
|
488
525
|
|
|
489
526
|
|
|
527
|
+
def _download_precomputed_vcfs(data_dir: str, genome: str = "all", variant_type: str = "all"):
|
|
528
|
+
"""Auto-download pre-computed VCF files + tabix indices from merge.fanglab.cn."""
|
|
529
|
+
import shutil
|
|
530
|
+
|
|
531
|
+
# Files hosted at: https://merge.fanglab.cn/static/precomputed/<filename>
|
|
532
|
+
# Adjust BASE_STATIC_URL if you move the files elsewhere (e.g. OSS / GitHub Releases)
|
|
533
|
+
BASE_STATIC_URL = f"{FIXED_API_URL}/static/precomputed"
|
|
534
|
+
|
|
535
|
+
ALL_FILES = {
|
|
536
|
+
("hg38", "coding"): "coding_merged.vcf.gz",
|
|
537
|
+
("hg38", "splicing"): "splicing_merged.vcf.gz",
|
|
538
|
+
("hg19", "coding"): "coding_merged_hg19.vcf.gz",
|
|
539
|
+
("hg19", "splicing"): "splicing_merged_hg19.vcf.gz",
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
# Filter by --genome and --variant-type
|
|
543
|
+
targets = {
|
|
544
|
+
(g, vt): fname for (g, vt), fname in ALL_FILES.items()
|
|
545
|
+
if (genome == "all" or g == genome)
|
|
546
|
+
and (variant_type == "all" or vt == variant_type)
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
if not targets:
|
|
550
|
+
console.print("[red]No files match the specified genome/variant-type combination.[/red]")
|
|
551
|
+
return
|
|
552
|
+
|
|
553
|
+
has_tabix = bool(shutil.which("tabix"))
|
|
554
|
+
if not has_tabix:
|
|
555
|
+
console.print(
|
|
556
|
+
"[yellow]⚠ tabix not found in PATH. VCF files will be downloaded but NOT indexed.\n"
|
|
557
|
+
" Install htslib (conda install -c bioconda htslib) then run this command again\n"
|
|
558
|
+
" or index manually: tabix -p vcf <file.vcf.gz>[/yellow]\n"
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
ok_count = 0
|
|
562
|
+
for (g, vt), fname in targets.items():
|
|
563
|
+
dest = os.path.join(data_dir, fname)
|
|
564
|
+
dest_tbi = dest + ".tbi"
|
|
565
|
+
url = f"{BASE_STATIC_URL}/{fname}"
|
|
566
|
+
url_tbi = f"{BASE_STATIC_URL}/{fname}.tbi"
|
|
567
|
+
|
|
568
|
+
console.print(f"\n[bold cyan]── {g} {vt}: {fname} ──[/bold cyan]")
|
|
569
|
+
|
|
570
|
+
# ── Download .vcf.gz ──────────────────────────────────────
|
|
571
|
+
if os.path.exists(dest):
|
|
572
|
+
size_mb = os.path.getsize(dest) / 1e6
|
|
573
|
+
console.print(f" [green]✓ Already exists ({size_mb:.0f} MB), skipping.[/green]")
|
|
574
|
+
else:
|
|
575
|
+
console.print(f" Downloading {url} …")
|
|
576
|
+
success = _stream_download(url, dest)
|
|
577
|
+
if not success:
|
|
578
|
+
console.print(f" [red]✗ Download failed. Check URL or network.[/red]")
|
|
579
|
+
continue
|
|
580
|
+
|
|
581
|
+
# ── Download .tbi (pre-built index) ──────────────────────
|
|
582
|
+
if os.path.exists(dest_tbi):
|
|
583
|
+
console.print(f" [green]✓ Index already exists, skipping.[/green]")
|
|
584
|
+
else:
|
|
585
|
+
console.print(f" Downloading index {url_tbi} …")
|
|
586
|
+
idx_ok = _stream_download(url_tbi, dest_tbi)
|
|
587
|
+
if not idx_ok:
|
|
588
|
+
# Fall back to local tabix
|
|
589
|
+
if has_tabix:
|
|
590
|
+
console.print(" Pre-built index not found, building with local tabix…")
|
|
591
|
+
result = subprocess.run(
|
|
592
|
+
["tabix", "-p", "vcf", dest],
|
|
593
|
+
capture_output=True, text=True, timeout=300,
|
|
594
|
+
)
|
|
595
|
+
if result.returncode == 0:
|
|
596
|
+
console.print(" [green]✓ Index built.[/green]")
|
|
597
|
+
else:
|
|
598
|
+
console.print(f" [red]✗ tabix failed: {result.stderr.strip()}[/red]")
|
|
599
|
+
continue
|
|
600
|
+
else:
|
|
601
|
+
console.print(
|
|
602
|
+
" [red]✗ No pre-built index and tabix not found.\n"
|
|
603
|
+
" Install htslib and run: tabix -p vcf " + dest + "[/red]"
|
|
604
|
+
)
|
|
605
|
+
continue
|
|
606
|
+
|
|
607
|
+
console.print(f" [green]✓ Ready: {dest}[/green]")
|
|
608
|
+
ok_count += 1
|
|
609
|
+
|
|
610
|
+
console.print(f"\n[bold green]✓ Done: {ok_count}/{len(targets)} files ready in {data_dir}[/bold green]")
|
|
611
|
+
if ok_count > 0:
|
|
612
|
+
console.print(
|
|
613
|
+
" Pre-computed scores will now be used automatically for common variants.\n"
|
|
614
|
+
" Run [bold]merge local status[/bold] to verify."
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def _stream_download(url: str, dest: str, chunk_size: int = 1 << 20) -> bool:
|
|
619
|
+
"""Download url → dest with a progress bar. Returns True on success."""
|
|
620
|
+
import math
|
|
621
|
+
tmp = dest + ".part"
|
|
622
|
+
try:
|
|
623
|
+
r = requests.get(url, stream=True, timeout=30)
|
|
624
|
+
if r.status_code == 404:
|
|
625
|
+
return False
|
|
626
|
+
r.raise_for_status()
|
|
627
|
+
total = int(r.headers.get("content-length", 0))
|
|
628
|
+
downloaded = 0
|
|
629
|
+
with open(tmp, "wb") as f:
|
|
630
|
+
for chunk in r.iter_content(chunk_size=chunk_size):
|
|
631
|
+
if chunk:
|
|
632
|
+
f.write(chunk)
|
|
633
|
+
downloaded += len(chunk)
|
|
634
|
+
if total:
|
|
635
|
+
pct = downloaded / total * 100
|
|
636
|
+
done = int(pct / 5)
|
|
637
|
+
bar = "█" * done + "░" * (20 - done)
|
|
638
|
+
mb = downloaded / 1e6
|
|
639
|
+
print(f"\r [{bar}] {pct:5.1f}% {mb:.0f} MB", end="", flush=True)
|
|
640
|
+
print() # newline after progress bar
|
|
641
|
+
os.replace(tmp, dest)
|
|
642
|
+
return True
|
|
643
|
+
except Exception as exc:
|
|
644
|
+
if os.path.exists(tmp):
|
|
645
|
+
os.unlink(tmp)
|
|
646
|
+
console.print(f"\n [red]Download error: {exc}[/red]")
|
|
647
|
+
return False
|
|
648
|
+
|
|
649
|
+
|
|
490
650
|
# ── merge local predict ───────────────────────────────────────
|
|
491
651
|
@local.command("predict")
|
|
492
652
|
@click.option("--chrom", required=True)
|
|
@@ -122,9 +122,9 @@ sys.modules['__main__'].FeatureEngineer = FeatureEngineer
|
|
|
122
122
|
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
123
123
|
|
|
124
124
|
MODEL_PATHS = {
|
|
125
|
-
'ClinVar': os.path.join(_THIS_DIR, '
|
|
126
|
-
'Splice_ClinVar_GnomAD': os.path.join(_THIS_DIR, '
|
|
127
|
-
'NonCoding_ClinVar': os.path.join(_THIS_DIR, '
|
|
125
|
+
'ClinVar': os.path.join(_THIS_DIR, 'BestModel_coding.pkl'),
|
|
126
|
+
'Splice_ClinVar_GnomAD': os.path.join(_THIS_DIR, 'BestModel_splice.pkl'),
|
|
127
|
+
'NonCoding_ClinVar': os.path.join(_THIS_DIR, 'BestModel_noncoding.pkl'),
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
for _name, _path in MODEL_PATHS.items():
|
|
@@ -122,9 +122,9 @@ sys.modules['__main__'].FeatureEngineer = FeatureEngineer
|
|
|
122
122
|
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
123
123
|
|
|
124
124
|
MODEL_PATHS = {
|
|
125
|
-
'ClinVar': os.path.join(_THIS_DIR, '
|
|
126
|
-
'Splice_ClinVar_GnomAD': os.path.join(_THIS_DIR, '
|
|
127
|
-
'NonCoding_ClinVar': os.path.join(_THIS_DIR, '
|
|
125
|
+
'ClinVar': os.path.join(_THIS_DIR, 'BestModel_coding.pkl'),
|
|
126
|
+
'Splice_ClinVar_GnomAD': os.path.join(_THIS_DIR, 'BestModel_splice.pkl'),
|
|
127
|
+
'NonCoding_ClinVar': os.path.join(_THIS_DIR, 'BestModel_noncoding.pkl'),
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
for _name, _path in MODEL_PATHS.items():
|
|
@@ -1251,11 +1251,190 @@ def run_local_ensemble(pred_data, all_transcripts, ensemble_type=None, skip_shap
|
|
|
1251
1251
|
return {chosen_key: result}, None
|
|
1252
1252
|
|
|
1253
1253
|
|
|
1254
|
+
# ─── Pre-computed VCF Cache ───────────────────────────────────────
|
|
1255
|
+
|
|
1256
|
+
# VCF filenames mirror the server-side PRECOMPUTED_VCFS config in views.py
|
|
1257
|
+
_PRECOMPUTED_VCF_NAMES = {
|
|
1258
|
+
"hg38": {
|
|
1259
|
+
"coding": "coding_merged.vcf.gz",
|
|
1260
|
+
"splicing": "splicing_merged.vcf.gz",
|
|
1261
|
+
},
|
|
1262
|
+
"hg19": {
|
|
1263
|
+
"coding": "coding_merged_hg19.vcf.gz",
|
|
1264
|
+
"splicing": "splicing_merged_hg19.vcf.gz",
|
|
1265
|
+
},
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
|
|
1269
|
+
def query_precomputed_local(chrom: str, pos, ref: str, alt: str,
|
|
1270
|
+
genome: str = "hg38") -> Optional[dict]:
|
|
1271
|
+
"""
|
|
1272
|
+
Query the local pre-computed VCF cache files (same format as the server).
|
|
1273
|
+
Splicing is checked first to prevent coding-model mis-classification.
|
|
1274
|
+
|
|
1275
|
+
Returns the same dict shape as views.query_precomputed_vcf on hit, or
|
|
1276
|
+
None on miss / unavailable.
|
|
1277
|
+
|
|
1278
|
+
INFO fields expected (set by the pre-computation pipeline):
|
|
1279
|
+
ENSEMBLE – MERGE score (0-1)
|
|
1280
|
+
INTERP – interpretation label (spaces encoded as underscores)
|
|
1281
|
+
HYENA – HyenaDNA score
|
|
1282
|
+
NT – Nucleotide Transformer score
|
|
1283
|
+
GPN – GPN-MSA score
|
|
1284
|
+
POPEVE – popEVE score
|
|
1285
|
+
EVO2 – Evo2 LLR score
|
|
1286
|
+
AM – AlphaMissense score
|
|
1287
|
+
ESM1B – ESM1b score
|
|
1288
|
+
AG_SPLICE – AlphaGenome splicing composite (splice variants)
|
|
1289
|
+
AG_RAW_MEAN / MAX / MIN – AlphaGenome raw scores (coding)
|
|
1290
|
+
AG_Q_MEAN / MAX / MIN – AlphaGenome quantile scores (coding)
|
|
1291
|
+
"""
|
|
1292
|
+
try:
|
|
1293
|
+
import pysam
|
|
1294
|
+
except ImportError:
|
|
1295
|
+
return None # pysam unavailable; remote fallback will be used
|
|
1296
|
+
|
|
1297
|
+
data_dir = _cfg()["data_dir"]
|
|
1298
|
+
names = _PRECOMPUTED_VCF_NAMES.get(genome, {})
|
|
1299
|
+
|
|
1300
|
+
for vtype in ("splicing", "coding"): # splicing first — mirrors server logic
|
|
1301
|
+
fname = names.get(vtype)
|
|
1302
|
+
if not fname:
|
|
1303
|
+
continue
|
|
1304
|
+
fpath = os.path.join(data_dir, fname)
|
|
1305
|
+
if not os.path.exists(fpath):
|
|
1306
|
+
continue
|
|
1307
|
+
|
|
1308
|
+
try:
|
|
1309
|
+
tbx = pysam.TabixFile(fpath)
|
|
1310
|
+
pos_int = int(pos)
|
|
1311
|
+
chrom_queries = [chrom,
|
|
1312
|
+
("chr" + chrom) if not chrom.startswith("chr") else chrom.lstrip("chr")]
|
|
1313
|
+
records = []
|
|
1314
|
+
for cq in chrom_queries:
|
|
1315
|
+
try:
|
|
1316
|
+
records = list(tbx.fetch(cq, pos_int - 1, pos_int))
|
|
1317
|
+
if records:
|
|
1318
|
+
break
|
|
1319
|
+
except Exception:
|
|
1320
|
+
continue
|
|
1321
|
+
tbx.close()
|
|
1322
|
+
|
|
1323
|
+
for record in records:
|
|
1324
|
+
parts = record.split("\t")
|
|
1325
|
+
if len(parts) < 8 or parts[3] != ref or parts[4] != alt:
|
|
1326
|
+
continue
|
|
1327
|
+
|
|
1328
|
+
info: dict = {}
|
|
1329
|
+
for item in parts[7].split(";"):
|
|
1330
|
+
if "=" in item:
|
|
1331
|
+
k, v = item.split("=", 1)
|
|
1332
|
+
info[k] = v
|
|
1333
|
+
|
|
1334
|
+
if "ENSEMBLE" not in info:
|
|
1335
|
+
continue
|
|
1336
|
+
|
|
1337
|
+
score_val = _safe_float(info["ENSEMBLE"]) or 0.0
|
|
1338
|
+
interp_val = info.get("INTERP", "Precomputed").replace("_", " ")
|
|
1339
|
+
|
|
1340
|
+
sub_models = {
|
|
1341
|
+
"alphagenome": {"statistics": {}},
|
|
1342
|
+
"hyenadna": {"score": _safe_float(info.get("HYENA"))},
|
|
1343
|
+
"nt": {"score": _safe_float(info.get("NT"))},
|
|
1344
|
+
"gpn_msa": {"score": _safe_float(info.get("GPN"))},
|
|
1345
|
+
"popeve": {"score": _safe_float(info.get("POPEVE"))},
|
|
1346
|
+
"evo2": {
|
|
1347
|
+
"llr_score": _safe_float(info.get("EVO2")),
|
|
1348
|
+
"ref_score": None,
|
|
1349
|
+
"var_score": None,
|
|
1350
|
+
"context_length": 8192,
|
|
1351
|
+
"interpretation": "Precomputed cache",
|
|
1352
|
+
"score_class": (
|
|
1353
|
+
"benign" if (_safe_float(info.get("EVO2")) or 0) > 0
|
|
1354
|
+
else "deleterious" if (_safe_float(info.get("EVO2")) or 0) < 0
|
|
1355
|
+
else "unknown"
|
|
1356
|
+
),
|
|
1357
|
+
},
|
|
1358
|
+
"AlphaMissense": {"score": _safe_float(info.get("AM"))},
|
|
1359
|
+
"ESM1b": {"score": _safe_float(info.get("ESM1B"))},
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
if vtype == "splicing":
|
|
1363
|
+
sub_models["alphagenome"]["statistics"]["alphagenome_splicing"] = \
|
|
1364
|
+
_safe_float(info.get("AG_SPLICE"))
|
|
1365
|
+
else:
|
|
1366
|
+
sub_models["alphagenome"]["statistics"].update({
|
|
1367
|
+
"raw_score_mean": _safe_float(info.get("AG_RAW_MEAN")),
|
|
1368
|
+
"raw_score_max": _safe_float(info.get("AG_RAW_MAX")),
|
|
1369
|
+
"raw_score_min": _safe_float(info.get("AG_RAW_MIN")),
|
|
1370
|
+
"quantile_score_mean": _safe_float(info.get("AG_Q_MEAN")),
|
|
1371
|
+
"quantile_score_max": _safe_float(info.get("AG_Q_MAX")),
|
|
1372
|
+
"quantile_score_min": _safe_float(info.get("AG_Q_MIN")),
|
|
1373
|
+
})
|
|
1374
|
+
|
|
1375
|
+
return {
|
|
1376
|
+
"ensemble": {
|
|
1377
|
+
"score": score_val,
|
|
1378
|
+
"interpretation": {
|
|
1379
|
+
"label": interp_val,
|
|
1380
|
+
"badge": "⚡ Cache",
|
|
1381
|
+
"color": "#ff6b6b" if score_val > 0.5 else "#6bcf7f",
|
|
1382
|
+
},
|
|
1383
|
+
"variant_type": "splice" if vtype == "splicing" else "coding",
|
|
1384
|
+
"model_used": ("Splice_ClinVar_GnomAD"
|
|
1385
|
+
if vtype == "splicing" else "ClinVar"),
|
|
1386
|
+
"from_cache": True,
|
|
1387
|
+
"features_raw_aligned": {},
|
|
1388
|
+
"imputed_features": {},
|
|
1389
|
+
"shap_plot": None,
|
|
1390
|
+
},
|
|
1391
|
+
"sub_models": sub_models,
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
except Exception as exc:
|
|
1395
|
+
logger.debug(f"[precomputed_local] {vtype}/{genome}: {exc}")
|
|
1396
|
+
continue
|
|
1397
|
+
|
|
1398
|
+
return None # cache miss
|
|
1399
|
+
|
|
1400
|
+
|
|
1254
1401
|
# ─── Top-level Prediction Entry ───────────────────────────────────
|
|
1255
1402
|
|
|
1256
1403
|
def predict_local(chrom, pos, ref, alt, genome="hg38", ensemble_type=None, **flags) -> dict:
|
|
1257
1404
|
vtype = ensemble_type or get_variant_type(chrom, pos, ref, alt, genome)
|
|
1258
1405
|
|
|
1406
|
+
# ── Pre-computed cache fast path ──────────────────────────────
|
|
1407
|
+
cached = query_precomputed_local(chrom, pos, ref, alt, genome)
|
|
1408
|
+
if cached:
|
|
1409
|
+
logger.debug(f"[precomputed_local] cache hit: {chrom}:{pos} {ref}>{alt}")
|
|
1410
|
+
ens = cached["ensemble"]
|
|
1411
|
+
subs = cached["sub_models"]
|
|
1412
|
+
return {
|
|
1413
|
+
"success": True,
|
|
1414
|
+
"from_cache": True,
|
|
1415
|
+
"prediction": {
|
|
1416
|
+
"input": {"chrom": chrom, "pos": pos, "ref": ref, "alt": alt},
|
|
1417
|
+
"genome_version": genome,
|
|
1418
|
+
"_variant_type": ens.get("variant_type", vtype),
|
|
1419
|
+
"precomputed": ens,
|
|
1420
|
+
"dbnsfp": {
|
|
1421
|
+
"annotations": {},
|
|
1422
|
+
"dl_models": {
|
|
1423
|
+
"AlphaMissense": subs.get("AlphaMissense", {}),
|
|
1424
|
+
"ESM1b": subs.get("ESM1b", {}),
|
|
1425
|
+
},
|
|
1426
|
+
},
|
|
1427
|
+
"alphagenome": subs.get("alphagenome"),
|
|
1428
|
+
"hyenadna": subs.get("hyenadna"),
|
|
1429
|
+
"nt": subs.get("nt"),
|
|
1430
|
+
"gpn_msa": subs.get("gpn_msa"),
|
|
1431
|
+
"popeve": subs.get("popeve"),
|
|
1432
|
+
"evo2": subs.get("evo2"),
|
|
1433
|
+
"errors": {},
|
|
1434
|
+
},
|
|
1435
|
+
}
|
|
1436
|
+
# ─────────────────────────────────────────────────────────────
|
|
1437
|
+
|
|
1259
1438
|
tasks = {
|
|
1260
1439
|
"dbnsfp": lambda: query_dbnsfp_local(chrom, pos, ref, alt, genome),
|
|
1261
1440
|
"hyenadna": lambda: call_local_service("hyenadna", chrom, pos, ref, alt, genome),
|
|
@@ -1317,6 +1496,42 @@ def predict_local_batch(vcf_path: str, genome: str = "hg38",
|
|
|
1317
1496
|
_log(f" [{i}/{len(variants)}] {chrom}:{pos} {ref}>{alt}…")
|
|
1318
1497
|
try:
|
|
1319
1498
|
vtype = get_variant_type(chrom, pos, ref, alt, genome, ensemble_type)
|
|
1499
|
+
|
|
1500
|
+
# ── Pre-computed cache fast path ──────────────────────
|
|
1501
|
+
cached = query_precomputed_local(chrom, pos, ref, alt, genome)
|
|
1502
|
+
if cached:
|
|
1503
|
+
_log(f" ⚡ Pre-computed cache hit")
|
|
1504
|
+
ens = cached["ensemble"]
|
|
1505
|
+
subs = cached["sub_models"]
|
|
1506
|
+
ag_stats = (subs.get("alphagenome") or {}).get("statistics", {})
|
|
1507
|
+
row = {
|
|
1508
|
+
"chrom": chrom, "pos": pos, "ref": ref, "alt": alt,
|
|
1509
|
+
"genome": genome,
|
|
1510
|
+
"variant_type": ens.get("variant_type", vtype),
|
|
1511
|
+
"gene": "-",
|
|
1512
|
+
"transcript": "-",
|
|
1513
|
+
"merge_ensemble_score": ens.get("score"),
|
|
1514
|
+
"merge_label": (ens.get("interpretation") or {}).get("label"),
|
|
1515
|
+
"from_cache": True,
|
|
1516
|
+
"alphamissense_score": (subs.get("AlphaMissense") or {}).get("score"),
|
|
1517
|
+
"esm1b_score": (subs.get("ESM1b") or {}).get("score"),
|
|
1518
|
+
"gpn_msa_score": (subs.get("gpn_msa") or {}).get("score"),
|
|
1519
|
+
"popeve_score": (subs.get("popeve") or {}).get("score"),
|
|
1520
|
+
"hyenadna_score": (subs.get("hyenadna") or {}).get("score"),
|
|
1521
|
+
"nt_score": (subs.get("nt") or {}).get("score"),
|
|
1522
|
+
"evo2_score": (subs.get("evo2") or {}).get("llr_score"),
|
|
1523
|
+
"alphagenome_raw_max": ag_stats.get("raw_score_max"),
|
|
1524
|
+
"alphagenome_raw_min": ag_stats.get("raw_score_min"),
|
|
1525
|
+
"alphagenome_raw_mean": ag_stats.get("raw_score_mean"),
|
|
1526
|
+
"alphagenome_quantile_max": ag_stats.get("quantile_score_max"),
|
|
1527
|
+
"alphagenome_quantile_min": ag_stats.get("quantile_score_min"),
|
|
1528
|
+
"alphagenome_quantile_mean": ag_stats.get("quantile_score_mean"),
|
|
1529
|
+
"alphagenome_splicing": ag_stats.get("alphagenome_splicing"),
|
|
1530
|
+
"errors": "",
|
|
1531
|
+
}
|
|
1532
|
+
results_rows.append(row)
|
|
1533
|
+
continue
|
|
1534
|
+
# ─────────────────────────────────────────────────────
|
|
1320
1535
|
resp = predict_local(chrom, pos, ref, alt, genome,
|
|
1321
1536
|
local_genome_path=None, ensemble_type=vtype, **model_flags)
|
|
1322
1537
|
pred = resp.get("prediction", {})
|
|
@@ -1399,6 +1614,12 @@ def check_local_files() -> dict:
|
|
|
1399
1614
|
"dbNSFP (hg19)": os.path.join(data_dir, "dbNSFP5.3a_grch37.gz"),
|
|
1400
1615
|
"GPN-MSA": os.path.join(data_dir, "scores.tsv.bgz"),
|
|
1401
1616
|
"popEVE": os.path.join(data_dir, "grch38_popEVE_ukbb_20250715.vcf.gz"),
|
|
1617
|
+
# ── Pre-computed VCF cache ──────────────────────────────────
|
|
1618
|
+
"Pre-computed coding (hg38)": os.path.join(data_dir, "coding_merged.vcf.gz"),
|
|
1619
|
+
"Pre-computed splicing (hg38)": os.path.join(data_dir, "splicing_merged.vcf.gz"),
|
|
1620
|
+
"Pre-computed coding (hg19)": os.path.join(data_dir, "coding_merged_hg19.vcf.gz"),
|
|
1621
|
+
"Pre-computed splicing (hg19)": os.path.join(data_dir, "splicing_merged_hg19.vcf.gz"),
|
|
1622
|
+
# ───────────────────────────────────────────────────────────
|
|
1402
1623
|
f"Evo2 local weights ({evo2_model_name}, optional)": evo2_weight_path,
|
|
1403
1624
|
}
|
|
1404
1625
|
result = {}
|
|
@@ -62,11 +62,13 @@ def _extract_flat(prediction: dict, ensemble=None) -> dict:
|
|
|
62
62
|
|
|
63
63
|
ens_score = ens_label = ens_model = ens_variant_type = None
|
|
64
64
|
imputed_feats = {}
|
|
65
|
+
_from_cache = False
|
|
65
66
|
|
|
66
67
|
if ensemble:
|
|
67
68
|
ens_results = ensemble.get("ensemble_results") or {
|
|
68
69
|
k: v for k, v in ensemble.items() if k not in ("success", "error")
|
|
69
70
|
}
|
|
71
|
+
_from_cache = bool(ensemble.get("from_cache"))
|
|
70
72
|
for _key, _val in ens_results.items():
|
|
71
73
|
if isinstance(_val, dict):
|
|
72
74
|
ens_score = _val.get("score")
|
|
@@ -74,6 +76,8 @@ def _extract_flat(prediction: dict, ensemble=None) -> dict:
|
|
|
74
76
|
ens_model = _key
|
|
75
77
|
ens_variant_type = _val.get("variant_type")
|
|
76
78
|
imputed_feats = _val.get("imputed_features", {})
|
|
79
|
+
if _val.get("from_cache"):
|
|
80
|
+
_from_cache = True
|
|
77
81
|
break
|
|
78
82
|
|
|
79
83
|
def _get_val(raw_val, imputed_key):
|
|
@@ -115,6 +119,7 @@ def _extract_flat(prediction: dict, ensemble=None) -> dict:
|
|
|
115
119
|
"AG_raw_mean": _ag("alphagenome_raw_score_mean", "raw_score_mean"),
|
|
116
120
|
"AG_quantile_mean":_ag("alphagenome_quantile_score_mean","quantile_score_mean"),
|
|
117
121
|
"AG_Splicing": _ag("alphagenome_splicing", "alphagenome_splicing"),
|
|
122
|
+
"from_cache": _from_cache,
|
|
118
123
|
}
|
|
119
124
|
|
|
120
125
|
|
|
@@ -172,6 +177,17 @@ def render_single(prediction: dict, ensemble=None, fmt="table", errors=None) ->
|
|
|
172
177
|
vt_style = {"coding": "bold magenta", "noncoding": "bold blue", "splice": "bold yellow"}.get(str(vt).lower(), "white")
|
|
173
178
|
t.add_row("Variant Type", Text(str(vt), style=vt_style))
|
|
174
179
|
|
|
180
|
+
# Cache indicator
|
|
181
|
+
_is_cached = (
|
|
182
|
+
(ensemble or {}).get("from_cache") or
|
|
183
|
+
any(
|
|
184
|
+
isinstance(v, dict) and v.get("from_cache")
|
|
185
|
+
for v in ((ensemble or {}).get("ensemble_results") or {}).values()
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
if _is_cached:
|
|
189
|
+
t.add_row("Score Source", Text("⚡ Pre-computed cache", style="bold yellow"))
|
|
190
|
+
|
|
175
191
|
# MERGE ensemble
|
|
176
192
|
t.add_section()
|
|
177
193
|
t.add_row("MERGE Pathogenicity", _score_text(flat["MERGE_Score"], flat["MERGE_Label"]))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: merge-cli
|
|
3
|
-
Version: 3.
|
|
4
|
-
Summary: MERGE
|
|
3
|
+
Version: 3.6
|
|
4
|
+
Summary: MERGE variant pathogenicity prediction CLI (fixed server, integrated model embedded, no manual configuration required)
|
|
5
5
|
Project-URL: Homepage, https://merge.fanglab.cn
|
|
6
6
|
Requires-Python: >=3.11
|
|
7
7
|
Requires-Dist: click>=8.1
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "merge-cli"
|
|
7
|
-
version = "3.
|
|
8
|
-
description = "MERGE
|
|
7
|
+
version = "3.6"
|
|
8
|
+
description = "MERGE variant pathogenicity prediction CLI (fixed server, integrated model embedded, no manual configuration required)"
|
|
9
9
|
requires-python = ">=3.11"
|
|
10
10
|
dependencies = [
|
|
11
11
|
# 核心 CLI 框架
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|