merge-cli 3.5.1__tar.gz → 3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {merge_cli-3.5.1 → merge_cli-3.6}/PKG-INFO +2 -2
  2. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/cli.py +204 -44
  3. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/local_engine.py +221 -0
  4. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/output.py +16 -0
  5. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli.egg-info/PKG-INFO +2 -2
  6. {merge_cli-3.5.1 → merge_cli-3.6}/pyproject.toml +2 -2
  7. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/__init__.py +0 -0
  8. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/api.py +0 -0
  9. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/cli_env_patch.py +0 -0
  10. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/config.py +0 -0
  11. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/__init__.py +0 -0
  12. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/models/.gitkeep +0 -0
  13. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/models/BestModel_coding.pkl +0 -0
  14. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/models/BestModel_noncoding.pkl +0 -0
  15. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/models/BestModel_splice.pkl +0 -0
  16. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/models/__init__.py +0 -0
  17. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/data/models/ensemble_predict.py +0 -0
  18. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli/ensemble_predict.py +0 -0
  19. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli.egg-info/SOURCES.txt +0 -0
  20. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli.egg-info/dependency_links.txt +0 -0
  21. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli.egg-info/entry_points.txt +0 -0
  22. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli.egg-info/requires.txt +0 -0
  23. {merge_cli-3.5.1 → merge_cli-3.6}/merge_cli.egg-info/top_level.txt +0 -0
  24. {merge_cli-3.5.1 → merge_cli-3.6}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: merge-cli
3
- Version: 3.5.1
4
- Summary: MERGE 变异致病性预测 CLI(服务器固定,集成模型内嵌,无需手动配置)
3
+ Version: 3.6
4
+ Summary: MERGE variant pathogenicity prediction CLI (fixed server, integrated model embedded, no manual configuration required)
5
5
  Project-URL: Homepage, https://merge.fanglab.cn
6
6
  Requires-Python: >=3.11
7
7
  Requires-Dist: click>=8.1
@@ -7,7 +7,8 @@ Changes (v3.5.0):
7
7
  3. Added --genome-ref option to specify reference genome FASTA path
8
8
  4. Added --no-shap option for skipping SHAP analysis
9
9
  """
10
- import os, sys, time
10
+ import os, sys, time, subprocess
11
+ import requests
11
12
  import click
12
13
  from rich.console import Console
13
14
  from rich.table import Table
@@ -251,46 +252,62 @@ def predict(chrom, pos, ref, alt, genome, fmt, no_ensemble,
251
252
  skip_shap = not click.confirm("\nGenerate SHAP analysis plot?", default=True)
252
253
  variant_info = f"{chrom}:{pos} {ref}>{alt}"
253
254
 
254
- # ── Determine variant type ────────────────────────────────────
255
- # Priority: CLI --ensemble-type > prediction["_variant_type"] > remote ANNOVAR API
256
- effective_ensemble_type = ensemble_type
257
- if not effective_ensemble_type:
258
- effective_ensemble_type = prediction.get("_variant_type")
259
- if not effective_ensemble_type:
260
- with console.status("[bold cyan]Detecting variant type...[/bold cyan]"):
255
+ # ── Pre-computed fast path ────────────────────────────────
256
+ # Both remote (/predict/ returns prediction["precomputed"]) and local
257
+ # (predict_local returns prediction["precomputed"]) use the same key.
258
+ precomp = prediction.get("precomputed")
259
+ if precomp:
260
+ model_key = precomp.get("model_used", "MERGE_Precomputed")
261
+ ensemble = {
262
+ "success": True,
263
+ "ensemble_results": {model_key: precomp},
264
+ "raw_features": {},
265
+ "from_cache": True,
266
+ }
267
+ prediction["_variant_type"] = precomp.get("variant_type")
268
+ console.print("[dim]⚡ Pre-computed score retrieved from cache[/dim]")
269
+ # ─────────────────────────────────────────────────────────
270
+ else:
271
+ # ── Determine variant type ────────────────────────────────────
272
+ # Priority: CLI --ensemble-type > prediction["_variant_type"] > remote ANNOVAR API
273
+ effective_ensemble_type = ensemble_type
274
+ if not effective_ensemble_type:
275
+ effective_ensemble_type = prediction.get("_variant_type")
276
+ if not effective_ensemble_type:
277
+ with console.status("[bold cyan]Detecting variant type...[/bold cyan]"):
278
+ try:
279
+ from . import local_engine as _le
280
+ effective_ensemble_type = _le.get_variant_type(chrom, pos, ref, alt, genome)
281
+ except Exception:
282
+ effective_ensemble_type = None
283
+ if effective_ensemble_type:
284
+ prediction["_variant_type"] = effective_ensemble_type
285
+
286
+ with console.status("[bold cyan]Computing MERGE ensemble score...[/bold cyan]"):
261
287
  try:
262
- from . import local_engine as _le
263
- effective_ensemble_type = _le.get_variant_type(chrom, pos, ref, alt, genome)
264
- except Exception:
265
- effective_ensemble_type = None
266
- if effective_ensemble_type:
267
- prediction["_variant_type"] = effective_ensemble_type
268
-
269
- with console.status("[bold cyan]Computing MERGE ensemble score...[/bold cyan]"):
270
- try:
271
- if is_local:
272
- from . import local_engine
273
- dbnsfp = prediction.get("dbnsfp") or {}
274
- transcripts = dbnsfp.get("transcripts") or ([dbnsfp] if dbnsfp else [])
275
- ens_result, ens_err = local_engine.run_local_ensemble(
276
- prediction, transcripts, ensemble_type=effective_ensemble_type,
277
- skip_shap=skip_shap, variant_info=variant_info)
278
- if ens_err:
279
- console.print(f"[yellow]Ensemble model warning: {ens_err}[/yellow]")
280
- ensemble = {"success": True, **ens_result}
281
- else:
282
- dbnsfp = prediction.get("dbnsfp") or {}
283
- transcripts = [dbnsfp] if dbnsfp else []
284
- ens_resp = api.predict_ensemble(prediction, transcripts,
285
- ensemble_type=effective_ensemble_type)
286
- if ens_resp.get("success"):
287
- ensemble = ens_resp
288
- for _v in (ensemble.get("ensemble_results") or {}).values():
289
- if isinstance(_v, dict) and _v.get("variant_type"):
290
- prediction["_variant_type"] = _v["variant_type"]
291
- break
292
- except Exception as e:
293
- console.print(f"[yellow]Ensemble score failed (raw scores unaffected): {e}[/yellow]")
288
+ if is_local:
289
+ from . import local_engine
290
+ dbnsfp = prediction.get("dbnsfp") or {}
291
+ transcripts = dbnsfp.get("transcripts") or ([dbnsfp] if dbnsfp else [])
292
+ ens_result, ens_err = local_engine.run_local_ensemble(
293
+ prediction, transcripts, ensemble_type=effective_ensemble_type,
294
+ skip_shap=skip_shap, variant_info=variant_info)
295
+ if ens_err:
296
+ console.print(f"[yellow]Ensemble model warning: {ens_err}[/yellow]")
297
+ ensemble = {"success": True, **ens_result}
298
+ else:
299
+ dbnsfp = prediction.get("dbnsfp") or {}
300
+ transcripts = [dbnsfp] if dbnsfp else []
301
+ ens_resp = api.predict_ensemble(prediction, transcripts,
302
+ ensemble_type=effective_ensemble_type)
303
+ if ens_resp.get("success"):
304
+ ensemble = ens_resp
305
+ for _v in (ensemble.get("ensemble_results") or {}).values():
306
+ if isinstance(_v, dict) and _v.get("variant_type"):
307
+ prediction["_variant_type"] = _v["variant_type"]
308
+ break
309
+ except Exception as e:
310
+ console.print(f"[yellow]Ensemble score failed (raw scores unaffected): {e}[/yellow]")
294
311
 
295
312
  output.render_single(prediction, ensemble, fmt=fmt,
296
313
  errors=prediction.get("errors"))
@@ -428,15 +445,33 @@ def local_setup():
428
445
  # ── merge local download ──────────────────────────────────────
429
446
  @local.command("download")
430
447
  @click.option("--file", "file_type",
431
- type=click.Choice(["all", "dbnsfp", "gpn-msa", "popeve"]),
448
+ type=click.Choice(["all", "dbnsfp", "gpn-msa", "popeve", "precomputed"]),
432
449
  default="all",
433
- help="Show download instructions for specific file")
434
- def local_download(file_type):
435
- """Show official download URLs for pre-computed data files."""
450
+ help="Show download instructions for specific file (or auto-download precomputed VCF)")
451
+ @click.option("--genome", default="all", type=click.Choice(["all", "hg38", "hg19"]),
452
+ help="Genome version for precomputed VCF download (default: both)")
453
+ @click.option("--variant-type", "variant_type", default="all",
454
+ type=click.Choice(["all", "coding", "splicing"]),
455
+ help="Variant type for precomputed VCF download (default: both)")
456
+ def local_download(file_type, genome, variant_type):
457
+ """Show official download URLs for pre-computed data files.
458
+
459
+ \b
460
+ The precomputed VCF cache files are auto-downloaded from merge.fanglab.cn:
461
+ merge local download --file precomputed # download all 4 files
462
+ merge local download --file precomputed --genome hg38
463
+ merge local download --file precomputed --genome hg38 --variant-type coding
464
+ """
436
465
  cfg = get_local_config()
437
466
  data_dir = cfg["data_dir"]
438
467
  os.makedirs(data_dir, exist_ok=True)
439
468
 
469
+ # ── Auto-download precomputed VCF files ───────────────────────
470
+ if file_type == "precomputed":
471
+ _download_precomputed_vcfs(data_dir, genome, variant_type)
472
+ return
473
+ # ─────────────────────────────────────────────────────────────
474
+
440
475
  INSTRUCTIONS = {
441
476
  "dbnsfp": {
442
477
  "label": "dbNSFP (ESM1b + AlphaMissense)",
@@ -472,6 +507,8 @@ def local_download(file_type):
472
507
  "\n[bold green]✓ Ensemble models (pkl files)[/bold green] are bundled with pip install, "
473
508
  "no manual download needed.\n"
474
509
  " If model files are missing, run: [bold]pip install --force-reinstall merge-cli[/bold]\n"
510
+ "\n[bold yellow]⚡ Pre-computed VCF cache[/bold yellow] can be auto-downloaded:\n"
511
+ " [bold]merge local download --file precomputed[/bold]\n"
475
512
  )
476
513
 
477
514
  show_keys = (list(INSTRUCTIONS.keys()) if file_type == "all"
@@ -487,6 +524,129 @@ def local_download(file_type):
487
524
  console.print()
488
525
 
489
526
 
527
+ def _download_precomputed_vcfs(data_dir: str, genome: str = "all", variant_type: str = "all"):
528
+ """Auto-download pre-computed VCF files + tabix indices from merge.fanglab.cn."""
529
+ import shutil
530
+
531
+ # Files hosted at: https://merge.fanglab.cn/static/precomputed/<filename>
532
+ # Adjust BASE_STATIC_URL if you move the files elsewhere (e.g. OSS / GitHub Releases)
533
+ BASE_STATIC_URL = f"{FIXED_API_URL}/static/precomputed"
534
+
535
+ ALL_FILES = {
536
+ ("hg38", "coding"): "coding_merged.vcf.gz",
537
+ ("hg38", "splicing"): "splicing_merged.vcf.gz",
538
+ ("hg19", "coding"): "coding_merged_hg19.vcf.gz",
539
+ ("hg19", "splicing"): "splicing_merged_hg19.vcf.gz",
540
+ }
541
+
542
+ # Filter by --genome and --variant-type
543
+ targets = {
544
+ (g, vt): fname for (g, vt), fname in ALL_FILES.items()
545
+ if (genome == "all" or g == genome)
546
+ and (variant_type == "all" or vt == variant_type)
547
+ }
548
+
549
+ if not targets:
550
+ console.print("[red]No files match the specified genome/variant-type combination.[/red]")
551
+ return
552
+
553
+ has_tabix = bool(shutil.which("tabix"))
554
+ if not has_tabix:
555
+ console.print(
556
+ "[yellow]⚠ tabix not found in PATH. VCF files will be downloaded but NOT indexed.\n"
557
+ " Install htslib (conda install -c bioconda htslib) then run this command again\n"
558
+ " or index manually: tabix -p vcf <file.vcf.gz>[/yellow]\n"
559
+ )
560
+
561
+ ok_count = 0
562
+ for (g, vt), fname in targets.items():
563
+ dest = os.path.join(data_dir, fname)
564
+ dest_tbi = dest + ".tbi"
565
+ url = f"{BASE_STATIC_URL}/{fname}"
566
+ url_tbi = f"{BASE_STATIC_URL}/{fname}.tbi"
567
+
568
+ console.print(f"\n[bold cyan]── {g} {vt}: {fname} ──[/bold cyan]")
569
+
570
+ # ── Download .vcf.gz ──────────────────────────────────────
571
+ if os.path.exists(dest):
572
+ size_mb = os.path.getsize(dest) / 1e6
573
+ console.print(f" [green]✓ Already exists ({size_mb:.0f} MB), skipping.[/green]")
574
+ else:
575
+ console.print(f" Downloading {url} …")
576
+ success = _stream_download(url, dest)
577
+ if not success:
578
+ console.print(f" [red]✗ Download failed. Check URL or network.[/red]")
579
+ continue
580
+
581
+ # ── Download .tbi (pre-built index) ──────────────────────
582
+ if os.path.exists(dest_tbi):
583
+ console.print(f" [green]✓ Index already exists, skipping.[/green]")
584
+ else:
585
+ console.print(f" Downloading index {url_tbi} …")
586
+ idx_ok = _stream_download(url_tbi, dest_tbi)
587
+ if not idx_ok:
588
+ # Fall back to local tabix
589
+ if has_tabix:
590
+ console.print(" Pre-built index not found, building with local tabix…")
591
+ result = subprocess.run(
592
+ ["tabix", "-p", "vcf", dest],
593
+ capture_output=True, text=True, timeout=300,
594
+ )
595
+ if result.returncode == 0:
596
+ console.print(" [green]✓ Index built.[/green]")
597
+ else:
598
+ console.print(f" [red]✗ tabix failed: {result.stderr.strip()}[/red]")
599
+ continue
600
+ else:
601
+ console.print(
602
+ " [red]✗ No pre-built index and tabix not found.\n"
603
+ " Install htslib and run: tabix -p vcf " + dest + "[/red]"
604
+ )
605
+ continue
606
+
607
+ console.print(f" [green]✓ Ready: {dest}[/green]")
608
+ ok_count += 1
609
+
610
+ console.print(f"\n[bold green]✓ Done: {ok_count}/{len(targets)} files ready in {data_dir}[/bold green]")
611
+ if ok_count > 0:
612
+ console.print(
613
+ " Pre-computed scores will now be used automatically for common variants.\n"
614
+ " Run [bold]merge local status[/bold] to verify."
615
+ )
616
+
617
+
618
+ def _stream_download(url: str, dest: str, chunk_size: int = 1 << 20) -> bool:
619
+ """Download url → dest with a progress bar. Returns True on success."""
620
+ import math
621
+ tmp = dest + ".part"
622
+ try:
623
+ r = requests.get(url, stream=True, timeout=30)
624
+ if r.status_code == 404:
625
+ return False
626
+ r.raise_for_status()
627
+ total = int(r.headers.get("content-length", 0))
628
+ downloaded = 0
629
+ with open(tmp, "wb") as f:
630
+ for chunk in r.iter_content(chunk_size=chunk_size):
631
+ if chunk:
632
+ f.write(chunk)
633
+ downloaded += len(chunk)
634
+ if total:
635
+ pct = downloaded / total * 100
636
+ done = int(pct / 5)
637
+ bar = "█" * done + "░" * (20 - done)
638
+ mb = downloaded / 1e6
639
+ print(f"\r [{bar}] {pct:5.1f}% {mb:.0f} MB", end="", flush=True)
640
+ print() # newline after progress bar
641
+ os.replace(tmp, dest)
642
+ return True
643
+ except Exception as exc:
644
+ if os.path.exists(tmp):
645
+ os.unlink(tmp)
646
+ console.print(f"\n [red]Download error: {exc}[/red]")
647
+ return False
648
+
649
+
490
650
  # ── merge local predict ───────────────────────────────────────
491
651
  @local.command("predict")
492
652
  @click.option("--chrom", required=True)
@@ -1251,11 +1251,190 @@ def run_local_ensemble(pred_data, all_transcripts, ensemble_type=None, skip_shap
1251
1251
  return {chosen_key: result}, None
1252
1252
 
1253
1253
 
1254
+ # ─── Pre-computed VCF Cache ───────────────────────────────────────
1255
+
1256
+ # VCF filenames mirror the server-side PRECOMPUTED_VCFS config in views.py
1257
+ _PRECOMPUTED_VCF_NAMES = {
1258
+ "hg38": {
1259
+ "coding": "coding_merged.vcf.gz",
1260
+ "splicing": "splicing_merged.vcf.gz",
1261
+ },
1262
+ "hg19": {
1263
+ "coding": "coding_merged_hg19.vcf.gz",
1264
+ "splicing": "splicing_merged_hg19.vcf.gz",
1265
+ },
1266
+ }
1267
+
1268
+
1269
+ def query_precomputed_local(chrom: str, pos, ref: str, alt: str,
1270
+ genome: str = "hg38") -> Optional[dict]:
1271
+ """
1272
+ Query the local pre-computed VCF cache files (same format as the server).
1273
+ Splicing is checked first to prevent coding-model mis-classification.
1274
+
1275
+ Returns the same dict shape as views.query_precomputed_vcf on hit, or
1276
+ None on miss / unavailable.
1277
+
1278
+ INFO fields expected (set by the pre-computation pipeline):
1279
+ ENSEMBLE – MERGE score (0-1)
1280
+ INTERP – interpretation label (spaces encoded as underscores)
1281
+ HYENA – HyenaDNA score
1282
+ NT – Nucleotide Transformer score
1283
+ GPN – GPN-MSA score
1284
+ POPEVE – popEVE score
1285
+ EVO2 – Evo2 LLR score
1286
+ AM – AlphaMissense score
1287
+ ESM1B – ESM1b score
1288
+ AG_SPLICE – AlphaGenome splicing composite (splice variants)
1289
+ AG_RAW_MEAN / MAX / MIN – AlphaGenome raw scores (coding)
1290
+ AG_Q_MEAN / MAX / MIN – AlphaGenome quantile scores (coding)
1291
+ """
1292
+ try:
1293
+ import pysam
1294
+ except ImportError:
1295
+ return None # pysam unavailable; remote fallback will be used
1296
+
1297
+ data_dir = _cfg()["data_dir"]
1298
+ names = _PRECOMPUTED_VCF_NAMES.get(genome, {})
1299
+
1300
+ for vtype in ("splicing", "coding"): # splicing first — mirrors server logic
1301
+ fname = names.get(vtype)
1302
+ if not fname:
1303
+ continue
1304
+ fpath = os.path.join(data_dir, fname)
1305
+ if not os.path.exists(fpath):
1306
+ continue
1307
+
1308
+ try:
1309
+ tbx = pysam.TabixFile(fpath)
1310
+ pos_int = int(pos)
1311
+ chrom_queries = [chrom,
1312
+ ("chr" + chrom) if not chrom.startswith("chr") else chrom.lstrip("chr")]
1313
+ records = []
1314
+ for cq in chrom_queries:
1315
+ try:
1316
+ records = list(tbx.fetch(cq, pos_int - 1, pos_int))
1317
+ if records:
1318
+ break
1319
+ except Exception:
1320
+ continue
1321
+ tbx.close()
1322
+
1323
+ for record in records:
1324
+ parts = record.split("\t")
1325
+ if len(parts) < 8 or parts[3] != ref or parts[4] != alt:
1326
+ continue
1327
+
1328
+ info: dict = {}
1329
+ for item in parts[7].split(";"):
1330
+ if "=" in item:
1331
+ k, v = item.split("=", 1)
1332
+ info[k] = v
1333
+
1334
+ if "ENSEMBLE" not in info:
1335
+ continue
1336
+
1337
+ score_val = _safe_float(info["ENSEMBLE"]) or 0.0
1338
+ interp_val = info.get("INTERP", "Precomputed").replace("_", " ")
1339
+
1340
+ sub_models = {
1341
+ "alphagenome": {"statistics": {}},
1342
+ "hyenadna": {"score": _safe_float(info.get("HYENA"))},
1343
+ "nt": {"score": _safe_float(info.get("NT"))},
1344
+ "gpn_msa": {"score": _safe_float(info.get("GPN"))},
1345
+ "popeve": {"score": _safe_float(info.get("POPEVE"))},
1346
+ "evo2": {
1347
+ "llr_score": _safe_float(info.get("EVO2")),
1348
+ "ref_score": None,
1349
+ "var_score": None,
1350
+ "context_length": 8192,
1351
+ "interpretation": "Precomputed cache",
1352
+ "score_class": (
1353
+ "benign" if (_safe_float(info.get("EVO2")) or 0) > 0
1354
+ else "deleterious" if (_safe_float(info.get("EVO2")) or 0) < 0
1355
+ else "unknown"
1356
+ ),
1357
+ },
1358
+ "AlphaMissense": {"score": _safe_float(info.get("AM"))},
1359
+ "ESM1b": {"score": _safe_float(info.get("ESM1B"))},
1360
+ }
1361
+
1362
+ if vtype == "splicing":
1363
+ sub_models["alphagenome"]["statistics"]["alphagenome_splicing"] = \
1364
+ _safe_float(info.get("AG_SPLICE"))
1365
+ else:
1366
+ sub_models["alphagenome"]["statistics"].update({
1367
+ "raw_score_mean": _safe_float(info.get("AG_RAW_MEAN")),
1368
+ "raw_score_max": _safe_float(info.get("AG_RAW_MAX")),
1369
+ "raw_score_min": _safe_float(info.get("AG_RAW_MIN")),
1370
+ "quantile_score_mean": _safe_float(info.get("AG_Q_MEAN")),
1371
+ "quantile_score_max": _safe_float(info.get("AG_Q_MAX")),
1372
+ "quantile_score_min": _safe_float(info.get("AG_Q_MIN")),
1373
+ })
1374
+
1375
+ return {
1376
+ "ensemble": {
1377
+ "score": score_val,
1378
+ "interpretation": {
1379
+ "label": interp_val,
1380
+ "badge": "⚡ Cache",
1381
+ "color": "#ff6b6b" if score_val > 0.5 else "#6bcf7f",
1382
+ },
1383
+ "variant_type": "splice" if vtype == "splicing" else "coding",
1384
+ "model_used": ("Splice_ClinVar_GnomAD"
1385
+ if vtype == "splicing" else "ClinVar"),
1386
+ "from_cache": True,
1387
+ "features_raw_aligned": {},
1388
+ "imputed_features": {},
1389
+ "shap_plot": None,
1390
+ },
1391
+ "sub_models": sub_models,
1392
+ }
1393
+
1394
+ except Exception as exc:
1395
+ logger.debug(f"[precomputed_local] {vtype}/{genome}: {exc}")
1396
+ continue
1397
+
1398
+ return None # cache miss
1399
+
1400
+
1254
1401
  # ─── Top-level Prediction Entry ───────────────────────────────────
1255
1402
 
1256
1403
  def predict_local(chrom, pos, ref, alt, genome="hg38", ensemble_type=None, **flags) -> dict:
1257
1404
  vtype = ensemble_type or get_variant_type(chrom, pos, ref, alt, genome)
1258
1405
 
1406
+ # ── Pre-computed cache fast path ──────────────────────────────
1407
+ cached = query_precomputed_local(chrom, pos, ref, alt, genome)
1408
+ if cached:
1409
+ logger.debug(f"[precomputed_local] cache hit: {chrom}:{pos} {ref}>{alt}")
1410
+ ens = cached["ensemble"]
1411
+ subs = cached["sub_models"]
1412
+ return {
1413
+ "success": True,
1414
+ "from_cache": True,
1415
+ "prediction": {
1416
+ "input": {"chrom": chrom, "pos": pos, "ref": ref, "alt": alt},
1417
+ "genome_version": genome,
1418
+ "_variant_type": ens.get("variant_type", vtype),
1419
+ "precomputed": ens,
1420
+ "dbnsfp": {
1421
+ "annotations": {},
1422
+ "dl_models": {
1423
+ "AlphaMissense": subs.get("AlphaMissense", {}),
1424
+ "ESM1b": subs.get("ESM1b", {}),
1425
+ },
1426
+ },
1427
+ "alphagenome": subs.get("alphagenome"),
1428
+ "hyenadna": subs.get("hyenadna"),
1429
+ "nt": subs.get("nt"),
1430
+ "gpn_msa": subs.get("gpn_msa"),
1431
+ "popeve": subs.get("popeve"),
1432
+ "evo2": subs.get("evo2"),
1433
+ "errors": {},
1434
+ },
1435
+ }
1436
+ # ─────────────────────────────────────────────────────────────
1437
+
1259
1438
  tasks = {
1260
1439
  "dbnsfp": lambda: query_dbnsfp_local(chrom, pos, ref, alt, genome),
1261
1440
  "hyenadna": lambda: call_local_service("hyenadna", chrom, pos, ref, alt, genome),
@@ -1317,6 +1496,42 @@ def predict_local_batch(vcf_path: str, genome: str = "hg38",
1317
1496
  _log(f" [{i}/{len(variants)}] {chrom}:{pos} {ref}>{alt}…")
1318
1497
  try:
1319
1498
  vtype = get_variant_type(chrom, pos, ref, alt, genome, ensemble_type)
1499
+
1500
+ # ── Pre-computed cache fast path ──────────────────────
1501
+ cached = query_precomputed_local(chrom, pos, ref, alt, genome)
1502
+ if cached:
1503
+ _log(f" ⚡ Pre-computed cache hit")
1504
+ ens = cached["ensemble"]
1505
+ subs = cached["sub_models"]
1506
+ ag_stats = (subs.get("alphagenome") or {}).get("statistics", {})
1507
+ row = {
1508
+ "chrom": chrom, "pos": pos, "ref": ref, "alt": alt,
1509
+ "genome": genome,
1510
+ "variant_type": ens.get("variant_type", vtype),
1511
+ "gene": "-",
1512
+ "transcript": "-",
1513
+ "merge_ensemble_score": ens.get("score"),
1514
+ "merge_label": (ens.get("interpretation") or {}).get("label"),
1515
+ "from_cache": True,
1516
+ "alphamissense_score": (subs.get("AlphaMissense") or {}).get("score"),
1517
+ "esm1b_score": (subs.get("ESM1b") or {}).get("score"),
1518
+ "gpn_msa_score": (subs.get("gpn_msa") or {}).get("score"),
1519
+ "popeve_score": (subs.get("popeve") or {}).get("score"),
1520
+ "hyenadna_score": (subs.get("hyenadna") or {}).get("score"),
1521
+ "nt_score": (subs.get("nt") or {}).get("score"),
1522
+ "evo2_score": (subs.get("evo2") or {}).get("llr_score"),
1523
+ "alphagenome_raw_max": ag_stats.get("raw_score_max"),
1524
+ "alphagenome_raw_min": ag_stats.get("raw_score_min"),
1525
+ "alphagenome_raw_mean": ag_stats.get("raw_score_mean"),
1526
+ "alphagenome_quantile_max": ag_stats.get("quantile_score_max"),
1527
+ "alphagenome_quantile_min": ag_stats.get("quantile_score_min"),
1528
+ "alphagenome_quantile_mean": ag_stats.get("quantile_score_mean"),
1529
+ "alphagenome_splicing": ag_stats.get("alphagenome_splicing"),
1530
+ "errors": "",
1531
+ }
1532
+ results_rows.append(row)
1533
+ continue
1534
+ # ─────────────────────────────────────────────────────
1320
1535
  resp = predict_local(chrom, pos, ref, alt, genome,
1321
1536
  local_genome_path=None, ensemble_type=vtype, **model_flags)
1322
1537
  pred = resp.get("prediction", {})
@@ -1399,6 +1614,12 @@ def check_local_files() -> dict:
1399
1614
  "dbNSFP (hg19)": os.path.join(data_dir, "dbNSFP5.3a_grch37.gz"),
1400
1615
  "GPN-MSA": os.path.join(data_dir, "scores.tsv.bgz"),
1401
1616
  "popEVE": os.path.join(data_dir, "grch38_popEVE_ukbb_20250715.vcf.gz"),
1617
+ # ── Pre-computed VCF cache ──────────────────────────────────
1618
+ "Pre-computed coding (hg38)": os.path.join(data_dir, "coding_merged.vcf.gz"),
1619
+ "Pre-computed splicing (hg38)": os.path.join(data_dir, "splicing_merged.vcf.gz"),
1620
+ "Pre-computed coding (hg19)": os.path.join(data_dir, "coding_merged_hg19.vcf.gz"),
1621
+ "Pre-computed splicing (hg19)": os.path.join(data_dir, "splicing_merged_hg19.vcf.gz"),
1622
+ # ───────────────────────────────────────────────────────────
1402
1623
  f"Evo2 local weights ({evo2_model_name}, optional)": evo2_weight_path,
1403
1624
  }
1404
1625
  result = {}
@@ -62,11 +62,13 @@ def _extract_flat(prediction: dict, ensemble=None) -> dict:
62
62
 
63
63
  ens_score = ens_label = ens_model = ens_variant_type = None
64
64
  imputed_feats = {}
65
+ _from_cache = False
65
66
 
66
67
  if ensemble:
67
68
  ens_results = ensemble.get("ensemble_results") or {
68
69
  k: v for k, v in ensemble.items() if k not in ("success", "error")
69
70
  }
71
+ _from_cache = bool(ensemble.get("from_cache"))
70
72
  for _key, _val in ens_results.items():
71
73
  if isinstance(_val, dict):
72
74
  ens_score = _val.get("score")
@@ -74,6 +76,8 @@ def _extract_flat(prediction: dict, ensemble=None) -> dict:
74
76
  ens_model = _key
75
77
  ens_variant_type = _val.get("variant_type")
76
78
  imputed_feats = _val.get("imputed_features", {})
79
+ if _val.get("from_cache"):
80
+ _from_cache = True
77
81
  break
78
82
 
79
83
  def _get_val(raw_val, imputed_key):
@@ -115,6 +119,7 @@ def _extract_flat(prediction: dict, ensemble=None) -> dict:
115
119
  "AG_raw_mean": _ag("alphagenome_raw_score_mean", "raw_score_mean"),
116
120
  "AG_quantile_mean":_ag("alphagenome_quantile_score_mean","quantile_score_mean"),
117
121
  "AG_Splicing": _ag("alphagenome_splicing", "alphagenome_splicing"),
122
+ "from_cache": _from_cache,
118
123
  }
119
124
 
120
125
 
@@ -172,6 +177,17 @@ def render_single(prediction: dict, ensemble=None, fmt="table", errors=None) ->
172
177
  vt_style = {"coding": "bold magenta", "noncoding": "bold blue", "splice": "bold yellow"}.get(str(vt).lower(), "white")
173
178
  t.add_row("Variant Type", Text(str(vt), style=vt_style))
174
179
 
180
+ # Cache indicator
181
+ _is_cached = (
182
+ (ensemble or {}).get("from_cache") or
183
+ any(
184
+ isinstance(v, dict) and v.get("from_cache")
185
+ for v in ((ensemble or {}).get("ensemble_results") or {}).values()
186
+ )
187
+ )
188
+ if _is_cached:
189
+ t.add_row("Score Source", Text("⚡ Pre-computed cache", style="bold yellow"))
190
+
175
191
  # MERGE ensemble
176
192
  t.add_section()
177
193
  t.add_row("MERGE Pathogenicity", _score_text(flat["MERGE_Score"], flat["MERGE_Label"]))
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: merge-cli
3
- Version: 3.5.1
4
- Summary: MERGE 变异致病性预测 CLI(服务器固定,集成模型内嵌,无需手动配置)
3
+ Version: 3.6
4
+ Summary: MERGE variant pathogenicity prediction CLI (fixed server, integrated model embedded, no manual configuration required)
5
5
  Project-URL: Homepage, https://merge.fanglab.cn
6
6
  Requires-Python: >=3.11
7
7
  Requires-Dist: click>=8.1
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "merge-cli"
7
- version = "3.5.1"
8
- description = "MERGE 变异致病性预测 CLI(服务器固定,集成模型内嵌,无需手动配置)"
7
+ version = "3.6"
8
+ description = "MERGE variant pathogenicity prediction CLI (fixed server, integrated model embedded, no manual configuration required)"
9
9
  requires-python = ">=3.11"
10
10
  dependencies = [
11
11
  # 核心 CLI 框架
File without changes
File without changes
File without changes
File without changes