allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
allelix/cli.py ADDED
@@ -0,0 +1,1524 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Allelix command-line interface."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ import sys
9
+ import time
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, NamedTuple
12
+
13
+ import click
14
+ from rich.console import Console
15
+ from rich.table import Table
16
+
17
+ from allelix import __version__
18
+ from allelix.annotators import get_annotators
19
+ from allelix.databases import resolve_data_dir
20
+ from allelix.parsers import ParserNotFoundError, detect_parser, get_parser_by_name
21
+ from allelix.reports._pipeline import run_analysis
22
+ from allelix.reports.diff import compute_diff, load_previous_report
23
+ from allelix.reports.high_value import format_warnings, load_high_value_snps, scan_no_calls
24
+ from allelix.reports.html import render_html
25
+ from allelix.reports.json_report import render_json
26
+ from allelix.reports.methylation import METHYLATION_PANEL_GENES
27
+ from allelix.reports.terminal import render_terminal, render_terminal_diff
28
+
29
+ if TYPE_CHECKING:
30
+ from allelix.annotators.base import Annotator
31
+ from allelix.models import Variant
32
+ from allelix.parsers.base import GenotypeParser
33
+
34
+ console = Console()
35
+
36
+ # Sort 1-22 numerically, then X, Y, MT, then anything else alphabetically.
37
+ _NAMED_CHROM_ORDER = {"X": 0, "Y": 1, "MT": 2}
38
+
39
+
40
+ def _chrom_sort_key(chrom: str) -> tuple[int, int, str]:
41
+ """Sort key: autosomes (1-22), then X/Y/MT, then unknowns alphabetically."""
42
+ if chrom.isdigit():
43
+ return (0, int(chrom), "")
44
+ if chrom in _NAMED_CHROM_ORDER:
45
+ return (1, _NAMED_CHROM_ORDER[chrom], "")
46
+ return (2, 0, chrom)
47
+
48
+
49
+ def _percent(part: int, total: int) -> str:
50
+ if total == 0:
51
+ return "0.00%"
52
+ return f"{part / total * 100:.2f}%"
53
+
54
+
55
+ class _WarningCounter(logging.Handler):
56
+ """Count warning records emitted by the parser pipeline."""
57
+
58
+ def __init__(self) -> None:
59
+ super().__init__(level=logging.WARNING)
60
+ self.count = 0
61
+
62
+ def emit(self, record: logging.LogRecord) -> None:
63
+ self.count += 1
64
+
65
+
66
+ class _LoggerSnapshot(NamedTuple):
67
+ """Captured state of a Python logger for restoration after CLI mutates it."""
68
+
69
+ level: int
70
+ propagate: bool
71
+
72
+
73
+ def _wire_parser_logging() -> tuple[_WarningCounter, logging.Handler, _LoggerSnapshot]:
74
+ """Attach warning capture + stderr surfacing to the parsers logger."""
75
+ parser_logger = logging.getLogger("allelix.parsers")
76
+ counter = _WarningCounter()
77
+ stderr_handler = logging.StreamHandler(sys.stderr)
78
+ stderr_handler.setLevel(logging.WARNING)
79
+ stderr_handler.setFormatter(logging.Formatter("warning: %(message)s"))
80
+ snapshot = _LoggerSnapshot(level=parser_logger.level, propagate=parser_logger.propagate)
81
+ parser_logger.addHandler(counter)
82
+ parser_logger.addHandler(stderr_handler)
83
+ parser_logger.setLevel(logging.WARNING)
84
+ parser_logger.propagate = False
85
+ return counter, stderr_handler, snapshot
86
+
87
+
88
+ def _unwire_parser_logging(
89
+ counter: _WarningCounter,
90
+ stderr_handler: logging.Handler,
91
+ snapshot: _LoggerSnapshot,
92
+ ) -> None:
93
+ parser_logger = logging.getLogger("allelix.parsers")
94
+ parser_logger.removeHandler(counter)
95
+ parser_logger.removeHandler(stderr_handler)
96
+ parser_logger.setLevel(snapshot.level)
97
+ parser_logger.propagate = snapshot.propagate
98
+
99
+
100
+ def _resolve_parser(file_path: Path, fmt: str | None) -> GenotypeParser:
101
+ try:
102
+ return get_parser_by_name(fmt) if fmt else detect_parser(file_path)
103
+ except ParserNotFoundError as exc:
104
+ raise click.ClickException(str(exc)) from exc
105
+
106
+
107
+ def _ready_annotators(
108
+ data_dir: Path | None,
109
+ *,
110
+ include_benign: bool = False,
111
+ gwas_filter_traits: bool = True,
112
+ cadd_full: bool = False,
113
+ ) -> tuple[Path, list[Annotator], list[Annotator]]:
114
+ resolved = resolve_data_dir(data_dir)
115
+ annotators = get_annotators(
116
+ resolved,
117
+ include_benign=include_benign,
118
+ gwas_filter_traits=gwas_filter_traits,
119
+ cadd_full=cadd_full,
120
+ )
121
+ ready: list[Annotator] = []
122
+ not_ready: list[Annotator] = []
123
+ for a in annotators:
124
+ if a.is_ready():
125
+ ready.append(a)
126
+ else:
127
+ not_ready.append(a)
128
+ if not ready:
129
+ names = ", ".join(a.name for a in annotators)
130
+ raise click.ClickException(
131
+ f"No annotators are ready. Run `allelix db update` first. Registered: {names}"
132
+ )
133
+ return resolved, ready, not_ready
134
+
135
+
136
+ _STALENESS_SECONDS = 7 * 24 * 60 * 60 # 7 days
137
+
138
+
139
+ def _maybe_refresh_databases(data_dir: Path) -> None:
140
+ """Check database mtimes; refresh any that are stale and have a changed remote signal.
141
+
142
+ Only runs for annotators that download data (SNPedia excluded).
143
+ If the network is unreachable, warns and continues with stale caches.
144
+ """
145
+ now = time.time()
146
+ annotators = get_annotators(data_dir)
147
+ for annotator in annotators:
148
+ with annotator:
149
+ if not annotator.requires_download or not annotator.is_ready():
150
+ continue
151
+ # Code-driven sources (commit-pinned HF caches) never change
152
+ # at a fixed URL — skip the HEAD request. See ADR-0030.
153
+ if not annotator.server_driven_freshness:
154
+ continue
155
+ db_files = list(data_dir.glob(f"{annotator.name}*sqlite*"))
156
+ if not db_files:
157
+ continue
158
+ newest_mtime = max(f.stat().st_mtime for f in db_files)
159
+ age = now - newest_mtime
160
+ if age <= _STALENESS_SECONDS:
161
+ continue
162
+
163
+ remote = annotator.fetch_remote_signal()
164
+ if remote is None:
165
+ age_days = int(age / 86400)
166
+ console.print(
167
+ f"[yellow]{annotator.display_name} database is {age_days} days old. "
168
+ "Run `allelix db update` when online.[/yellow]"
169
+ )
170
+ continue
171
+
172
+ cached = annotator.cached_remote_signal()
173
+ if cached == remote:
174
+ continue
175
+
176
+ console.print(f"[bold]Updating {annotator.display_name}…[/bold]")
177
+ if _run_setup(annotator):
178
+ console.print(
179
+ f"[green]✓ {annotator.display_name} updated[/green] "
180
+ f"(version {annotator.version() or '(unknown)'})"
181
+ )
182
+
183
+
184
+ def _format_from_path(output: Path, override: str | None) -> str:
185
+ if override:
186
+ return override.lower()
187
+ suffix = output.suffix.lower()
188
+ if suffix == ".html":
189
+ return "html"
190
+ if suffix == ".json":
191
+ return "json"
192
+ raise click.ClickException(
193
+ f"Cannot infer report format from {output.name!r}. "
194
+ "Pass --report-format html|json explicitly."
195
+ )
196
+
197
+
198
+ def _run_analysis_command(
199
+ file_path: Path,
200
+ fmt: str | None,
201
+ data_dir: Path | None,
202
+ output: Path | None,
203
+ report_format: str | None,
204
+ min_magnitude: float,
205
+ category: str | None,
206
+ genes: frozenset[str] | None,
207
+ build: str | None = None,
208
+ include_benign: bool = False,
209
+ gwas_min_magnitude: float | None = None,
210
+ snpedia_min_magnitude: float | None = None,
211
+ exclude_sources: frozenset[str] | None = None,
212
+ gwas_all: bool = False,
213
+ diff_path: Path | None = None,
214
+ no_update: bool = False,
215
+ no_gnomad: bool = False,
216
+ no_alphamissense: bool = False,
217
+ ) -> None:
218
+ resolved = resolve_data_dir(data_dir)
219
+ if not no_update:
220
+ _maybe_refresh_databases(resolved)
221
+ parser = _resolve_parser(file_path, fmt)
222
+
223
+ from allelix.config import load_config
224
+
225
+ cfg = load_config(resolved)
226
+ _, ready, not_ready = _ready_annotators(
227
+ data_dir,
228
+ include_benign=include_benign,
229
+ gwas_filter_traits=not gwas_all,
230
+ cadd_full=cfg.cadd_full,
231
+ )
232
+ annotator_classes = {type(a).name: type(a) for a in ready}
233
+ ready = [a for a in ready if cfg.is_enabled(a.name, annotator_classes)]
234
+
235
+ if exclude_sources:
236
+ ready = [a for a in ready if a.name not in exclude_sources]
237
+
238
+ gnomad_annotator = None
239
+ if not no_gnomad:
240
+ from allelix.annotators.gnomad import GnomadAnnotator
241
+
242
+ for a in ready:
243
+ if isinstance(a, GnomadAnnotator):
244
+ gnomad_annotator = a
245
+ break
246
+ ready = [a for a in ready if a.name != "gnomad"]
247
+
248
+ am_annotator = None
249
+ if not no_alphamissense:
250
+ from allelix.annotators.alphamissense import AlphaMissenseAnnotator
251
+
252
+ for a in ready:
253
+ if isinstance(a, AlphaMissenseAnnotator):
254
+ am_annotator = a
255
+ break
256
+ ready = [a for a in ready if a.name != "alphamissense"]
257
+
258
+ cadd_annotator = None
259
+ from allelix.annotators.cadd import CaddAnnotator
260
+
261
+ for a in ready:
262
+ if isinstance(a, CaddAnnotator):
263
+ cadd_annotator = a
264
+ break
265
+ ready = [a for a in ready if a.name != "cadd"]
266
+
267
+ if not_ready:
268
+ names = [a.name for a in not_ready]
269
+ console.print(
270
+ f"[yellow]Skipping unready annotators: {', '.join(names)}[/yellow] "
271
+ "(run `allelix db update` to populate)"
272
+ )
273
+
274
+ all_active: list[Annotator] = list(ready)
275
+ if gnomad_annotator is not None and gnomad_annotator.is_ready():
276
+ all_active.append(gnomad_annotator)
277
+ if am_annotator is not None and am_annotator.is_ready():
278
+ all_active.append(am_annotator)
279
+ if cadd_annotator is not None and cadd_annotator.is_ready():
280
+ all_active.append(cadd_annotator)
281
+ versions = ", ".join(f"{a.display_name} ({a.version() or 'unknown'})" for a in all_active)
282
+ console.print(f"[dim]Analyzing against: {versions}[/dim]")
283
+
284
+ counter, stderr_handler, snapshot = _wire_parser_logging()
285
+ try:
286
+ result = run_analysis(
287
+ file_path,
288
+ parser,
289
+ ready,
290
+ skipped_count_provider=lambda: counter.count,
291
+ build_override=build,
292
+ gnomad=gnomad_annotator,
293
+ alphamissense=am_annotator,
294
+ cadd=cadd_annotator,
295
+ )
296
+ finally:
297
+ _unwire_parser_logging(counter, stderr_handler, snapshot)
298
+
299
+ _emit_build_diagnostics(result)
300
+
301
+ high_value = load_high_value_snps()
302
+ hv_rsids = set(high_value)
303
+ hv_variants: list[Variant] = [v for v in parser.parse(file_path) if v.rsid in hv_rsids]
304
+ hv_warnings = scan_no_calls(hv_variants, high_value)
305
+ if hv_warnings:
306
+ console.print(
307
+ f"[bold red]Warning:[/bold red] {len(hv_warnings)} high-value SNP(s) returned no-call:"
308
+ )
309
+ for line in format_warnings(hv_warnings):
310
+ console.print(f" [red]⚠[/red] {line}")
311
+
312
+ if counter.count:
313
+ console.print(
314
+ f"[yellow]Note:[/yellow] {counter.count:,} malformed line(s) skipped "
315
+ "(see warnings on stderr)."
316
+ )
317
+
318
+ source_floors: dict[str, float] | None = None
319
+ if gwas_min_magnitude is not None or snpedia_min_magnitude is not None:
320
+ source_floors = {}
321
+ if gwas_min_magnitude is not None:
322
+ source_floors["gwas"] = gwas_min_magnitude
323
+ if snpedia_min_magnitude is not None:
324
+ source_floors["snpedia"] = snpedia_min_magnitude
325
+
326
+ diff_result = None
327
+ if diff_path is not None:
328
+ try:
329
+ prev = load_previous_report(diff_path)
330
+ except ValueError as exc:
331
+ raise click.ClickException(str(exc)) from exc
332
+ filtered_for_diff = result.filter(
333
+ min_magnitude=min_magnitude,
334
+ category=category,
335
+ genes=genes,
336
+ source_min_magnitudes=source_floors,
337
+ )
338
+ from allelix.reports._pipeline import rollup_gwas_duplicates
339
+
340
+ filtered_for_diff = rollup_gwas_duplicates(filtered_for_diff)
341
+ diff_result = compute_diff(
342
+ filtered_for_diff,
343
+ prev["annotations"],
344
+ prev.get("generated_at", ""),
345
+ )
346
+
347
+ if output is None:
348
+ if diff_result is not None:
349
+ rendered = render_terminal_diff(diff_result, console)
350
+ else:
351
+ rendered = render_terminal(
352
+ result,
353
+ console=console,
354
+ min_magnitude=min_magnitude,
355
+ category=category,
356
+ genes=genes,
357
+ source_min_magnitudes=source_floors,
358
+ )
359
+ else:
360
+ chosen = _format_from_path(output, report_format)
361
+ hv_warning_lines = format_warnings(hv_warnings) if hv_warnings else None
362
+ if chosen == "json":
363
+ hv_dicts = (
364
+ [{"rsid": w.snp.rsid, "gene": w.snp.gene, "note": w.snp.note} for w in hv_warnings]
365
+ if hv_warnings
366
+ else None
367
+ )
368
+ rendered = render_json(
369
+ result,
370
+ output_path=output,
371
+ min_magnitude=min_magnitude,
372
+ category=category,
373
+ genes=genes,
374
+ source_min_magnitudes=source_floors,
375
+ diff=diff_result,
376
+ high_value_no_calls=hv_dicts,
377
+ )
378
+ else:
379
+ rendered = render_html(
380
+ result,
381
+ output_path=output,
382
+ min_magnitude=min_magnitude,
383
+ category=category,
384
+ genes=genes,
385
+ source_min_magnitudes=source_floors,
386
+ diff=diff_result,
387
+ high_value_no_calls=hv_warning_lines,
388
+ )
389
+ console.print(f"[green]Wrote {rendered:,} annotation(s) to {output}[/green]")
390
+
391
+ console.print(
392
+ f"[dim]{len(result.annotations):,} total annotation(s) from {len(ready)} "
393
+ f"database(s) across {result.total_variants:,} variant(s).[/dim]"
394
+ )
395
+
396
+
397
+ @click.group()
398
+ @click.version_option(version=__version__, prog_name="allelix")
399
+ def main() -> None:
400
+ """Allelix: open-source genotype analysis toolkit."""
401
+
402
+
403
+ @main.command()
404
+ @click.argument(
405
+ "file_path",
406
+ type=click.Path(exists=True, dir_okay=False, path_type=Path),
407
+ )
408
+ @click.option(
409
+ "--format",
410
+ "fmt",
411
+ default=None,
412
+ help="Force a specific parser (e.g., myhappygenes). Default: auto-detect.",
413
+ )
414
+ def stats(file_path: Path, fmt: str | None) -> None:
415
+ """Show summary statistics for a genotype file."""
416
+ parser = _resolve_parser(file_path, fmt)
417
+ counter, stderr_handler, snapshot = _wire_parser_logging()
418
+
419
+ high_value = load_high_value_snps()
420
+ hv_rsids = set(high_value)
421
+ hv_variants: list[Variant] = []
422
+
423
+ total = 0
424
+ no_calls = 0
425
+ het = 0
426
+ hom = 0
427
+ chrom_counts: dict[str, int] = {}
428
+ try:
429
+ metadata = parser.get_metadata(file_path)
430
+ for variant in parser.parse(file_path):
431
+ total += 1
432
+ if variant.rsid in hv_rsids:
433
+ hv_variants.append(variant)
434
+ if variant.is_no_call:
435
+ no_calls += 1
436
+ elif variant.is_heterozygous:
437
+ het += 1
438
+ else:
439
+ hom += 1
440
+ chrom_counts[variant.chromosome] = chrom_counts.get(variant.chromosome, 0) + 1
441
+ finally:
442
+ _unwire_parser_logging(counter, stderr_handler, snapshot)
443
+
444
+ summary = Table(title=f"Genotype File Stats: {file_path.name}")
445
+ summary.add_column("Metric", style="cyan", no_wrap=True)
446
+ summary.add_column("Value", style="green")
447
+ summary.add_row("Format", parser.display_name)
448
+ summary.add_row("Sample ID", metadata["sample_id"] or "(unknown)")
449
+ summary.add_row("Build", metadata["build"])
450
+ summary.add_row("Total SNPs", f"{total:,}")
451
+ summary.add_row("No-calls", f"{no_calls:,} ({_percent(no_calls, total)})")
452
+ summary.add_row("Heterozygous", f"{het:,} ({_percent(het, total)})")
453
+ summary.add_row("Homozygous", f"{hom:,} ({_percent(hom, total)})")
454
+ if counter.count:
455
+ summary.add_row(
456
+ "Skipped (malformed)",
457
+ f"[yellow]{counter.count:,}[/yellow] (see warnings on stderr)",
458
+ )
459
+
460
+ hv_warnings = scan_no_calls(hv_variants, high_value)
461
+ if hv_warnings:
462
+ summary.add_row(
463
+ "High-value no-calls",
464
+ f"[red]{len(hv_warnings)}[/red]",
465
+ )
466
+ console.print(summary)
467
+
468
+ if hv_warnings:
469
+ for line in format_warnings(hv_warnings):
470
+ console.print(f" [red]⚠[/red] {line}")
471
+
472
+ chrom_table = Table(title="Variants per Chromosome")
473
+ chrom_table.add_column("Chromosome", style="cyan", no_wrap=True)
474
+ chrom_table.add_column("Count", style="green", justify="right")
475
+ for chrom in sorted(chrom_counts, key=_chrom_sort_key):
476
+ chrom_table.add_row(chrom, f"{chrom_counts[chrom]:,}")
477
+ console.print(chrom_table)
478
+
479
+
480
+ _FILE_ARG = click.argument(
481
+ "file_path", type=click.Path(exists=True, dir_okay=False, path_type=Path)
482
+ )
483
+ _FORMAT_OPT = click.option(
484
+ "--format", "fmt", default=None, help="Force a specific parser. Default: auto-detect."
485
+ )
486
+ _DATA_DIR_OPT = click.option(
487
+ "--data-dir",
488
+ type=click.Path(file_okay=False, path_type=Path),
489
+ default=None,
490
+ help="Override database cache location.",
491
+ )
492
+ _MIN_MAG_OPT = click.option(
493
+ "--min-magnitude",
494
+ type=float,
495
+ default=5.0,
496
+ show_default=True,
497
+ help="Filter annotations below this magnitude. Use 0 for the full unfiltered set.",
498
+ )
499
+ _OUTPUT_OPT = click.option(
500
+ "--output",
501
+ type=click.Path(dir_okay=False, path_type=Path),
502
+ default=None,
503
+ help="Write a report file (.html or .json). Omit for terminal output.",
504
+ )
505
+ _REPORT_FORMAT_OPT = click.option(
506
+ "--report-format",
507
+ type=click.Choice(["html", "json"], case_sensitive=False),
508
+ default=None,
509
+ help="Override report format detection (otherwise inferred from --output extension).",
510
+ )
511
+ _INCLUDE_BENIGN_OPT = click.option(
512
+ "--include-benign",
513
+ is_flag=True,
514
+ default=False,
515
+ help="Include ClinVar Benign/Likely_benign annotations (suppressed by default).",
516
+ )
517
+ _GWAS_MIN_MAG_OPT = click.option(
518
+ "--gwas-min-magnitude",
519
+ type=float,
520
+ default=9.0,
521
+ show_default=True,
522
+ help="Magnitude floor for GWAS Catalog annotations (overrides --min-magnitude for GWAS).",
523
+ )
524
+ _SNPEDIA_MIN_MAG_OPT = click.option(
525
+ "--snpedia-min-magnitude",
526
+ type=float,
527
+ default=2.0,
528
+ show_default=True,
529
+ help="Magnitude floor for SNPedia annotations (overrides --min-magnitude for SNPedia).",
530
+ )
531
+ _INCLUDE_GWAS_OPT = click.option(
532
+ "--include-gwas",
533
+ is_flag=True,
534
+ default=False,
535
+ help="Include GWAS Catalog annotations (excluded by default in focused reports).",
536
+ )
537
+ _EXCLUDE_SNPEDIA_OPT = click.option(
538
+ "--exclude-snpedia",
539
+ is_flag=True,
540
+ default=False,
541
+ help="Exclude SNPedia annotations. Required for commercial use (CC BY-NC-SA 3.0).",
542
+ )
543
+ _GWAS_ALL_OPT = click.option(
544
+ "--gwas-all",
545
+ is_flag=True,
546
+ default=False,
547
+ help="Include all GWAS trait categories (disables default noise filtering).",
548
+ )
549
+ _DIFF_OPT = click.option(
550
+ "--diff",
551
+ "diff_path",
552
+ type=click.Path(exists=True, dir_okay=False, path_type=Path),
553
+ default=None,
554
+ help=(
555
+ "Dev/QA tool: compare current output against a previous JSON report "
556
+ "to detect regressions from code changes, database refreshes, or "
557
+ "filter adjustments. Shows new, changed, and removed annotations. "
558
+ "Not a monitoring tool — use for version-to-version validation."
559
+ ),
560
+ )
561
+ _NO_UPDATE_OPT = click.option(
562
+ "--no-update",
563
+ is_flag=True,
564
+ default=False,
565
+ help="Skip the pre-analysis database freshness check.",
566
+ )
567
+ _NO_GNOMAD_OPT = click.option(
568
+ "--no-gnomad",
569
+ is_flag=True,
570
+ default=False,
571
+ help="Skip gnomAD population frequency enrichment.",
572
+ )
573
+ _NO_ALPHAMISSENSE_OPT = click.option(
574
+ "--no-alphamissense",
575
+ is_flag=True,
576
+ default=False,
577
+ help="Skip AlphaMissense variant pathogenicity enrichment.",
578
+ )
579
+ _BUILD_OPT = click.option(
580
+ "--build",
581
+ type=click.Choice(["grch37", "grch38", "auto"], case_sensitive=False),
582
+ default="auto",
583
+ help=(
584
+ "Genome build of the input file. 'auto' detects from position data "
585
+ "(ADR-0021) and ignores the file header. 'grch37' / 'grch38' force a "
586
+ "specific build, skipping detection."
587
+ ),
588
+ )
589
+
590
+
591
+ def _resolve_clinvar_builds(value: str) -> tuple[str, ...]:
592
+ """Map a `db update --build` value to a tuple of build identifiers."""
593
+ v = (value or "both").strip().lower()
594
+ if v == "both":
595
+ return ("GRCh37", "GRCh38")
596
+ if v == "grch37":
597
+ return ("GRCh37",)
598
+ if v == "grch38":
599
+ return ("GRCh38",)
600
+ raise click.ClickException(f"Unknown --build value {value!r}")
601
+
602
+
603
+ def _normalize_cli_build(value: str | None) -> str | None:
604
+ """Map a --build CLI value to a canonical build identifier or None for auto."""
605
+ if value is None:
606
+ return None
607
+ v = value.strip().lower()
608
+ if v in ("", "auto"):
609
+ return None
610
+ if v == "grch37":
611
+ return "GRCh37"
612
+ if v == "grch38":
613
+ return "GRCh38"
614
+ raise click.ClickException(f"Unknown --build value {value!r}")
615
+
616
+
617
+ def _emit_build_diagnostics(result: object) -> None:
618
+ """Print a one-line build banner and a warning on header/data mismatch."""
619
+ diag = getattr(result, "build_diagnostics", None)
620
+ if diag is None:
621
+ return
622
+ matched = f"{diag.matched_count}/{diag.inspected_count}" if diag.inspected_count else "0/0"
623
+ if diag.override:
624
+ source = "override"
625
+ elif diag.detected_build:
626
+ source = "detected"
627
+ elif diag.header_build:
628
+ source = "header (no position confirmation)"
629
+ else:
630
+ source = "fallback (no known SNPs matched)"
631
+ console.print(
632
+ f"[dim]Build: {diag.effective_build} ({source}; "
633
+ f"{matched} known-SNP positions matched)[/dim]"
634
+ )
635
+ if diag.mismatch:
636
+ console.print(
637
+ f"[yellow]Build mismatch: file header claims {diag.header_build} but "
638
+ f"position data is {diag.detected_build}. Using {diag.detected_build}. "
639
+ f"This is a real-world data-quality issue — your provider may have "
640
+ f"mislabeled the build (see ADR-0021).[/yellow]"
641
+ )
642
+ if diag.effective_build == "GRCh36":
643
+ console.print(
644
+ "[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "
645
+ "(PharmGKB, GWAS Catalog, SNPedia, gnomAD) are complete. ClinVar "
646
+ "position-matching is skipped (no GRCh36 cache — see ADR-0025). "
647
+ "For full ClinVar coverage, liftOver to GRCh38 first: "
648
+ "docs/grch36-liftover.md[/yellow]"
649
+ )
650
+
651
+
652
+ @main.command()
653
+ @_FILE_ARG
654
+ @_FORMAT_OPT
655
+ @_DATA_DIR_OPT
656
+ @_MIN_MAG_OPT
657
+ @click.option(
658
+ "--category",
659
+ type=str,
660
+ default=None,
661
+ help="Filter to a single bucket (clinical, pharma).",
662
+ )
663
+ @_OUTPUT_OPT
664
+ @_REPORT_FORMAT_OPT
665
+ @_BUILD_OPT
666
+ @_INCLUDE_BENIGN_OPT
667
+ @_GWAS_MIN_MAG_OPT
668
+ @_SNPEDIA_MIN_MAG_OPT
669
+ @_GWAS_ALL_OPT
670
+ @_EXCLUDE_SNPEDIA_OPT
671
+ @_DIFF_OPT
672
+ @_NO_UPDATE_OPT
673
+ @_NO_GNOMAD_OPT
674
+ @_NO_ALPHAMISSENSE_OPT
675
+ def analyze(
676
+ file_path: Path,
677
+ fmt: str | None,
678
+ data_dir: Path | None,
679
+ min_magnitude: float,
680
+ category: str | None,
681
+ output: Path | None,
682
+ report_format: str | None,
683
+ build: str,
684
+ include_benign: bool,
685
+ gwas_min_magnitude: float,
686
+ snpedia_min_magnitude: float,
687
+ gwas_all: bool,
688
+ exclude_snpedia: bool,
689
+ diff_path: Path | None,
690
+ no_update: bool,
691
+ no_gnomad: bool,
692
+ no_alphamissense: bool,
693
+ ) -> None:
694
+ """Annotate a genotype file against all ready reference databases."""
695
+ _run_analysis_command(
696
+ file_path=file_path,
697
+ fmt=fmt,
698
+ data_dir=data_dir,
699
+ output=output,
700
+ report_format=report_format,
701
+ min_magnitude=min_magnitude,
702
+ category=category,
703
+ genes=None,
704
+ build=_normalize_cli_build(build),
705
+ include_benign=include_benign,
706
+ gwas_min_magnitude=gwas_min_magnitude,
707
+ snpedia_min_magnitude=snpedia_min_magnitude,
708
+ exclude_sources=frozenset({"snpedia"}) if exclude_snpedia else None,
709
+ gwas_all=gwas_all,
710
+ diff_path=diff_path,
711
+ no_update=no_update,
712
+ no_gnomad=no_gnomad,
713
+ no_alphamissense=no_alphamissense,
714
+ )
715
+
716
+
717
+ @main.command()
718
+ @_FILE_ARG
719
+ @_FORMAT_OPT
720
+ @click.option(
721
+ "--snps",
722
+ required=True,
723
+ help="Comma-separated rsIDs to extract (e.g., rs1801133,rs4680).",
724
+ )
725
+ def extract(file_path: Path, fmt: str | None, snps: str) -> None:
726
+ """Print diploid genotypes for specific rsIDs — spot-check carrier status.
727
+
728
+ Useful for verifying ClinVar / PharmGKB hits against the actual file
729
+ before trusting them. The "Genotype" column shows the diploid call as
730
+ the array (or VCF) reported it; "Het?" and "No-call?" answer the
731
+ questions the carrier rule (ADR-0007) actually checks.
732
+ """
733
+ parser = _resolve_parser(file_path, fmt)
734
+ wanted = {s.strip() for s in snps.split(",") if s.strip()}
735
+ if not wanted:
736
+ raise click.ClickException("--snps cannot be empty.")
737
+
738
+ counter, stderr_handler, snapshot = _wire_parser_logging()
739
+ found: dict[str, object] = {}
740
+ try:
741
+ for variant in parser.parse(file_path):
742
+ if variant.rsid in wanted:
743
+ found[variant.rsid] = variant
744
+ if len(found) == len(wanted):
745
+ break # streaming early-exit once we have everything
746
+ finally:
747
+ _unwire_parser_logging(counter, stderr_handler, snapshot)
748
+
749
+ table = Table(title=f"Genotypes from {file_path.name}")
750
+ table.add_column("rsID", style="cyan", no_wrap=True)
751
+ table.add_column("Chr", no_wrap=True)
752
+ table.add_column("Position", justify="right")
753
+ table.add_column("Genotype", style="yellow", no_wrap=True)
754
+ table.add_column("Het?", justify="center")
755
+ table.add_column("No-call?", justify="center")
756
+ for rsid in sorted(wanted):
757
+ variant = found.get(rsid)
758
+ if variant is None:
759
+ table.add_row(rsid, "—", "—", "[red]not in file[/red]", "—", "—")
760
+ continue
761
+ table.add_row(
762
+ variant.rsid,
763
+ variant.chromosome,
764
+ f"{variant.position:,}",
765
+ variant.genotype,
766
+ "yes" if variant.is_heterozygous else "no",
767
+ "[red]yes[/red]" if variant.is_no_call else "no",
768
+ )
769
+ console.print(table)
770
+
771
+
772
+ @main.command()
773
+ @click.argument("file1", type=click.Path(exists=True, dir_okay=False, path_type=Path))
774
+ @click.argument("file2", type=click.Path(exists=True, dir_okay=False, path_type=Path))
775
+ @click.option("--format1", "fmt1", default=None, help="Force parser for file 1.")
776
+ @click.option("--format2", "fmt2", default=None, help="Force parser for file 2.")
777
+ def compare(file1: Path, file2: Path, fmt1: str | None, fmt2: str | None) -> None:
778
+ """Compare two genotype files — coverage overlap and concordance.
779
+
780
+ Reports shared rsIDs, file-specific rsIDs, genotype agreement,
781
+ strand-flip matches (complementary alleles on opposite strands),
782
+ discordant calls, and strand-ambiguous positions.
783
+ """
784
+ from allelix.compare import compare_variants
785
+ from allelix.utils.build_detect import detect_build
786
+
787
+ parser1 = _resolve_parser(file1, fmt1)
788
+ parser2 = _resolve_parser(file2, fmt2)
789
+ variants1 = list(parser1.parse(file1))
790
+ variants2 = list(parser2.parse(file2))
791
+
792
+ det1 = detect_build(variants1)
793
+ det2 = detect_build(variants2)
794
+ build1 = det1.build or parser1.get_metadata(file1).get("build", "unknown")
795
+ build2 = det2.build or parser2.get_metadata(file2).get("build", "unknown")
796
+
797
+ result = compare_variants(variants1, variants2, build1=build1, build2=build2)
798
+
799
+ if result.build1 != result.build2:
800
+ console.print(
801
+ f"[yellow]Warning: builds differ ({result.build1} vs {result.build2}). "
802
+ "Position-based comparisons may be unreliable.[/yellow]"
803
+ )
804
+
805
+ table = Table(title="Coverage Summary")
806
+ table.add_column("Metric", style="bold")
807
+ table.add_column("Value", justify="right")
808
+ table.add_row("File 1", f"{file1.name} ({result.file1_total:,} variants)")
809
+ table.add_row("File 2", f"{file2.name} ({result.file2_total:,} variants)")
810
+ table.add_row("Build (file 1)", result.build1)
811
+ table.add_row("Build (file 2)", result.build2)
812
+ table.add_row("Shared rsIDs", f"{result.shared:,}")
813
+ table.add_row("File 1 only", f"{result.file1_only:,}")
814
+ table.add_row("File 2 only", f"{result.file2_only:,}")
815
+ console.print(table)
816
+
817
+ conc_table = Table(title="Genotype Concordance")
818
+ conc_table.add_column("Category", style="bold")
819
+ conc_table.add_column("Count", justify="right")
820
+ conc_table.add_column("%", justify="right")
821
+ for label, count in [
822
+ ("Concordant", result.concordant),
823
+ ("Strand-flip match", result.strand_flip_match),
824
+ ("Discordant", result.discordant),
825
+ ("Strand-ambiguous", result.strand_ambiguous),
826
+ ("No-call (either file)", result.no_call),
827
+ ]:
828
+ pct = _percent(count, result.shared) if result.shared else "—"
829
+ conc_table.add_row(label, f"{count:,}", pct)
830
+ console.print(conc_table)
831
+
832
+ if result.chromosome_counts:
833
+ chrom_table = Table(title="Per-Chromosome Breakdown")
834
+ chrom_table.add_column("Chr", style="cyan", no_wrap=True)
835
+ chrom_table.add_column("Concordant", justify="right")
836
+ chrom_table.add_column("Flip", justify="right")
837
+ chrom_table.add_column("Discordant", justify="right")
838
+ chrom_table.add_column("Ambiguous", justify="right")
839
+ chrom_table.add_column("No-call", justify="right")
840
+ for chrom in sorted(result.chromosome_counts, key=_chrom_sort_key):
841
+ c = result.chromosome_counts[chrom]
842
+ chrom_table.add_row(
843
+ chrom,
844
+ str(c.get("concordant", 0)),
845
+ str(c.get("strand_flip_match", 0)),
846
+ str(c.get("discordant", 0)),
847
+ str(c.get("strand_ambiguous", 0)),
848
+ str(c.get("no_call", 0)),
849
+ )
850
+ console.print(chrom_table)
851
+
852
+
853
+ @main.command()
854
+ @_FILE_ARG
855
+ @_FORMAT_OPT
856
+ @_DATA_DIR_OPT
857
+ @_MIN_MAG_OPT
858
+ @_OUTPUT_OPT
859
+ @_REPORT_FORMAT_OPT
860
+ @_BUILD_OPT
861
+ @_INCLUDE_BENIGN_OPT
862
+ @_GWAS_MIN_MAG_OPT
863
+ @_SNPEDIA_MIN_MAG_OPT
864
+ @_INCLUDE_GWAS_OPT
865
+ @_GWAS_ALL_OPT
866
+ @_EXCLUDE_SNPEDIA_OPT
867
+ @_DIFF_OPT
868
+ @_NO_UPDATE_OPT
869
+ @_NO_GNOMAD_OPT
870
+ @_NO_ALPHAMISSENSE_OPT
871
+ def methylation(
872
+ file_path: Path,
873
+ fmt: str | None,
874
+ data_dir: Path | None,
875
+ min_magnitude: float,
876
+ output: Path | None,
877
+ report_format: str | None,
878
+ build: str,
879
+ include_benign: bool,
880
+ gwas_min_magnitude: float,
881
+ snpedia_min_magnitude: float,
882
+ include_gwas: bool,
883
+ gwas_all: bool,
884
+ exclude_snpedia: bool,
885
+ diff_path: Path | None,
886
+ no_update: bool,
887
+ no_gnomad: bool,
888
+ no_alphamissense: bool,
889
+ ) -> None:
890
+ """Methylation-pathway-focused report (MTHFR, MTR, MTRR, COMT, CBS, …)."""
891
+ excluded: set[str] = set()
892
+ if not include_gwas:
893
+ excluded.add("gwas")
894
+ if exclude_snpedia:
895
+ excluded.add("snpedia")
896
+ _run_analysis_command(
897
+ file_path=file_path,
898
+ fmt=fmt,
899
+ data_dir=data_dir,
900
+ output=output,
901
+ report_format=report_format,
902
+ min_magnitude=min_magnitude,
903
+ category=None,
904
+ genes=METHYLATION_PANEL_GENES,
905
+ build=_normalize_cli_build(build),
906
+ include_benign=include_benign,
907
+ gwas_min_magnitude=gwas_min_magnitude,
908
+ snpedia_min_magnitude=snpedia_min_magnitude,
909
+ exclude_sources=frozenset(excluded) if excluded else None,
910
+ gwas_all=gwas_all,
911
+ diff_path=diff_path,
912
+ no_update=no_update,
913
+ no_gnomad=no_gnomad,
914
+ no_alphamissense=no_alphamissense,
915
+ )
916
+
917
+
918
+ @main.command()
919
+ @_FILE_ARG
920
+ @_FORMAT_OPT
921
+ @_DATA_DIR_OPT
922
+ @_MIN_MAG_OPT
923
+ @_OUTPUT_OPT
924
+ @_REPORT_FORMAT_OPT
925
+ @_BUILD_OPT
926
+ @_INCLUDE_BENIGN_OPT
927
+ @_GWAS_MIN_MAG_OPT
928
+ @_SNPEDIA_MIN_MAG_OPT
929
+ @_INCLUDE_GWAS_OPT
930
+ @_GWAS_ALL_OPT
931
+ @_EXCLUDE_SNPEDIA_OPT
932
+ @_DIFF_OPT
933
+ @_NO_UPDATE_OPT
934
+ @_NO_GNOMAD_OPT
935
+ @_NO_ALPHAMISSENSE_OPT
936
+ def pharmacogenomics(
937
+ file_path: Path,
938
+ fmt: str | None,
939
+ data_dir: Path | None,
940
+ min_magnitude: float,
941
+ output: Path | None,
942
+ report_format: str | None,
943
+ build: str,
944
+ include_benign: bool,
945
+ gwas_min_magnitude: float,
946
+ snpedia_min_magnitude: float,
947
+ include_gwas: bool,
948
+ gwas_all: bool,
949
+ exclude_snpedia: bool,
950
+ diff_path: Path | None,
951
+ no_update: bool,
952
+ no_gnomad: bool,
953
+ no_alphamissense: bool,
954
+ ) -> None:
955
+ """Pharmacogenomics-focused report (annotations from PharmGKB-style sources)."""
956
+ excluded: set[str] = set()
957
+ if not include_gwas:
958
+ excluded.add("gwas")
959
+ if exclude_snpedia:
960
+ excluded.add("snpedia")
961
+ _run_analysis_command(
962
+ file_path=file_path,
963
+ fmt=fmt,
964
+ data_dir=data_dir,
965
+ output=output,
966
+ report_format=report_format,
967
+ min_magnitude=min_magnitude,
968
+ category="pharma",
969
+ genes=None,
970
+ build=_normalize_cli_build(build),
971
+ include_benign=include_benign,
972
+ gwas_min_magnitude=gwas_min_magnitude,
973
+ snpedia_min_magnitude=snpedia_min_magnitude,
974
+ exclude_sources=frozenset(excluded) if excluded else None,
975
+ gwas_all=gwas_all,
976
+ diff_path=diff_path,
977
+ no_update=no_update,
978
+ no_gnomad=no_gnomad,
979
+ no_alphamissense=no_alphamissense,
980
+ )
981
+
982
+
983
+ @main.group()
984
+ def db() -> None:
985
+ """Manage local reference database cache."""
986
+
987
+
988
+ def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
989
+ """Write a remote signal to an existing cache without re-downloading."""
990
+ import contextlib
991
+ import sqlite3
992
+
993
+ from allelix.databases.manager import stamp_remote_signal
994
+
995
+ db_path = getattr(annotator, "_db_path", None)
996
+ if db_path is None:
997
+ return
998
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
999
+ stamp_remote_signal(conn, annotator.name, signal)
1000
+ conn.commit()
1001
+
1002
+
1003
+ def _confirm_cadd_license(*, license_held: bool = False) -> bool:
1004
+ """Show the CADD license notice and ask for confirmation."""
1005
+ if license_held:
1006
+ console.print(
1007
+ "\n[bold yellow]CADD License Notice[/bold yellow]\n"
1008
+ "Commercial license asserted. Proceeding with CADD download.\n"
1009
+ )
1010
+ return True
1011
+ console.print(
1012
+ "\n[bold yellow]CADD License Notice[/bold yellow]\n"
1013
+ "CADD scores are provided by the University of Washington.\n"
1014
+ "Commercial use requires a license from UW CoMotion\n"
1015
+ "([link=https://els2.comotion.uw.edu/product/cadd-scores]"
1016
+ "https://els2.comotion.uw.edu/product/cadd-scores[/link]).\n"
1017
+ "By continuing, you confirm that your use is non-commercial\n"
1018
+ "or that you hold a valid commercial license.\n"
1019
+ )
1020
+ return click.confirm("Continue with CADD download?", default=False)
1021
+
1022
+
1023
+ def _run_setup(annotator: Annotator) -> bool:
1024
+ """Invoke annotator.setup(). Returns True on success, False on failure."""
1025
+ try:
1026
+ annotator.setup()
1027
+ except Exception as exc:
1028
+ if hasattr(exc, "close"):
1029
+ exc.close()
1030
+ console.print(f" [red]{annotator.name}: {exc}[/red]")
1031
+ return False
1032
+ sig = getattr(annotator, "cached_remote_signal", lambda: None)()
1033
+ if sig and "cpic:unavailable" in sig:
1034
+ console.print(
1035
+ f" [yellow]{annotator.name}: updated (CPIC unavailable — "
1036
+ "non-finding filter degraded, retry later)[/yellow]"
1037
+ )
1038
+ return True
1039
+
1040
+
1041
+ @db.command("update")
1042
+ @_DATA_DIR_OPT
1043
+ @click.option(
1044
+ "--force",
1045
+ is_flag=True,
1046
+ default=False,
1047
+ help="Re-download even if the local cache appears current.",
1048
+ )
1049
+ @click.option(
1050
+ "--no-gnomad",
1051
+ is_flag=True,
1052
+ default=False,
1053
+ help="Skip gnomAD population frequency database.",
1054
+ )
1055
+ @click.option(
1056
+ "--no-alphamissense",
1057
+ is_flag=True,
1058
+ default=False,
1059
+ help="Skip AlphaMissense pathogenicity database.",
1060
+ )
1061
+ @click.option(
1062
+ "--cadd",
1063
+ "include_cadd",
1064
+ is_flag=True,
1065
+ default=False,
1066
+ help="Download CADD deleteriousness scores (non-commercial use only; disabled by default).",
1067
+ )
1068
+ @click.option(
1069
+ "--build",
1070
+ type=click.Choice(["grch37", "grch38", "both"], case_sensitive=False),
1071
+ default="both",
1072
+ help=(
1073
+ "Which ClinVar genome build(s) to download. 'both' (default) keeps "
1074
+ "GRCh37 and GRCh38 caches in sync so `analyze` can dispatch by "
1075
+ "detected build (ADR-0021). 'grch37' / 'grch38' restrict to one to "
1076
+ "save bandwidth."
1077
+ ),
1078
+ )
1079
+ def db_update(
1080
+ data_dir: Path | None,
1081
+ force: bool,
1082
+ no_gnomad: bool,
1083
+ no_alphamissense: bool,
1084
+ include_cadd: bool,
1085
+ build: str,
1086
+ ) -> None:
1087
+ """Download or refresh reference databases.
1088
+
1089
+ For each annotator:
1090
+ - no cache → download
1091
+ - --force → download
1092
+ - cache + remote signal matches cache → skip
1093
+ - cache + remote signal differs (or legacy v0.4.1 cache with no
1094
+ stored signal) → download
1095
+ - cache + remote signal can't be fetched → skip with notice (use
1096
+ --force to override)
1097
+
1098
+ `--build` selects which ClinVar build(s) to manage. Default 'both'
1099
+ downloads GRCh37 and GRCh38 caches.
1100
+ """
1101
+ resolved = resolve_data_dir(data_dir)
1102
+ console.print(f"Data directory: [cyan]{resolved}[/cyan]")
1103
+
1104
+ from allelix.config import load_config
1105
+
1106
+ cfg = load_config(resolved)
1107
+
1108
+ clinvar_builds = _resolve_clinvar_builds(build)
1109
+ for annotator in get_annotators(
1110
+ resolved, clinvar_builds=clinvar_builds, cadd_full=cfg.cadd_full
1111
+ ):
1112
+ with annotator:
1113
+ if no_gnomad and annotator.name == "gnomad":
1114
+ console.print(f" [dim]{annotator.name}: skipped (--no-gnomad)[/dim]")
1115
+ continue
1116
+ if no_alphamissense and annotator.name == "alphamissense":
1117
+ console.print(f" [dim]{annotator.name}: skipped (--no-alphamissense)[/dim]")
1118
+ continue
1119
+
1120
+ if annotator.name == "cadd":
1121
+ if not include_cadd and not cfg.is_enabled("cadd"):
1122
+ console.print(
1123
+ f" [dim]{annotator.name}: disabled "
1124
+ "(enable with `allelix config set sources.cadd true` "
1125
+ "or pass `--cadd`)[/dim]"
1126
+ )
1127
+ continue
1128
+ if (not annotator.is_ready() or force) and not _confirm_cadd_license(
1129
+ license_held=cfg.license_held("cadd"),
1130
+ ):
1131
+ console.print(f" [dim]{annotator.name}: skipped (declined)[/dim]")
1132
+ continue
1133
+
1134
+ if not annotator.requires_download:
1135
+ if annotator.is_ready():
1136
+ console.print(
1137
+ f" [dim]{annotator.name}: ready "
1138
+ f"({annotator.version() or 'unknown'})[/dim]"
1139
+ )
1140
+ continue
1141
+
1142
+ if not annotator.is_ready():
1143
+ console.print(f" [bold]{annotator.name}[/bold]: downloading…")
1144
+ if _run_setup(annotator):
1145
+ console.print(
1146
+ f" [green]✓ {annotator.name} ready[/green] "
1147
+ f"(version {annotator.version() or '(unknown)'})"
1148
+ )
1149
+ continue
1150
+
1151
+ if force:
1152
+ console.print(f" [bold]{annotator.name}[/bold]: --force; refreshing…")
1153
+ if _run_setup(annotator):
1154
+ console.print(
1155
+ f" [green]✓ {annotator.name} refreshed[/green] "
1156
+ f"(version {annotator.version() or '(unknown)'})"
1157
+ )
1158
+ continue
1159
+
1160
+ # Code-driven sources (commit-pinned HF caches) are updated
1161
+ # only via code changes — no runtime freshness probe needed.
1162
+ if not annotator.server_driven_freshness:
1163
+ console.print(
1164
+ f" [dim]{annotator.name}: already current "
1165
+ f"(version {annotator.version() or '(unknown)'})[/dim]"
1166
+ )
1167
+ continue
1168
+
1169
+ remote = annotator.fetch_remote_signal()
1170
+ if remote is None:
1171
+ console.print(
1172
+ f" [yellow]{annotator.name}: cache present, but remote "
1173
+ "freshness can't be verified (network error or no signal). "
1174
+ "Pass --force to refresh anyway.[/yellow]"
1175
+ )
1176
+ continue
1177
+
1178
+ cached = annotator.cached_remote_signal()
1179
+ if cached == remote:
1180
+ console.print(
1181
+ f" [dim]{annotator.name}: already current "
1182
+ f"(version {annotator.version() or '(unknown)'})[/dim]"
1183
+ )
1184
+ continue
1185
+
1186
+ if cached is None:
1187
+ _stamp_remote_signal(annotator, remote)
1188
+ console.print(
1189
+ f" [dim]{annotator.name}: stamped remote signal "
1190
+ f"(version {annotator.version() or '(unknown)'})[/dim]"
1191
+ )
1192
+ continue
1193
+
1194
+ console.print(f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
1195
+ if _run_setup(annotator):
1196
+ console.print(
1197
+ f" [green]✓ {annotator.name} refreshed[/green] "
1198
+ f"(version {annotator.version() or '(unknown)'})"
1199
+ )
1200
+
1201
+
1202
+ @db.command("status")
1203
+ @_DATA_DIR_OPT
1204
+ def db_status(data_dir: Path | None) -> None:
1205
+ """Show installed reference database versions and freshness."""
1206
+ from allelix.config import load_config
1207
+
1208
+ resolved = resolve_data_dir(data_dir)
1209
+ cfg = load_config(resolved)
1210
+ table = Table(title=f"Reference Databases ({resolved})")
1211
+ table.add_column("Annotator", style="cyan", no_wrap=True)
1212
+ table.add_column("Ready", justify="center")
1213
+ table.add_column("Version")
1214
+ table.add_column("Records", justify="right")
1215
+ for annotator in get_annotators(resolved, cadd_full=cfg.cadd_full):
1216
+ with annotator:
1217
+ ready = annotator.is_ready()
1218
+ ready_marker = "[green]yes[/green]" if ready else "[red]no[/red]"
1219
+ version = annotator.version() or "—"
1220
+ sig = getattr(annotator, "cached_remote_signal", lambda: None)()
1221
+ if sig and "cpic:unavailable" in sig:
1222
+ version += " (no CPIC)"
1223
+ records = "—"
1224
+ count_fn = getattr(annotator, "record_count", None)
1225
+ if callable(count_fn):
1226
+ count = count_fn()
1227
+ if count is not None:
1228
+ records = f"{count:,}"
1229
+ table.add_row(annotator.display_name, ready_marker, version, records)
1230
+ console.print(table)
1231
+
1232
+
1233
+ @main.group()
1234
+ def config() -> None:
1235
+ """Manage persistent configuration (source toggles, license mode)."""
1236
+
1237
+
1238
+ @config.command("show")
1239
+ @_DATA_DIR_OPT
1240
+ def config_show(data_dir: Path | None) -> None:
1241
+ """Display current configuration."""
1242
+ from allelix.annotators import _ANNOTATOR_CLASSES
1243
+ from allelix.annotators.base import Permission
1244
+ from allelix.annotators.base import permission as check_permission
1245
+ from allelix.config import load_config
1246
+
1247
+ resolved = resolve_data_dir(data_dir)
1248
+ cfg = load_config(resolved)
1249
+
1250
+ table = Table(title=f"Configuration ({resolved / 'config.toml'})")
1251
+ table.add_column("Source", style="cyan", no_wrap=True)
1252
+ table.add_column("Enabled", justify="center")
1253
+ table.add_column("Note", style="dim")
1254
+ for name, enabled in sorted(cfg.sources.items()):
1255
+ cls = _ANNOTATOR_CLASSES.get(name)
1256
+ note = ""
1257
+ if cls is not None:
1258
+ perm = check_permission(
1259
+ cls.license,
1260
+ commercial=cfg.commercial,
1261
+ license_held=cfg.license_held(name),
1262
+ )
1263
+ if perm is Permission.BLOCK_PURCHASABLE:
1264
+ marker = "[red]no[/red]"
1265
+ note = f"requires commercial license — purchase: {cls.license.purchase_url}"
1266
+ elif perm is Permission.BLOCK_FINAL:
1267
+ marker = "[red]no[/red]"
1268
+ note = "no commercial license is available"
1269
+ elif enabled:
1270
+ marker = "[green]yes[/green]"
1271
+ else:
1272
+ marker = "[red]no[/red]"
1273
+ elif enabled:
1274
+ marker = "[green]yes[/green]"
1275
+ else:
1276
+ marker = "[red]no[/red]"
1277
+ table.add_row(name, marker, note)
1278
+ console.print(table)
1279
+ mode = "[yellow]commercial[/yellow]" if cfg.commercial else "[green]personal[/green]"
1280
+ console.print(f"License mode: {mode}")
1281
+
1282
+
1283
+ @config.command("get")
1284
+ @_DATA_DIR_OPT
1285
+ @click.argument("key", required=False, default=None)
1286
+ def config_get(data_dir: Path | None, key: str | None) -> None:
1287
+ r"""Get a configuration value (or dump entire config).
1288
+
1289
+ \b
1290
+ Keys:
1291
+ sources.<name> Show if a source is enabled
1292
+ license.commercial Show commercial mode
1293
+ license.<source> Show if a license is asserted for <source>
1294
+ options.cadd_full Show full CADD tabix mode
1295
+
1296
+ \b
1297
+ Examples:
1298
+ allelix config get # dump entire config
1299
+ allelix config get sources.cadd # true
1300
+ allelix config get license.cadd # false
1301
+ allelix config get options.cadd_full # false
1302
+ """
1303
+ from allelix.config import _serialize, load_config
1304
+
1305
+ resolved = resolve_data_dir(data_dir)
1306
+ cfg = load_config(resolved)
1307
+
1308
+ if key is None:
1309
+ console.print(f"[dim]Config: {resolved / 'config.toml'}[/dim]")
1310
+ click.echo(_serialize(cfg))
1311
+ return
1312
+
1313
+ if key.startswith("sources."):
1314
+ source_name = key[len("sources.") :]
1315
+ val = cfg.sources.get(source_name)
1316
+ if val is None:
1317
+ raise click.ClickException(
1318
+ f"Unknown source {source_name!r}. Known sources: {', '.join(sorted(cfg.sources))}"
1319
+ )
1320
+ click.echo(str(val).lower())
1321
+ elif key == "license.commercial":
1322
+ click.echo(str(cfg.commercial).lower())
1323
+ elif key.startswith("license."):
1324
+ source_name = key[len("license.") :]
1325
+ click.echo(str(cfg.license_held(source_name)).lower())
1326
+ elif key == "options.cadd_full":
1327
+ click.echo(str(cfg.cadd_full).lower())
1328
+ else:
1329
+ raise click.ClickException(
1330
+ f"Unknown key {key!r}. Use 'sources.<name>', 'license.commercial', "
1331
+ "'license.<source>', or 'options.cadd_full'."
1332
+ )
1333
+
1334
+
1335
+ @config.command("set")
1336
+ @_DATA_DIR_OPT
1337
+ @click.argument("key")
1338
+ @click.argument("value")
1339
+ def config_set(data_dir: Path | None, key: str, value: str) -> None:
1340
+ r"""Set a configuration value.
1341
+
1342
+ \b
1343
+ Keys:
1344
+ sources.<name> Enable/disable a source (true/false)
1345
+ license.commercial Set commercial mode (true/false)
1346
+ license.<source> Assert you hold a commercial license for <source>
1347
+ options.cadd_full Use full CADD tabix file instead of cache (true/false)
1348
+
1349
+ \b
1350
+ Examples:
1351
+ allelix config set sources.snpedia false
1352
+ allelix config set license.commercial true
1353
+ allelix config set license.cadd true
1354
+ allelix config set options.cadd_full true
1355
+ """
1356
+ from allelix.config import load_config, save_config
1357
+
1358
+ resolved = resolve_data_dir(data_dir)
1359
+ cfg = load_config(resolved)
1360
+
1361
+ val_lower = value.strip().lower()
1362
+ if val_lower not in ("true", "false"):
1363
+ raise click.ClickException(f"Value must be 'true' or 'false', got {value!r}")
1364
+ bool_val = val_lower == "true"
1365
+
1366
+ if key.startswith("sources."):
1367
+ source_name = key[len("sources.") :]
1368
+ cfg.sources[source_name] = bool_val
1369
+ elif key == "license.commercial":
1370
+ cfg.commercial = bool_val
1371
+ elif key.startswith("license."):
1372
+ source_name = key[len("license.") :]
1373
+ if bool_val:
1374
+ from allelix.annotators import get_annotator_class
1375
+
1376
+ cls = get_annotator_class(source_name)
1377
+ if cls is not None and not cls.license.licensable:
1378
+ raise click.ClickException(
1379
+ f"{source_name} is not commercially licensable. "
1380
+ f"This assertion has no effect and cannot be set."
1381
+ )
1382
+ cfg.license_overrides[source_name] = True
1383
+ else:
1384
+ from allelix.annotators import get_annotator_class
1385
+
1386
+ if (
1387
+ get_annotator_class(source_name) is None
1388
+ and source_name not in cfg.license_overrides
1389
+ ):
1390
+ console.print(f"[yellow]Warning: unknown source {source_name!r}[/yellow]")
1391
+ cfg.license_overrides.pop(source_name, None)
1392
+ elif key == "options.cadd_full":
1393
+ cfg.cadd_full = bool_val
1394
+ else:
1395
+ raise click.ClickException(
1396
+ f"Unknown key {key!r}. Use 'sources.<name>', 'license.commercial', "
1397
+ "'license.<source>', or 'options.cadd_full'."
1398
+ )
1399
+
1400
+ save_config(resolved, cfg)
1401
+ console.print(f"[dim]Config: {resolved / 'config.toml'}[/dim]")
1402
+ console.print(f"[green]Set {key} = {val_lower}[/green]")
1403
+
1404
+
1405
+ @main.group()
1406
+ def export() -> None:
1407
+ """Export parsed genotype data to other formats."""
1408
+
1409
+
1410
+ @export.command("plink")
1411
+ @_FILE_ARG
1412
+ @click.option(
1413
+ "--output-prefix",
1414
+ "-o",
1415
+ type=click.Path(path_type=Path),
1416
+ default=None,
1417
+ help="Base path for .bed/.bim/.fam (default: input stem).",
1418
+ )
1419
+ @_FORMAT_OPT
1420
+ @_BUILD_OPT
1421
+ @_DATA_DIR_OPT
1422
+ def export_plink_cmd(
1423
+ file_path: Path,
1424
+ output_prefix: Path | None,
1425
+ fmt: str | None,
1426
+ build: str,
1427
+ data_dir: Path | None,
1428
+ ) -> None:
1429
+ """Convert to PLINK1 binary format (.bed/.bim/.fam).
1430
+
1431
+ Produces a single-sample, SNP-major .bed file suitable for downstream
1432
+ tools (plink2 PCA, ADMIXTURE, PRSice). Uses gnomAD ref/alt for allele
1433
+ coding when available; falls back to monomorphic (A2=0) for positions
1434
+ without gnomAD coverage.
1435
+ """
1436
+ from allelix.exporters.plink import _orient_genotype, export_plink
1437
+
1438
+ parser = _resolve_parser(file_path, fmt)
1439
+ prefix = output_prefix if output_prefix else file_path.with_suffix("")
1440
+ build_override = _normalize_cli_build(build)
1441
+ metadata = parser.get_metadata(file_path)
1442
+ effective_build = build_override or metadata.get("build", "GRCh37")
1443
+ resolved = resolve_data_dir(data_dir)
1444
+
1445
+ variants = list(parser.parse(file_path))
1446
+
1447
+ # Sort by chromosome then position so the .bim has contiguous
1448
+ # chromosome blocks — PLINK1.9 rejects split chromosomes.
1449
+ chrom_order = {str(i): i for i in range(1, 23)}
1450
+ chrom_order.update({"X": 23, "Y": 24, "XY": 25, "MT": 26})
1451
+ variants.sort(
1452
+ key=lambda v: (chrom_order.get(v.chromosome, 99), v.chromosome, v.position),
1453
+ )
1454
+
1455
+ variant_by_rsid: dict[str, Variant] = {}
1456
+ for v in variants:
1457
+ if not v.is_no_call:
1458
+ variant_by_rsid[v.rsid] = v
1459
+ rsids = set(variant_by_rsid)
1460
+
1461
+ ref_alt_map: dict[str, tuple[str, str]] = {}
1462
+ gnomad = None
1463
+ try:
1464
+ from allelix.annotators.gnomad import GnomadAnnotator
1465
+
1466
+ gnomad = GnomadAnnotator(resolved)
1467
+ if gnomad.is_ready():
1468
+ coord_map = gnomad.bulk_resolve_coordinates(rsids)
1469
+ for rsid, coords in coord_map.items():
1470
+ if len(coords) == 1:
1471
+ _, _, ref, alt = coords[0]
1472
+ ref_alt_map[rsid] = (ref, alt)
1473
+ else:
1474
+ v = variant_by_rsid[rsid]
1475
+ pair = {v.allele1, v.allele2}
1476
+ for _, _, ref, alt in coords:
1477
+ if _orient_genotype(
1478
+ v.allele1, v.allele2, ref, alt
1479
+ ) is not None and pair <= {ref, alt}:
1480
+ ref_alt_map[rsid] = (ref, alt)
1481
+ break
1482
+ else:
1483
+ for _, _, ref, alt in coords:
1484
+ if _orient_genotype(v.allele1, v.allele2, ref, alt) is not None:
1485
+ ref_alt_map[rsid] = (ref, alt)
1486
+ break
1487
+ except Exception:
1488
+ console.print(
1489
+ "[yellow]gnomAD coordinate resolution failed; using fallback allele coding.[/yellow]"
1490
+ )
1491
+ finally:
1492
+ if gnomad is not None:
1493
+ gnomad.close()
1494
+
1495
+ written, skipped, indel_skip, mono = export_plink(
1496
+ iter(variants), prefix, effective_build, ref_alt_map or None
1497
+ )
1498
+ skip_parts = []
1499
+ if skipped:
1500
+ skip_parts.append(f"{skipped:,} no-calls")
1501
+ if indel_skip:
1502
+ skip_parts.append(f"{indel_skip:,} indels")
1503
+ skip_msg = f" ({', '.join(skip_parts)} skipped)" if skip_parts else ""
1504
+ console.print(f"Wrote {written:,} variants to {prefix}.bed/.bim/.fam{skip_msg}")
1505
+ if mono > 0:
1506
+ pct = mono / written * 100 if written else 0
1507
+ console.print(
1508
+ f"[dim]{mono:,} markers ({pct:.0f}%) exported as monomorphic "
1509
+ f"(A2=0, ref/alt unknown or ambiguous).[/dim]"
1510
+ )
1511
+ if not ref_alt_map:
1512
+ console.print(
1513
+ "[yellow]gnomAD not available — all homozygous markers exported "
1514
+ "as monomorphic.[/yellow]"
1515
+ )
1516
+ console.print("[yellow]Run `allelix db update` first for proper allele coding.[/yellow]")
1517
+ console.print(
1518
+ "[dim]Single-sample export. Merging with other samples requires "
1519
+ "allele harmonization (--merge-mode or set-all-var-ids).[/dim]"
1520
+ )
1521
+
1522
+
1523
+ if __name__ == "__main__":
1524
+ main()