allelix 1.8.4__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {allelix-1.8.4 → allelix-2.0.0}/PKG-INFO +58 -30
  2. {allelix-1.8.4 → allelix-2.0.0}/README.md +50 -25
  3. {allelix-1.8.4 → allelix-2.0.0}/allelix/__init__.py +1 -1
  4. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/__init__.py +2 -2
  5. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/alphamissense.py +30 -1
  6. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/base.py +18 -2
  7. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/cadd.py +1 -1
  8. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/clinvar.py +203 -7
  9. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/gnomad.py +31 -1
  10. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/gwas.py +90 -1
  11. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/pharmgkb.py +116 -23
  12. {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/snpedia.py +85 -2
  13. allelix-2.0.0/allelix/cli/__init__.py +36 -0
  14. allelix-2.0.0/allelix/cli/_helpers.py +553 -0
  15. allelix-2.0.0/allelix/cli/_options.py +152 -0
  16. allelix-2.0.0/allelix/cli/analyze.py +116 -0
  17. allelix-2.0.0/allelix/cli/config.py +190 -0
  18. allelix-2.0.0/allelix/cli/db.py +253 -0
  19. allelix-2.0.0/allelix/cli/focused.py +176 -0
  20. allelix-2.0.0/allelix/cli/utility.py +530 -0
  21. {allelix-1.8.4 → allelix-2.0.0}/allelix/compare.py +1 -1
  22. {allelix-1.8.4 → allelix-2.0.0}/allelix/config.py +1 -1
  23. {allelix-1.8.4 → allelix-2.0.0}/allelix/data/__init__.py +1 -1
  24. allelix-2.0.0/allelix/data/clinvar_clnsig_snapshot.yaml +115 -0
  25. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/__init__.py +1 -1
  26. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/_versions.py +1 -1
  27. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/alphamissense_loader.py +1 -1
  28. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/cadd_loader.py +1 -1
  29. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/cpic_loader.py +5 -5
  30. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/gnomad_loader.py +1 -1
  31. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/gwas_loader.py +1 -1
  32. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/loader_utils.py +1 -1
  33. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/manager.py +2 -2
  34. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/pharmgkb_loader.py +15 -11
  35. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/schema.py +4 -2
  36. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/snpedia_loader.py +1 -1
  37. {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/snpedia_parser.py +1 -1
  38. {allelix-1.8.4 → allelix-2.0.0}/allelix/exporters/__init__.py +1 -1
  39. {allelix-1.8.4 → allelix-2.0.0}/allelix/exporters/plink.py +1 -1
  40. {allelix-1.8.4 → allelix-2.0.0}/allelix/models.py +2 -2
  41. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/__init__.py +5 -1
  42. allelix-2.0.0/allelix/parsers/_helpers.py +84 -0
  43. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/ancestrydna.py +1 -1
  44. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/base.py +1 -1
  45. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/ftdna.py +1 -1
  46. allelix-2.0.0/allelix/parsers/ftdna_illumina.py +143 -0
  47. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/livingdna.py +1 -1
  48. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/myhappygenes.py +1 -1
  49. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/myheritage.py +1 -1
  50. {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/twentythreeandme.py +1 -1
  51. allelix-2.0.0/allelix/parsers/vcf.py +535 -0
  52. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/__init__.py +2 -2
  53. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/_pipeline.py +129 -20
  54. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/diff.py +1 -1
  55. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/high_value.py +1 -1
  56. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/html.py +15 -4
  57. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/json_report.py +5 -2
  58. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/methylation.py +1 -1
  59. {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/terminal.py +3 -1
  60. {allelix-1.8.4 → allelix-2.0.0}/allelix/utils/__init__.py +1 -1
  61. {allelix-1.8.4 → allelix-2.0.0}/allelix/utils/allele.py +1 -1
  62. {allelix-1.8.4 → allelix-2.0.0}/allelix/utils/build_detect.py +1 -1
  63. {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/PKG-INFO +58 -30
  64. {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/SOURCES.txt +11 -1
  65. {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/requires.txt +3 -0
  66. {allelix-1.8.4 → allelix-2.0.0}/pyproject.toml +9 -5
  67. {allelix-1.8.4 → allelix-2.0.0}/tests/test_cli.py +587 -24
  68. {allelix-1.8.4 → allelix-2.0.0}/tests/test_cli_helpers.py +19 -14
  69. {allelix-1.8.4 → allelix-2.0.0}/tests/test_compare.py +1 -1
  70. {allelix-1.8.4 → allelix-2.0.0}/tests/test_config.py +1 -1
  71. {allelix-1.8.4 → allelix-2.0.0}/tests/test_end_to_end.py +13 -13
  72. {allelix-1.8.4 → allelix-2.0.0}/tests/test_mock_data_invariants.py +9 -9
  73. {allelix-1.8.4 → allelix-2.0.0}/tests/test_models.py +1 -1
  74. {allelix-1.8.4 → allelix-2.0.0}/tests/test_registry.py +1 -1
  75. {allelix-1.8.4 → allelix-2.0.0}/tests/test_version.py +1 -1
  76. allelix-1.8.4/allelix/cli.py +0 -1541
  77. allelix-1.8.4/allelix/parsers/_helpers.py +0 -41
  78. {allelix-1.8.4 → allelix-2.0.0}/LICENSE +0 -0
  79. {allelix-1.8.4 → allelix-2.0.0}/allelix/data/high_value_snps.yaml +0 -0
  80. {allelix-1.8.4 → allelix-2.0.0}/allelix/py.typed +0 -0
  81. {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/dependency_links.txt +0 -0
  82. {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/entry_points.txt +0 -0
  83. {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/top_level.txt +0 -0
  84. {allelix-1.8.4 → allelix-2.0.0}/setup.cfg +0 -0
@@ -1,13 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 1.8.4
3
+ Version: 2.0.0
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
- Author-email: dial481 <dial481@users.noreply.github.com>
5
+ Author: Allelix
6
+ Maintainer-email: dial481 <dial481@users.noreply.github.com>
6
7
  License-Expression: AGPL-3.0-or-later
7
8
  Project-URL: Homepage, https://allelix.io
8
- Project-URL: Source, https://github.com/dial481/allelix
9
- Project-URL: Issues, https://github.com/dial481/allelix/issues
10
- Project-URL: Changelog, https://github.com/dial481/allelix/blob/main/CHANGELOG.md
9
+ Project-URL: Source, https://github.com/allelix/allelix
10
+ Project-URL: Issues, https://github.com/allelix/allelix/issues
11
+ Project-URL: Changelog, https://github.com/allelix/allelix/blob/main/CHANGELOG.md
11
12
  Keywords: genomics,genotype,snp,bioinformatics,dna
12
13
  Classifier: Development Status :: 5 - Production/Stable
13
14
  Classifier: Intended Audience :: Science/Research
@@ -25,6 +26,8 @@ Requires-Dist: pyyaml>=6.0
25
26
  Requires-Dist: rich>=13.7
26
27
  Provides-Extra: cadd
27
28
  Requires-Dist: pysam>=0.22; extra == "cadd"
29
+ Provides-Extra: vcf-index
30
+ Requires-Dist: pysam>=0.22; extra == "vcf-index"
28
31
  Provides-Extra: dev
29
32
  Requires-Dist: pre-commit>=3.7; extra == "dev"
30
33
  Requires-Dist: pytest>=8.0; extra == "dev"
@@ -36,18 +39,20 @@ Dynamic: license-file
36
39
 
37
40
  Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
38
41
 
39
- > **Status:** Production — six parser formats, four annotators (ClinVar +
40
- > PharmGKB + GWAS Catalog + SNPedia), three enrichment sources (gnomAD
41
- > population frequencies + AlphaMissense pathogenicity + CADD
42
- > deleteriousness), licensable-source gating for commercial users,
43
- > dual-build ClinVar caches (GRCh37 + GRCh38),
42
+ > **Status:** Production — eight parser formats (including VCF + gVCF),
43
+ > four annotators (ClinVar + ClinPGx + GWAS Catalog + SNPedia), three
44
+ > enrichment sources (gnomAD population frequencies + AlphaMissense
45
+ > pathogenicity + CADD deleteriousness), licensable-source gating for
46
+ > commercial users, dual-build ClinVar caches (GRCh37 + GRCh38),
44
47
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
45
48
  > commands, report diffing, persistent config with commercial-mode
46
49
  > safety switch. Build auto-detection from position data (ADR-0021).
47
- > No regex on prose anywhere in production. **Latest: v1.8.4** —
48
- > `--no-cadd` flag for licensing exclusion parity.
50
+ > No regex on prose anywhere in production. **Latest: v2.0.0** — VCF +
51
+ > gVCF parser with multi-sample handling, batched annotation pipeline
52
+ > for WGS scale, FTDNA Illumina raw parser, R-4 ClinVar CLNSIG drift CI
53
+ > test, CLI package restructure.
49
54
  > Release notes:
50
- > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
55
+ > [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
51
56
 
52
57
  ## Quickstart
53
58
 
@@ -61,6 +66,15 @@ allelix db update
61
66
 
62
67
  # Analyze a genotype file
63
68
  allelix analyze your_genotype_file.txt --output report.html
69
+
70
+ # VCF / gVCF input — same command, auto-detected
71
+ allelix analyze your_wgs.vcf.gz --output report.html
72
+
73
+ # Multi-sample VCF — pick which sample to analyze
74
+ allelix analyze trio.vcf.gz --sample HG002 --output report.html
75
+
76
+ # Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
77
+ allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
64
78
  ```
65
79
 
66
80
  Requires Python 3.11+. See [Development](#development) for source installs and running tests.
@@ -75,16 +89,20 @@ Requires Python 3.11+. See [Development](#development) for source installs and r
75
89
  | Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
76
90
  | MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
77
91
  | Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
92
+ | FTDNA Illumina raw | ✓ | Tab-delimited variant of the FTDNA export (distinct from the CSV format above). `RSID/CHROMOSOME/POSITION/RESULT` columns. Build 37 default. |
93
+ | VCF / gVCF | ✓ | REF/ALT encoding, `0/1` genotype notation. Plain VCF: absence at a position means reference. gVCF: explicit reference blocks (lines with `<NON_REF>` ALT and `END=` INFO) are skipped — they match nothing in any annotation database. Multi-sample files require `--sample <ID>`. Streams via stdlib; `.vcf.gz` handled transparently. Optional `pip install allelix[vcf-index]` enables pysam-backed tabix random access for fast `extract --snps` on huge VCFs. |
78
94
 
79
95
  Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
80
96
 
81
- ### v2 roadmap
97
+ ### v2.1+ roadmap
82
98
 
83
- | Format | Notes |
99
+ | Feature | Notes |
84
100
  |---|---|
85
- | VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
86
101
  | Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
102
+ | Annotator-level strand awareness (R-1) | Strand-flip matching wired into every annotator's carrier check. Basic `compare` strand support shipped in v1.1; full annotator integration deferred from v2.0.0. |
103
+ | Good / Bad / Neutral repute | Per-annotation repute field. Reframes the report from "here's what's wrong" to "here's your full picture." Requires Annotation model change + renderer updates. |
87
104
  | PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
105
+ | PharmCAT integration | Wrap CPIC's PharmCAT as an optional external engine for star-allele / diplotype calling. Requires VCF input (shipped in v2.0.0). |
88
106
  | Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
89
107
 
90
108
  ## Supported Databases
@@ -92,22 +110,32 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
92
110
  | Database | Status | Notes |
93
111
  |---|---|---|
94
112
  | ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
95
- | PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
96
- | CPIC (per-allele function table) | ✓ | Internal data source for the PharmGKB filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
113
+ | ClinPGx (formerly PharmGKB) | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
114
+ | CPIC (per-allele function table) | ✓ | Internal data source for the ClinPGx filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
97
115
  | SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
98
116
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
99
117
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
100
118
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
101
119
  | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
102
120
 
103
- ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
121
+ ### Build coverage asymmetry (GRCh37 vs GRCh38)
122
+
123
+ ClinVar dispatches per-build (ADR-0021) and ships with both GRCh37 and GRCh38 caches. The two caches are essentially equivalent in coverage: 2,896,063 rows / 2,645,206 distinct rsIDs in GRCh37 vs 2,896,102 / 2,645,243 in GRCh38 — a difference of 39 rows.
124
+
125
+ Despite that equivalence, the same person's WGS file produces noticeably more annotations as GRCh37 than as GRCh38. The mechanism is in the resolution step, not in upstream-data shape. Position-keyed rsID resolution requires exact `(chromosome, position, ref, alt)` alignment between the user's variant call and ClinVar's stored row. Lift-over between builds does not preserve that alignment perfectly: the `~0.4%` of the genome where the reference assembly was rebuilt has different REF alleles, multi-allelic sites split differently, and some benchmark VCF positions drop out entirely in the GRCh38 lift. Each misalignment loses one resolution, which in turn loses all the rsID-keyed downstream annotations that rsID would have driven (ClinVar's own carrier annotation, plus GWAS Catalog, SNPedia, and ClinPGx).
126
+
127
+ Real GIAB HG002 benchmark, surviving the default `--min-magnitude 5.0` filter: GRCh37 surfaces 520 distinct rsIDs across all sources, GRCh38 surfaces 341. The two sets overlap on 331 rsIDs; 189 are GRCh37-only and 10 are GRCh38-only — pure asymmetric loss in the GRCh38 lift, not different upstream coverage. The unfiltered totals (65,965 vs 4,867) magnify the same pattern at lower magnitudes, mostly via GWAS-Catalog weak-association rows.
128
+
129
+ If you have a choice of build for the input, GRCh37 surfaces more annotations today on rsID-less VCFs that flow through position-keyed resolution. GRCh38 still surfaces every ClinVar carrier hit it has an exact alignment for.
130
+
131
+ ### Known ClinPGx limitation: reference-genotype rows where ClinVar and CPIC both lack data
104
132
 
105
- ADR-0022 + ADR-0023: a tiny residual of PharmGKB rows may appear in reports even when the user is homozygous reference. PharmGKB publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
133
+ ADR-0022 + ADR-0023: a tiny residual of ClinPGx rows may appear in reports even when the user is homozygous reference. ClinPGx publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
106
134
 
107
135
  - **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
108
136
  - **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
109
137
 
110
- For the rare rsID where PharmGKB has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
138
+ For the rare rsID where ClinPGx has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
111
139
 
112
140
  The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
113
141
 
@@ -164,7 +192,7 @@ Not all databases are equal in size. `allelix db update` downloads them all by d
164
192
  | Database | On disk | Download time | What it adds |
165
193
  |---|---|---|---|
166
194
  | ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
167
- | PharmGKB + CPIC | ~6MB | seconds | Drug-gene interactions. |
195
+ | ClinPGx + CPIC | ~6MB | seconds | Drug-gene interactions. |
168
196
  | GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
169
197
  | gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
170
198
  | AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
@@ -182,14 +210,14 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
182
210
  |---|---|---|---|
183
211
  | ClinVar | NCBI | Public domain | No restrictions |
184
212
  | GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
185
- | PharmGKB | pharmgkb.org | CC BY-SA 4.0 | Attribution required |
186
- | CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the PharmGKB non-finding filter (ADR-0020), not surfaced as its own annotator. |
213
+ | ClinPGx (formerly PharmGKB) | clinpgx.org | CC BY-SA 4.0 | Attribution required |
214
+ | CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the ClinPGx non-finding filter (ADR-0020), not surfaced as its own annotator. |
187
215
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
188
216
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
189
217
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
190
218
  | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
191
219
 
192
- **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
220
+ **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, ClinPGx, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
193
221
 
194
222
  ### SNPedia data download
195
223
 
@@ -218,17 +246,17 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
218
246
 
219
247
  ## Architecture & Design Decisions
220
248
 
221
- The "why" behind major design choices lives in [`docs/adr/`](https://github.com/dial481/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
249
+ The "why" behind major design choices lives in [`docs/adr/`](https://github.com/allelix/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
222
250
 
223
251
  Notable load-bearing ADRs:
224
252
 
225
253
  - **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
226
- - **ADR-0020 — CPIC API as the per-allele function source.** The PharmGKB non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
254
+ - **ADR-0020 — CPIC API as the per-allele function source.** The ClinPGx non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
227
255
  - **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
228
- - **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
256
+ - **ADR-0009 — ClinPGx matches the user's exact normalized diploid call.**
229
257
  - **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
230
258
 
231
- Release history: see [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
259
+ Release history: see [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
232
260
 
233
261
  ## Development
234
262
 
@@ -248,4 +276,4 @@ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit i
248
276
 
249
277
  ## License
250
278
 
251
- AGPL-3.0-or-later. See `LICENSE`.
279
+ GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later). See `LICENSE`.
@@ -2,18 +2,20 @@
2
2
 
3
3
  Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
4
4
 
5
- > **Status:** Production — six parser formats, four annotators (ClinVar +
6
- > PharmGKB + GWAS Catalog + SNPedia), three enrichment sources (gnomAD
7
- > population frequencies + AlphaMissense pathogenicity + CADD
8
- > deleteriousness), licensable-source gating for commercial users,
9
- > dual-build ClinVar caches (GRCh37 + GRCh38),
5
+ > **Status:** Production — eight parser formats (including VCF + gVCF),
6
+ > four annotators (ClinVar + ClinPGx + GWAS Catalog + SNPedia), three
7
+ > enrichment sources (gnomAD population frequencies + AlphaMissense
8
+ > pathogenicity + CADD deleteriousness), licensable-source gating for
9
+ > commercial users, dual-build ClinVar caches (GRCh37 + GRCh38),
10
10
  > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
11
11
  > commands, report diffing, persistent config with commercial-mode
12
12
  > safety switch. Build auto-detection from position data (ADR-0021).
13
- > No regex on prose anywhere in production. **Latest: v1.8.4** —
14
- > `--no-cadd` flag for licensing exclusion parity.
13
+ > No regex on prose anywhere in production. **Latest: v2.0.0** — VCF +
14
+ > gVCF parser with multi-sample handling, batched annotation pipeline
15
+ > for WGS scale, FTDNA Illumina raw parser, R-4 ClinVar CLNSIG drift CI
16
+ > test, CLI package restructure.
15
17
  > Release notes:
16
- > [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
18
+ > [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
17
19
 
18
20
  ## Quickstart
19
21
 
@@ -27,6 +29,15 @@ allelix db update
27
29
 
28
30
  # Analyze a genotype file
29
31
  allelix analyze your_genotype_file.txt --output report.html
32
+
33
+ # VCF / gVCF input — same command, auto-detected
34
+ allelix analyze your_wgs.vcf.gz --output report.html
35
+
36
+ # Multi-sample VCF — pick which sample to analyze
37
+ allelix analyze trio.vcf.gz --sample HG002 --output report.html
38
+
39
+ # Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
40
+ allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
30
41
  ```
31
42
 
32
43
  Requires Python 3.11+. See [Development](#development) for source installs and running tests.
@@ -41,16 +52,20 @@ Requires Python 3.11+. See [Development](#development) for source installs and r
41
52
  | Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
42
53
  | MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
43
54
  | Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
55
+ | FTDNA Illumina raw | ✓ | Tab-delimited variant of the FTDNA export (distinct from the CSV format above). `RSID/CHROMOSOME/POSITION/RESULT` columns. Build 37 default. |
56
+ | VCF / gVCF | ✓ | REF/ALT encoding, `0/1` genotype notation. Plain VCF: absence at a position means reference. gVCF: explicit reference blocks (lines with `<NON_REF>` ALT and `END=` INFO) are skipped — they match nothing in any annotation database. Multi-sample files require `--sample <ID>`. Streams via stdlib; `.vcf.gz` handled transparently. Optional `pip install allelix[vcf-index]` enables pysam-backed tabix random access for fast `extract --snps` on huge VCFs. |
44
57
 
45
58
  Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
46
59
 
47
- ### v2 roadmap
60
+ ### v2.1+ roadmap
48
61
 
49
- | Format | Notes |
62
+ | Feature | Notes |
50
63
  |---|---|
51
- | VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
52
64
  | Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
65
+ | Annotator-level strand awareness (R-1) | Strand-flip matching wired into every annotator's carrier check. Basic `compare` strand support shipped in v1.1; full annotator integration deferred from v2.0.0. |
66
+ | Good / Bad / Neutral repute | Per-annotation repute field. Reframes the report from "here's what's wrong" to "here's your full picture." Requires Annotation model change + renderer updates. |
53
67
  | PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
68
+ | PharmCAT integration | Wrap CPIC's PharmCAT as an optional external engine for star-allele / diplotype calling. Requires VCF input (shipped in v2.0.0). |
54
69
  | Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
55
70
 
56
71
  ## Supported Databases
@@ -58,22 +73,32 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
58
73
  | Database | Status | Notes |
59
74
  |---|---|---|
60
75
  | ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
61
- | PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
62
- | CPIC (per-allele function table) | ✓ | Internal data source for the PharmGKB filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
76
+ | ClinPGx (formerly PharmGKB) | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
77
+ | CPIC (per-allele function table) | ✓ | Internal data source for the ClinPGx filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
63
78
  | SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
64
79
  | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
65
80
  | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
66
81
  | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
67
82
  | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
68
83
 
69
- ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
84
+ ### Build coverage asymmetry (GRCh37 vs GRCh38)
85
+
86
+ ClinVar dispatches per-build (ADR-0021) and ships with both GRCh37 and GRCh38 caches. The two caches are essentially equivalent in coverage: 2,896,063 rows / 2,645,206 distinct rsIDs in GRCh37 vs 2,896,102 / 2,645,243 in GRCh38 — a difference of 39 rows.
87
+
88
+ Despite that equivalence, the same person's WGS file produces noticeably more annotations as GRCh37 than as GRCh38. The mechanism is in the resolution step, not in upstream-data shape. Position-keyed rsID resolution requires exact `(chromosome, position, ref, alt)` alignment between the user's variant call and ClinVar's stored row. Lift-over between builds does not preserve that alignment perfectly: the `~0.4%` of the genome where the reference assembly was rebuilt has different REF alleles, multi-allelic sites split differently, and some benchmark VCF positions drop out entirely in the GRCh38 lift. Each misalignment loses one resolution, which in turn loses all the rsID-keyed downstream annotations that rsID would have driven (ClinVar's own carrier annotation, plus GWAS Catalog, SNPedia, and ClinPGx).
89
+
90
+ Real GIAB HG002 benchmark, surviving the default `--min-magnitude 5.0` filter: GRCh37 surfaces 520 distinct rsIDs across all sources, GRCh38 surfaces 341. The two sets overlap on 331 rsIDs; 189 are GRCh37-only and 10 are GRCh38-only — pure asymmetric loss in the GRCh38 lift, not different upstream coverage. The unfiltered totals (65,965 vs 4,867) magnify the same pattern at lower magnitudes, mostly via GWAS-Catalog weak-association rows.
91
+
92
+ If you have a choice of build for the input, GRCh37 surfaces more annotations today on rsID-less VCFs that flow through position-keyed resolution. GRCh38 still surfaces every ClinVar carrier hit it has an exact alignment for.
93
+
94
+ ### Known ClinPGx limitation: reference-genotype rows where ClinVar and CPIC both lack data
70
95
 
71
- ADR-0022 + ADR-0023: a tiny residual of PharmGKB rows may appear in reports even when the user is homozygous reference. PharmGKB publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
96
+ ADR-0022 + ADR-0023: a tiny residual of ClinPGx rows may appear in reports even when the user is homozygous reference. ClinPGx publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
72
97
 
73
98
  - **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
74
99
  - **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
75
100
 
76
- For the rare rsID where PharmGKB has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
101
+ For the rare rsID where ClinPGx has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
77
102
 
78
103
  The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
79
104
 
@@ -130,7 +155,7 @@ Not all databases are equal in size. `allelix db update` downloads them all by d
130
155
  | Database | On disk | Download time | What it adds |
131
156
  |---|---|---|---|
132
157
  | ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
133
- | PharmGKB + CPIC | ~6MB | seconds | Drug-gene interactions. |
158
+ | ClinPGx + CPIC | ~6MB | seconds | Drug-gene interactions. |
134
159
  | GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
135
160
  | gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
136
161
  | AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
@@ -148,14 +173,14 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
148
173
  |---|---|---|---|
149
174
  | ClinVar | NCBI | Public domain | No restrictions |
150
175
  | GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
151
- | PharmGKB | pharmgkb.org | CC BY-SA 4.0 | Attribution required |
152
- | CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the PharmGKB non-finding filter (ADR-0020), not surfaced as its own annotator. |
176
+ | ClinPGx (formerly PharmGKB) | clinpgx.org | CC BY-SA 4.0 | Attribution required |
177
+ | CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the ClinPGx non-finding filter (ADR-0020), not surfaced as its own annotator. |
153
178
  | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
154
179
  | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
155
180
  | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
156
181
  | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
157
182
 
158
- **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
183
+ **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, ClinPGx, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
159
184
 
160
185
  ### SNPedia data download
161
186
 
@@ -184,17 +209,17 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
184
209
 
185
210
  ## Architecture & Design Decisions
186
211
 
187
- The "why" behind major design choices lives in [`docs/adr/`](https://github.com/dial481/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
212
+ The "why" behind major design choices lives in [`docs/adr/`](https://github.com/allelix/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
188
213
 
189
214
  Notable load-bearing ADRs:
190
215
 
191
216
  - **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
192
- - **ADR-0020 — CPIC API as the per-allele function source.** The PharmGKB non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
217
+ - **ADR-0020 — CPIC API as the per-allele function source.** The ClinPGx non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
193
218
  - **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
194
- - **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
219
+ - **ADR-0009 — ClinPGx matches the user's exact normalized diploid call.**
195
220
  - **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
196
221
 
197
- Release history: see [`CHANGELOG.md`](https://github.com/dial481/allelix/blob/main/CHANGELOG.md).
222
+ Release history: see [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
198
223
 
199
224
  ## Development
200
225
 
@@ -214,4 +239,4 @@ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit i
214
239
 
215
240
  ## License
216
241
 
217
- AGPL-3.0-or-later. See `LICENSE`.
242
+ GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later). See `LICENSE`.
@@ -1,5 +1,5 @@
1
1
  # SPDX-License-Identifier: AGPL-3.0-or-later
2
- # Copyright (C) 2026 dial481
2
+ # Copyright (C) 2026 Allelix
3
3
  """Allelix: open-source genotype analysis toolkit."""
4
4
 
5
5
  from __future__ import annotations
@@ -1,5 +1,5 @@
1
1
  # SPDX-License-Identifier: AGPL-3.0-or-later
2
- # Copyright (C) 2026 dial481
2
+ # Copyright (C) 2026 Allelix
3
3
  """Annotator registry. Unlike parsers, ALL annotators run on every variant."""
4
4
 
5
5
  from __future__ import annotations
@@ -43,7 +43,7 @@ def get_annotators(
43
43
  complete 81 GB CADD file). Requires ``pysam`` and a local copy.
44
44
 
45
45
  ADR-0023: ClinVar's `reference_for(rsid, build)` is wired into
46
- PharmGKB and SNPedia as the primary hom-ref suppression filter — the
46
+ ClinPGx and SNPedia as the primary hom-ref suppression filter — the
47
47
  REF allele lookup universally determines whether the user is
48
48
  homozygous reference (and thus a non-finding for that variant).
49
49
  """
@@ -1,5 +1,5 @@
1
1
  # SPDX-License-Identifier: AGPL-3.0-or-later
2
- # Copyright (C) 2026 dial481
2
+ # Copyright (C) 2026 Allelix
3
3
  """AlphaMissense variant pathogenicity enrichment.
4
4
 
5
5
  AlphaMissense is not a clinical annotator — it does not produce
@@ -226,3 +226,32 @@ class AlphaMissenseAnnotator(Annotator):
226
226
  if score is not None:
227
227
  result[(rsid, alt)] = (score, cls)
228
228
  return result
229
+
230
+ def bulk_lookup_by_position(
231
+ self, keys: set[tuple[str, int, str, str]]
232
+ ) -> dict[tuple[str, int, str, str], tuple[float, str]]:
233
+ """Return ``{(chrom, pos, ref, alt): (score, class)}`` via PK lookup.
234
+
235
+ Position-keyed fallback for rsID-less VCFs whose ClinVar-resolved
236
+ rsIDs aren't indexed in the AlphaMissense cache. Hits the
237
+ ``(chrom, pos, ref, alt)`` primary key directly.
238
+ """
239
+ if not keys:
240
+ return {}
241
+ conn = self._connection()
242
+ result: dict[tuple[str, int, str, str], tuple[float, str]] = {}
243
+ key_list = list(keys)
244
+ batch_size = _BULK_BATCH_SIZE // 4
245
+ for i in range(0, len(key_list), batch_size):
246
+ batch = key_list[i : i + batch_size]
247
+ clauses = " OR ".join(["(chrom = ? AND pos = ? AND ref = ? AND alt = ?)"] * len(batch))
248
+ params = [v for k in batch for v in k]
249
+ rows = conn.execute(
250
+ f"SELECT chrom, pos, ref, alt, am_pathogenicity, am_class"
251
+ f" FROM alphamissense_scores WHERE {clauses}",
252
+ params,
253
+ ).fetchall()
254
+ for chrom, pos, ref, alt, score, cls in rows:
255
+ if score is not None:
256
+ result[(chrom, pos, ref, alt)] = (score, cls)
257
+ return result
@@ -1,5 +1,5 @@
1
1
  # SPDX-License-Identifier: AGPL-3.0-or-later
2
- # Copyright (C) 2026 dial481
2
+ # Copyright (C) 2026 Allelix
3
3
  """Abstract base class for reference-database annotators."""
4
4
 
5
5
  from __future__ import annotations
@@ -11,7 +11,7 @@ from enum import Enum, auto
11
11
  from typing import TYPE_CHECKING, ClassVar
12
12
 
13
13
  if TYPE_CHECKING:
14
- from collections.abc import Callable
14
+ from collections.abc import Callable, Iterable, Iterator
15
15
  from pathlib import Path
16
16
  from types import TracebackType
17
17
 
@@ -173,6 +173,22 @@ class Annotator(ABC):
173
173
  """
174
174
  ...
175
175
 
176
+ def batch_annotate(self, variants: Iterable[Variant]) -> Iterator[Annotation]:
177
+ """Annotate a batch of variants. Yields annotations in arrival order.
178
+
179
+ Default implementation loops over ``annotate(variant)`` so any
180
+ existing annotator works unchanged. Subclasses with rsID-based
181
+ SQLite lookups should override this with a chunked ``WHERE rsid
182
+ IN (...)`` query to avoid per-variant round-trips at WGS scale
183
+ (4-6M variants per VCF).
184
+
185
+ The default keeps the pipeline single-path: callers always use
186
+ ``batch_annotate``; the loop fallback gives backward compatibility
187
+ for annotators that haven't grown a batched query path yet.
188
+ """
189
+ for variant in variants:
190
+ yield from self.annotate(variant)
191
+
176
192
  @abstractmethod
177
193
  def is_ready(self) -> bool:
178
194
  """Whether the local cache exists and is queryable."""
@@ -1,5 +1,5 @@
1
1
  # SPDX-License-Identifier: AGPL-3.0-or-later
2
- # Copyright (C) 2026 dial481
2
+ # Copyright (C) 2026 Allelix
3
3
  """CADD variant deleteriousness enrichment.
4
4
 
5
5
  CADD is not a clinical annotator — it does not produce Annotation