allelix 1.9.0__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {allelix-1.9.0 → allelix-2.0.1}/PKG-INFO +55 -30
- {allelix-1.9.0 → allelix-2.0.1}/README.md +47 -25
- {allelix-1.9.0 → allelix-2.0.1}/allelix/__init__.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/__init__.py +2 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/alphamissense.py +30 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/base.py +18 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/cadd.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/clinvar.py +224 -8
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/gnomad.py +41 -3
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/gwas.py +112 -8
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/pharmgkb.py +119 -23
- {allelix-1.9.0 → allelix-2.0.1}/allelix/annotators/snpedia.py +85 -2
- allelix-2.0.1/allelix/cli/__init__.py +36 -0
- allelix-2.0.1/allelix/cli/_helpers.py +577 -0
- allelix-2.0.1/allelix/cli/_options.py +152 -0
- allelix-2.0.1/allelix/cli/analyze.py +116 -0
- allelix-2.0.1/allelix/cli/config.py +190 -0
- allelix-2.0.1/allelix/cli/db.py +242 -0
- allelix-2.0.1/allelix/cli/focused.py +176 -0
- allelix-2.0.1/allelix/cli/utility.py +530 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/compare.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/config.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/data/__init__.py +1 -1
- allelix-2.0.1/allelix/data/clinvar_clnsig_snapshot.yaml +115 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/__init__.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/_versions.py +2 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/alphamissense_loader.py +2 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/cadd_loader.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/cpic_loader.py +5 -5
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/gnomad_loader.py +2 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/gwas_loader.py +26 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/loader_utils.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/manager.py +23 -6
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/pharmgkb_loader.py +32 -11
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/schema.py +4 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/snpedia_loader.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/databases/snpedia_parser.py +18 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/exporters/__init__.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/exporters/plink.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/models.py +21 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/__init__.py +5 -1
- allelix-2.0.1/allelix/parsers/_helpers.py +98 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/ancestrydna.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/base.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/ftdna.py +19 -2
- allelix-2.0.1/allelix/parsers/ftdna_illumina.py +143 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/livingdna.py +23 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/myhappygenes.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/myheritage.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/parsers/twentythreeandme.py +1 -1
- allelix-2.0.1/allelix/parsers/vcf.py +535 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/__init__.py +2 -2
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/_pipeline.py +183 -54
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/diff.py +44 -5
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/high_value.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/html.py +13 -4
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/json_report.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/reports/methylation.py +1 -1
- allelix-2.0.1/allelix/reports/terminal.py +241 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/utils/__init__.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix/utils/allele.py +17 -15
- {allelix-1.9.0 → allelix-2.0.1}/allelix/utils/build_detect.py +32 -10
- {allelix-1.9.0 → allelix-2.0.1}/allelix.egg-info/PKG-INFO +55 -30
- {allelix-1.9.0 → allelix-2.0.1}/allelix.egg-info/SOURCES.txt +11 -1
- {allelix-1.9.0 → allelix-2.0.1}/allelix.egg-info/requires.txt +3 -0
- {allelix-1.9.0 → allelix-2.0.1}/pyproject.toml +9 -5
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_cli.py +504 -47
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_cli_helpers.py +19 -14
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_compare.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_config.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_end_to_end.py +13 -13
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_mock_data_invariants.py +9 -9
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_models.py +34 -1
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_registry.py +1 -1
- {allelix-1.9.0 → allelix-2.0.1}/tests/test_version.py +1 -1
- allelix-1.9.0/allelix/cli.py +0 -1596
- allelix-1.9.0/allelix/parsers/_helpers.py +0 -41
- allelix-1.9.0/allelix/reports/terminal.py +0 -205
- {allelix-1.9.0 → allelix-2.0.1}/LICENSE +0 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/data/high_value_snps.yaml +0 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix/py.typed +0 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix.egg-info/dependency_links.txt +0 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix.egg-info/entry_points.txt +0 -0
- {allelix-1.9.0 → allelix-2.0.1}/allelix.egg-info/top_level.txt +0 -0
- {allelix-1.9.0 → allelix-2.0.1}/setup.cfg +0 -0
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: allelix
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.1
|
|
4
4
|
Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
5
|
-
Author
|
|
5
|
+
Author: Allelix
|
|
6
|
+
Maintainer-email: dial481 <dial481@users.noreply.github.com>
|
|
6
7
|
License-Expression: AGPL-3.0-or-later
|
|
7
8
|
Project-URL: Homepage, https://allelix.io
|
|
8
|
-
Project-URL: Source, https://github.com/
|
|
9
|
-
Project-URL: Issues, https://github.com/
|
|
10
|
-
Project-URL: Changelog, https://github.com/
|
|
9
|
+
Project-URL: Source, https://github.com/allelix/allelix
|
|
10
|
+
Project-URL: Issues, https://github.com/allelix/allelix/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/allelix/allelix/blob/main/CHANGELOG.md
|
|
11
12
|
Keywords: genomics,genotype,snp,bioinformatics,dna
|
|
12
13
|
Classifier: Development Status :: 5 - Production/Stable
|
|
13
14
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -25,6 +26,8 @@ Requires-Dist: pyyaml>=6.0
|
|
|
25
26
|
Requires-Dist: rich>=13.7
|
|
26
27
|
Provides-Extra: cadd
|
|
27
28
|
Requires-Dist: pysam>=0.22; extra == "cadd"
|
|
29
|
+
Provides-Extra: vcf-index
|
|
30
|
+
Requires-Dist: pysam>=0.22; extra == "vcf-index"
|
|
28
31
|
Provides-Extra: dev
|
|
29
32
|
Requires-Dist: pre-commit>=3.7; extra == "dev"
|
|
30
33
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
@@ -36,18 +39,20 @@ Dynamic: license-file
|
|
|
36
39
|
|
|
37
40
|
Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
38
41
|
|
|
39
|
-
> **Status:** Production —
|
|
40
|
-
>
|
|
41
|
-
> population frequencies + AlphaMissense
|
|
42
|
-
> deleteriousness), licensable-source gating for
|
|
43
|
-
> dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
42
|
+
> **Status:** Production — eight parser formats (including VCF + gVCF),
|
|
43
|
+
> four annotators (ClinVar + ClinPGx + GWAS Catalog + SNPedia), three
|
|
44
|
+
> enrichment sources (gnomAD population frequencies + AlphaMissense
|
|
45
|
+
> pathogenicity + CADD deleteriousness), licensable-source gating for
|
|
46
|
+
> commercial users, dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
44
47
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
45
48
|
> commands, report diffing, persistent config with commercial-mode
|
|
46
49
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
47
|
-
> No regex on prose anywhere in production. **Latest:
|
|
48
|
-
>
|
|
50
|
+
> No regex on prose anywhere in production. **Latest: v2.0.0** — VCF +
|
|
51
|
+
> gVCF parser with multi-sample handling, batched annotation pipeline
|
|
52
|
+
> for WGS scale, FTDNA Illumina raw parser, R-4 ClinVar CLNSIG drift CI
|
|
53
|
+
> test, CLI package restructure.
|
|
49
54
|
> Release notes:
|
|
50
|
-
> [`CHANGELOG.md`](https://github.com/
|
|
55
|
+
> [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
51
56
|
|
|
52
57
|
## Quickstart
|
|
53
58
|
|
|
@@ -62,6 +67,12 @@ allelix db update
|
|
|
62
67
|
# Analyze a genotype file
|
|
63
68
|
allelix analyze your_genotype_file.txt --output report.html
|
|
64
69
|
|
|
70
|
+
# VCF / gVCF input — same command, auto-detected
|
|
71
|
+
allelix analyze your_wgs.vcf.gz --output report.html
|
|
72
|
+
|
|
73
|
+
# Multi-sample VCF — pick which sample to analyze
|
|
74
|
+
allelix analyze trio.vcf.gz --sample HG002 --output report.html
|
|
75
|
+
|
|
65
76
|
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
66
77
|
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
67
78
|
```
|
|
@@ -78,16 +89,20 @@ Requires Python 3.11+. See [Development](#development) for source installs and r
|
|
|
78
89
|
| Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
|
|
79
90
|
| MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
|
|
80
91
|
| Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
|
|
92
|
+
| FTDNA Illumina raw | ✓ | Tab-delimited variant of the FTDNA export (distinct from the CSV format above). `RSID/CHROMOSOME/POSITION/RESULT` columns. Build 37 default. |
|
|
93
|
+
| VCF / gVCF | ✓ | REF/ALT encoding, `0/1` genotype notation. Plain VCF: absence at a position means reference. gVCF: explicit reference blocks (lines with `<NON_REF>` ALT and `END=` INFO) are skipped — they match nothing in any annotation database. Multi-sample files require `--sample <ID>`. Streams via stdlib; `.vcf.gz` handled transparently. Optional `pip install allelix[vcf-index]` enables pysam-backed tabix random access for fast `extract --snps` on huge VCFs. |
|
|
81
94
|
|
|
82
95
|
Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
|
|
83
96
|
|
|
84
|
-
### v2 roadmap
|
|
97
|
+
### v2.1+ roadmap
|
|
85
98
|
|
|
86
|
-
|
|
|
99
|
+
| Feature | Notes |
|
|
87
100
|
|---|---|
|
|
88
|
-
| VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
|
|
89
101
|
| Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
|
|
102
|
+
| Annotator-level strand awareness (R-1) | Strand-flip matching wired into every annotator's carrier check. Basic `compare` strand support shipped in v1.1; full annotator integration deferred from v2.0.0. |
|
|
103
|
+
| Good / Bad / Neutral repute | Per-annotation repute field. Reframes the report from "here's what's wrong" to "here's your full picture." Requires Annotation model change + renderer updates. |
|
|
90
104
|
| PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
|
|
105
|
+
| PharmCAT integration | Wrap CPIC's PharmCAT as an optional external engine for star-allele / diplotype calling. Requires VCF input (shipped in v2.0.0). |
|
|
91
106
|
| Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
|
|
92
107
|
|
|
93
108
|
## Supported Databases
|
|
@@ -95,22 +110,32 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
95
110
|
| Database | Status | Notes |
|
|
96
111
|
|---|---|---|
|
|
97
112
|
| ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
|
|
98
|
-
| PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
99
|
-
| CPIC (per-allele function table) | ✓ | Internal data source for the
|
|
113
|
+
| ClinPGx (formerly PharmGKB) | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
114
|
+
| CPIC (per-allele function table) | ✓ | Internal data source for the ClinPGx filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
|
|
100
115
|
| SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
|
|
101
116
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
102
117
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
103
118
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
104
119
|
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
105
120
|
|
|
106
|
-
###
|
|
121
|
+
### Build coverage asymmetry (GRCh37 vs GRCh38)
|
|
122
|
+
|
|
123
|
+
ClinVar dispatches per-build (ADR-0021) and ships with both GRCh37 and GRCh38 caches. The two caches are essentially equivalent in coverage: 2,896,063 rows / 2,645,206 distinct rsIDs in GRCh37 vs 2,896,102 / 2,645,243 in GRCh38 — a difference of 39 rows.
|
|
124
|
+
|
|
125
|
+
Despite that equivalence, the same person's WGS file produces noticeably more annotations as GRCh37 than as GRCh38. The mechanism is in the resolution step, not in upstream-data shape. Position-keyed rsID resolution requires exact `(chromosome, position, ref, alt)` alignment between the user's variant call and ClinVar's stored row. Lift-over between builds does not preserve that alignment perfectly: the `~0.4%` of the genome where the reference assembly was rebuilt has different REF alleles, multi-allelic sites split differently, and some benchmark VCF positions drop out entirely in the GRCh38 lift. Each misalignment loses one resolution, which in turn loses all the rsID-keyed downstream annotations that rsID would have driven (ClinVar's own carrier annotation, plus GWAS Catalog, SNPedia, and ClinPGx).
|
|
126
|
+
|
|
127
|
+
Real GIAB HG002 benchmark, surviving the default `--min-magnitude 5.0` filter: GRCh37 surfaces 520 distinct rsIDs across all sources, GRCh38 surfaces 341. The two sets overlap on 331 rsIDs; 189 are GRCh37-only and 10 are GRCh38-only — pure asymmetric loss in the GRCh38 lift, not different upstream coverage. The unfiltered totals (65,965 vs 4,867) magnify the same pattern at lower magnitudes, mostly via GWAS-Catalog weak-association rows.
|
|
128
|
+
|
|
129
|
+
If you have a choice of build for the input, GRCh37 surfaces more annotations today on rsID-less VCFs that flow through position-keyed resolution. GRCh38 still surfaces every ClinVar carrier hit it has an exact alignment for.
|
|
130
|
+
|
|
131
|
+
### Known ClinPGx limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
107
132
|
|
|
108
|
-
ADR-0022 + ADR-0023: a tiny residual of
|
|
133
|
+
ADR-0022 + ADR-0023: a tiny residual of ClinPGx rows may appear in reports even when the user is homozygous reference. ClinPGx publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
|
|
109
134
|
|
|
110
135
|
- **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
|
|
111
136
|
- **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
|
|
112
137
|
|
|
113
|
-
For the rare rsID where
|
|
138
|
+
For the rare rsID where ClinPGx has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
|
|
114
139
|
|
|
115
140
|
The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
|
|
116
141
|
|
|
@@ -167,7 +192,7 @@ Not all databases are equal in size. `allelix db update` downloads them all by d
|
|
|
167
192
|
| Database | On disk | Download time | What it adds |
|
|
168
193
|
|---|---|---|---|
|
|
169
194
|
| ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
|
|
170
|
-
|
|
|
195
|
+
| ClinPGx + CPIC | ~6MB | seconds | Drug-gene interactions. |
|
|
171
196
|
| GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
|
|
172
197
|
| gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
|
|
173
198
|
| AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
|
|
@@ -185,14 +210,14 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
185
210
|
|---|---|---|---|
|
|
186
211
|
| ClinVar | NCBI | Public domain | No restrictions |
|
|
187
212
|
| GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
|
|
188
|
-
| PharmGKB |
|
|
189
|
-
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the
|
|
213
|
+
| ClinPGx (formerly PharmGKB) | clinpgx.org | CC BY-SA 4.0 | Attribution required |
|
|
214
|
+
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the ClinPGx non-finding filter (ADR-0020), not surfaced as its own annotator. |
|
|
190
215
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
191
216
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
192
217
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
193
218
|
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
194
219
|
|
|
195
|
-
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar,
|
|
220
|
+
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, ClinPGx, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
196
221
|
|
|
197
222
|
### SNPedia data download
|
|
198
223
|
|
|
@@ -221,17 +246,17 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
|
|
|
221
246
|
|
|
222
247
|
## Architecture & Design Decisions
|
|
223
248
|
|
|
224
|
-
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/
|
|
249
|
+
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/allelix/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
|
|
225
250
|
|
|
226
251
|
Notable load-bearing ADRs:
|
|
227
252
|
|
|
228
253
|
- **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
|
|
229
|
-
- **ADR-0020 — CPIC API as the per-allele function source.** The
|
|
254
|
+
- **ADR-0020 — CPIC API as the per-allele function source.** The ClinPGx non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
|
|
230
255
|
- **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
|
|
231
|
-
- **ADR-0009 —
|
|
256
|
+
- **ADR-0009 — ClinPGx matches the user's exact normalized diploid call.**
|
|
232
257
|
- **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
|
|
233
258
|
|
|
234
|
-
Release history: see [`CHANGELOG.md`](https://github.com/
|
|
259
|
+
Release history: see [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
235
260
|
|
|
236
261
|
## Development
|
|
237
262
|
|
|
@@ -251,4 +276,4 @@ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit i
|
|
|
251
276
|
|
|
252
277
|
## License
|
|
253
278
|
|
|
254
|
-
AGPL-3.0-or-later. See `LICENSE`.
|
|
279
|
+
GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later). See `LICENSE`.
|
|
@@ -2,18 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
4
4
|
|
|
5
|
-
> **Status:** Production —
|
|
6
|
-
>
|
|
7
|
-
> population frequencies + AlphaMissense
|
|
8
|
-
> deleteriousness), licensable-source gating for
|
|
9
|
-
> dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
5
|
+
> **Status:** Production — eight parser formats (including VCF + gVCF),
|
|
6
|
+
> four annotators (ClinVar + ClinPGx + GWAS Catalog + SNPedia), three
|
|
7
|
+
> enrichment sources (gnomAD population frequencies + AlphaMissense
|
|
8
|
+
> pathogenicity + CADD deleteriousness), licensable-source gating for
|
|
9
|
+
> commercial users, dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
10
10
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
11
11
|
> commands, report diffing, persistent config with commercial-mode
|
|
12
12
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
13
|
-
> No regex on prose anywhere in production. **Latest:
|
|
14
|
-
>
|
|
13
|
+
> No regex on prose anywhere in production. **Latest: v2.0.0** — VCF +
|
|
14
|
+
> gVCF parser with multi-sample handling, batched annotation pipeline
|
|
15
|
+
> for WGS scale, FTDNA Illumina raw parser, R-4 ClinVar CLNSIG drift CI
|
|
16
|
+
> test, CLI package restructure.
|
|
15
17
|
> Release notes:
|
|
16
|
-
> [`CHANGELOG.md`](https://github.com/
|
|
18
|
+
> [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
17
19
|
|
|
18
20
|
## Quickstart
|
|
19
21
|
|
|
@@ -28,6 +30,12 @@ allelix db update
|
|
|
28
30
|
# Analyze a genotype file
|
|
29
31
|
allelix analyze your_genotype_file.txt --output report.html
|
|
30
32
|
|
|
33
|
+
# VCF / gVCF input — same command, auto-detected
|
|
34
|
+
allelix analyze your_wgs.vcf.gz --output report.html
|
|
35
|
+
|
|
36
|
+
# Multi-sample VCF — pick which sample to analyze
|
|
37
|
+
allelix analyze trio.vcf.gz --sample HG002 --output report.html
|
|
38
|
+
|
|
31
39
|
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
32
40
|
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
33
41
|
```
|
|
@@ -44,16 +52,20 @@ Requires Python 3.11+. See [Development](#development) for source installs and r
|
|
|
44
52
|
| Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
|
|
45
53
|
| MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
|
|
46
54
|
| Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
|
|
55
|
+
| FTDNA Illumina raw | ✓ | Tab-delimited variant of the FTDNA export (distinct from the CSV format above). `RSID/CHROMOSOME/POSITION/RESULT` columns. Build 37 default. |
|
|
56
|
+
| VCF / gVCF | ✓ | REF/ALT encoding, `0/1` genotype notation. Plain VCF: absence at a position means reference. gVCF: explicit reference blocks (lines with `<NON_REF>` ALT and `END=` INFO) are skipped — they match nothing in any annotation database. Multi-sample files require `--sample <ID>`. Streams via stdlib; `.vcf.gz` handled transparently. Optional `pip install allelix[vcf-index]` enables pysam-backed tabix random access for fast `extract --snps` on huge VCFs. |
|
|
47
57
|
|
|
48
58
|
Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
|
|
49
59
|
|
|
50
|
-
### v2 roadmap
|
|
60
|
+
### v2.1+ roadmap
|
|
51
61
|
|
|
52
|
-
|
|
|
62
|
+
| Feature | Notes |
|
|
53
63
|
|---|---|
|
|
54
|
-
| VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
|
|
55
64
|
| Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
|
|
65
|
+
| Annotator-level strand awareness (R-1) | Strand-flip matching wired into every annotator's carrier check. Basic `compare` strand support shipped in v1.1; full annotator integration deferred from v2.0.0. |
|
|
66
|
+
| Good / Bad / Neutral repute | Per-annotation repute field. Reframes the report from "here's what's wrong" to "here's your full picture." Requires Annotation model change + renderer updates. |
|
|
56
67
|
| PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
|
|
68
|
+
| PharmCAT integration | Wrap CPIC's PharmCAT as an optional external engine for star-allele / diplotype calling. Requires VCF input (shipped in v2.0.0). |
|
|
57
69
|
| Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
|
|
58
70
|
|
|
59
71
|
## Supported Databases
|
|
@@ -61,22 +73,32 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
61
73
|
| Database | Status | Notes |
|
|
62
74
|
|---|---|---|
|
|
63
75
|
| ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
|
|
64
|
-
| PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
65
|
-
| CPIC (per-allele function table) | ✓ | Internal data source for the
|
|
76
|
+
| ClinPGx (formerly PharmGKB) | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
77
|
+
| CPIC (per-allele function table) | ✓ | Internal data source for the ClinPGx filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
|
|
66
78
|
| SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
|
|
67
79
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
68
80
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
69
81
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
70
82
|
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
71
83
|
|
|
72
|
-
###
|
|
84
|
+
### Build coverage asymmetry (GRCh37 vs GRCh38)
|
|
85
|
+
|
|
86
|
+
ClinVar dispatches per-build (ADR-0021) and ships with both GRCh37 and GRCh38 caches. The two caches are essentially equivalent in coverage: 2,896,063 rows / 2,645,206 distinct rsIDs in GRCh37 vs 2,896,102 / 2,645,243 in GRCh38 — a difference of 39 rows.
|
|
87
|
+
|
|
88
|
+
Despite that equivalence, the same person's WGS file produces noticeably more annotations as GRCh37 than as GRCh38. The mechanism is in the resolution step, not in upstream-data shape. Position-keyed rsID resolution requires exact `(chromosome, position, ref, alt)` alignment between the user's variant call and ClinVar's stored row. Lift-over between builds does not preserve that alignment perfectly: the `~0.4%` of the genome where the reference assembly was rebuilt has different REF alleles, multi-allelic sites split differently, and some benchmark VCF positions drop out entirely in the GRCh38 lift. Each misalignment loses one resolution, which in turn loses all the rsID-keyed downstream annotations that rsID would have driven (ClinVar's own carrier annotation, plus GWAS Catalog, SNPedia, and ClinPGx).
|
|
89
|
+
|
|
90
|
+
Real GIAB HG002 benchmark, surviving the default `--min-magnitude 5.0` filter: GRCh37 surfaces 520 distinct rsIDs across all sources, GRCh38 surfaces 341. The two sets overlap on 331 rsIDs; 189 are GRCh37-only and 10 are GRCh38-only — pure asymmetric loss in the GRCh38 lift, not different upstream coverage. The unfiltered totals (65,965 vs 4,867) magnify the same pattern at lower magnitudes, mostly via GWAS-Catalog weak-association rows.
|
|
91
|
+
|
|
92
|
+
If you have a choice of build for the input, GRCh37 surfaces more annotations today on rsID-less VCFs that flow through position-keyed resolution. GRCh38 still surfaces every ClinVar carrier hit it has an exact alignment for.
|
|
93
|
+
|
|
94
|
+
### Known ClinPGx limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
73
95
|
|
|
74
|
-
ADR-0022 + ADR-0023: a tiny residual of
|
|
96
|
+
ADR-0022 + ADR-0023: a tiny residual of ClinPGx rows may appear in reports even when the user is homozygous reference. ClinPGx publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
|
|
75
97
|
|
|
76
98
|
- **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
|
|
77
99
|
- **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
|
|
78
100
|
|
|
79
|
-
For the rare rsID where
|
|
101
|
+
For the rare rsID where ClinPGx has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
|
|
80
102
|
|
|
81
103
|
The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
|
|
82
104
|
|
|
@@ -133,7 +155,7 @@ Not all databases are equal in size. `allelix db update` downloads them all by d
|
|
|
133
155
|
| Database | On disk | Download time | What it adds |
|
|
134
156
|
|---|---|---|---|
|
|
135
157
|
| ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
|
|
136
|
-
|
|
|
158
|
+
| ClinPGx + CPIC | ~6MB | seconds | Drug-gene interactions. |
|
|
137
159
|
| GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
|
|
138
160
|
| gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
|
|
139
161
|
| AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
|
|
@@ -151,14 +173,14 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
151
173
|
|---|---|---|---|
|
|
152
174
|
| ClinVar | NCBI | Public domain | No restrictions |
|
|
153
175
|
| GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
|
|
154
|
-
| PharmGKB |
|
|
155
|
-
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the
|
|
176
|
+
| ClinPGx (formerly PharmGKB) | clinpgx.org | CC BY-SA 4.0 | Attribution required |
|
|
177
|
+
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the ClinPGx non-finding filter (ADR-0020), not surfaced as its own annotator. |
|
|
156
178
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
157
179
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
158
180
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
159
181
|
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
160
182
|
|
|
161
|
-
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar,
|
|
183
|
+
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, ClinPGx, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
162
184
|
|
|
163
185
|
### SNPedia data download
|
|
164
186
|
|
|
@@ -187,17 +209,17 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
|
|
|
187
209
|
|
|
188
210
|
## Architecture & Design Decisions
|
|
189
211
|
|
|
190
|
-
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/
|
|
212
|
+
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/allelix/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
|
|
191
213
|
|
|
192
214
|
Notable load-bearing ADRs:
|
|
193
215
|
|
|
194
216
|
- **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
|
|
195
|
-
- **ADR-0020 — CPIC API as the per-allele function source.** The
|
|
217
|
+
- **ADR-0020 — CPIC API as the per-allele function source.** The ClinPGx non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
|
|
196
218
|
- **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
|
|
197
|
-
- **ADR-0009 —
|
|
219
|
+
- **ADR-0009 — ClinPGx matches the user's exact normalized diploid call.**
|
|
198
220
|
- **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
|
|
199
221
|
|
|
200
|
-
Release history: see [`CHANGELOG.md`](https://github.com/
|
|
222
|
+
Release history: see [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
201
223
|
|
|
202
224
|
## Development
|
|
203
225
|
|
|
@@ -217,4 +239,4 @@ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit i
|
|
|
217
239
|
|
|
218
240
|
## License
|
|
219
241
|
|
|
220
|
-
AGPL-3.0-or-later. See `LICENSE`.
|
|
242
|
+
GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later). See `LICENSE`.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
-
# Copyright (C) 2026
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
3
|
"""Annotator registry. Unlike parsers, ALL annotators run on every variant."""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -43,7 +43,7 @@ def get_annotators(
|
|
|
43
43
|
complete 81 GB CADD file). Requires ``pysam`` and a local copy.
|
|
44
44
|
|
|
45
45
|
ADR-0023: ClinVar's `reference_for(rsid, build)` is wired into
|
|
46
|
-
|
|
46
|
+
ClinPGx and SNPedia as the primary hom-ref suppression filter — the
|
|
47
47
|
REF allele lookup universally determines whether the user is
|
|
48
48
|
homozygous reference (and thus a non-finding for that variant).
|
|
49
49
|
"""
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
-
# Copyright (C) 2026
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
3
|
"""AlphaMissense variant pathogenicity enrichment.
|
|
4
4
|
|
|
5
5
|
AlphaMissense is not a clinical annotator — it does not produce
|
|
@@ -226,3 +226,32 @@ class AlphaMissenseAnnotator(Annotator):
|
|
|
226
226
|
if score is not None:
|
|
227
227
|
result[(rsid, alt)] = (score, cls)
|
|
228
228
|
return result
|
|
229
|
+
|
|
230
|
+
def bulk_lookup_by_position(
|
|
231
|
+
self, keys: set[tuple[str, int, str, str]]
|
|
232
|
+
) -> dict[tuple[str, int, str, str], tuple[float, str]]:
|
|
233
|
+
"""Return ``{(chrom, pos, ref, alt): (score, class)}`` via PK lookup.
|
|
234
|
+
|
|
235
|
+
Position-keyed fallback for rsID-less VCFs whose ClinVar-resolved
|
|
236
|
+
rsIDs aren't indexed in the AlphaMissense cache. Hits the
|
|
237
|
+
``(chrom, pos, ref, alt)`` primary key directly.
|
|
238
|
+
"""
|
|
239
|
+
if not keys:
|
|
240
|
+
return {}
|
|
241
|
+
conn = self._connection()
|
|
242
|
+
result: dict[tuple[str, int, str, str], tuple[float, str]] = {}
|
|
243
|
+
key_list = list(keys)
|
|
244
|
+
batch_size = _BULK_BATCH_SIZE // 4
|
|
245
|
+
for i in range(0, len(key_list), batch_size):
|
|
246
|
+
batch = key_list[i : i + batch_size]
|
|
247
|
+
clauses = " OR ".join(["(chrom = ? AND pos = ? AND ref = ? AND alt = ?)"] * len(batch))
|
|
248
|
+
params = [v for k in batch for v in k]
|
|
249
|
+
rows = conn.execute(
|
|
250
|
+
f"SELECT chrom, pos, ref, alt, am_pathogenicity, am_class"
|
|
251
|
+
f" FROM alphamissense_scores WHERE {clauses}",
|
|
252
|
+
params,
|
|
253
|
+
).fetchall()
|
|
254
|
+
for chrom, pos, ref, alt, score, cls in rows:
|
|
255
|
+
if score is not None:
|
|
256
|
+
result[(chrom, pos, ref, alt)] = (score, cls)
|
|
257
|
+
return result
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
-
# Copyright (C) 2026
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
3
|
"""Abstract base class for reference-database annotators."""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -11,7 +11,7 @@ from enum import Enum, auto
|
|
|
11
11
|
from typing import TYPE_CHECKING, ClassVar
|
|
12
12
|
|
|
13
13
|
if TYPE_CHECKING:
|
|
14
|
-
from collections.abc import Callable
|
|
14
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from types import TracebackType
|
|
17
17
|
|
|
@@ -173,6 +173,22 @@ class Annotator(ABC):
|
|
|
173
173
|
"""
|
|
174
174
|
...
|
|
175
175
|
|
|
176
|
+
def batch_annotate(self, variants: Iterable[Variant]) -> Iterator[Annotation]:
|
|
177
|
+
"""Annotate a batch of variants. Yields annotations in arrival order.
|
|
178
|
+
|
|
179
|
+
Default implementation loops over ``annotate(variant)`` so any
|
|
180
|
+
existing annotator works unchanged. Subclasses with rsID-based
|
|
181
|
+
SQLite lookups should override this with a chunked ``WHERE rsid
|
|
182
|
+
IN (...)`` query to avoid per-variant round-trips at WGS scale
|
|
183
|
+
(4-6M variants per VCF).
|
|
184
|
+
|
|
185
|
+
The default keeps the pipeline single-path: callers always use
|
|
186
|
+
``batch_annotate``; the loop fallback gives backward compatibility
|
|
187
|
+
for annotators that haven't grown a batched query path yet.
|
|
188
|
+
"""
|
|
189
|
+
for variant in variants:
|
|
190
|
+
yield from self.annotate(variant)
|
|
191
|
+
|
|
176
192
|
@abstractmethod
|
|
177
193
|
def is_ready(self) -> bool:
|
|
178
194
|
"""Whether the local cache exists and is queryable."""
|