allelix 1.8.4__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {allelix-1.8.4 → allelix-2.0.0}/PKG-INFO +58 -30
- {allelix-1.8.4 → allelix-2.0.0}/README.md +50 -25
- {allelix-1.8.4 → allelix-2.0.0}/allelix/__init__.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/__init__.py +2 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/alphamissense.py +30 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/base.py +18 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/cadd.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/clinvar.py +203 -7
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/gnomad.py +31 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/gwas.py +90 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/pharmgkb.py +116 -23
- {allelix-1.8.4 → allelix-2.0.0}/allelix/annotators/snpedia.py +85 -2
- allelix-2.0.0/allelix/cli/__init__.py +36 -0
- allelix-2.0.0/allelix/cli/_helpers.py +553 -0
- allelix-2.0.0/allelix/cli/_options.py +152 -0
- allelix-2.0.0/allelix/cli/analyze.py +116 -0
- allelix-2.0.0/allelix/cli/config.py +190 -0
- allelix-2.0.0/allelix/cli/db.py +253 -0
- allelix-2.0.0/allelix/cli/focused.py +176 -0
- allelix-2.0.0/allelix/cli/utility.py +530 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/compare.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/config.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/data/__init__.py +1 -1
- allelix-2.0.0/allelix/data/clinvar_clnsig_snapshot.yaml +115 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/__init__.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/_versions.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/alphamissense_loader.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/cadd_loader.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/cpic_loader.py +5 -5
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/gnomad_loader.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/gwas_loader.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/loader_utils.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/manager.py +2 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/pharmgkb_loader.py +15 -11
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/schema.py +4 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/snpedia_loader.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/databases/snpedia_parser.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/exporters/__init__.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/exporters/plink.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/models.py +2 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/__init__.py +5 -1
- allelix-2.0.0/allelix/parsers/_helpers.py +84 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/ancestrydna.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/base.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/ftdna.py +1 -1
- allelix-2.0.0/allelix/parsers/ftdna_illumina.py +143 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/livingdna.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/myhappygenes.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/myheritage.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/parsers/twentythreeandme.py +1 -1
- allelix-2.0.0/allelix/parsers/vcf.py +535 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/__init__.py +2 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/_pipeline.py +129 -20
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/diff.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/high_value.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/html.py +15 -4
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/json_report.py +5 -2
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/methylation.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/reports/terminal.py +3 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/utils/__init__.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/utils/allele.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix/utils/build_detect.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/PKG-INFO +58 -30
- {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/SOURCES.txt +11 -1
- {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/requires.txt +3 -0
- {allelix-1.8.4 → allelix-2.0.0}/pyproject.toml +9 -5
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_cli.py +587 -24
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_cli_helpers.py +19 -14
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_compare.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_config.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_end_to_end.py +13 -13
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_mock_data_invariants.py +9 -9
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_models.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_registry.py +1 -1
- {allelix-1.8.4 → allelix-2.0.0}/tests/test_version.py +1 -1
- allelix-1.8.4/allelix/cli.py +0 -1541
- allelix-1.8.4/allelix/parsers/_helpers.py +0 -41
- {allelix-1.8.4 → allelix-2.0.0}/LICENSE +0 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/data/high_value_snps.yaml +0 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix/py.typed +0 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/dependency_links.txt +0 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/entry_points.txt +0 -0
- {allelix-1.8.4 → allelix-2.0.0}/allelix.egg-info/top_level.txt +0 -0
- {allelix-1.8.4 → allelix-2.0.0}/setup.cfg +0 -0
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: allelix
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
5
|
-
Author
|
|
5
|
+
Author: Allelix
|
|
6
|
+
Maintainer-email: dial481 <dial481@users.noreply.github.com>
|
|
6
7
|
License-Expression: AGPL-3.0-or-later
|
|
7
8
|
Project-URL: Homepage, https://allelix.io
|
|
8
|
-
Project-URL: Source, https://github.com/
|
|
9
|
-
Project-URL: Issues, https://github.com/
|
|
10
|
-
Project-URL: Changelog, https://github.com/
|
|
9
|
+
Project-URL: Source, https://github.com/allelix/allelix
|
|
10
|
+
Project-URL: Issues, https://github.com/allelix/allelix/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/allelix/allelix/blob/main/CHANGELOG.md
|
|
11
12
|
Keywords: genomics,genotype,snp,bioinformatics,dna
|
|
12
13
|
Classifier: Development Status :: 5 - Production/Stable
|
|
13
14
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -25,6 +26,8 @@ Requires-Dist: pyyaml>=6.0
|
|
|
25
26
|
Requires-Dist: rich>=13.7
|
|
26
27
|
Provides-Extra: cadd
|
|
27
28
|
Requires-Dist: pysam>=0.22; extra == "cadd"
|
|
29
|
+
Provides-Extra: vcf-index
|
|
30
|
+
Requires-Dist: pysam>=0.22; extra == "vcf-index"
|
|
28
31
|
Provides-Extra: dev
|
|
29
32
|
Requires-Dist: pre-commit>=3.7; extra == "dev"
|
|
30
33
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
@@ -36,18 +39,20 @@ Dynamic: license-file
|
|
|
36
39
|
|
|
37
40
|
Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
38
41
|
|
|
39
|
-
> **Status:** Production —
|
|
40
|
-
>
|
|
41
|
-
> population frequencies + AlphaMissense
|
|
42
|
-
> deleteriousness), licensable-source gating for
|
|
43
|
-
> dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
42
|
+
> **Status:** Production — eight parser formats (including VCF + gVCF),
|
|
43
|
+
> four annotators (ClinVar + ClinPGx + GWAS Catalog + SNPedia), three
|
|
44
|
+
> enrichment sources (gnomAD population frequencies + AlphaMissense
|
|
45
|
+
> pathogenicity + CADD deleteriousness), licensable-source gating for
|
|
46
|
+
> commercial users, dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
44
47
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
45
48
|
> commands, report diffing, persistent config with commercial-mode
|
|
46
49
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
47
|
-
> No regex on prose anywhere in production. **Latest:
|
|
48
|
-
>
|
|
50
|
+
> No regex on prose anywhere in production. **Latest: v2.0.0** — VCF +
|
|
51
|
+
> gVCF parser with multi-sample handling, batched annotation pipeline
|
|
52
|
+
> for WGS scale, FTDNA Illumina raw parser, R-4 ClinVar CLNSIG drift CI
|
|
53
|
+
> test, CLI package restructure.
|
|
49
54
|
> Release notes:
|
|
50
|
-
> [`CHANGELOG.md`](https://github.com/
|
|
55
|
+
> [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
51
56
|
|
|
52
57
|
## Quickstart
|
|
53
58
|
|
|
@@ -61,6 +66,15 @@ allelix db update
|
|
|
61
66
|
|
|
62
67
|
# Analyze a genotype file
|
|
63
68
|
allelix analyze your_genotype_file.txt --output report.html
|
|
69
|
+
|
|
70
|
+
# VCF / gVCF input — same command, auto-detected
|
|
71
|
+
allelix analyze your_wgs.vcf.gz --output report.html
|
|
72
|
+
|
|
73
|
+
# Multi-sample VCF — pick which sample to analyze
|
|
74
|
+
allelix analyze trio.vcf.gz --sample HG002 --output report.html
|
|
75
|
+
|
|
76
|
+
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
77
|
+
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
64
78
|
```
|
|
65
79
|
|
|
66
80
|
Requires Python 3.11+. See [Development](#development) for source installs and running tests.
|
|
@@ -75,16 +89,20 @@ Requires Python 3.11+. See [Development](#development) for source installs and r
|
|
|
75
89
|
| Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
|
|
76
90
|
| MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
|
|
77
91
|
| Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
|
|
92
|
+
| FTDNA Illumina raw | ✓ | Tab-delimited variant of the FTDNA export (distinct from the CSV format above). `RSID/CHROMOSOME/POSITION/RESULT` columns. Build 37 default. |
|
|
93
|
+
| VCF / gVCF | ✓ | REF/ALT encoding, `0/1` genotype notation. Plain VCF: absence at a position means reference. gVCF: explicit reference blocks (lines with `<NON_REF>` ALT and `END=` INFO) are skipped — they match nothing in any annotation database. Multi-sample files require `--sample <ID>`. Streams via stdlib; `.vcf.gz` handled transparently. Optional `pip install allelix[vcf-index]` enables pysam-backed tabix random access for fast `extract --snps` on huge VCFs. |
|
|
78
94
|
|
|
79
95
|
Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
|
|
80
96
|
|
|
81
|
-
### v2 roadmap
|
|
97
|
+
### v2.1+ roadmap
|
|
82
98
|
|
|
83
|
-
|
|
|
99
|
+
| Feature | Notes |
|
|
84
100
|
|---|---|
|
|
85
|
-
| VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
|
|
86
101
|
| Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
|
|
102
|
+
| Annotator-level strand awareness (R-1) | Strand-flip matching wired into every annotator's carrier check. Basic `compare` strand support shipped in v1.1; full annotator integration deferred from v2.0.0. |
|
|
103
|
+
| Good / Bad / Neutral repute | Per-annotation repute field. Reframes the report from "here's what's wrong" to "here's your full picture." Requires Annotation model change + renderer updates. |
|
|
87
104
|
| PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
|
|
105
|
+
| PharmCAT integration | Wrap CPIC's PharmCAT as an optional external engine for star-allele / diplotype calling. Requires VCF input (shipped in v2.0.0). |
|
|
88
106
|
| Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
|
|
89
107
|
|
|
90
108
|
## Supported Databases
|
|
@@ -92,22 +110,32 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
92
110
|
| Database | Status | Notes |
|
|
93
111
|
|---|---|---|
|
|
94
112
|
| ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
|
|
95
|
-
| PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
96
|
-
| CPIC (per-allele function table) | ✓ | Internal data source for the
|
|
113
|
+
| ClinPGx (formerly PharmGKB) | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
114
|
+
| CPIC (per-allele function table) | ✓ | Internal data source for the ClinPGx filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
|
|
97
115
|
| SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
|
|
98
116
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
99
117
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
100
118
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
101
119
|
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
102
120
|
|
|
103
|
-
###
|
|
121
|
+
### Build coverage asymmetry (GRCh37 vs GRCh38)
|
|
122
|
+
|
|
123
|
+
ClinVar dispatches per-build (ADR-0021) and ships with both GRCh37 and GRCh38 caches. The two caches are essentially equivalent in coverage: 2,896,063 rows / 2,645,206 distinct rsIDs in GRCh37 vs 2,896,102 / 2,645,243 in GRCh38 — a difference of 39 rows.
|
|
124
|
+
|
|
125
|
+
Despite that equivalence, the same person's WGS file produces noticeably more annotations as GRCh37 than as GRCh38. The mechanism is in the resolution step, not in upstream-data shape. Position-keyed rsID resolution requires exact `(chromosome, position, ref, alt)` alignment between the user's variant call and ClinVar's stored row. Lift-over between builds does not preserve that alignment perfectly: the `~0.4%` of the genome where the reference assembly was rebuilt has different REF alleles, multi-allelic sites split differently, and some benchmark VCF positions drop out entirely in the GRCh38 lift. Each misalignment loses one resolution, which in turn loses all the rsID-keyed downstream annotations that rsID would have driven (ClinVar's own carrier annotation, plus GWAS Catalog, SNPedia, and ClinPGx).
|
|
126
|
+
|
|
127
|
+
Real GIAB HG002 benchmark, surviving the default `--min-magnitude 5.0` filter: GRCh37 surfaces 520 distinct rsIDs across all sources, GRCh38 surfaces 341. The two sets overlap on 331 rsIDs; 189 are GRCh37-only and 10 are GRCh38-only — pure asymmetric loss in the GRCh38 lift, not different upstream coverage. The unfiltered totals (65,965 vs 4,867) magnify the same pattern at lower magnitudes, mostly via GWAS-Catalog weak-association rows.
|
|
128
|
+
|
|
129
|
+
If you have a choice of build for the input, GRCh37 surfaces more annotations today on rsID-less VCFs that flow through position-keyed resolution. GRCh38 still surfaces every ClinVar carrier hit it has an exact alignment for.
|
|
130
|
+
|
|
131
|
+
### Known ClinPGx limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
104
132
|
|
|
105
|
-
ADR-0022 + ADR-0023: a tiny residual of
|
|
133
|
+
ADR-0022 + ADR-0023: a tiny residual of ClinPGx rows may appear in reports even when the user is homozygous reference. ClinPGx publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
|
|
106
134
|
|
|
107
135
|
- **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
|
|
108
136
|
- **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
|
|
109
137
|
|
|
110
|
-
For the rare rsID where
|
|
138
|
+
For the rare rsID where ClinPGx has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
|
|
111
139
|
|
|
112
140
|
The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
|
|
113
141
|
|
|
@@ -164,7 +192,7 @@ Not all databases are equal in size. `allelix db update` downloads them all by d
|
|
|
164
192
|
| Database | On disk | Download time | What it adds |
|
|
165
193
|
|---|---|---|---|
|
|
166
194
|
| ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
|
|
167
|
-
|
|
|
195
|
+
| ClinPGx + CPIC | ~6MB | seconds | Drug-gene interactions. |
|
|
168
196
|
| GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
|
|
169
197
|
| gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
|
|
170
198
|
| AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
|
|
@@ -182,14 +210,14 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
182
210
|
|---|---|---|---|
|
|
183
211
|
| ClinVar | NCBI | Public domain | No restrictions |
|
|
184
212
|
| GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
|
|
185
|
-
| PharmGKB |
|
|
186
|
-
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the
|
|
213
|
+
| ClinPGx (formerly PharmGKB) | clinpgx.org | CC BY-SA 4.0 | Attribution required |
|
|
214
|
+
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the ClinPGx non-finding filter (ADR-0020), not surfaced as its own annotator. |
|
|
187
215
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
188
216
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
189
217
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
190
218
|
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
191
219
|
|
|
192
|
-
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar,
|
|
220
|
+
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, ClinPGx, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
193
221
|
|
|
194
222
|
### SNPedia data download
|
|
195
223
|
|
|
@@ -218,17 +246,17 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
|
|
|
218
246
|
|
|
219
247
|
## Architecture & Design Decisions
|
|
220
248
|
|
|
221
|
-
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/
|
|
249
|
+
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/allelix/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
|
|
222
250
|
|
|
223
251
|
Notable load-bearing ADRs:
|
|
224
252
|
|
|
225
253
|
- **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
|
|
226
|
-
- **ADR-0020 — CPIC API as the per-allele function source.** The
|
|
254
|
+
- **ADR-0020 — CPIC API as the per-allele function source.** The ClinPGx non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
|
|
227
255
|
- **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
|
|
228
|
-
- **ADR-0009 —
|
|
256
|
+
- **ADR-0009 — ClinPGx matches the user's exact normalized diploid call.**
|
|
229
257
|
- **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
|
|
230
258
|
|
|
231
|
-
Release history: see [`CHANGELOG.md`](https://github.com/
|
|
259
|
+
Release history: see [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
232
260
|
|
|
233
261
|
## Development
|
|
234
262
|
|
|
@@ -248,4 +276,4 @@ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit i
|
|
|
248
276
|
|
|
249
277
|
## License
|
|
250
278
|
|
|
251
|
-
AGPL-3.0-or-later. See `LICENSE`.
|
|
279
|
+
GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later). See `LICENSE`.
|
|
@@ -2,18 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
4
4
|
|
|
5
|
-
> **Status:** Production —
|
|
6
|
-
>
|
|
7
|
-
> population frequencies + AlphaMissense
|
|
8
|
-
> deleteriousness), licensable-source gating for
|
|
9
|
-
> dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
5
|
+
> **Status:** Production — eight parser formats (including VCF + gVCF),
|
|
6
|
+
> four annotators (ClinVar + ClinPGx + GWAS Catalog + SNPedia), three
|
|
7
|
+
> enrichment sources (gnomAD population frequencies + AlphaMissense
|
|
8
|
+
> pathogenicity + CADD deleteriousness), licensable-source gating for
|
|
9
|
+
> commercial users, dual-build ClinVar caches (GRCh37 + GRCh38),
|
|
10
10
|
> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
|
|
11
11
|
> commands, report diffing, persistent config with commercial-mode
|
|
12
12
|
> safety switch. Build auto-detection from position data (ADR-0021).
|
|
13
|
-
> No regex on prose anywhere in production. **Latest:
|
|
14
|
-
>
|
|
13
|
+
> No regex on prose anywhere in production. **Latest: v2.0.0** — VCF +
|
|
14
|
+
> gVCF parser with multi-sample handling, batched annotation pipeline
|
|
15
|
+
> for WGS scale, FTDNA Illumina raw parser, R-4 ClinVar CLNSIG drift CI
|
|
16
|
+
> test, CLI package restructure.
|
|
15
17
|
> Release notes:
|
|
16
|
-
> [`CHANGELOG.md`](https://github.com/
|
|
18
|
+
> [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
17
19
|
|
|
18
20
|
## Quickstart
|
|
19
21
|
|
|
@@ -27,6 +29,15 @@ allelix db update
|
|
|
27
29
|
|
|
28
30
|
# Analyze a genotype file
|
|
29
31
|
allelix analyze your_genotype_file.txt --output report.html
|
|
32
|
+
|
|
33
|
+
# VCF / gVCF input — same command, auto-detected
|
|
34
|
+
allelix analyze your_wgs.vcf.gz --output report.html
|
|
35
|
+
|
|
36
|
+
# Multi-sample VCF — pick which sample to analyze
|
|
37
|
+
allelix analyze trio.vcf.gz --sample HG002 --output report.html
|
|
38
|
+
|
|
39
|
+
# Filter to a custom panel (rsIDs + gene names, one per line; '#' comments and blank lines ignored)
|
|
40
|
+
allelix analyze your_genotype_file.txt --filter-file my_panel.txt --output report.html
|
|
30
41
|
```
|
|
31
42
|
|
|
32
43
|
Requires Python 3.11+. See [Development](#development) for source installs and running tests.
|
|
@@ -41,16 +52,20 @@ Requires Python 3.11+. See [Development](#development) for source installs and r
|
|
|
41
52
|
| Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
|
|
42
53
|
| MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
|
|
43
54
|
| Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
|
|
55
|
+
| FTDNA Illumina raw | ✓ | Tab-delimited variant of the FTDNA export (distinct from the CSV format above). `RSID/CHROMOSOME/POSITION/RESULT` columns. Build 37 default. |
|
|
56
|
+
| VCF / gVCF | ✓ | REF/ALT encoding, `0/1` genotype notation. Plain VCF: absence at a position means reference. gVCF: explicit reference blocks (lines with `<NON_REF>` ALT and `END=` INFO) are skipped — they match nothing in any annotation database. Multi-sample files require `--sample <ID>`. Streams via stdlib; `.vcf.gz` handled transparently. Optional `pip install allelix[vcf-index]` enables pysam-backed tabix random access for fast `extract --snps` on huge VCFs. |
|
|
44
57
|
|
|
45
58
|
Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
|
|
46
59
|
|
|
47
|
-
### v2 roadmap
|
|
60
|
+
### v2.1+ roadmap
|
|
48
61
|
|
|
49
|
-
|
|
|
62
|
+
| Feature | Notes |
|
|
50
63
|
|---|---|
|
|
51
|
-
| VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
|
|
52
64
|
| Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
|
|
65
|
+
| Annotator-level strand awareness (R-1) | Strand-flip matching wired into every annotator's carrier check. Basic `compare` strand support shipped in v1.1; full annotator integration deferred from v2.0.0. |
|
|
66
|
+
| Good / Bad / Neutral repute | Per-annotation repute field. Reframes the report from "here's what's wrong" to "here's your full picture." Requires Annotation model change + renderer updates. |
|
|
53
67
|
| PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
|
|
68
|
+
| PharmCAT integration | Wrap CPIC's PharmCAT as an optional external engine for star-allele / diplotype calling. Requires VCF input (shipped in v2.0.0). |
|
|
54
69
|
| Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
|
|
55
70
|
|
|
56
71
|
## Supported Databases
|
|
@@ -58,22 +73,32 @@ Adding a new format means adding one file to `allelix/parsers/` and registering
|
|
|
58
73
|
| Database | Status | Notes |
|
|
59
74
|
|---|---|---|
|
|
60
75
|
| ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
|
|
61
|
-
| PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
62
|
-
| CPIC (per-allele function table) | ✓ | Internal data source for the
|
|
76
|
+
| ClinPGx (formerly PharmGKB) | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
|
|
77
|
+
| CPIC (per-allele function table) | ✓ | Internal data source for the ClinPGx filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
|
|
63
78
|
| SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
|
|
64
79
|
| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
|
|
65
80
|
| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
|
|
66
81
|
| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
|
|
67
82
|
| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Use `--no-cadd` to skip enrichment for a single run. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
|
|
68
83
|
|
|
69
|
-
###
|
|
84
|
+
### Build coverage asymmetry (GRCh37 vs GRCh38)
|
|
85
|
+
|
|
86
|
+
ClinVar dispatches per-build (ADR-0021) and ships with both GRCh37 and GRCh38 caches. The two caches are essentially equivalent in coverage: 2,896,063 rows / 2,645,206 distinct rsIDs in GRCh37 vs 2,896,102 / 2,645,243 in GRCh38 — a difference of 39 rows.
|
|
87
|
+
|
|
88
|
+
Despite that equivalence, the same person's WGS file produces noticeably more annotations as GRCh37 than as GRCh38. The mechanism is in the resolution step, not in upstream-data shape. Position-keyed rsID resolution requires exact `(chromosome, position, ref, alt)` alignment between the user's variant call and ClinVar's stored row. Lift-over between builds does not preserve that alignment perfectly: the `~0.4%` of the genome where the reference assembly was rebuilt has different REF alleles, multi-allelic sites split differently, and some benchmark VCF positions drop out entirely in the GRCh38 lift. Each misalignment loses one resolution, which in turn loses all the rsID-keyed downstream annotations that rsID would have driven (ClinVar's own carrier annotation, plus GWAS Catalog, SNPedia, and ClinPGx).
|
|
89
|
+
|
|
90
|
+
Real GIAB HG002 benchmark, surviving the default `--min-magnitude 5.0` filter: GRCh37 surfaces 520 distinct rsIDs across all sources, GRCh38 surfaces 341. The two sets overlap on 331 rsIDs; 189 are GRCh37-only and 10 are GRCh38-only — pure asymmetric loss in the GRCh38 lift, not different upstream coverage. The unfiltered totals (65,965 vs 4,867) magnify the same pattern at lower magnitudes, mostly via GWAS-Catalog weak-association rows.
|
|
91
|
+
|
|
92
|
+
If you have a choice of build for the input, GRCh37 surfaces more annotations today on rsID-less VCFs that flow through position-keyed resolution. GRCh38 still surfaces every ClinVar carrier hit it has an exact alignment for.
|
|
93
|
+
|
|
94
|
+
### Known ClinPGx limitation: reference-genotype rows where ClinVar and CPIC both lack data
|
|
70
95
|
|
|
71
|
-
ADR-0022 + ADR-0023: a tiny residual of
|
|
96
|
+
ADR-0022 + ADR-0023: a tiny residual of ClinPGx rows may appear in reports even when the user is homozygous reference. ClinPGx publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
|
|
72
97
|
|
|
73
98
|
- **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
|
|
74
99
|
- **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
|
|
75
100
|
|
|
76
|
-
For the rare rsID where
|
|
101
|
+
For the rare rsID where ClinPGx has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
|
|
77
102
|
|
|
78
103
|
The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
|
|
79
104
|
|
|
@@ -130,7 +155,7 @@ Not all databases are equal in size. `allelix db update` downloads them all by d
|
|
|
130
155
|
| Database | On disk | Download time | What it adds |
|
|
131
156
|
|---|---|---|---|
|
|
132
157
|
| ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
|
|
133
|
-
|
|
|
158
|
+
| ClinPGx + CPIC | ~6MB | seconds | Drug-gene interactions. |
|
|
134
159
|
| GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
|
|
135
160
|
| gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
|
|
136
161
|
| AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
|
|
@@ -148,14 +173,14 @@ Allelix source code is licensed under the **GNU Affero General Public License v3
|
|
|
148
173
|
|---|---|---|---|
|
|
149
174
|
| ClinVar | NCBI | Public domain | No restrictions |
|
|
150
175
|
| GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
|
|
151
|
-
| PharmGKB |
|
|
152
|
-
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the
|
|
176
|
+
| ClinPGx (formerly PharmGKB) | clinpgx.org | CC BY-SA 4.0 | Attribution required |
|
|
177
|
+
| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the ClinPGx non-finding filter (ADR-0020), not surfaced as its own annotator. |
|
|
153
178
|
| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
|
|
154
179
|
| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
|
|
155
180
|
| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
|
|
156
181
|
| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. Use `--no-cadd` to omit. |
|
|
157
182
|
|
|
158
|
-
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar,
|
|
183
|
+
**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, ClinPGx, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
|
|
159
184
|
|
|
160
185
|
### SNPedia data download
|
|
161
186
|
|
|
@@ -184,17 +209,17 @@ None of these are scraping errors. They are editorial inconsistencies on the sou
|
|
|
184
209
|
|
|
185
210
|
## Architecture & Design Decisions
|
|
186
211
|
|
|
187
|
-
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/
|
|
212
|
+
The "why" behind major design choices lives in [`docs/adr/`](https://github.com/allelix/allelix/blob/main/docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
|
|
188
213
|
|
|
189
214
|
Notable load-bearing ADRs:
|
|
190
215
|
|
|
191
216
|
- **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
|
|
192
|
-
- **ADR-0020 — CPIC API as the per-allele function source.** The
|
|
217
|
+
- **ADR-0020 — CPIC API as the per-allele function source.** The ClinPGx non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
|
|
193
218
|
- **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
|
|
194
|
-
- **ADR-0009 —
|
|
219
|
+
- **ADR-0009 — ClinPGx matches the user's exact normalized diploid call.**
|
|
195
220
|
- **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
|
|
196
221
|
|
|
197
|
-
Release history: see [`CHANGELOG.md`](https://github.com/
|
|
222
|
+
Release history: see [`CHANGELOG.md`](https://github.com/allelix/allelix/blob/main/CHANGELOG.md).
|
|
198
223
|
|
|
199
224
|
## Development
|
|
200
225
|
|
|
@@ -214,4 +239,4 @@ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit i
|
|
|
214
239
|
|
|
215
240
|
## License
|
|
216
241
|
|
|
217
|
-
AGPL-3.0-or-later. See `LICENSE`.
|
|
242
|
+
GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later). See `LICENSE`.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
-
# Copyright (C) 2026
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
3
|
"""Annotator registry. Unlike parsers, ALL annotators run on every variant."""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -43,7 +43,7 @@ def get_annotators(
|
|
|
43
43
|
complete 81 GB CADD file). Requires ``pysam`` and a local copy.
|
|
44
44
|
|
|
45
45
|
ADR-0023: ClinVar's `reference_for(rsid, build)` is wired into
|
|
46
|
-
|
|
46
|
+
ClinPGx and SNPedia as the primary hom-ref suppression filter — the
|
|
47
47
|
REF allele lookup universally determines whether the user is
|
|
48
48
|
homozygous reference (and thus a non-finding for that variant).
|
|
49
49
|
"""
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
-
# Copyright (C) 2026
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
3
|
"""AlphaMissense variant pathogenicity enrichment.
|
|
4
4
|
|
|
5
5
|
AlphaMissense is not a clinical annotator — it does not produce
|
|
@@ -226,3 +226,32 @@ class AlphaMissenseAnnotator(Annotator):
|
|
|
226
226
|
if score is not None:
|
|
227
227
|
result[(rsid, alt)] = (score, cls)
|
|
228
228
|
return result
|
|
229
|
+
|
|
230
|
+
def bulk_lookup_by_position(
|
|
231
|
+
self, keys: set[tuple[str, int, str, str]]
|
|
232
|
+
) -> dict[tuple[str, int, str, str], tuple[float, str]]:
|
|
233
|
+
"""Return ``{(chrom, pos, ref, alt): (score, class)}`` via PK lookup.
|
|
234
|
+
|
|
235
|
+
Position-keyed fallback for rsID-less VCFs whose ClinVar-resolved
|
|
236
|
+
rsIDs aren't indexed in the AlphaMissense cache. Hits the
|
|
237
|
+
``(chrom, pos, ref, alt)`` primary key directly.
|
|
238
|
+
"""
|
|
239
|
+
if not keys:
|
|
240
|
+
return {}
|
|
241
|
+
conn = self._connection()
|
|
242
|
+
result: dict[tuple[str, int, str, str], tuple[float, str]] = {}
|
|
243
|
+
key_list = list(keys)
|
|
244
|
+
batch_size = _BULK_BATCH_SIZE // 4
|
|
245
|
+
for i in range(0, len(key_list), batch_size):
|
|
246
|
+
batch = key_list[i : i + batch_size]
|
|
247
|
+
clauses = " OR ".join(["(chrom = ? AND pos = ? AND ref = ? AND alt = ?)"] * len(batch))
|
|
248
|
+
params = [v for k in batch for v in k]
|
|
249
|
+
rows = conn.execute(
|
|
250
|
+
f"SELECT chrom, pos, ref, alt, am_pathogenicity, am_class"
|
|
251
|
+
f" FROM alphamissense_scores WHERE {clauses}",
|
|
252
|
+
params,
|
|
253
|
+
).fetchall()
|
|
254
|
+
for chrom, pos, ref, alt, score, cls in rows:
|
|
255
|
+
if score is not None:
|
|
256
|
+
result[(chrom, pos, ref, alt)] = (score, cls)
|
|
257
|
+
return result
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
-
# Copyright (C) 2026
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
3
|
"""Abstract base class for reference-database annotators."""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
@@ -11,7 +11,7 @@ from enum import Enum, auto
|
|
|
11
11
|
from typing import TYPE_CHECKING, ClassVar
|
|
12
12
|
|
|
13
13
|
if TYPE_CHECKING:
|
|
14
|
-
from collections.abc import Callable
|
|
14
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from types import TracebackType
|
|
17
17
|
|
|
@@ -173,6 +173,22 @@ class Annotator(ABC):
|
|
|
173
173
|
"""
|
|
174
174
|
...
|
|
175
175
|
|
|
176
|
+
def batch_annotate(self, variants: Iterable[Variant]) -> Iterator[Annotation]:
|
|
177
|
+
"""Annotate a batch of variants. Yields annotations in arrival order.
|
|
178
|
+
|
|
179
|
+
Default implementation loops over ``annotate(variant)`` so any
|
|
180
|
+
existing annotator works unchanged. Subclasses with rsID-based
|
|
181
|
+
SQLite lookups should override this with a chunked ``WHERE rsid
|
|
182
|
+
IN (...)`` query to avoid per-variant round-trips at WGS scale
|
|
183
|
+
(4-6M variants per VCF).
|
|
184
|
+
|
|
185
|
+
The default keeps the pipeline single-path: callers always use
|
|
186
|
+
``batch_annotate``; the loop fallback gives backward compatibility
|
|
187
|
+
for annotators that haven't grown a batched query path yet.
|
|
188
|
+
"""
|
|
189
|
+
for variant in variants:
|
|
190
|
+
yield from self.annotate(variant)
|
|
191
|
+
|
|
176
192
|
@abstractmethod
|
|
177
193
|
def is_ready(self) -> bool:
|
|
178
194
|
"""Whether the local cache exists and is queryable."""
|