allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,203 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Genome build detection from position data.
4
+
5
+ ADR-0021: Allelix detects the build of an input genotype file from a
6
+ handful of well-known SNP positions rather than trusting the file header.
7
+ A real-world MyHappyGenes/Tempus file was confirmed to label its build
8
+ as "37.1" while shipping GRCh38 coordinates; cross-build REF/ALT
9
+ comparison produced a false-positive pathogenic call on NIPA1.
10
+
11
+ The detection table holds authoritative (chromosome, 1-based position)
12
+ pairs for all three builds (GRCh36, GRCh37, GRCh38) across ~10 SNPs
13
+ spread over chromosomes 1, 10, 11, 12, 17, 19, and 22. Each entry's positions
14
+ differ by tens of thousands to millions of bases — there is no
15
+ ambiguity. A single matched rsID identifies the build; multiple are
16
+ confirmatory.
17
+
18
+ Position data is normative; headers are not.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import TYPE_CHECKING, NamedTuple
24
+
25
+ if TYPE_CHECKING:
26
+ from collections.abc import Iterable
27
+
28
+ from allelix.models import Variant
29
+
30
+ BUILD_GRCH36 = "GRCh36"
31
+ BUILD_GRCH37 = "GRCh37"
32
+ BUILD_GRCH38 = "GRCh38"
33
+
34
+ # Authoritative 1-based positions per NCBI dbSNP / Variation API. Each
35
+ # entry was cross-checked against the API's SPDI (0-based) + 1 and the
36
+ # correct NC accession version for each build. SNPs were chosen for:
37
+ # - presence on virtually every consumer array
38
+ # - clinical or pharmacogenomic relevance (so coverage is high)
39
+ # - distribution across chromosomes so partial-coverage files still
40
+ # hit at least one entry
41
+ #
42
+ # If the API ever returns inverted labels (mine did for chr11/12/19 due
43
+ # to NC accession version quirks), THIS table is the source of truth.
44
+ # Verify against dbSNP's web view before editing.
45
+ KNOWN_SNP_POSITIONS: dict[str, dict[str, tuple[str, int]]] = {
46
+ # MTHFR — methylation pathway, chromosome 1 short arm
47
+ "rs1801133": {
48
+ BUILD_GRCH36: ("1", 11778965),
49
+ BUILD_GRCH37: ("1", 11856378),
50
+ BUILD_GRCH38: ("1", 11796321),
51
+ },
52
+ "rs1801131": {
53
+ BUILD_GRCH36: ("1", 11777063),
54
+ BUILD_GRCH37: ("1", 11854476),
55
+ BUILD_GRCH38: ("1", 11794419),
56
+ },
57
+ # CYP2C9 / CYP2C19 cluster — chromosome 10 long arm
58
+ "rs1799853": {
59
+ BUILD_GRCH36: ("10", 96692448),
60
+ BUILD_GRCH37: ("10", 96702047),
61
+ BUILD_GRCH38: ("10", 94942290),
62
+ },
63
+ "rs1057910": {
64
+ BUILD_GRCH36: ("10", 96731043),
65
+ BUILD_GRCH37: ("10", 96741053),
66
+ BUILD_GRCH38: ("10", 94981296),
67
+ },
68
+ "rs4244285": {
69
+ BUILD_GRCH36: ("10", 96532017),
70
+ BUILD_GRCH37: ("10", 96541616),
71
+ BUILD_GRCH38: ("10", 94781859),
72
+ },
73
+ # SLCO1B1 — statin myopathy, chromosome 12
74
+ "rs4149056": {
75
+ BUILD_GRCH36: ("12", 21222816),
76
+ BUILD_GRCH37: ("12", 21331549),
77
+ BUILD_GRCH38: ("12", 21178615),
78
+ },
79
+ # DRD2/ANKK1 — chromosome 11
80
+ "rs1800497": {
81
+ BUILD_GRCH36: ("11", 112776038),
82
+ BUILD_GRCH37: ("11", 113270828),
83
+ BUILD_GRCH38: ("11", 113400106),
84
+ },
85
+ # BRCA1 — hereditary cancer, chromosome 17
86
+ "rs80357906": {
87
+ BUILD_GRCH36: ("17", 38449327),
88
+ BUILD_GRCH37: ("17", 41209080),
89
+ BUILD_GRCH38: ("17", 43057063),
90
+ },
91
+ # APOE — chromosome 19, near telomere
92
+ "rs429358": {
93
+ BUILD_GRCH36: ("19", 50103781),
94
+ BUILD_GRCH37: ("19", 45411941),
95
+ BUILD_GRCH38: ("19", 44908684),
96
+ },
97
+ "rs7412": {
98
+ BUILD_GRCH36: ("19", 50103919),
99
+ BUILD_GRCH37: ("19", 45412079),
100
+ BUILD_GRCH38: ("19", 44908822),
101
+ },
102
+ # COMT — chromosome 22
103
+ "rs4680": {
104
+ BUILD_GRCH36: ("22", 18331271),
105
+ BUILD_GRCH37: ("22", 19951271),
106
+ BUILD_GRCH38: ("22", 19963748),
107
+ },
108
+ }
109
+
110
+
111
+ _MIN_CONFIDENT_MATCHES = 3
112
+
113
+
114
+ class BuildDetectionResult(NamedTuple):
115
+ """Outcome of build detection on an input file.
116
+
117
+ `build` is `"GRCh36"`, `"GRCh37"`, `"GRCh38"`, or None if no known SNPs were
118
+ found in the input. `matched` counts how many table entries matched
119
+ the winning build; `inspected` counts how many table entries were
120
+ found in the input (regardless of which build their positions
121
+ matched). When `matched < inspected` the file is internally
122
+ inconsistent (e.g., one rsID matches GRCh37, another matches
123
+ GRCh38) — surface a warning but pick the majority.
124
+ """
125
+
126
+ build: str | None
127
+ matched: int
128
+ inspected: int
129
+
130
+ @property
131
+ def is_confident(self) -> bool:
132
+ """True iff enough rsIDs matched and all matches agreed.
133
+
134
+ Requires at least ``_MIN_CONFIDENT_MATCHES`` (3) concordant
135
+ positions before declaring confident. A single-SNP match
136
+ could be a table error; three concordant matches across
137
+ different chromosomes eliminates that risk.
138
+ """
139
+ return self.matched >= _MIN_CONFIDENT_MATCHES and self.matched == self.inspected
140
+
141
+
142
+ def detect_build(variants: Iterable[Variant]) -> BuildDetectionResult:
143
+ """Detect the genome build of an iterable of `Variant` records.
144
+
145
+ Iterates the input, looking for any rsID in `KNOWN_SNP_POSITIONS`,
146
+ and tallies which build's (chromosome, position) each match votes
147
+ for. Returns when every entry in the table has been seen OR the
148
+ input is exhausted. Streaming-friendly — does not materialize the
149
+ full variant list.
150
+ """
151
+ votes: dict[str, int] = {BUILD_GRCH36: 0, BUILD_GRCH37: 0, BUILD_GRCH38: 0}
152
+ inspected = 0
153
+ remaining = set(KNOWN_SNP_POSITIONS)
154
+ for variant in variants:
155
+ if variant.rsid not in remaining:
156
+ continue
157
+ entry = KNOWN_SNP_POSITIONS[variant.rsid]
158
+ remaining.discard(variant.rsid)
159
+ inspected += 1
160
+ for build, (chrom, pos) in entry.items():
161
+ if variant.chromosome == chrom and variant.position == pos:
162
+ votes[build] += 1
163
+ break
164
+ if not remaining:
165
+ break
166
+
167
+ if inspected == 0:
168
+ return BuildDetectionResult(build=None, matched=0, inspected=0)
169
+
170
+ winner = max(votes, key=votes.__getitem__)
171
+ if votes[winner] == 0:
172
+ return BuildDetectionResult(build=None, matched=0, inspected=inspected)
173
+ # Tie between two builds with equal non-zero votes — don't pick.
174
+ top_counts = sorted(votes.values(), reverse=True)
175
+ if top_counts[0] == top_counts[1]:
176
+ return BuildDetectionResult(build=None, matched=0, inspected=inspected)
177
+ return BuildDetectionResult(build=winner, matched=votes[winner], inspected=inspected)
178
+
179
+
180
+ def normalize_build_label(label: str | None) -> str | None:
181
+ """Map a human-written build label to canonical `GRCh36`, `GRCh37`, or `GRCh38`.
182
+
183
+ Examples that map to GRCh36: `"GRCh36"`, `"hg18"`, `"build 36"`,
184
+ `"NCBI 36"`. Examples for GRCh37: `"GRCh37"`, `"grch37"`, `"hg19"`,
185
+ `"37.1"`, `"build 37.1"`, `"NCBI 37"`. Examples for GRCh38: `"GRCh38"`,
186
+ `"hg38"`, `"38"`. Unrecognized labels return None.
187
+
188
+ Used to compare a file's header-claimed build against the detected
189
+ build. The label space is informal and provider-specific; this
190
+ function only recognizes well-known aliases.
191
+ """
192
+ if not label:
193
+ return None
194
+ s = label.strip().lower()
195
+ if not s:
196
+ return None
197
+ if "36" in s or "hg18" in s or "ncbi36" in s or "ncbi 36" in s:
198
+ return BUILD_GRCH36
199
+ if "37" in s or "hg19" in s or "ncbi37" in s or "ncbi 37" in s:
200
+ return BUILD_GRCH37
201
+ if "38" in s or "hg38" in s or "ncbi38" in s or "ncbi 38" in s:
202
+ return BUILD_GRCH38
203
+ return None
@@ -0,0 +1,276 @@
1
+ Metadata-Version: 2.4
2
+ Name: allelix
3
+ Version: 1.8.1
4
+ Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
+ Author-email: dial481 <dial481@users.noreply.github.com>
6
+ License-Expression: AGPL-3.0-or-later
7
+ Project-URL: Homepage, https://github.com/dial481/allelix
8
+ Project-URL: Issues, https://github.com/dial481/allelix/issues
9
+ Keywords: genomics,genotype,snp,bioinformatics,dna
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: click>=8.2
21
+ Requires-Dist: mwparserfromhell>=0.6
22
+ Requires-Dist: pyyaml>=6.0
23
+ Requires-Dist: rich>=13.7
24
+ Provides-Extra: cadd
25
+ Requires-Dist: pysam>=0.22; extra == "cadd"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pre-commit>=3.7; extra == "dev"
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-cov<8,>=7; extra == "dev"
30
+ Requires-Dist: ruff>=0.6; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # Allelix
34
+
35
+ Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
36
+
37
+ > **Status:** Production — six parser formats, four annotators (ClinVar +
38
+ > PharmGKB + GWAS Catalog + SNPedia), three enrichment sources (gnomAD
39
+ > population frequencies + AlphaMissense pathogenicity + CADD
40
+ > deleteriousness), licensable-source gating for commercial users,
41
+ > dual-build ClinVar caches (GRCh37 + GRCh38),
42
+ > HTML/JSON/terminal reports, methylation + pharmacogenomics focused
43
+ > commands, report diffing, persistent config with commercial-mode
44
+ > safety switch. Build auto-detection from position data (ADR-0021).
45
+ > No regex on prose anywhere in production. **Latest: v1.7.0** — PLINK
46
+ > export, magnitude scoring formalization (ADR-0034). Release notes:
47
+ > [`CHANGELOG.md`](CHANGELOG.md).
48
+
49
+ ## Quickstart
50
+
51
+ Requires Python 3.11+.
52
+
53
+ ```bash
54
+ git clone https://github.com/dial481/allelix
55
+ cd allelix
56
+ python -m venv .venv
57
+ source .venv/bin/activate
58
+ pip install -e ".[dev]"
59
+
60
+ # Generate a synthetic test fixture
61
+ python tests/generate_mock_data.py
62
+
63
+ # Show summary statistics for a genotype file
64
+ allelix stats tests/fixtures/mock_myhappygenes.txt
65
+
66
+ # Download reference databases. First run downloads all sources (~15GB
67
+ # on disk with gnomAD + AlphaMissense). Use --no-gnomad / --no-alphamissense
68
+ # to skip the large enrichment databases. Re-runs skip unchanged sources.
69
+ # CADD is opt-in: allelix db update --cadd
70
+ allelix db update
71
+ allelix db status # see what's cached
72
+
73
+ # Analyze a genotype file against all ready databases
74
+ allelix analyze tests/fixtures/mock_myhappygenes.txt --min-magnitude 5
75
+
76
+ # Same data, focused subsets
77
+ allelix methylation tests/fixtures/mock_myhappygenes.txt
78
+ allelix pharmacogenomics tests/fixtures/mock_myhappygenes.txt
79
+
80
+ # Compare two genotype files (coverage, concordance, strand-flip detection)
81
+ allelix compare file1.txt file2.txt
82
+
83
+ # Export to PLINK1 binary format (.bed/.bim/.fam) for plink2, ADMIXTURE, PRSice
84
+ # Expect ~60% monomorphic markers (A2=0) — genotyping chips probe many
85
+ # intronic/intergenic sites outside gnomAD's exome coverage.
86
+ allelix export plink genotype_file.txt -o output_prefix --build grch37
87
+
88
+ # Output to a self-contained HTML or JSON report
89
+ allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.html
90
+ allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.json
91
+ ```
92
+
93
+ ## Supported Formats
94
+
95
+ | Format | Status | Notes |
96
+ |---|---|---|
97
+ | MyHappyGenes (Tempus) | ✓ | Tab-delimited, 5 columns. **Build is auto-detected** — real-world MHG exports mislabel the header as "build 37.1" while shipping GRCh38 coordinates. Allelix detects from position data and warns on header/data disagreement (ADR-0021). |
98
+ | 23andMe | ✓ | Tab-delimited, 4 columns, concatenated genotype. Supports build 36/37/38 from header. I-prefixed probe IDs passed through. |
99
+ | AncestryDNA | ✓ | Tab-delimited, 5 columns. Chromosome mapping: 23→X, 24→Y, 25→X (PAR), 26→MT. V1 and V2 chip layouts. |
100
+ | Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
101
+ | MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
102
+ | Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
103
+
104
+ Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
105
+
106
+ ### v2 roadmap
107
+
108
+ | Format | Notes |
109
+ |---|---|
110
+ | VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
111
+ | Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
112
+ | PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
113
+ | Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
114
+
115
+ ## Supported Databases
116
+
117
+ | Database | Status | Notes |
118
+ |---|---|---|
119
+ | ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
120
+ | PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
121
+ | CPIC (per-allele function table) | ✓ | Internal data source for the PharmGKB filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
122
+ | SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
123
+ | GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
124
+ | gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
125
+ | AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
126
+ | CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
127
+
128
+ ### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
129
+
130
+ ADR-0022 + ADR-0023: a tiny residual of PharmGKB rows may appear in reports even when the user is homozygous reference. PharmGKB publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
131
+
132
+ - **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
133
+ - **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
134
+
135
+ For the rare rsID where PharmGKB has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
136
+
137
+ The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
138
+
139
+ ### Known ClinVar upstream data quality issues
140
+
141
+ Two ClinVar rows in real-world reports are known upstream artifacts, not Allelix bugs:
142
+
143
+ - **PKD1 rs199476100 GG (Pathogenic/Likely pathogenic, magnitude 8.5).** This is a stop-gained variant with a gnomAD frequency of 0.0005% (7 observations in 1.38 million chromosomes). Homozygosity for this variant is biologically implausible — PKD1 is autosomal dominant and the nonsense variant would be embryonic-lethal or devastating in homozygous state. The chip genotyping call is almost certainly a probe artifact. The code correctly reports what ClinVar says and what the chip reads; the error is upstream of Allelix. Future work: population-frequency filtering could flag ultra-rare variants where the chip call is likely unreliable.
144
+
145
+ - **IL10 rs1800896 CT (Pathogenic, magnitude 9.0).** This is a common polymorphism (MAF ~20–40%) in the IL-10 promoter. ClinVar's Pathogenic classification comes from a single submitter for hepatitis C susceptibility; a second submitter classifies the same allele as "Uncertain risk allele" for leprosy susceptibility. The ClinVar VCF aggregates across conditions, so the report may pair the Pathogenic classification with the wrong condition. Future work: ClinVar review-status weighting (number of submitters, star rating) could down-weight single-submitter classifications on common variants.
146
+
147
+ Neither issue affects Allelix's filter logic. Both are inherent to ClinVar's aggregation model and the limitations of array-based genotyping chips.
148
+
149
+ ## Regulatory Posture
150
+
151
+ Allelix is an informational research tool. It reports classifications made by external databases. It does not independently classify variants, diagnose conditions, or make health recommendations. All variant significance is attributed to its source — Allelix says "ClinVar classifies this variant as pathogenic," never "this variant is pathogenic."
152
+
153
+ This is not a disclaimer afterthought. It is a design constraint that affects model naming, report wording, and category labeling throughout the codebase.
154
+
155
+ ## Privacy
156
+
157
+ - No data leaves your machine. No telemetry. No uploads. No analytics.
158
+ - Reference databases are downloaded via `allelix db update` and cached locally.
159
+ - Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
160
+
161
+ ## Configuration
162
+
163
+ Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.
164
+
165
+ ```bash
166
+ # View current config (annotated with license notes)
167
+ allelix config show
168
+
169
+ # Read a single key
170
+ allelix config get sources.cadd
171
+ allelix config get license.commercial
172
+
173
+ # Disable a source permanently
174
+ allelix config set sources.gnomad false
175
+
176
+ # Enable commercial mode (auto-disables non-commercial sources)
177
+ allelix config set license.commercial true
178
+
179
+ # Assert that you hold a commercial CADD license
180
+ allelix config set license.cadd true
181
+ ```
182
+
183
+ CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
184
+
185
+ ### Database sizes and download times
186
+
187
+ Not all databases are equal in size. `allelix db update` downloads them all by default, but you can skip the large ones if disk space or bandwidth is a concern:
188
+
189
+ | Database | On disk | Download time | What it adds |
190
+ |---|---|---|---|
191
+ | ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
192
+ | PharmGKB + CPIC | ~6MB | seconds | Drug-gene interactions. |
193
+ | GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
194
+ | gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
195
+ | AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
196
+ | CADD (opt-in) | ~5GB | 5–15 min | Variant deleteriousness scores (how damaging is this variant?). Enable with `--cadd`. |
197
+
198
+ gnomAD and AlphaMissense are the largest but add the most interpretive context. gnomAD answers "is this variant rare or common?" — a pathogenic variant carried by 35% of the population reads very differently from one seen in 3 people. AlphaMissense answers "does this missense change likely damage the protein?" — especially valuable for the thousands of variants ClinVar hasn't reviewed yet.
199
+
200
+ To skip either during download: `allelix db update --no-gnomad --no-alphamissense`. To disable permanently: `allelix config set sources.gnomad false`.
201
+
202
+ ## Data Sources & Licensing
203
+
204
+ Allelix source code is licensed under the **GNU Affero General Public License v3.0 or later** (AGPL-3.0-or-later). Allelix ships with **zero third-party data**. All reference databases are downloaded by the user at runtime via `allelix db update`. Each database retains its original license on the user's machine:
205
+
206
+ | Database | Source | License | Usage |
207
+ |---|---|---|---|
208
+ | ClinVar | NCBI | Public domain | No restrictions |
209
+ | GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
210
+ | PharmGKB | pharmgkb.org | CC BY-SA 4.0 | Attribution required |
211
+ | CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the PharmGKB non-finding filter (ADR-0020), not surfaced as its own annotator. |
212
+ | SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
213
+ | gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
214
+ | AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
215
+ | CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
216
+
217
+ **Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
218
+
219
+ ### SNPedia data download
220
+
221
+ SNPedia data is downloaded automatically by `allelix db update` from a pre-built cache. If the SNPedia database is not present, `allelix analyze` runs normally using all other databases and prints a note that SNPedia data is not available.
222
+
223
+ To rebuild the cache from source (not normally needed):
224
+
225
+ ```bash
226
+ python scripts/scrape_snpedia.py # scrape 216K pages from bots.snpedia.com (1-4 hours)
227
+ python scripts/parse_snpedia.py # parse raw wiki markup into structured genotype rows
228
+ ```
229
+
230
+ ### Known SNPedia source data quality notes
231
+
232
+ SNPedia appears frozen — no edits have been observed since mid-2023. The data below reflects the state of the wiki at scrape time (May 2026) and is unlikely to change.
233
+
234
+ Of the 104,806 genotype pages in the archive:
235
+
236
+ - **103 pages have empty or missing allele fields.** These are incomplete entries on the source wiki — the `{{Genotype}}` template was created but the `allele1`/`allele2` fields were never filled in (e.g., `Rs1131692198(;)` with `|allele1=\n|allele2=\n`). All 103 were verified against the live site on 2026-05-21; every one matches the source exactly. The annotator silently skips these — they cannot match any user genotype.
237
+
238
+ - **1 page has no `{{Genotype}}` template at all.** `Rs1799853(T)` is a malformed single-allele page (`{{is a|genotype}}` instead of a proper genotype template). Skipped by the parser.
239
+
240
+ - **2 pages have a space before the parenthesis in the title** (`Rs52820871 (G;G)` and `Rs52820871 (G;T)` instead of the standard `Rs52820871(G;G)` format). The annotator handles both title styles.
241
+
242
+ None of these are scraping errors. They are editorial inconsistencies on the source wiki. The annotator handles all of them correctly: incomplete entries are skipped, variant title formats are matched, and no false annotations are produced.
243
+
244
+ ## Architecture & Design Decisions
245
+
246
+ The "why" behind major design choices lives in [`docs/adr/`](docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
247
+
248
+ Notable load-bearing ADRs:
249
+
250
+ - **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
251
+ - **ADR-0020 — CPIC API as the per-allele function source.** The PharmGKB non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
252
+ - **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
253
+ - **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
254
+ - **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
255
+
256
+ Release history: see [`CHANGELOG.md`](CHANGELOG.md).
257
+
258
+ ## Development
259
+
260
+ ```bash
261
+ source .venv/bin/activate
262
+ pip install -e ".[dev]"
263
+
264
+ # One-time: install pre-commit hooks
265
+ pre-commit install --hook-type pre-commit
266
+
267
+ ruff check .
268
+ ruff format --check .
269
+ pytest
270
+ ```
271
+
272
+ The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit is blocked, fix the underlying problem rather than skipping the hook.
273
+
274
+ ## License
275
+
276
+ AGPL-3.0-or-later. See `LICENSE`.
@@ -0,0 +1,58 @@
1
+ allelix/__init__.py,sha256=ZORQwa3PR3CO1oqL_SgpZWTYkjSaYVgf5HBWNXNjfdk,330
2
+ allelix/cli.py,sha256=xAKGAX1bdX6VufZ1hMz1Yn1eBIGNpreAQmcREl-zHfQ,52846
3
+ allelix/compare.py,sha256=wVprVPoxwHbdVzjKgu2Yeadb-W9luwyo2f9liVOpaxg,4505
4
+ allelix/config.py,sha256=7e7VKX7vBPHdkAMeKCxMMhNqESF1Km7IiMIVqrG5u8Q,4598
5
+ allelix/models.py,sha256=_NLleKp0dYannVJdv_DO0tXKvRZhVQMqNAvRCQpjtzQ,4379
6
+ allelix/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ allelix/annotators/__init__.py,sha256=u2UezbC0dmhv1hHsfdtBvmya2ONQClUgzau3D4gNz7k,3240
8
+ allelix/annotators/alphamissense.py,sha256=JJ8Qui1ZrHRgVzcm5yaIaevuYFPqT9YId5GlcGFgx5c,8831
9
+ allelix/annotators/base.py,sha256=ymVFs8fF80BtLj6UJDQOSIaeOgjbeuQ_i4wPHbZVV2g,7373
10
+ allelix/annotators/cadd.py,sha256=wTZawo8X_FjsoOmt3yXSNFGGbLDiMwbxIb4Kq2-dP7o,10911
11
+ allelix/annotators/clinvar.py,sha256=aem3rHyQ_e8F82D3uwt4n8Cn69OUVI5XdlZUCECxt-4,16052
12
+ allelix/annotators/gnomad.py,sha256=lLMsPE5jRuZxUrsyx1Lc7REwSMWqQe7zZ0b13a8g0GY,7973
13
+ allelix/annotators/gwas.py,sha256=ZOZw68M_fRda7zP_ACsCkydLH3yKUvH8FX6ANlhLq4o,12878
14
+ allelix/annotators/pharmgkb.py,sha256=dLrU7PZKuIFxiQTajyDptHOWdfywczoDS0P4rpSG1Tw,15762
15
+ allelix/annotators/snpedia.py,sha256=Fly_JBu31t2RBymOBn03DHHwOc7ACY_27nlTm3bGi8I,9799
16
+ allelix/data/__init__.py,sha256=UvPduQUaJ8VJyyyOvkdfnaew6qDBFyNTr7u6qstrP_A,112
17
+ allelix/data/high_value_snps.yaml,sha256=otCp9DSa3TAyTvKc0a40vfUf9nDfJ9eYG00nNE8dzVo,1493
18
+ allelix/databases/__init__.py,sha256=0wY_4MoIAsa1JGtbh_hUVecdDhDLeZLk5XkL6dAf3Ew,966
19
+ allelix/databases/_versions.py,sha256=DFVgLow6BBRYbelaDghEGVv6MATKxdA-RfjiVBMps0c,625
20
+ allelix/databases/alphamissense_loader.py,sha256=h-1NH85eu9TeyU_jYtJdDgzpfvmmN8l-bUbW0sjxZSE,1467
21
+ allelix/databases/cadd_loader.py,sha256=r1qZAIXxC8qMsbnX0227RZQwXM9n0ujboItQNkoB9Hk,1431
22
+ allelix/databases/cpic_loader.py,sha256=isKPYr7mtt2ybAe77nO5BScJ5BeCnEKPuU6izRgAvUc,9618
23
+ allelix/databases/gnomad_loader.py,sha256=oG-UJbUSkYXMY-s0SRdVuAymdNqU0LWx1YW7fzeUgRk,1434
24
+ allelix/databases/gwas_loader.py,sha256=7txpNK91PTvoAI0PLZBFSxrIHxRczanDZPaezxUgN94,14993
25
+ allelix/databases/loader_utils.py,sha256=Zm-I3xAsUylvVgiRhoT0Sw0cYS5ri-j6niwjvwLuhHs,2524
26
+ allelix/databases/manager.py,sha256=dPLH2PGNvRFruiPBWtuNs-A7Z5cXrf421biNZwOdIWE,19225
27
+ allelix/databases/pharmgkb_loader.py,sha256=KXD4jwgeBtBbxS6qwqW5BwU-G-F7BMf4vOXeDuPPvbY,17202
28
+ allelix/databases/schema.py,sha256=gbpm1A1w0qjgSiBHgHTpFDKCGlO3kGzLrJK36x_QsJw,4062
29
+ allelix/databases/snpedia_loader.py,sha256=Cj4PMuLkU_NEP0ot-IOZWU9Phvr-jIlaQ5krzJZqjIo,1195
30
+ allelix/databases/snpedia_parser.py,sha256=B7Imp0dbeBvpheDz5AUoZZ6OhoKdVp0yf_GH-Au5s4s,11372
31
+ allelix/exporters/__init__.py,sha256=byh4VaPpL4E3mKWN7a78cjLybRK40UZM_ULLyAEpOk8,123
32
+ allelix/exporters/plink.py,sha256=HOjL3H2kMdw3pBX7j7rx4qIq7pbM9MlrYx8dCfp9csM,4350
33
+ allelix/parsers/__init__.py,sha256=Tf1NyLdR321X_jv-RCSVJQlfIExgStN7MX7tpvy8Ba8,2077
34
+ allelix/parsers/_helpers.py,sha256=2kwwb-bkUynAlW60_b6r1fdfPKgCnhsVYdSzqCRoLNI,1338
35
+ allelix/parsers/ancestrydna.py,sha256=pC717KyeK9U3m_GZb91kLMKmy4yBtgnlweaiXoc-o44,4497
36
+ allelix/parsers/base.py,sha256=_YEM1BR-43WT8Ppg7jzZ6dfXmYqb3R1hj7LqmMey72c,3047
37
+ allelix/parsers/ftdna.py,sha256=YAFSjK5z0MmedpRIcWt2TNgLcxlygqI3HO96nE824Ug,4702
38
+ allelix/parsers/livingdna.py,sha256=GQHh_mfejVZI2RmWDg6_pAPEDkikfegg1nOht7WWVzA,4409
39
+ allelix/parsers/myhappygenes.py,sha256=XV56ifbWm0mKLXp9sCAGBDHZyckHQn29G8ix6BDnnsk,4911
40
+ allelix/parsers/myheritage.py,sha256=Yan5bdUOuGQcslNV4uYHMaj8Rhoy1pHi9bmg_LAh1wM,4311
41
+ allelix/parsers/twentythreeandme.py,sha256=tGXi0T99GaPf7khX1UIBkLfZ3j5MA83yzgVeocj93j0,5608
42
+ allelix/reports/__init__.py,sha256=nk0YHkKrHkUe2GHXvC9S4qZnDzkToHcjAVXUwHmxT9c,1430
43
+ allelix/reports/_pipeline.py,sha256=JPzrGqwRfzS6X7pZifs62MU2MsDbmE7ibb56sXY7v_8,18273
44
+ allelix/reports/diff.py,sha256=5TSa88ibYtK-kSgqAs4r-dfyCgjnoriAEVQY1DL78HY,5534
45
+ allelix/reports/high_value.py,sha256=dAqE9RYZ1E8gjg9XeG7lTooTUxkBVlOpWwPgMFcrfBA,4392
46
+ allelix/reports/html.py,sha256=P0ZGX-uKuNQhezaXyp2J6ntuU1K_hWVyFnMyHSt6F7I,41659
47
+ allelix/reports/json_report.py,sha256=tbWWRHISkUZiNc1SQ3XUhXe-NHWJR8hojU4U63O1AwM,5192
48
+ allelix/reports/methylation.py,sha256=tN0OIZ6WfvRk-XgEYP4We7iUmfCLweDUuJxkiCs4sbA,1081
49
+ allelix/reports/terminal.py,sha256=aKXh_J_iVmvQh5Qa9ObmdPytCPtT-0IRU899o5w4wfc,7233
50
+ allelix/utils/__init__.py,sha256=xjWQJxJ5kXynPxmk_przVjNXDEnFp8-K3uFegVubslk,138
51
+ allelix/utils/allele.py,sha256=Ei4mDPh-WKNPth42la1C2XecLXUDW8XYmnpBBi4cD7Q,3318
52
+ allelix/utils/build_detect.py,sha256=4rDLPq0L2uqp9tiBIPkmYB_OD4UTfqqxLOk0U7_kn4c,7499
53
+ allelix-1.8.1.dist-info/licenses/LICENSE,sha256=QWRhgdtPnw-Xay85rhILM_cxYofYh91NfTOWwI_AOQI,34841
54
+ allelix-1.8.1.dist-info/METADATA,sha256=hlmHN7VUG7cBWtAN8CtluiPG-n-un3xqtEpmh7ZxndY,21040
55
+ allelix-1.8.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
56
+ allelix-1.8.1.dist-info/entry_points.txt,sha256=n75_TahJixgNPt1Xhk2V1wDR3RNCiKq4xAKa4_skYNI,45
57
+ allelix-1.8.1.dist-info/top_level.txt,sha256=2SoebaRt4WWVq8Ynk-5uNN1ATE_dJtOqsgj39oCHVbk,8
58
+ allelix-1.8.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ allelix = allelix.cli:main