PyPI - allelix - Versions diffs - 1.8.1__py3-none-any.whl - Mend

allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

allelix/__init__.py +12 -0
allelix/annotators/__init__.py +90 -0
allelix/annotators/alphamissense.py +228 -0
allelix/annotators/base.py +214 -0
allelix/annotators/cadd.py +283 -0
allelix/annotators/clinvar.py +404 -0
allelix/annotators/gnomad.py +212 -0
allelix/annotators/gwas.py +354 -0
allelix/annotators/pharmgkb.py +406 -0
allelix/annotators/snpedia.py +276 -0
allelix/cli.py +1524 -0
allelix/compare.py +149 -0
allelix/config.py +143 -0
allelix/data/__init__.py +3 -0
allelix/data/high_value_snps.yaml +64 -0
allelix/databases/__init__.py +30 -0
allelix/databases/_versions.py +16 -0
allelix/databases/alphamissense_loader.py +48 -0
allelix/databases/cadd_loader.py +49 -0
allelix/databases/cpic_loader.py +234 -0
allelix/databases/gnomad_loader.py +49 -0
allelix/databases/gwas_loader.py +546 -0
allelix/databases/loader_utils.py +80 -0
allelix/databases/manager.py +515 -0
allelix/databases/pharmgkb_loader.py +437 -0
allelix/databases/schema.py +165 -0
allelix/databases/snpedia_loader.py +44 -0
allelix/databases/snpedia_parser.py +342 -0
allelix/exporters/__init__.py +3 -0
allelix/exporters/plink.py +144 -0
allelix/models.py +117 -0
allelix/parsers/__init__.py +73 -0
allelix/parsers/_helpers.py +41 -0
allelix/parsers/ancestrydna.py +130 -0
allelix/parsers/base.py +97 -0
allelix/parsers/ftdna.py +129 -0
allelix/parsers/livingdna.py +121 -0
allelix/parsers/myhappygenes.py +135 -0
allelix/parsers/myheritage.py +118 -0
allelix/parsers/twentythreeandme.py +150 -0
allelix/py.typed +0 -0
allelix/reports/__init__.py +40 -0
allelix/reports/_pipeline.py +497 -0
allelix/reports/diff.py +169 -0
allelix/reports/high_value.py +133 -0
allelix/reports/html.py +1130 -0
allelix/reports/json_report.py +163 -0
allelix/reports/methylation.py +50 -0
allelix/reports/terminal.py +203 -0
allelix/utils/__init__.py +3 -0
allelix/utils/allele.py +87 -0
allelix/utils/build_detect.py +203 -0
allelix-1.8.1.dist-info/METADATA +276 -0
allelix-1.8.1.dist-info/RECORD +58 -0
allelix-1.8.1.dist-info/WHEEL +5 -0
allelix-1.8.1.dist-info/entry_points.txt +2 -0
allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
allelix-1.8.1.dist-info/top_level.txt +1 -0

allelix/utils/build_detect.py ADDED Viewed

@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2026 dial481
+"""Genome build detection from position data.
+ADR-0021: Allelix detects the build of an input genotype file from a
+handful of well-known SNP positions rather than trusting the file header.
+A real-world MyHappyGenes/Tempus file was confirmed to label its build
+as "37.1" while shipping GRCh38 coordinates; cross-build REF/ALT
+comparison produced a false-positive pathogenic call on NIPA1.
+The detection table holds authoritative (chromosome, 1-based position)
+pairs for all three builds (GRCh36, GRCh37, GRCh38) across ~10 SNPs
+spread over chromosomes 1, 10, 11, 12, 17, 19, and 22. Each entry's positions
+differ by tens of thousands to millions of bases — there is no
+ambiguity. A single matched rsID identifies the build; multiple are
+confirmatory.
+Position data is normative; headers are not.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, NamedTuple
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from allelix.models import Variant
+BUILD_GRCH36 = "GRCh36"
+BUILD_GRCH37 = "GRCh37"
+BUILD_GRCH38 = "GRCh38"
+# Authoritative 1-based positions per NCBI dbSNP / Variation API. Each
+# entry was cross-checked against the API's SPDI (0-based) + 1 and the
+# correct NC accession version for each build. SNPs were chosen for:
+#   - presence on virtually every consumer array
+#   - clinical or pharmacogenomic relevance (so coverage is high)
+#   - distribution across chromosomes so partial-coverage files still
+#     hit at least one entry
+#
+# If the API ever returns inverted labels (mine did for chr11/12/19 due
+# to NC accession version quirks), THIS table is the source of truth.
+# Verify against dbSNP's web view before editing.
+KNOWN_SNP_POSITIONS: dict[str, dict[str, tuple[str, int]]] = {
+    # MTHFR — methylation pathway, chromosome 1 short arm
+    "rs1801133": {
+        BUILD_GRCH36: ("1", 11778965),
+        BUILD_GRCH37: ("1", 11856378),
+        BUILD_GRCH38: ("1", 11796321),
+    },
+    "rs1801131": {
+        BUILD_GRCH36: ("1", 11777063),
+        BUILD_GRCH37: ("1", 11854476),
+        BUILD_GRCH38: ("1", 11794419),
+    },
+    # CYP2C9 / CYP2C19 cluster — chromosome 10 long arm
+    "rs1799853": {
+        BUILD_GRCH36: ("10", 96692448),
+        BUILD_GRCH37: ("10", 96702047),
+        BUILD_GRCH38: ("10", 94942290),
+    },
+    "rs1057910": {
+        BUILD_GRCH36: ("10", 96731043),
+        BUILD_GRCH37: ("10", 96741053),
+        BUILD_GRCH38: ("10", 94981296),
+    },
+    "rs4244285": {
+        BUILD_GRCH36: ("10", 96532017),
+        BUILD_GRCH37: ("10", 96541616),
+        BUILD_GRCH38: ("10", 94781859),
+    },
+    # SLCO1B1 — statin myopathy, chromosome 12
+    "rs4149056": {
+        BUILD_GRCH36: ("12", 21222816),
+        BUILD_GRCH37: ("12", 21331549),
+        BUILD_GRCH38: ("12", 21178615),
+    },
+    # DRD2/ANKK1 — chromosome 11
+    "rs1800497": {
+        BUILD_GRCH36: ("11", 112776038),
+        BUILD_GRCH37: ("11", 113270828),
+        BUILD_GRCH38: ("11", 113400106),
+    },
+    # BRCA1 — hereditary cancer, chromosome 17
+    "rs80357906": {
+        BUILD_GRCH36: ("17", 38449327),
+        BUILD_GRCH37: ("17", 41209080),
+        BUILD_GRCH38: ("17", 43057063),
+    },
+    # APOE — chromosome 19, near telomere
+    "rs429358": {
+        BUILD_GRCH36: ("19", 50103781),
+        BUILD_GRCH37: ("19", 45411941),
+        BUILD_GRCH38: ("19", 44908684),
+    },
+    "rs7412": {
+        BUILD_GRCH36: ("19", 50103919),
+        BUILD_GRCH37: ("19", 45412079),
+        BUILD_GRCH38: ("19", 44908822),
+    },
+    # COMT — chromosome 22
+    "rs4680": {
+        BUILD_GRCH36: ("22", 18331271),
+        BUILD_GRCH37: ("22", 19951271),
+        BUILD_GRCH38: ("22", 19963748),
+    },
+}
+_MIN_CONFIDENT_MATCHES = 3
+class BuildDetectionResult(NamedTuple):
+    """Outcome of build detection on an input file.
+    `build` is `"GRCh36"`, `"GRCh37"`, `"GRCh38"`, or None if no known SNPs were
+    found in the input. `matched` counts how many table entries matched
+    the winning build; `inspected` counts how many table entries were
+    found in the input (regardless of which build their positions
+    matched). When `matched < inspected` the file is internally
+    inconsistent (e.g., one rsID matches GRCh37, another matches
+    GRCh38) — surface a warning but pick the majority.
+    """
+    build: str | None
+    matched: int
+    inspected: int
+    @property
+    def is_confident(self) -> bool:
+        """True iff enough rsIDs matched and all matches agreed.
+        Requires at least ``_MIN_CONFIDENT_MATCHES`` (3) concordant
+        positions before declaring confident. A single-SNP match
+        could be a table error; three concordant matches across
+        different chromosomes eliminates that risk.
+        """
+        return self.matched >= _MIN_CONFIDENT_MATCHES and self.matched == self.inspected
+def detect_build(variants: Iterable[Variant]) -> BuildDetectionResult:
+    """Detect the genome build of an iterable of `Variant` records.
+    Iterates the input, looking for any rsID in `KNOWN_SNP_POSITIONS`,
+    and tallies which build's (chromosome, position) each match votes
+    for. Returns when every entry in the table has been seen OR the
+    input is exhausted. Streaming-friendly — does not materialize the
+    full variant list.
+    """
+    votes: dict[str, int] = {BUILD_GRCH36: 0, BUILD_GRCH37: 0, BUILD_GRCH38: 0}
+    inspected = 0
+    remaining = set(KNOWN_SNP_POSITIONS)
+    for variant in variants:
+        if variant.rsid not in remaining:
+            continue
+        entry = KNOWN_SNP_POSITIONS[variant.rsid]
+        remaining.discard(variant.rsid)
+        inspected += 1
+        for build, (chrom, pos) in entry.items():
+            if variant.chromosome == chrom and variant.position == pos:
+                votes[build] += 1
+                break
+        if not remaining:
+            break
+    if inspected == 0:
+        return BuildDetectionResult(build=None, matched=0, inspected=0)
+    winner = max(votes, key=votes.__getitem__)
+    if votes[winner] == 0:
+        return BuildDetectionResult(build=None, matched=0, inspected=inspected)
+    # Tie between two builds with equal non-zero votes — don't pick.
+    top_counts = sorted(votes.values(), reverse=True)
+    if top_counts[0] == top_counts[1]:
+        return BuildDetectionResult(build=None, matched=0, inspected=inspected)
+    return BuildDetectionResult(build=winner, matched=votes[winner], inspected=inspected)
+def normalize_build_label(label: str | None) -> str | None:
+    """Map a human-written build label to canonical `GRCh36`, `GRCh37`, or `GRCh38`.
+    Examples that map to GRCh36: `"GRCh36"`, `"hg18"`, `"build 36"`,
+    `"NCBI 36"`. Examples for GRCh37: `"GRCh37"`, `"grch37"`, `"hg19"`,
+    `"37.1"`, `"build 37.1"`, `"NCBI 37"`. Examples for GRCh38: `"GRCh38"`,
+    `"hg38"`, `"38"`. Unrecognized labels return None.
+    Used to compare a file's header-claimed build against the detected
+    build. The label space is informal and provider-specific; this
+    function only recognizes well-known aliases.
+    """
+    if not label:
+        return None
+    s = label.strip().lower()
+    if not s:
+        return None
+    if "36" in s or "hg18" in s or "ncbi36" in s or "ncbi 36" in s:
+        return BUILD_GRCH36
+    if "37" in s or "hg19" in s or "ncbi37" in s or "ncbi 37" in s:
+        return BUILD_GRCH37
+    if "38" in s or "hg38" in s or "ncbi38" in s or "ncbi 38" in s:
+        return BUILD_GRCH38
+    return None

allelix-1.8.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,276 @@
+Metadata-Version: 2.4
+Name: allelix
+Version: 1.8.1
+Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
+Author-email: dial481 <dial481@users.noreply.github.com>
+License-Expression: AGPL-3.0-or-later
+Project-URL: Homepage, https://github.com/dial481/allelix
+Project-URL: Issues, https://github.com/dial481/allelix/issues
+Keywords: genomics,genotype,snp,bioinformatics,dna
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Science/Research
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: click>=8.2
+Requires-Dist: mwparserfromhell>=0.6
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: rich>=13.7
+Provides-Extra: cadd
+Requires-Dist: pysam>=0.22; extra == "cadd"
+Provides-Extra: dev
+Requires-Dist: pre-commit>=3.7; extra == "dev"
+Requires-Dist: pytest>=8.0; extra == "dev"
+Requires-Dist: pytest-cov<8,>=7; extra == "dev"
+Requires-Dist: ruff>=0.6; extra == "dev"
+Dynamic: license-file
+# Allelix
+Open-source command-line toolkit for analyzing raw genotype files from consumer DNA testing services. Format-agnostic ingestion, database-agnostic annotation, offline-first.
+> **Status:** Production — six parser formats, four annotators (ClinVar +
+> PharmGKB + GWAS Catalog + SNPedia), three enrichment sources (gnomAD
+> population frequencies + AlphaMissense pathogenicity + CADD
+> deleteriousness), licensable-source gating for commercial users,
+> dual-build ClinVar caches (GRCh37 + GRCh38),
+> HTML/JSON/terminal reports, methylation + pharmacogenomics focused
+> commands, report diffing, persistent config with commercial-mode
+> safety switch. Build auto-detection from position data (ADR-0021).
+> No regex on prose anywhere in production. **Latest: v1.7.0** — PLINK
+> export, magnitude scoring formalization (ADR-0034). Release notes:
+> [`CHANGELOG.md`](CHANGELOG.md).
+## Quickstart
+Requires Python 3.11+.
+```bash
+git clone https://github.com/dial481/allelix
+cd allelix
+python -m venv .venv
+source .venv/bin/activate
+pip install -e ".[dev]"
+# Generate a synthetic test fixture
+python tests/generate_mock_data.py
+# Show summary statistics for a genotype file
+allelix stats tests/fixtures/mock_myhappygenes.txt
+# Download reference databases. First run downloads all sources (~15GB
+# on disk with gnomAD + AlphaMissense). Use --no-gnomad / --no-alphamissense
+# to skip the large enrichment databases. Re-runs skip unchanged sources.
+# CADD is opt-in: allelix db update --cadd
+allelix db update
+allelix db status   # see what's cached
+# Analyze a genotype file against all ready databases
+allelix analyze tests/fixtures/mock_myhappygenes.txt --min-magnitude 5
+# Same data, focused subsets
+allelix methylation tests/fixtures/mock_myhappygenes.txt
+allelix pharmacogenomics tests/fixtures/mock_myhappygenes.txt
+# Compare two genotype files (coverage, concordance, strand-flip detection)
+allelix compare file1.txt file2.txt
+# Export to PLINK1 binary format (.bed/.bim/.fam) for plink2, ADMIXTURE, PRSice
+# Expect ~60% monomorphic markers (A2=0) — genotyping chips probe many
+# intronic/intergenic sites outside gnomAD's exome coverage.
+allelix export plink genotype_file.txt -o output_prefix --build grch37
+# Output to a self-contained HTML or JSON report
+allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.html
+allelix analyze tests/fixtures/mock_myhappygenes.txt --output report.json
+```
+## Supported Formats
+| Format | Status | Notes |
+|---|---|---|
+| MyHappyGenes (Tempus) | ✓ | Tab-delimited, 5 columns. **Build is auto-detected** — real-world MHG exports mislabel the header as "build 37.1" while shipping GRCh38 coordinates. Allelix detects from position data and warns on header/data disagreement (ADR-0021). |
+| 23andMe | ✓ | Tab-delimited, 4 columns, concatenated genotype. Supports build 36/37/38 from header. I-prefixed probe IDs passed through. |
+| AncestryDNA | ✓ | Tab-delimited, 5 columns. Chromosome mapping: 23→X, 24→Y, 25→X (PAR), 26→MT. V1 and V2 chip layouts. |
+| Family Tree DNA | ✓ | CSV, double-quoted fields, concatenated genotype. Build 37 default. |
+| MyHeritage DNA | ✓ | CSV, same structure as FTDNA. Detected by "MyHeritage" in comment header. Handles double-double-quoted field variant. |
+| Living DNA | ✓ | Tab-delimited despite `.csv` extension. Handles AX-, AFFX-prefixed and CHR:POS positional SNP IDs. |
+Adding a new format means adding one file to `allelix/parsers/` and registering an instance in the `PARSERS` list in `allelix/parsers/__init__.py`.
+### v2 roadmap
+| Format | Notes |
+|---|---|
+| VCF | REF/ALT encoding, `0/1` genotype notation, absence-means-reference semantics. Architecturally different from array parsers — 4-6M variants per file, streaming + batch SQL required. |
+| Per-source scoring | Magnitude breakdown by database. Users see which source drove the composite score. |
+| PLINK import | Read .bed/.bim/.fam as an input format (complement to the v1.7.0 export). |
+| Genome Watchtower | Real-time variant monitoring via database delta feeds. Privacy-preserving: server publishes universal feed, matching happens locally against your deviation set. Replaces full re-analysis with millisecond set intersection. |
+## Supported Databases
+| Database | Status | Notes |
+|---|---|---|
+| ClinVar (GRCh37 + GRCh38) | ✓ | Public domain (NCBI). SNVs + indels + multi-allelic sites. **Both builds cached**; `analyze` dispatches by detected build (ADR-0021). Carrier rule (ADR-0007) requires the user to carry the ALT allele. Indel-anchor protection (ADR-0011) prevents single-base array readouts from matching anchor-base indels. |
+| PharmGKB | ✓ | CC BY-SA 4.0. Clinical annotations only — single-rsid SNVs; star alleles and haplotypes deferred (ADR-0009). **Primary non-finding filter is the ClinVar REF carrier rule (ADR-0023):** if ClinVar publishes a single-base REF for the rsid and the user is homozygous for it, the row is suppressed. CPIC's `(rsid, base) → function_class` join (ADR-0020) survives as a secondary tier for rsids ClinVar doesn't catalog. Earlier prose tiers (ADR-0013, ADR-0017, ADR-0018) are superseded. |
+| CPIC (per-allele function table) | ✓ | Internal data source for the PharmGKB filter. Fetched from `api.cpicpgx.org` at `db update` time. Used to populate the `pharmgkb_allele_function` table — not surfaced to end users as its own annotator. |
+| SNPedia | ✓ | CC BY-NC-SA 3.0 US. Pre-built cache downloaded via `db update` (~216K wiki pages, ~105K genotype rows). If the SNPedia database is absent, analysis runs without it. For commercial use, pass `--exclude-snpedia` — `analyze` runs using all other databases and omits SNPedia annotations. The cache can also be rebuilt from source via `scripts/scrape_snpedia.py` + `scripts/parse_snpedia.py`. |
+| GWAS Catalog | ✓ | Public domain (EBI/NHGRI). Trait–SNP associations with p-values and effect sizes. Carrier rule (ADR-0007) requires the user to carry the risk allele. P-value magnitude scoring (ADR-0024) maps continuous p-values to the 0–10 scale; unknown-risk-allele entries fire on rsID match alone but are capped at 3.0. |
+| gnomAD | ✓ | ODbL v1.0. **Enrichment annotator** — adds population allele frequency context to existing annotations. Shows how common each variant is in the general population (~16M exome variants from 730K individuals). A pathogenic variant that 35% of people carry reads very differently from one seen in 0.001%. Pre-built cache downloaded via `db update` (~6GB on disk). Use `--no-gnomad` to skip. |
+| AlphaMissense | ✓ | CC BY 4.0. **Enrichment annotator** — adds DeepMind's protein-structure-based pathogenicity predictions to existing annotations. Scores 71M missense variants on a 0–1 scale: <0.34 = likely benign, >0.564 = likely pathogenic. Complements ClinVar's expert classifications with computational predictions — especially valuable for variants ClinVar hasn't reviewed yet. Pre-built cache downloaded via `db update` (~8GB on disk). Use `--no-alphamissense` to skip. |
+| CADD | ✓ | LicenseRef-CADD (non-commercial). **Enrichment annotator** — adds PHRED-scaled deleteriousness scores from CADD v1.7. Ranks how deleterious any single-nucleotide variant is using 100+ annotation tracks (coding, non-coding, regulatory). PHRED 10 = top 10% most deleterious, 20 = top 1%, 30 = top 0.1%. **Opt-in** — disabled by default (`sources.cadd = false`). Enable via `allelix db update --cadd` or `allelix config set sources.cadd true`. Pre-built cache (~5 GB on disk, ~120M variant keys). Full mode available via pysam for GRCh38 data (`options.cadd_full = true`). Cache mode covers the large majority of variants present in gnomAD, AlphaMissense, and ClinVar — nearly every position allelix can annotate from its other databases. For genotyping chip data (23andMe, AncestryDNA, MyHappyGenes, etc.), cache and full mode produce effectively identical results because chip probes overwhelmingly target known, cataloged variants. Full mode adds coverage for novel or private variants that appear only in whole-genome or whole-exome sequencing data and are not in any pre-computed database. If your input is a genotyping chip file, cache mode is all you need. |
+### Known PharmGKB limitation: reference-genotype rows where ClinVar and CPIC both lack data
+ADR-0022 + ADR-0023: a tiny residual of PharmGKB rows may appear in reports even when the user is homozygous reference. PharmGKB publishes one annotation per genotype including the reference homozygote, and for the reference-homozygote row to be suppressed Allelix needs structured data on the variant from either:
+- **ClinVar's REF allele** (the primary filter — see ADR-0023). Covers any rsID ClinVar catalogs.
+- **CPIC's per-allele function table** (the secondary fallback — see ADR-0020). Covers rsIDs CPIC has classified.
+For the rare rsID where PharmGKB has an annotation but *neither* ClinVar nor CPIC has data, the row emits. These are identifiable by a homozygous-reference genotype combined with "decreased risk," "may have a typical response," or similar comparative language. They are an upstream data gap, not an Allelix bug — we surface them honestly rather than hide them behind a curated exclusion list (which would recreate the maintenance trap the v0.5–v0.7 prose filters were trying to escape).
+The CFTR × ivacaftor leak (~30+ rows on real data, pre-v0.7.3) is fixed by the ADR-0023 ClinVar REF check: CPIC's CFTR vocabulary (`"ivacaftor responsive"`) doesn't match the four-class enum the secondary tier expects, but ClinVar publishes REF for every CFTR rsID, so the primary tier catches them universally.
+### Known ClinVar upstream data quality issues
+Two ClinVar rows in real-world reports are known upstream artifacts, not Allelix bugs:
+- **PKD1 rs199476100 GG (Pathogenic/Likely pathogenic, magnitude 8.5).** This is a stop-gained variant with a gnomAD frequency of 0.0005% (7 observations in 1.38 million chromosomes). Homozygosity for this variant is biologically implausible — PKD1 is autosomal dominant and the nonsense variant would be embryonic-lethal or devastating in homozygous state. The chip genotyping call is almost certainly a probe artifact. The code correctly reports what ClinVar says and what the chip reads; the error is upstream of Allelix. Future work: population-frequency filtering could flag ultra-rare variants where the chip call is likely unreliable.
+- **IL10 rs1800896 CT (Pathogenic, magnitude 9.0).** This is a common polymorphism (MAF ~20–40%) in the IL-10 promoter. ClinVar's Pathogenic classification comes from a single submitter for hepatitis C susceptibility; a second submitter classifies the same allele as "Uncertain risk allele" for leprosy susceptibility. The ClinVar VCF aggregates across conditions, so the report may pair the Pathogenic classification with the wrong condition. Future work: ClinVar review-status weighting (number of submitters, star rating) could down-weight single-submitter classifications on common variants.
+Neither issue affects Allelix's filter logic. Both are inherent to ClinVar's aggregation model and the limitations of array-based genotyping chips.
+## Regulatory Posture
+Allelix is an informational research tool. It reports classifications made by external databases. It does not independently classify variants, diagnose conditions, or make health recommendations. All variant significance is attributed to its source — Allelix says "ClinVar classifies this variant as pathogenic," never "this variant is pathogenic."
+This is not a disclaimer afterthought. It is a design constraint that affects model naming, report wording, and category labeling throughout the codebase.
+## Privacy
+- No data leaves your machine. No telemetry. No uploads. No analytics.
+- Reference databases are downloaded via `allelix db update` and cached locally.
+- Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
+## Configuration
+Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.
+```bash
+# View current config (annotated with license notes)
+allelix config show
+# Read a single key
+allelix config get sources.cadd
+allelix config get license.commercial
+# Disable a source permanently
+allelix config set sources.gnomad false
+# Enable commercial mode (auto-disables non-commercial sources)
+allelix config set license.commercial true
+# Assert that you hold a commercial CADD license
+allelix config set license.cadd true
+```
+CLI flags (`--no-gnomad`, `--no-alphamissense`, `--exclude-snpedia`, `--cadd`) override the config for a single run. The config sets the baseline; flags override per-invocation.
+### Database sizes and download times
+Not all databases are equal in size. `allelix db update` downloads them all by default, but you can skip the large ones if disk space or bandwidth is a concern:
+| Database | On disk | Download time | What it adds |
+|---|---|---|---|
+| ClinVar (GRCh37 + GRCh38) | ~900MB | 1–2 min | Core clinical variant classifications. Required. |
+| PharmGKB + CPIC | ~6MB | seconds | Drug-gene interactions. |
+| GWAS Catalog | ~200MB | 1–2 min | Trait-SNP associations from genome-wide studies. |
+| gnomAD | ~6GB | 5–15 min | Population allele frequencies (how common is this variant?). |
+| AlphaMissense | ~8GB | 5–15 min | Missense pathogenicity predictions (how likely to break protein function?). |
+| CADD (opt-in) | ~5GB | 5–15 min | Variant deleteriousness scores (how damaging is this variant?). Enable with `--cadd`. |
+gnomAD and AlphaMissense are the largest but add the most interpretive context. gnomAD answers "is this variant rare or common?" — a pathogenic variant carried by 35% of the population reads very differently from one seen in 3 people. AlphaMissense answers "does this missense change likely damage the protein?" — especially valuable for the thousands of variants ClinVar hasn't reviewed yet.
+To skip either during download: `allelix db update --no-gnomad --no-alphamissense`. To disable permanently: `allelix config set sources.gnomad false`.
+## Data Sources & Licensing
+Allelix source code is licensed under the **GNU Affero General Public License v3.0 or later** (AGPL-3.0-or-later). Allelix ships with **zero third-party data**. All reference databases are downloaded by the user at runtime via `allelix db update`. Each database retains its original license on the user's machine:
+| Database | Source | License | Usage |
+|---|---|---|---|
+| ClinVar | NCBI | Public domain | No restrictions |
+| GWAS Catalog | EBI/NHGRI | Public domain | No restrictions |
+| PharmGKB | pharmgkb.org | CC BY-SA 4.0 | Attribution required |
+| CPIC | cpicpgx.org | CC BY-SA 4.0 | Attribution required. Per-allele function data fetched from `api.cpicpgx.org` at `db update` time; used internally for the PharmGKB non-finding filter (ADR-0020), not surfaced as its own annotator. |
+| SNPedia | snpedia.com | CC BY-NC-SA 3.0 US | Attribution required, **non-commercial only**. Use `--exclude-snpedia` to omit. |
+| gnomAD | gnomad.broadinstitute.org | ODbL v1.0 | Attribution required. Population allele frequencies for context; not a clinical annotator. Use `--no-gnomad` to omit. |
+| AlphaMissense | zenodo.org/records/10813168 | CC BY 4.0 | Attribution required. Cheng et al., Science 2023. Missense variant pathogenicity predictions. Use `--no-alphamissense` to omit. |
+| CADD | cadd.gs.washington.edu | LicenseRef-CADD | Attribution required, **non-commercial by default**. Commercial licenses available from UW CoMotion. Opt-in via `allelix db update --cadd`. |
+**Commercial users:** When `license.commercial = true`, non-commercial sources are gated by a three-state permission model. SNPedia is permanently blocked (no commercial license is available). CADD is blocked by default but can be unlocked — the University of Washington offers commercial licenses at `https://els2.comotion.uw.edu/product/cadd-scores`; after purchasing, assert your license with `allelix config set license.cadd true` to re-enable CADD in commercial mode. All other databases (ClinVar, PharmGKB, GWAS Catalog, gnomAD, AlphaMissense) are compatible with commercial use. `allelix config show` displays the permission state for each source.
+### SNPedia data download
+SNPedia data is downloaded automatically by `allelix db update` from a pre-built cache. If the SNPedia database is not present, `allelix analyze` runs normally using all other databases and prints a note that SNPedia data is not available.
+To rebuild the cache from source (not normally needed):
+```bash
+python scripts/scrape_snpedia.py   # scrape 216K pages from bots.snpedia.com (1-4 hours)
+python scripts/parse_snpedia.py    # parse raw wiki markup into structured genotype rows
+```
+### Known SNPedia source data quality notes
+SNPedia appears frozen — no edits have been observed since mid-2023. The data below reflects the state of the wiki at scrape time (May 2026) and is unlikely to change.
+Of the 104,806 genotype pages in the archive:
+- **103 pages have empty or missing allele fields.** These are incomplete entries on the source wiki — the `{{Genotype}}` template was created but the `allele1`/`allele2` fields were never filled in (e.g., `Rs1131692198(;)` with `|allele1=\n|allele2=\n`). All 103 were verified against the live site on 2026-05-21; every one matches the source exactly. The annotator silently skips these — they cannot match any user genotype.
+- **1 page has no `{{Genotype}}` template at all.** `Rs1799853(T)` is a malformed single-allele page (`{{is a|genotype}}` instead of a proper genotype template). Skipped by the parser.
+- **2 pages have a space before the parenthesis in the title** (`Rs52820871 (G;G)` and `Rs52820871 (G;T)` instead of the standard `Rs52820871(G;G)` format). The annotator handles both title styles.
+None of these are scraping errors. They are editorial inconsistencies on the source wiki. The annotator handles all of them correctly: incomplete entries are skipped, variant title formats are matched, and no false annotations are produced.
+## Architecture & Design Decisions
+The "why" behind major design choices lives in [`docs/adr/`](docs/adr/README.md) as Architecture Decision Records. Read these before proposing changes that touch the parser/annotator interfaces, the regulatory posture, or the data-handling model.
+Notable load-bearing ADRs:
+- **ADR-0016 — Data Classification Principle.** Classification reads structured fields only. Regex on prose is forbidden in production code.
+- **ADR-0020 — CPIC API as the per-allele function source.** The PharmGKB non-finding filter is a table join keyed on `(rsid, base) → clinicalfunctionalstatus`, sourced from CPIC's structured API. Supersedes the prose-extraction tiers from earlier versions (ADR-0017, ADR-0018).
+- **ADR-0007 — Genotype matching requires the user to carry the ALT allele.** Applies to ClinVar.
+- **ADR-0009 — PharmGKB matches the user's exact normalized diploid call.**
+- **ADR-0015 — Mock data generators are the contract.** Fixture shape must mirror real data shape; invariants tested.
+Release history: see [`CHANGELOG.md`](CHANGELOG.md).
+## Development
+```bash
+source .venv/bin/activate
+pip install -e ".[dev]"
+# One-time: install pre-commit hooks
+pre-commit install --hook-type pre-commit
+ruff check .
+ruff format --check .
+pytest
+```
+The pre-commit hook enforces `ruff check` + `ruff format --check`. If a commit is blocked, fix the underlying problem rather than skipping the hook.
+## License
+AGPL-3.0-or-later. See `LICENSE`.

allelix-1.8.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,58 @@
+allelix/__init__.py,sha256=ZORQwa3PR3CO1oqL_SgpZWTYkjSaYVgf5HBWNXNjfdk,330
+allelix/cli.py,sha256=xAKGAX1bdX6VufZ1hMz1Yn1eBIGNpreAQmcREl-zHfQ,52846
+allelix/compare.py,sha256=wVprVPoxwHbdVzjKgu2Yeadb-W9luwyo2f9liVOpaxg,4505
+allelix/config.py,sha256=7e7VKX7vBPHdkAMeKCxMMhNqESF1Km7IiMIVqrG5u8Q,4598
+allelix/models.py,sha256=_NLleKp0dYannVJdv_DO0tXKvRZhVQMqNAvRCQpjtzQ,4379
+allelix/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+allelix/annotators/__init__.py,sha256=u2UezbC0dmhv1hHsfdtBvmya2ONQClUgzau3D4gNz7k,3240
+allelix/annotators/alphamissense.py,sha256=JJ8Qui1ZrHRgVzcm5yaIaevuYFPqT9YId5GlcGFgx5c,8831
+allelix/annotators/base.py,sha256=ymVFs8fF80BtLj6UJDQOSIaeOgjbeuQ_i4wPHbZVV2g,7373
+allelix/annotators/cadd.py,sha256=wTZawo8X_FjsoOmt3yXSNFGGbLDiMwbxIb4Kq2-dP7o,10911
+allelix/annotators/clinvar.py,sha256=aem3rHyQ_e8F82D3uwt4n8Cn69OUVI5XdlZUCECxt-4,16052
+allelix/annotators/gnomad.py,sha256=lLMsPE5jRuZxUrsyx1Lc7REwSMWqQe7zZ0b13a8g0GY,7973
+allelix/annotators/gwas.py,sha256=ZOZw68M_fRda7zP_ACsCkydLH3yKUvH8FX6ANlhLq4o,12878
+allelix/annotators/pharmgkb.py,sha256=dLrU7PZKuIFxiQTajyDptHOWdfywczoDS0P4rpSG1Tw,15762
+allelix/annotators/snpedia.py,sha256=Fly_JBu31t2RBymOBn03DHHwOc7ACY_27nlTm3bGi8I,9799
+allelix/data/__init__.py,sha256=UvPduQUaJ8VJyyyOvkdfnaew6qDBFyNTr7u6qstrP_A,112
+allelix/data/high_value_snps.yaml,sha256=otCp9DSa3TAyTvKc0a40vfUf9nDfJ9eYG00nNE8dzVo,1493
+allelix/databases/__init__.py,sha256=0wY_4MoIAsa1JGtbh_hUVecdDhDLeZLk5XkL6dAf3Ew,966
+allelix/databases/_versions.py,sha256=DFVgLow6BBRYbelaDghEGVv6MATKxdA-RfjiVBMps0c,625
+allelix/databases/alphamissense_loader.py,sha256=h-1NH85eu9TeyU_jYtJdDgzpfvmmN8l-bUbW0sjxZSE,1467
+allelix/databases/cadd_loader.py,sha256=r1qZAIXxC8qMsbnX0227RZQwXM9n0ujboItQNkoB9Hk,1431
+allelix/databases/cpic_loader.py,sha256=isKPYr7mtt2ybAe77nO5BScJ5BeCnEKPuU6izRgAvUc,9618
+allelix/databases/gnomad_loader.py,sha256=oG-UJbUSkYXMY-s0SRdVuAymdNqU0LWx1YW7fzeUgRk,1434
+allelix/databases/gwas_loader.py,sha256=7txpNK91PTvoAI0PLZBFSxrIHxRczanDZPaezxUgN94,14993
+allelix/databases/loader_utils.py,sha256=Zm-I3xAsUylvVgiRhoT0Sw0cYS5ri-j6niwjvwLuhHs,2524
+allelix/databases/manager.py,sha256=dPLH2PGNvRFruiPBWtuNs-A7Z5cXrf421biNZwOdIWE,19225
+allelix/databases/pharmgkb_loader.py,sha256=KXD4jwgeBtBbxS6qwqW5BwU-G-F7BMf4vOXeDuPPvbY,17202
+allelix/databases/schema.py,sha256=gbpm1A1w0qjgSiBHgHTpFDKCGlO3kGzLrJK36x_QsJw,4062
+allelix/databases/snpedia_loader.py,sha256=Cj4PMuLkU_NEP0ot-IOZWU9Phvr-jIlaQ5krzJZqjIo,1195
+allelix/databases/snpedia_parser.py,sha256=B7Imp0dbeBvpheDz5AUoZZ6OhoKdVp0yf_GH-Au5s4s,11372
+allelix/exporters/__init__.py,sha256=byh4VaPpL4E3mKWN7a78cjLybRK40UZM_ULLyAEpOk8,123
+allelix/exporters/plink.py,sha256=HOjL3H2kMdw3pBX7j7rx4qIq7pbM9MlrYx8dCfp9csM,4350
+allelix/parsers/__init__.py,sha256=Tf1NyLdR321X_jv-RCSVJQlfIExgStN7MX7tpvy8Ba8,2077
+allelix/parsers/_helpers.py,sha256=2kwwb-bkUynAlW60_b6r1fdfPKgCnhsVYdSzqCRoLNI,1338
+allelix/parsers/ancestrydna.py,sha256=pC717KyeK9U3m_GZb91kLMKmy4yBtgnlweaiXoc-o44,4497
+allelix/parsers/base.py,sha256=_YEM1BR-43WT8Ppg7jzZ6dfXmYqb3R1hj7LqmMey72c,3047
+allelix/parsers/ftdna.py,sha256=YAFSjK5z0MmedpRIcWt2TNgLcxlygqI3HO96nE824Ug,4702
+allelix/parsers/livingdna.py,sha256=GQHh_mfejVZI2RmWDg6_pAPEDkikfegg1nOht7WWVzA,4409
+allelix/parsers/myhappygenes.py,sha256=XV56ifbWm0mKLXp9sCAGBDHZyckHQn29G8ix6BDnnsk,4911
+allelix/parsers/myheritage.py,sha256=Yan5bdUOuGQcslNV4uYHMaj8Rhoy1pHi9bmg_LAh1wM,4311
+allelix/parsers/twentythreeandme.py,sha256=tGXi0T99GaPf7khX1UIBkLfZ3j5MA83yzgVeocj93j0,5608
+allelix/reports/__init__.py,sha256=nk0YHkKrHkUe2GHXvC9S4qZnDzkToHcjAVXUwHmxT9c,1430
+allelix/reports/_pipeline.py,sha256=JPzrGqwRfzS6X7pZifs62MU2MsDbmE7ibb56sXY7v_8,18273
+allelix/reports/diff.py,sha256=5TSa88ibYtK-kSgqAs4r-dfyCgjnoriAEVQY1DL78HY,5534
+allelix/reports/high_value.py,sha256=dAqE9RYZ1E8gjg9XeG7lTooTUxkBVlOpWwPgMFcrfBA,4392
+allelix/reports/html.py,sha256=P0ZGX-uKuNQhezaXyp2J6ntuU1K_hWVyFnMyHSt6F7I,41659
+allelix/reports/json_report.py,sha256=tbWWRHISkUZiNc1SQ3XUhXe-NHWJR8hojU4U63O1AwM,5192
+allelix/reports/methylation.py,sha256=tN0OIZ6WfvRk-XgEYP4We7iUmfCLweDUuJxkiCs4sbA,1081
+allelix/reports/terminal.py,sha256=aKXh_J_iVmvQh5Qa9ObmdPytCPtT-0IRU899o5w4wfc,7233
+allelix/utils/__init__.py,sha256=xjWQJxJ5kXynPxmk_przVjNXDEnFp8-K3uFegVubslk,138
+allelix/utils/allele.py,sha256=Ei4mDPh-WKNPth42la1C2XecLXUDW8XYmnpBBi4cD7Q,3318
+allelix/utils/build_detect.py,sha256=4rDLPq0L2uqp9tiBIPkmYB_OD4UTfqqxLOk0U7_kn4c,7499
+allelix-1.8.1.dist-info/licenses/LICENSE,sha256=QWRhgdtPnw-Xay85rhILM_cxYofYh91NfTOWwI_AOQI,34841
+allelix-1.8.1.dist-info/METADATA,sha256=hlmHN7VUG7cBWtAN8CtluiPG-n-un3xqtEpmh7ZxndY,21040
+allelix-1.8.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+allelix-1.8.1.dist-info/entry_points.txt,sha256=n75_TahJixgNPt1Xhk2V1wDR3RNCiKq4xAKa4_skYNI,45
+allelix-1.8.1.dist-info/top_level.txt,sha256=2SoebaRt4WWVq8Ynk-5uNN1ATE_dJtOqsgj39oCHVbk,8
+allelix-1.8.1.dist-info/RECORD,,

allelix-1.8.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

allelix-1.8.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ allelix = allelix.cli:main