allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,437 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """PharmGKB clinical-annotation download, parse, and load into SQLite.
4
+
5
+ PharmGKB publishes a `clinicalAnnotations.zip` containing two TSVs:
6
+
7
+ - `clinical_annotations.tsv`: one row per clinical annotation
8
+ (id, variant/haplotypes, gene, drug(s), phenotype(s), level of evidence,
9
+ score, phenotype category, …)
10
+ - `clinical_ann_alleles.tsv`: per-genotype rows for each annotation
11
+ (annotation id, genotype/allele, annotation text, allele function)
12
+
13
+ This loader joins the two on annotation id and emits one record per
14
+ (rsid, genotype) pair. Star alleles, multi-rsid composites, and indel
15
+ genotypes are skipped — they require haplotype reconstruction.
16
+
17
+ See ADR-0009 for the genotype-matching rationale.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import contextlib
23
+ import csv
24
+ import logging
25
+ import os
26
+ import re
27
+ import sqlite3
28
+ import tempfile
29
+ import zipfile
30
+ from datetime import UTC, datetime
31
+ from pathlib import Path
32
+ from typing import TYPE_CHECKING
33
+
34
+ from allelix.databases.schema import PHARMGKB_SCHEMA
35
+
36
+ if TYPE_CHECKING:
37
+ from collections.abc import Iterator
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ PHARMGKB_CLINICAL_URL = "https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip"
42
+ PHARMGKB_DB_FILENAME = "pharmgkb.sqlite"
43
+
44
+ INSERT_BATCH_SIZE = 5_000
45
+
46
+ CLINICAL_ANN_FILENAME = "clinical_annotations.tsv"
47
+ CLINICAL_ANN_ALLELES_FILENAME = "clinical_ann_alleles.tsv"
48
+
49
+ # Structural format validation only — NOT prose classification.
50
+ # Per ADR-0016, regex is permitted for ID format checking and shape
51
+ # validation; it's forbidden as input to any classification decision.
52
+ _RSID_RE = re.compile(r"^rs\d+$")
53
+ _TWO_LETTER_GENOTYPE_RE = re.compile(r"^[ACGT]{2}$")
54
+
55
+ # ADR-0020 (v0.9.0): per-allele function lives in the structured CPIC API,
56
+ # fetched into `pharmgkb_allele_function` at db-update time and queried as
57
+ # a join. The filter is: for the user's `(rsid, genotype)`, look up each
58
+ # base in the lookup; if every base maps to Normal function, the row is a
59
+ # non-finding. No regex, no prose parsing, no description classification.
60
+ #
61
+ # Function class enumeration mirrors CPIC's structured field. Values
62
+ # outside this set are treated as not-Normal (variant) and emit the row.
63
+ FUNCTION_CLASS_NORMAL = "normal"
64
+ FUNCTION_CLASS_DECREASED = "decreased"
65
+ FUNCTION_CLASS_NO_FUNCTION = "no_function"
66
+ FUNCTION_CLASS_INCREASED = "increased"
67
+ FUNCTION_CLASS_UNKNOWN = "unknown"
68
+
69
+ # Schema migration. v0.5.x lacks `function_class`; v0.6.x lacks the
70
+ # `pharmgkb_allele_function` table. `schema_is_current()` returns False on
71
+ # either, so `db update` automatically refreshes into the v0.9.0 schema.
72
+ _REQUIRED_PHARMGKB_COLUMNS = frozenset(
73
+ {
74
+ "rsid",
75
+ "genotype",
76
+ "gene",
77
+ "drugs",
78
+ "phenotype",
79
+ "phenotype_category",
80
+ "annotation_text",
81
+ "level_of_evidence",
82
+ "score",
83
+ "pgkb_annotation_id",
84
+ "allele_function",
85
+ "function_class",
86
+ "is_nonfinding",
87
+ }
88
+ )
89
+ _REQUIRED_PHARMGKB_TABLES = frozenset({"pharmgkb_annotations", "pharmgkb_allele_function"})
90
+
91
+
92
+ def classify_function(allele_function: str | None) -> str:
93
+ """Map PharmGKB's `Allele Function` field to a stable enum string.
94
+
95
+ The structured field is authoritative (ADR-0016). When it's empty, we
96
+ record `unknown` rather than guess from prose — the user sees the row
97
+ and decides what to do with it.
98
+ """
99
+ if not allele_function:
100
+ return FUNCTION_CLASS_UNKNOWN
101
+ value = allele_function.strip().lower()
102
+ if "no function" in value:
103
+ return FUNCTION_CLASS_NO_FUNCTION
104
+ if "decreased" in value:
105
+ return FUNCTION_CLASS_DECREASED
106
+ if "increased" in value:
107
+ return FUNCTION_CLASS_INCREASED
108
+ if "normal" in value:
109
+ return FUNCTION_CLASS_NORMAL
110
+ return FUNCTION_CLASS_UNKNOWN
111
+
112
+
113
+ def is_nonfinding_for_row(
114
+ allele_function: str | None,
115
+ annotation_text: str | None = None, # kept for back-compat; unused.
116
+ *,
117
+ rsid: str | None = None,
118
+ genotype: str | None = None,
119
+ allele_function_lookup: dict[tuple[str, str], str] | None = None,
120
+ ) -> bool:
121
+ """Decide whether a row is a non-finding (ADR-0020, v0.9.0).
122
+
123
+ The filter is a join, not a text classifier:
124
+
125
+ 1. **PharmGKB's structured `Allele Function` column** (ADR-0016).
126
+ Authoritative on the rare row where PharmGKB populates it for an
127
+ SNV genotype (most SNV rows have it empty).
128
+
129
+ 2. **CPIC per-allele function lookup** (ADR-0020). For the row's
130
+ `(rsid, genotype)`, look up each user-carried base in the
131
+ `pharmgkb_allele_function` table. If every base maps to
132
+ `Normal function`, the row is a non-finding. If any base is
133
+ non-Normal — or absent from the lookup for an rsid that HAS
134
+ entries — the row emits.
135
+
136
+ If neither tier has data for an rsid, the row emits (rows are never
137
+ silently suppressed without structured evidence).
138
+ """
139
+ function_class = classify_function(allele_function)
140
+ if function_class != FUNCTION_CLASS_UNKNOWN:
141
+ return function_class == FUNCTION_CLASS_NORMAL
142
+
143
+ if rsid and genotype and allele_function_lookup is not None:
144
+ lookup_result = is_nonfinding_by_allele_lookup(rsid, genotype, allele_function_lookup)
145
+ if lookup_result is not None:
146
+ return lookup_result
147
+
148
+ return False
149
+
150
+
151
+ def is_nonfinding(function_class: str) -> bool:
152
+ """Structured-only non-finding check (back-compat shim for tests).
153
+
154
+ Returns True iff function_class == 'normal'. Production code should
155
+ use `is_nonfinding_for_row()` which also handles the empty-field
156
+ prose fallback per ADR-0017.
157
+ """
158
+ return function_class == FUNCTION_CLASS_NORMAL
159
+
160
+
161
+ def schema_is_current(db_path: Path) -> bool:
162
+ """True iff the cache has the v0.7.0 PharmGKB schema.
163
+
164
+ v0.7.0 requires both:
165
+ - all v0.6.0 columns on pharmgkb_annotations
166
+ - the new pharmgkb_allele_function table (ADR-0018)
167
+ """
168
+ if not db_path.exists():
169
+ return False
170
+ try:
171
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
172
+ try:
173
+ tables = {
174
+ row[0]
175
+ for row in conn.execute("SELECT name FROM sqlite_master WHERE type = 'table'")
176
+ }
177
+ if not _REQUIRED_PHARMGKB_TABLES.issubset(tables):
178
+ return False
179
+ cols = {row[1] for row in conn.execute("PRAGMA table_info(pharmgkb_annotations)")}
180
+ except sqlite3.DatabaseError:
181
+ return False
182
+ return _REQUIRED_PHARMGKB_COLUMNS.issubset(cols)
183
+ except sqlite3.DatabaseError:
184
+ return False
185
+
186
+
187
+ def is_nonfinding_by_allele_lookup(
188
+ rsid: str,
189
+ genotype: str,
190
+ allele_function_lookup: dict[tuple[str, str], str],
191
+ ) -> bool | None:
192
+ """Per-allele structured classifier (ADR-0018).
193
+
194
+ Returns True if every allele in the user's genotype is either absent
195
+ from the CPIC lookup (Normal-by-absence) or explicitly classified as
196
+ Normal function. Returns False if ANY allele has a flagged non-Normal
197
+ function. Returns None if the lookup has no entries for this rsid at
198
+ all (callers fall back to prose).
199
+ """
200
+ if len(genotype) != 2:
201
+ return None
202
+ rsid_has_entries = any(k[0] == rsid for k in allele_function_lookup)
203
+ if not rsid_has_entries:
204
+ return None
205
+ for allele in set(genotype.upper()):
206
+ function = allele_function_lookup.get((rsid, allele))
207
+ # Under ADR-0020, the CPIC source classifies every allele PharmGKB
208
+ # cares about — Normal for reference, non-Normal for variant. An
209
+ # allele MISSING from the lookup at an rsid that otherwise has
210
+ # entries is an uncharacterized base; never silently suppressed.
211
+ if function != FUNCTION_CLASS_NORMAL:
212
+ return False
213
+ return True
214
+
215
+
216
+ def _normalize_genotype(raw: str) -> str | None:
217
+ """Return a sorted 2-letter SNV genotype, or None if not an SNV diploid call."""
218
+ cleaned = raw.replace(":", "").replace(";", "").replace("/", "").strip().upper()
219
+ if not _TWO_LETTER_GENOTYPE_RE.match(cleaned):
220
+ return None
221
+ return "".join(sorted(cleaned))
222
+
223
+
224
+ def _is_single_rsid(variant_field: str) -> bool:
225
+ """True if the Variant/Haplotypes field is a single rsid."""
226
+ return bool(_RSID_RE.match(variant_field.strip()))
227
+
228
+
229
+ def _open_directory(zip_or_dir: Path) -> tuple[Path, tempfile.TemporaryDirectory | None]:
230
+ """Return a directory path containing the TSVs.
231
+
232
+ If `zip_or_dir` is a directory, return it (no cleanup needed).
233
+ If it's a ZIP, extract to a temp dir and return (path, tempdir to clean).
234
+ """
235
+ if zip_or_dir.is_dir():
236
+ return zip_or_dir, None
237
+ tmp = tempfile.TemporaryDirectory(prefix="allelix-pharmgkb-")
238
+ # Python 3.11+ zipfile.extractall sanitizes "../" and absolute paths in
239
+ # member names. The project pins requires-python >= 3.11 (pyproject.toml).
240
+ with zipfile.ZipFile(zip_or_dir) as zf:
241
+ zf.extractall(tmp.name)
242
+ return Path(tmp.name), tmp
243
+
244
+
245
+ def iter_pharmgkb_records(
246
+ zip_or_dir: Path,
247
+ allele_function_lookup: dict[tuple[str, str], str] | None = None,
248
+ ) -> Iterator[dict[str, object]]:
249
+ """Yield one record per (rsid, genotype) pair from a clinical annotations dump.
250
+
251
+ Skips:
252
+ - rows whose Variant/Haplotypes is not a single rsid (star alleles,
253
+ multi-variant composites)
254
+ - per-allele rows whose Genotype/Allele is not a 2-letter SNV genotype
255
+ (indels, star alleles)
256
+ """
257
+ dir_path, tmp = _open_directory(zip_or_dir)
258
+ try:
259
+ annotations: dict[str, dict[str, str]] = {}
260
+ ann_tsv = dir_path / CLINICAL_ANN_FILENAME
261
+ alleles_tsv = dir_path / CLINICAL_ANN_ALLELES_FILENAME
262
+ if not ann_tsv.exists() or not alleles_tsv.exists():
263
+ raise FileNotFoundError(
264
+ f"PharmGKB dump missing required TSVs in {dir_path}: "
265
+ f"need {CLINICAL_ANN_FILENAME} + {CLINICAL_ANN_ALLELES_FILENAME}"
266
+ )
267
+
268
+ with ann_tsv.open("r", encoding="utf-8", newline="") as fh:
269
+ reader = csv.DictReader(fh, delimiter="\t")
270
+ for row in reader:
271
+ ann_id = row.get("Clinical Annotation ID", "").strip()
272
+ variant = row.get("Variant/Haplotypes", "").strip()
273
+ if not ann_id or not _is_single_rsid(variant):
274
+ continue
275
+ annotations[ann_id] = {
276
+ "rsid": variant,
277
+ "gene": row.get("Gene", "").strip(),
278
+ "drugs": row.get("Drug(s)", "").strip(),
279
+ "phenotype": row.get("Phenotype(s)", "").strip(),
280
+ "phenotype_category": row.get("Phenotype Category", "").strip(),
281
+ "level_of_evidence": row.get("Level of Evidence", "").strip(),
282
+ "score": row.get("Score", "").strip(),
283
+ }
284
+
285
+ with alleles_tsv.open("r", encoding="utf-8", newline="") as fh:
286
+ reader = csv.DictReader(fh, delimiter="\t")
287
+ for row in reader:
288
+ ann_id = row.get("Clinical Annotation ID", "").strip()
289
+ if ann_id not in annotations:
290
+ continue
291
+ normalized = _normalize_genotype(row.get("Genotype/Allele", ""))
292
+ if normalized is None:
293
+ continue
294
+ meta = annotations[ann_id]
295
+ allele_function = row.get("Allele Function", "").strip()
296
+ function_class = classify_function(allele_function)
297
+ annotation_text = row.get("Annotation Text", "").strip()
298
+ yield {
299
+ "rsid": meta["rsid"],
300
+ "genotype": normalized,
301
+ "gene": meta["gene"],
302
+ "drugs": meta["drugs"],
303
+ "phenotype": meta["phenotype"],
304
+ "phenotype_category": meta["phenotype_category"],
305
+ "annotation_text": annotation_text,
306
+ "level_of_evidence": meta["level_of_evidence"],
307
+ "score": _safe_float(meta["score"]),
308
+ "pgkb_annotation_id": ann_id,
309
+ "allele_function": allele_function,
310
+ "function_class": function_class,
311
+ "is_nonfinding": is_nonfinding_for_row(
312
+ allele_function,
313
+ annotation_text,
314
+ rsid=meta["rsid"],
315
+ genotype=normalized,
316
+ allele_function_lookup=allele_function_lookup,
317
+ ),
318
+ }
319
+ finally:
320
+ if tmp is not None:
321
+ tmp.cleanup()
322
+
323
+
324
+ def _safe_float(value: str) -> float | None:
325
+ if not value:
326
+ return None
327
+ try:
328
+ return float(value)
329
+ except ValueError:
330
+ return None
331
+
332
+
333
+ def load_pharmgkb_tsv(
334
+ zip_or_dir: Path,
335
+ db_path: Path,
336
+ source_url: str = "",
337
+ version: str = "",
338
+ remote_signal: str | None = None,
339
+ allele_function_lookup: dict[tuple[str, str], str] | None = None,
340
+ ) -> int:
341
+ """Load a PharmGKB clinical-annotations dump into a fresh SQLite cache atomically.
342
+
343
+ Writes to a `.tmp` SQLite then `os.replace`s onto `db_path`. A failed
344
+ mid-parse leaves the previous cache (if any) intact.
345
+
346
+ `allele_function_lookup` is the structured `(rsid, base) → function_class`
347
+ table that drives the non-finding filter (ADR-0020). Production fetches
348
+ it from CPIC's API; tests inject a synthetic dict directly. When None
349
+ the loader falls back to an empty lookup — every row emits.
350
+
351
+ `remote_signal` is the value `fetch_remote_signal` returned at the time
352
+ of this download; stored so the next `db update` can detect remote
353
+ changes without re-downloading.
354
+ """
355
+ tmp_path = db_path.parent / f"{db_path.name}.tmp"
356
+ if tmp_path.exists():
357
+ tmp_path.unlink()
358
+
359
+ resolved_version = version or datetime.now(UTC).strftime("%Y-%m-%d")
360
+ lookup = allele_function_lookup or {}
361
+
362
+ try:
363
+ with contextlib.closing(sqlite3.connect(tmp_path)) as conn:
364
+ conn.executescript(PHARMGKB_SCHEMA)
365
+
366
+ # Populate the per-allele function table (ADR-0020) first.
367
+ # The lookup arrives pre-built from cpic_loader.fetch_cpic_allele_functions
368
+ # (production) or a test fixture (unit tests).
369
+ af_insert_sql = (
370
+ "INSERT INTO pharmgkb_allele_function "
371
+ "(rsid, allele, function_class, source) "
372
+ "VALUES (?, ?, ?, 'cpic_api')"
373
+ )
374
+ for (rsid, allele), function_class in lookup.items():
375
+ conn.execute(af_insert_sql, (rsid, allele, function_class))
376
+
377
+ insert_sql = (
378
+ "INSERT INTO pharmgkb_annotations "
379
+ "(rsid, genotype, gene, drugs, phenotype, phenotype_category, "
380
+ "annotation_text, level_of_evidence, score, pgkb_annotation_id, "
381
+ "allele_function, function_class, is_nonfinding) "
382
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
383
+ )
384
+ batch: list[tuple] = []
385
+ count = 0
386
+ for record in iter_pharmgkb_records(zip_or_dir, lookup):
387
+ batch.append(
388
+ (
389
+ record["rsid"],
390
+ record["genotype"],
391
+ record["gene"],
392
+ record["drugs"],
393
+ record["phenotype"],
394
+ record["phenotype_category"],
395
+ record["annotation_text"],
396
+ record["level_of_evidence"],
397
+ record["score"],
398
+ record["pgkb_annotation_id"],
399
+ record["allele_function"],
400
+ record["function_class"],
401
+ int(bool(record["is_nonfinding"])),
402
+ )
403
+ )
404
+ if len(batch) >= INSERT_BATCH_SIZE:
405
+ conn.executemany(insert_sql, batch)
406
+ count += len(batch)
407
+ batch.clear()
408
+ if batch:
409
+ conn.executemany(insert_sql, batch)
410
+ count += len(batch)
411
+ from allelix.databases._versions import PHARMGKB_INTERPRETER_VERSION
412
+
413
+ conn.execute(
414
+ "INSERT INTO database_versions "
415
+ "(name, source_url, version, downloaded_at, record_count, "
416
+ "remote_signal, local_version_tag) "
417
+ "VALUES (?, ?, ?, ?, ?, ?, ?)",
418
+ (
419
+ "pharmgkb",
420
+ source_url,
421
+ resolved_version,
422
+ datetime.now(UTC).isoformat(),
423
+ count,
424
+ remote_signal or "",
425
+ f"iv:{PHARMGKB_INTERPRETER_VERSION}",
426
+ ),
427
+ )
428
+ conn.commit()
429
+ os.replace(tmp_path, db_path)
430
+ return count
431
+ except Exception:
432
+ if tmp_path.exists():
433
+ try:
434
+ tmp_path.unlink()
435
+ except OSError:
436
+ logger.warning("Could not remove failed temp DB %s", tmp_path)
437
+ raise
@@ -0,0 +1,165 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """SQLite schemas for cached reference databases.
4
+
5
+ Each annotator owns its own SQLite file (e.g. `clinvar.sqlite`, `pharmgkb.sqlite`).
6
+ Every per-annotator schema embeds the shared `database_versions` table so that
7
+ `get_database_info(db_path, name)` works uniformly across them.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ _DATABASE_VERSIONS_TABLE = """
13
+ CREATE TABLE IF NOT EXISTS database_versions (
14
+ name TEXT PRIMARY KEY,
15
+ source_url TEXT NOT NULL,
16
+ version TEXT,
17
+ downloaded_at TEXT NOT NULL,
18
+ record_count INTEGER NOT NULL,
19
+ remote_signal TEXT,
20
+ local_version_tag TEXT
21
+ );
22
+ """
23
+
24
+ CLINVAR_SCHEMA = (
25
+ """
26
+ CREATE TABLE IF NOT EXISTS clinvar_variants (
27
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
28
+ rsid TEXT NOT NULL,
29
+ chromosome TEXT NOT NULL,
30
+ position INTEGER NOT NULL,
31
+ ref TEXT NOT NULL,
32
+ alt TEXT NOT NULL,
33
+ clinical_significance TEXT,
34
+ condition TEXT,
35
+ gene TEXT,
36
+ review_status TEXT,
37
+ allele_id INTEGER
38
+ );
39
+
40
+ CREATE INDEX IF NOT EXISTS idx_clinvar_rsid ON clinvar_variants(rsid);
41
+ """
42
+ + _DATABASE_VERSIONS_TABLE
43
+ )
44
+
45
+ PHARMGKB_SCHEMA = (
46
+ """
47
+ CREATE TABLE IF NOT EXISTS pharmgkb_annotations (
48
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
49
+ rsid TEXT NOT NULL,
50
+ genotype TEXT NOT NULL,
51
+ gene TEXT,
52
+ drugs TEXT,
53
+ phenotype TEXT,
54
+ phenotype_category TEXT,
55
+ annotation_text TEXT,
56
+ level_of_evidence TEXT,
57
+ score REAL,
58
+ pgkb_annotation_id TEXT,
59
+ allele_function TEXT,
60
+ function_class TEXT NOT NULL,
61
+ is_nonfinding INTEGER NOT NULL
62
+ );
63
+
64
+ CREATE INDEX IF NOT EXISTS idx_pharmgkb_rsid ON pharmgkb_annotations(rsid);
65
+
66
+ -- ADR-0018: per-allele function extracted from PharmGKB's canonical CPIC
67
+ -- template sentence ("The {allele} allele of {rsid} is assigned {function}
68
+ -- function by CPIC."). Populated at load time by a pre-pass over the
69
+ -- annotation rows. Drives is_nonfinding classification for SNV rows where
70
+ -- the `Allele Function` column is empty (i.e., every in-scope row).
71
+ CREATE TABLE IF NOT EXISTS pharmgkb_allele_function (
72
+ rsid TEXT NOT NULL,
73
+ allele TEXT NOT NULL,
74
+ function_class TEXT NOT NULL,
75
+ source TEXT NOT NULL DEFAULT 'cpic_template',
76
+ PRIMARY KEY (rsid, allele)
77
+ );
78
+ """
79
+ + _DATABASE_VERSIONS_TABLE
80
+ )
81
+
82
+ GWAS_SCHEMA = (
83
+ """
84
+ CREATE TABLE IF NOT EXISTS gwas_associations (
85
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
86
+ rsid TEXT NOT NULL,
87
+ risk_allele TEXT,
88
+ trait TEXT NOT NULL,
89
+ p_value REAL,
90
+ or_beta REAL,
91
+ ci_text TEXT,
92
+ gene TEXT,
93
+ study_accession TEXT,
94
+ pubmed_id TEXT,
95
+ risk_allele_frequency REAL,
96
+ context TEXT,
97
+ mapped_trait_uri TEXT,
98
+ trait_category TEXT
99
+ );
100
+
101
+ CREATE INDEX IF NOT EXISTS idx_gwas_rsid ON gwas_associations(rsid);
102
+ """
103
+ + _DATABASE_VERSIONS_TABLE
104
+ )
105
+
106
+ GNOMAD_SCHEMA = (
107
+ """
108
+ CREATE TABLE IF NOT EXISTS gnomad_frequencies (
109
+ chrom TEXT NOT NULL,
110
+ pos INTEGER NOT NULL,
111
+ ref TEXT NOT NULL,
112
+ alt TEXT NOT NULL,
113
+ rsid TEXT,
114
+ af REAL,
115
+ af_popmax REAL,
116
+ popmax TEXT,
117
+ af_afr REAL,
118
+ af_amr REAL,
119
+ af_asj REAL,
120
+ af_eas REAL,
121
+ af_fin REAL,
122
+ af_nfe REAL,
123
+ af_sas REAL,
124
+ PRIMARY KEY (chrom, pos, ref, alt)
125
+ );
126
+
127
+ CREATE INDEX IF NOT EXISTS idx_gnomad_rsid ON gnomad_frequencies(rsid);
128
+ """
129
+ + _DATABASE_VERSIONS_TABLE
130
+ )
131
+
132
+ ALPHAMISSENSE_SCHEMA = (
133
+ """
134
+ CREATE TABLE IF NOT EXISTS alphamissense_scores (
135
+ chrom TEXT NOT NULL,
136
+ pos INTEGER NOT NULL,
137
+ ref TEXT NOT NULL,
138
+ alt TEXT NOT NULL,
139
+ rsid TEXT,
140
+ uniprot_id TEXT,
141
+ transcript_id TEXT,
142
+ protein_variant TEXT,
143
+ am_pathogenicity REAL NOT NULL,
144
+ am_class TEXT NOT NULL,
145
+ PRIMARY KEY (chrom, pos, ref, alt)
146
+ );
147
+
148
+ CREATE INDEX IF NOT EXISTS idx_am_rsid ON alphamissense_scores(rsid);
149
+ """
150
+ + _DATABASE_VERSIONS_TABLE
151
+ )
152
+
153
+ CADD_SCHEMA = (
154
+ """
155
+ CREATE TABLE IF NOT EXISTS cadd_scores (
156
+ chrom TEXT NOT NULL,
157
+ pos INTEGER NOT NULL,
158
+ ref TEXT NOT NULL,
159
+ alt TEXT NOT NULL,
160
+ phred REAL NOT NULL,
161
+ PRIMARY KEY (chrom, pos, ref, alt)
162
+ );
163
+ """
164
+ + _DATABASE_VERSIONS_TABLE
165
+ )
@@ -0,0 +1,44 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """SNPedia pre-built cache loader.
4
+
5
+ The pre-built SQLite cache is downloaded from HuggingFace during
6
+ ``db update``. Contains ~216K raw wiki pages and ~105K parsed genotype
7
+ rows.
8
+
9
+ The cache can also be built locally via ``scripts/scrape_snpedia.py``
10
+ followed by ``scripts/parse_snpedia.py``.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from allelix.databases.loader_utils import install_prebuilt_gz_cache
18
+
19
+ if TYPE_CHECKING:
20
+ from pathlib import Path
21
+
22
+ SNPEDIA_CACHE_URL = (
23
+ "https://huggingface.co/datasets/genomics-commons/snpedia"
24
+ "/resolve/69a745401a0d63acb71fc759b9e79f6d5da79dd9/snpedia.sqlite.gz"
25
+ )
26
+
27
+ SNPEDIA_EXPECTED_SHA256 = "bd940b624143d03427baf9b2572da07257631bd6fb8b584b5ed0961f07cad104"
28
+
29
+
30
+ def install_prebuilt_cache(
31
+ gz_path: Path,
32
+ db_path: Path,
33
+ *,
34
+ source_url: str = "",
35
+ remote_signal: str | None = None,
36
+ ) -> None:
37
+ """Decompress a gzipped pre-built SNPedia SQLite cache into place."""
38
+ install_prebuilt_gz_cache(
39
+ gz_path,
40
+ db_path,
41
+ "snpedia",
42
+ source_url=source_url,
43
+ remote_signal=remote_signal,
44
+ )