allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""PharmGKB clinical-annotation download, parse, and load into SQLite.
|
|
4
|
+
|
|
5
|
+
PharmGKB publishes a `clinicalAnnotations.zip` containing two TSVs:
|
|
6
|
+
|
|
7
|
+
- `clinical_annotations.tsv`: one row per clinical annotation
|
|
8
|
+
(id, variant/haplotypes, gene, drug(s), phenotype(s), level of evidence,
|
|
9
|
+
score, phenotype category, …)
|
|
10
|
+
- `clinical_ann_alleles.tsv`: per-genotype rows for each annotation
|
|
11
|
+
(annotation id, genotype/allele, annotation text, allele function)
|
|
12
|
+
|
|
13
|
+
This loader joins the two on annotation id and emits one record per
|
|
14
|
+
(rsid, genotype) pair. Star alleles, multi-rsid composites, and indel
|
|
15
|
+
genotypes are skipped — they require haplotype reconstruction.
|
|
16
|
+
|
|
17
|
+
See ADR-0009 for the genotype-matching rationale.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import contextlib
|
|
23
|
+
import csv
|
|
24
|
+
import logging
|
|
25
|
+
import os
|
|
26
|
+
import re
|
|
27
|
+
import sqlite3
|
|
28
|
+
import tempfile
|
|
29
|
+
import zipfile
|
|
30
|
+
from datetime import UTC, datetime
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import TYPE_CHECKING
|
|
33
|
+
|
|
34
|
+
from allelix.databases.schema import PHARMGKB_SCHEMA
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from collections.abc import Iterator
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
PHARMGKB_CLINICAL_URL = "https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip"
|
|
42
|
+
PHARMGKB_DB_FILENAME = "pharmgkb.sqlite"
|
|
43
|
+
|
|
44
|
+
INSERT_BATCH_SIZE = 5_000
|
|
45
|
+
|
|
46
|
+
CLINICAL_ANN_FILENAME = "clinical_annotations.tsv"
|
|
47
|
+
CLINICAL_ANN_ALLELES_FILENAME = "clinical_ann_alleles.tsv"
|
|
48
|
+
|
|
49
|
+
# Structural format validation only — NOT prose classification.
|
|
50
|
+
# Per ADR-0016, regex is permitted for ID format checking and shape
|
|
51
|
+
# validation; it's forbidden as input to any classification decision.
|
|
52
|
+
_RSID_RE = re.compile(r"^rs\d+$")
|
|
53
|
+
_TWO_LETTER_GENOTYPE_RE = re.compile(r"^[ACGT]{2}$")
|
|
54
|
+
|
|
55
|
+
# ADR-0020 (v0.9.0): per-allele function lives in the structured CPIC API,
|
|
56
|
+
# fetched into `pharmgkb_allele_function` at db-update time and queried as
|
|
57
|
+
# a join. The filter is: for the user's `(rsid, genotype)`, look up each
|
|
58
|
+
# base in the lookup; if every base maps to Normal function, the row is a
|
|
59
|
+
# non-finding. No regex, no prose parsing, no description classification.
|
|
60
|
+
#
|
|
61
|
+
# Function class enumeration mirrors CPIC's structured field. Values
|
|
62
|
+
# outside this set are treated as not-Normal (variant) and emit the row.
|
|
63
|
+
FUNCTION_CLASS_NORMAL = "normal"
|
|
64
|
+
FUNCTION_CLASS_DECREASED = "decreased"
|
|
65
|
+
FUNCTION_CLASS_NO_FUNCTION = "no_function"
|
|
66
|
+
FUNCTION_CLASS_INCREASED = "increased"
|
|
67
|
+
FUNCTION_CLASS_UNKNOWN = "unknown"
|
|
68
|
+
|
|
69
|
+
# Schema migration. v0.5.x lacks `function_class`; v0.6.x lacks the
|
|
70
|
+
# `pharmgkb_allele_function` table. `schema_is_current()` returns False on
|
|
71
|
+
# either, so `db update` automatically refreshes into the v0.9.0 schema.
|
|
72
|
+
_REQUIRED_PHARMGKB_COLUMNS = frozenset(
|
|
73
|
+
{
|
|
74
|
+
"rsid",
|
|
75
|
+
"genotype",
|
|
76
|
+
"gene",
|
|
77
|
+
"drugs",
|
|
78
|
+
"phenotype",
|
|
79
|
+
"phenotype_category",
|
|
80
|
+
"annotation_text",
|
|
81
|
+
"level_of_evidence",
|
|
82
|
+
"score",
|
|
83
|
+
"pgkb_annotation_id",
|
|
84
|
+
"allele_function",
|
|
85
|
+
"function_class",
|
|
86
|
+
"is_nonfinding",
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
_REQUIRED_PHARMGKB_TABLES = frozenset({"pharmgkb_annotations", "pharmgkb_allele_function"})
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def classify_function(allele_function: str | None) -> str:
|
|
93
|
+
"""Map PharmGKB's `Allele Function` field to a stable enum string.
|
|
94
|
+
|
|
95
|
+
The structured field is authoritative (ADR-0016). When it's empty, we
|
|
96
|
+
record `unknown` rather than guess from prose — the user sees the row
|
|
97
|
+
and decides what to do with it.
|
|
98
|
+
"""
|
|
99
|
+
if not allele_function:
|
|
100
|
+
return FUNCTION_CLASS_UNKNOWN
|
|
101
|
+
value = allele_function.strip().lower()
|
|
102
|
+
if "no function" in value:
|
|
103
|
+
return FUNCTION_CLASS_NO_FUNCTION
|
|
104
|
+
if "decreased" in value:
|
|
105
|
+
return FUNCTION_CLASS_DECREASED
|
|
106
|
+
if "increased" in value:
|
|
107
|
+
return FUNCTION_CLASS_INCREASED
|
|
108
|
+
if "normal" in value:
|
|
109
|
+
return FUNCTION_CLASS_NORMAL
|
|
110
|
+
return FUNCTION_CLASS_UNKNOWN
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def is_nonfinding_for_row(
|
|
114
|
+
allele_function: str | None,
|
|
115
|
+
annotation_text: str | None = None, # kept for back-compat; unused.
|
|
116
|
+
*,
|
|
117
|
+
rsid: str | None = None,
|
|
118
|
+
genotype: str | None = None,
|
|
119
|
+
allele_function_lookup: dict[tuple[str, str], str] | None = None,
|
|
120
|
+
) -> bool:
|
|
121
|
+
"""Decide whether a row is a non-finding (ADR-0020, v0.9.0).
|
|
122
|
+
|
|
123
|
+
The filter is a join, not a text classifier:
|
|
124
|
+
|
|
125
|
+
1. **PharmGKB's structured `Allele Function` column** (ADR-0016).
|
|
126
|
+
Authoritative on the rare row where PharmGKB populates it for an
|
|
127
|
+
SNV genotype (most SNV rows have it empty).
|
|
128
|
+
|
|
129
|
+
2. **CPIC per-allele function lookup** (ADR-0020). For the row's
|
|
130
|
+
`(rsid, genotype)`, look up each user-carried base in the
|
|
131
|
+
`pharmgkb_allele_function` table. If every base maps to
|
|
132
|
+
`Normal function`, the row is a non-finding. If any base is
|
|
133
|
+
non-Normal — or absent from the lookup for an rsid that HAS
|
|
134
|
+
entries — the row emits.
|
|
135
|
+
|
|
136
|
+
If neither tier has data for an rsid, the row emits (rows are never
|
|
137
|
+
silently suppressed without structured evidence).
|
|
138
|
+
"""
|
|
139
|
+
function_class = classify_function(allele_function)
|
|
140
|
+
if function_class != FUNCTION_CLASS_UNKNOWN:
|
|
141
|
+
return function_class == FUNCTION_CLASS_NORMAL
|
|
142
|
+
|
|
143
|
+
if rsid and genotype and allele_function_lookup is not None:
|
|
144
|
+
lookup_result = is_nonfinding_by_allele_lookup(rsid, genotype, allele_function_lookup)
|
|
145
|
+
if lookup_result is not None:
|
|
146
|
+
return lookup_result
|
|
147
|
+
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def is_nonfinding(function_class: str) -> bool:
|
|
152
|
+
"""Structured-only non-finding check (back-compat shim for tests).
|
|
153
|
+
|
|
154
|
+
Returns True iff function_class == 'normal'. Production code should
|
|
155
|
+
use `is_nonfinding_for_row()` which also handles the empty-field
|
|
156
|
+
prose fallback per ADR-0017.
|
|
157
|
+
"""
|
|
158
|
+
return function_class == FUNCTION_CLASS_NORMAL
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def schema_is_current(db_path: Path) -> bool:
|
|
162
|
+
"""True iff the cache has the v0.7.0 PharmGKB schema.
|
|
163
|
+
|
|
164
|
+
v0.7.0 requires both:
|
|
165
|
+
- all v0.6.0 columns on pharmgkb_annotations
|
|
166
|
+
- the new pharmgkb_allele_function table (ADR-0018)
|
|
167
|
+
"""
|
|
168
|
+
if not db_path.exists():
|
|
169
|
+
return False
|
|
170
|
+
try:
|
|
171
|
+
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
172
|
+
try:
|
|
173
|
+
tables = {
|
|
174
|
+
row[0]
|
|
175
|
+
for row in conn.execute("SELECT name FROM sqlite_master WHERE type = 'table'")
|
|
176
|
+
}
|
|
177
|
+
if not _REQUIRED_PHARMGKB_TABLES.issubset(tables):
|
|
178
|
+
return False
|
|
179
|
+
cols = {row[1] for row in conn.execute("PRAGMA table_info(pharmgkb_annotations)")}
|
|
180
|
+
except sqlite3.DatabaseError:
|
|
181
|
+
return False
|
|
182
|
+
return _REQUIRED_PHARMGKB_COLUMNS.issubset(cols)
|
|
183
|
+
except sqlite3.DatabaseError:
|
|
184
|
+
return False
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def is_nonfinding_by_allele_lookup(
|
|
188
|
+
rsid: str,
|
|
189
|
+
genotype: str,
|
|
190
|
+
allele_function_lookup: dict[tuple[str, str], str],
|
|
191
|
+
) -> bool | None:
|
|
192
|
+
"""Per-allele structured classifier (ADR-0018).
|
|
193
|
+
|
|
194
|
+
Returns True if every allele in the user's genotype is either absent
|
|
195
|
+
from the CPIC lookup (Normal-by-absence) or explicitly classified as
|
|
196
|
+
Normal function. Returns False if ANY allele has a flagged non-Normal
|
|
197
|
+
function. Returns None if the lookup has no entries for this rsid at
|
|
198
|
+
all (callers fall back to prose).
|
|
199
|
+
"""
|
|
200
|
+
if len(genotype) != 2:
|
|
201
|
+
return None
|
|
202
|
+
rsid_has_entries = any(k[0] == rsid for k in allele_function_lookup)
|
|
203
|
+
if not rsid_has_entries:
|
|
204
|
+
return None
|
|
205
|
+
for allele in set(genotype.upper()):
|
|
206
|
+
function = allele_function_lookup.get((rsid, allele))
|
|
207
|
+
# Under ADR-0020, the CPIC source classifies every allele PharmGKB
|
|
208
|
+
# cares about — Normal for reference, non-Normal for variant. An
|
|
209
|
+
# allele MISSING from the lookup at an rsid that otherwise has
|
|
210
|
+
# entries is an uncharacterized base; never silently suppressed.
|
|
211
|
+
if function != FUNCTION_CLASS_NORMAL:
|
|
212
|
+
return False
|
|
213
|
+
return True
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _normalize_genotype(raw: str) -> str | None:
|
|
217
|
+
"""Return a sorted 2-letter SNV genotype, or None if not an SNV diploid call."""
|
|
218
|
+
cleaned = raw.replace(":", "").replace(";", "").replace("/", "").strip().upper()
|
|
219
|
+
if not _TWO_LETTER_GENOTYPE_RE.match(cleaned):
|
|
220
|
+
return None
|
|
221
|
+
return "".join(sorted(cleaned))
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _is_single_rsid(variant_field: str) -> bool:
|
|
225
|
+
"""True if the Variant/Haplotypes field is a single rsid."""
|
|
226
|
+
return bool(_RSID_RE.match(variant_field.strip()))
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _open_directory(zip_or_dir: Path) -> tuple[Path, tempfile.TemporaryDirectory | None]:
|
|
230
|
+
"""Return a directory path containing the TSVs.
|
|
231
|
+
|
|
232
|
+
If `zip_or_dir` is a directory, return it (no cleanup needed).
|
|
233
|
+
If it's a ZIP, extract to a temp dir and return (path, tempdir to clean).
|
|
234
|
+
"""
|
|
235
|
+
if zip_or_dir.is_dir():
|
|
236
|
+
return zip_or_dir, None
|
|
237
|
+
tmp = tempfile.TemporaryDirectory(prefix="allelix-pharmgkb-")
|
|
238
|
+
# Python 3.11+ zipfile.extractall sanitizes "../" and absolute paths in
|
|
239
|
+
# member names. The project pins requires-python >= 3.11 (pyproject.toml).
|
|
240
|
+
with zipfile.ZipFile(zip_or_dir) as zf:
|
|
241
|
+
zf.extractall(tmp.name)
|
|
242
|
+
return Path(tmp.name), tmp
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def iter_pharmgkb_records(
|
|
246
|
+
zip_or_dir: Path,
|
|
247
|
+
allele_function_lookup: dict[tuple[str, str], str] | None = None,
|
|
248
|
+
) -> Iterator[dict[str, object]]:
|
|
249
|
+
"""Yield one record per (rsid, genotype) pair from a clinical annotations dump.
|
|
250
|
+
|
|
251
|
+
Skips:
|
|
252
|
+
- rows whose Variant/Haplotypes is not a single rsid (star alleles,
|
|
253
|
+
multi-variant composites)
|
|
254
|
+
- per-allele rows whose Genotype/Allele is not a 2-letter SNV genotype
|
|
255
|
+
(indels, star alleles)
|
|
256
|
+
"""
|
|
257
|
+
dir_path, tmp = _open_directory(zip_or_dir)
|
|
258
|
+
try:
|
|
259
|
+
annotations: dict[str, dict[str, str]] = {}
|
|
260
|
+
ann_tsv = dir_path / CLINICAL_ANN_FILENAME
|
|
261
|
+
alleles_tsv = dir_path / CLINICAL_ANN_ALLELES_FILENAME
|
|
262
|
+
if not ann_tsv.exists() or not alleles_tsv.exists():
|
|
263
|
+
raise FileNotFoundError(
|
|
264
|
+
f"PharmGKB dump missing required TSVs in {dir_path}: "
|
|
265
|
+
f"need {CLINICAL_ANN_FILENAME} + {CLINICAL_ANN_ALLELES_FILENAME}"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
with ann_tsv.open("r", encoding="utf-8", newline="") as fh:
|
|
269
|
+
reader = csv.DictReader(fh, delimiter="\t")
|
|
270
|
+
for row in reader:
|
|
271
|
+
ann_id = row.get("Clinical Annotation ID", "").strip()
|
|
272
|
+
variant = row.get("Variant/Haplotypes", "").strip()
|
|
273
|
+
if not ann_id or not _is_single_rsid(variant):
|
|
274
|
+
continue
|
|
275
|
+
annotations[ann_id] = {
|
|
276
|
+
"rsid": variant,
|
|
277
|
+
"gene": row.get("Gene", "").strip(),
|
|
278
|
+
"drugs": row.get("Drug(s)", "").strip(),
|
|
279
|
+
"phenotype": row.get("Phenotype(s)", "").strip(),
|
|
280
|
+
"phenotype_category": row.get("Phenotype Category", "").strip(),
|
|
281
|
+
"level_of_evidence": row.get("Level of Evidence", "").strip(),
|
|
282
|
+
"score": row.get("Score", "").strip(),
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
with alleles_tsv.open("r", encoding="utf-8", newline="") as fh:
|
|
286
|
+
reader = csv.DictReader(fh, delimiter="\t")
|
|
287
|
+
for row in reader:
|
|
288
|
+
ann_id = row.get("Clinical Annotation ID", "").strip()
|
|
289
|
+
if ann_id not in annotations:
|
|
290
|
+
continue
|
|
291
|
+
normalized = _normalize_genotype(row.get("Genotype/Allele", ""))
|
|
292
|
+
if normalized is None:
|
|
293
|
+
continue
|
|
294
|
+
meta = annotations[ann_id]
|
|
295
|
+
allele_function = row.get("Allele Function", "").strip()
|
|
296
|
+
function_class = classify_function(allele_function)
|
|
297
|
+
annotation_text = row.get("Annotation Text", "").strip()
|
|
298
|
+
yield {
|
|
299
|
+
"rsid": meta["rsid"],
|
|
300
|
+
"genotype": normalized,
|
|
301
|
+
"gene": meta["gene"],
|
|
302
|
+
"drugs": meta["drugs"],
|
|
303
|
+
"phenotype": meta["phenotype"],
|
|
304
|
+
"phenotype_category": meta["phenotype_category"],
|
|
305
|
+
"annotation_text": annotation_text,
|
|
306
|
+
"level_of_evidence": meta["level_of_evidence"],
|
|
307
|
+
"score": _safe_float(meta["score"]),
|
|
308
|
+
"pgkb_annotation_id": ann_id,
|
|
309
|
+
"allele_function": allele_function,
|
|
310
|
+
"function_class": function_class,
|
|
311
|
+
"is_nonfinding": is_nonfinding_for_row(
|
|
312
|
+
allele_function,
|
|
313
|
+
annotation_text,
|
|
314
|
+
rsid=meta["rsid"],
|
|
315
|
+
genotype=normalized,
|
|
316
|
+
allele_function_lookup=allele_function_lookup,
|
|
317
|
+
),
|
|
318
|
+
}
|
|
319
|
+
finally:
|
|
320
|
+
if tmp is not None:
|
|
321
|
+
tmp.cleanup()
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _safe_float(value: str) -> float | None:
|
|
325
|
+
if not value:
|
|
326
|
+
return None
|
|
327
|
+
try:
|
|
328
|
+
return float(value)
|
|
329
|
+
except ValueError:
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def load_pharmgkb_tsv(
|
|
334
|
+
zip_or_dir: Path,
|
|
335
|
+
db_path: Path,
|
|
336
|
+
source_url: str = "",
|
|
337
|
+
version: str = "",
|
|
338
|
+
remote_signal: str | None = None,
|
|
339
|
+
allele_function_lookup: dict[tuple[str, str], str] | None = None,
|
|
340
|
+
) -> int:
|
|
341
|
+
"""Load a PharmGKB clinical-annotations dump into a fresh SQLite cache atomically.
|
|
342
|
+
|
|
343
|
+
Writes to a `.tmp` SQLite then `os.replace`s onto `db_path`. A failed
|
|
344
|
+
mid-parse leaves the previous cache (if any) intact.
|
|
345
|
+
|
|
346
|
+
`allele_function_lookup` is the structured `(rsid, base) → function_class`
|
|
347
|
+
table that drives the non-finding filter (ADR-0020). Production fetches
|
|
348
|
+
it from CPIC's API; tests inject a synthetic dict directly. When None
|
|
349
|
+
the loader falls back to an empty lookup — every row emits.
|
|
350
|
+
|
|
351
|
+
`remote_signal` is the value `fetch_remote_signal` returned at the time
|
|
352
|
+
of this download; stored so the next `db update` can detect remote
|
|
353
|
+
changes without re-downloading.
|
|
354
|
+
"""
|
|
355
|
+
tmp_path = db_path.parent / f"{db_path.name}.tmp"
|
|
356
|
+
if tmp_path.exists():
|
|
357
|
+
tmp_path.unlink()
|
|
358
|
+
|
|
359
|
+
resolved_version = version or datetime.now(UTC).strftime("%Y-%m-%d")
|
|
360
|
+
lookup = allele_function_lookup or {}
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
with contextlib.closing(sqlite3.connect(tmp_path)) as conn:
|
|
364
|
+
conn.executescript(PHARMGKB_SCHEMA)
|
|
365
|
+
|
|
366
|
+
# Populate the per-allele function table (ADR-0020) first.
|
|
367
|
+
# The lookup arrives pre-built from cpic_loader.fetch_cpic_allele_functions
|
|
368
|
+
# (production) or a test fixture (unit tests).
|
|
369
|
+
af_insert_sql = (
|
|
370
|
+
"INSERT INTO pharmgkb_allele_function "
|
|
371
|
+
"(rsid, allele, function_class, source) "
|
|
372
|
+
"VALUES (?, ?, ?, 'cpic_api')"
|
|
373
|
+
)
|
|
374
|
+
for (rsid, allele), function_class in lookup.items():
|
|
375
|
+
conn.execute(af_insert_sql, (rsid, allele, function_class))
|
|
376
|
+
|
|
377
|
+
insert_sql = (
|
|
378
|
+
"INSERT INTO pharmgkb_annotations "
|
|
379
|
+
"(rsid, genotype, gene, drugs, phenotype, phenotype_category, "
|
|
380
|
+
"annotation_text, level_of_evidence, score, pgkb_annotation_id, "
|
|
381
|
+
"allele_function, function_class, is_nonfinding) "
|
|
382
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
|
383
|
+
)
|
|
384
|
+
batch: list[tuple] = []
|
|
385
|
+
count = 0
|
|
386
|
+
for record in iter_pharmgkb_records(zip_or_dir, lookup):
|
|
387
|
+
batch.append(
|
|
388
|
+
(
|
|
389
|
+
record["rsid"],
|
|
390
|
+
record["genotype"],
|
|
391
|
+
record["gene"],
|
|
392
|
+
record["drugs"],
|
|
393
|
+
record["phenotype"],
|
|
394
|
+
record["phenotype_category"],
|
|
395
|
+
record["annotation_text"],
|
|
396
|
+
record["level_of_evidence"],
|
|
397
|
+
record["score"],
|
|
398
|
+
record["pgkb_annotation_id"],
|
|
399
|
+
record["allele_function"],
|
|
400
|
+
record["function_class"],
|
|
401
|
+
int(bool(record["is_nonfinding"])),
|
|
402
|
+
)
|
|
403
|
+
)
|
|
404
|
+
if len(batch) >= INSERT_BATCH_SIZE:
|
|
405
|
+
conn.executemany(insert_sql, batch)
|
|
406
|
+
count += len(batch)
|
|
407
|
+
batch.clear()
|
|
408
|
+
if batch:
|
|
409
|
+
conn.executemany(insert_sql, batch)
|
|
410
|
+
count += len(batch)
|
|
411
|
+
from allelix.databases._versions import PHARMGKB_INTERPRETER_VERSION
|
|
412
|
+
|
|
413
|
+
conn.execute(
|
|
414
|
+
"INSERT INTO database_versions "
|
|
415
|
+
"(name, source_url, version, downloaded_at, record_count, "
|
|
416
|
+
"remote_signal, local_version_tag) "
|
|
417
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
418
|
+
(
|
|
419
|
+
"pharmgkb",
|
|
420
|
+
source_url,
|
|
421
|
+
resolved_version,
|
|
422
|
+
datetime.now(UTC).isoformat(),
|
|
423
|
+
count,
|
|
424
|
+
remote_signal or "",
|
|
425
|
+
f"iv:{PHARMGKB_INTERPRETER_VERSION}",
|
|
426
|
+
),
|
|
427
|
+
)
|
|
428
|
+
conn.commit()
|
|
429
|
+
os.replace(tmp_path, db_path)
|
|
430
|
+
return count
|
|
431
|
+
except Exception:
|
|
432
|
+
if tmp_path.exists():
|
|
433
|
+
try:
|
|
434
|
+
tmp_path.unlink()
|
|
435
|
+
except OSError:
|
|
436
|
+
logger.warning("Could not remove failed temp DB %s", tmp_path)
|
|
437
|
+
raise
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""SQLite schemas for cached reference databases.
|
|
4
|
+
|
|
5
|
+
Each annotator owns its own SQLite file (e.g. `clinvar.sqlite`, `pharmgkb.sqlite`).
|
|
6
|
+
Every per-annotator schema embeds the shared `database_versions` table so that
|
|
7
|
+
`get_database_info(db_path, name)` works uniformly across them.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
_DATABASE_VERSIONS_TABLE = """
|
|
13
|
+
CREATE TABLE IF NOT EXISTS database_versions (
|
|
14
|
+
name TEXT PRIMARY KEY,
|
|
15
|
+
source_url TEXT NOT NULL,
|
|
16
|
+
version TEXT,
|
|
17
|
+
downloaded_at TEXT NOT NULL,
|
|
18
|
+
record_count INTEGER NOT NULL,
|
|
19
|
+
remote_signal TEXT,
|
|
20
|
+
local_version_tag TEXT
|
|
21
|
+
);
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
CLINVAR_SCHEMA = (
|
|
25
|
+
"""
|
|
26
|
+
CREATE TABLE IF NOT EXISTS clinvar_variants (
|
|
27
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
28
|
+
rsid TEXT NOT NULL,
|
|
29
|
+
chromosome TEXT NOT NULL,
|
|
30
|
+
position INTEGER NOT NULL,
|
|
31
|
+
ref TEXT NOT NULL,
|
|
32
|
+
alt TEXT NOT NULL,
|
|
33
|
+
clinical_significance TEXT,
|
|
34
|
+
condition TEXT,
|
|
35
|
+
gene TEXT,
|
|
36
|
+
review_status TEXT,
|
|
37
|
+
allele_id INTEGER
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
CREATE INDEX IF NOT EXISTS idx_clinvar_rsid ON clinvar_variants(rsid);
|
|
41
|
+
"""
|
|
42
|
+
+ _DATABASE_VERSIONS_TABLE
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
PHARMGKB_SCHEMA = (
|
|
46
|
+
"""
|
|
47
|
+
CREATE TABLE IF NOT EXISTS pharmgkb_annotations (
|
|
48
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
49
|
+
rsid TEXT NOT NULL,
|
|
50
|
+
genotype TEXT NOT NULL,
|
|
51
|
+
gene TEXT,
|
|
52
|
+
drugs TEXT,
|
|
53
|
+
phenotype TEXT,
|
|
54
|
+
phenotype_category TEXT,
|
|
55
|
+
annotation_text TEXT,
|
|
56
|
+
level_of_evidence TEXT,
|
|
57
|
+
score REAL,
|
|
58
|
+
pgkb_annotation_id TEXT,
|
|
59
|
+
allele_function TEXT,
|
|
60
|
+
function_class TEXT NOT NULL,
|
|
61
|
+
is_nonfinding INTEGER NOT NULL
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
CREATE INDEX IF NOT EXISTS idx_pharmgkb_rsid ON pharmgkb_annotations(rsid);
|
|
65
|
+
|
|
66
|
+
-- ADR-0018: per-allele function extracted from PharmGKB's canonical CPIC
|
|
67
|
+
-- template sentence ("The {allele} allele of {rsid} is assigned {function}
|
|
68
|
+
-- function by CPIC."). Populated at load time by a pre-pass over the
|
|
69
|
+
-- annotation rows. Drives is_nonfinding classification for SNV rows where
|
|
70
|
+
-- the `Allele Function` column is empty (i.e., every in-scope row).
|
|
71
|
+
CREATE TABLE IF NOT EXISTS pharmgkb_allele_function (
|
|
72
|
+
rsid TEXT NOT NULL,
|
|
73
|
+
allele TEXT NOT NULL,
|
|
74
|
+
function_class TEXT NOT NULL,
|
|
75
|
+
source TEXT NOT NULL DEFAULT 'cpic_template',
|
|
76
|
+
PRIMARY KEY (rsid, allele)
|
|
77
|
+
);
|
|
78
|
+
"""
|
|
79
|
+
+ _DATABASE_VERSIONS_TABLE
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
GWAS_SCHEMA = (
|
|
83
|
+
"""
|
|
84
|
+
CREATE TABLE IF NOT EXISTS gwas_associations (
|
|
85
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
86
|
+
rsid TEXT NOT NULL,
|
|
87
|
+
risk_allele TEXT,
|
|
88
|
+
trait TEXT NOT NULL,
|
|
89
|
+
p_value REAL,
|
|
90
|
+
or_beta REAL,
|
|
91
|
+
ci_text TEXT,
|
|
92
|
+
gene TEXT,
|
|
93
|
+
study_accession TEXT,
|
|
94
|
+
pubmed_id TEXT,
|
|
95
|
+
risk_allele_frequency REAL,
|
|
96
|
+
context TEXT,
|
|
97
|
+
mapped_trait_uri TEXT,
|
|
98
|
+
trait_category TEXT
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
CREATE INDEX IF NOT EXISTS idx_gwas_rsid ON gwas_associations(rsid);
|
|
102
|
+
"""
|
|
103
|
+
+ _DATABASE_VERSIONS_TABLE
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
GNOMAD_SCHEMA = (
|
|
107
|
+
"""
|
|
108
|
+
CREATE TABLE IF NOT EXISTS gnomad_frequencies (
|
|
109
|
+
chrom TEXT NOT NULL,
|
|
110
|
+
pos INTEGER NOT NULL,
|
|
111
|
+
ref TEXT NOT NULL,
|
|
112
|
+
alt TEXT NOT NULL,
|
|
113
|
+
rsid TEXT,
|
|
114
|
+
af REAL,
|
|
115
|
+
af_popmax REAL,
|
|
116
|
+
popmax TEXT,
|
|
117
|
+
af_afr REAL,
|
|
118
|
+
af_amr REAL,
|
|
119
|
+
af_asj REAL,
|
|
120
|
+
af_eas REAL,
|
|
121
|
+
af_fin REAL,
|
|
122
|
+
af_nfe REAL,
|
|
123
|
+
af_sas REAL,
|
|
124
|
+
PRIMARY KEY (chrom, pos, ref, alt)
|
|
125
|
+
);
|
|
126
|
+
|
|
127
|
+
CREATE INDEX IF NOT EXISTS idx_gnomad_rsid ON gnomad_frequencies(rsid);
|
|
128
|
+
"""
|
|
129
|
+
+ _DATABASE_VERSIONS_TABLE
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
ALPHAMISSENSE_SCHEMA = (
|
|
133
|
+
"""
|
|
134
|
+
CREATE TABLE IF NOT EXISTS alphamissense_scores (
|
|
135
|
+
chrom TEXT NOT NULL,
|
|
136
|
+
pos INTEGER NOT NULL,
|
|
137
|
+
ref TEXT NOT NULL,
|
|
138
|
+
alt TEXT NOT NULL,
|
|
139
|
+
rsid TEXT,
|
|
140
|
+
uniprot_id TEXT,
|
|
141
|
+
transcript_id TEXT,
|
|
142
|
+
protein_variant TEXT,
|
|
143
|
+
am_pathogenicity REAL NOT NULL,
|
|
144
|
+
am_class TEXT NOT NULL,
|
|
145
|
+
PRIMARY KEY (chrom, pos, ref, alt)
|
|
146
|
+
);
|
|
147
|
+
|
|
148
|
+
CREATE INDEX IF NOT EXISTS idx_am_rsid ON alphamissense_scores(rsid);
|
|
149
|
+
"""
|
|
150
|
+
+ _DATABASE_VERSIONS_TABLE
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
CADD_SCHEMA = (
|
|
154
|
+
"""
|
|
155
|
+
CREATE TABLE IF NOT EXISTS cadd_scores (
|
|
156
|
+
chrom TEXT NOT NULL,
|
|
157
|
+
pos INTEGER NOT NULL,
|
|
158
|
+
ref TEXT NOT NULL,
|
|
159
|
+
alt TEXT NOT NULL,
|
|
160
|
+
phred REAL NOT NULL,
|
|
161
|
+
PRIMARY KEY (chrom, pos, ref, alt)
|
|
162
|
+
);
|
|
163
|
+
"""
|
|
164
|
+
+ _DATABASE_VERSIONS_TABLE
|
|
165
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""SNPedia pre-built cache loader.
|
|
4
|
+
|
|
5
|
+
The pre-built SQLite cache is downloaded from HuggingFace during
|
|
6
|
+
``db update``. Contains ~216K raw wiki pages and ~105K parsed genotype
|
|
7
|
+
rows.
|
|
8
|
+
|
|
9
|
+
The cache can also be built locally via ``scripts/scrape_snpedia.py``
|
|
10
|
+
followed by ``scripts/parse_snpedia.py``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
from allelix.databases.loader_utils import install_prebuilt_gz_cache
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
SNPEDIA_CACHE_URL = (
|
|
23
|
+
"https://huggingface.co/datasets/genomics-commons/snpedia"
|
|
24
|
+
"/resolve/69a745401a0d63acb71fc759b9e79f6d5da79dd9/snpedia.sqlite.gz"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
SNPEDIA_EXPECTED_SHA256 = "bd940b624143d03427baf9b2572da07257631bd6fb8b584b5ed0961f07cad104"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def install_prebuilt_cache(
|
|
31
|
+
gz_path: Path,
|
|
32
|
+
db_path: Path,
|
|
33
|
+
*,
|
|
34
|
+
source_url: str = "",
|
|
35
|
+
remote_signal: str | None = None,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Decompress a gzipped pre-built SNPedia SQLite cache into place."""
|
|
38
|
+
install_prebuilt_gz_cache(
|
|
39
|
+
gz_path,
|
|
40
|
+
db_path,
|
|
41
|
+
"snpedia",
|
|
42
|
+
source_url=source_url,
|
|
43
|
+
remote_signal=remote_signal,
|
|
44
|
+
)
|