allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""PharmGKB annotator. Source-attributed pharmacogenomic annotations (ADR-0003)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import sqlite3
|
|
10
|
+
import urllib.error
|
|
11
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
12
|
+
|
|
13
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor, is_clinvar_homref
|
|
14
|
+
from allelix.databases._versions import PHARMGKB_INTERPRETER_VERSION
|
|
15
|
+
from allelix.databases.cpic_loader import (
|
|
16
|
+
fetch_cpic_allele_functions,
|
|
17
|
+
fetch_cpic_remote_signal,
|
|
18
|
+
)
|
|
19
|
+
from allelix.databases.manager import (
|
|
20
|
+
download,
|
|
21
|
+
get_database_info,
|
|
22
|
+
head_request_headers,
|
|
23
|
+
)
|
|
24
|
+
from allelix.databases.pharmgkb_loader import (
|
|
25
|
+
PHARMGKB_CLINICAL_URL,
|
|
26
|
+
PHARMGKB_DB_FILENAME,
|
|
27
|
+
_normalize_genotype,
|
|
28
|
+
load_pharmgkb_tsv,
|
|
29
|
+
schema_is_current,
|
|
30
|
+
)
|
|
31
|
+
from allelix.models import Annotation
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from collections.abc import Callable
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
from allelix.models import Variant
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
# Allelix-derived magnitude scoring from PharmGKB Level of Evidence. See ADR-0008.
|
|
42
|
+
# 1A is the strongest evidence (CPIC guideline-backed); 4 is the weakest.
|
|
43
|
+
_LOE_MAGNITUDE: dict[str, float] = {
|
|
44
|
+
"1a": 9.0,
|
|
45
|
+
"1b": 8.0,
|
|
46
|
+
"2a": 7.0,
|
|
47
|
+
"2b": 6.0,
|
|
48
|
+
"3": 4.0,
|
|
49
|
+
"4": 2.0,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _magnitude(level_of_evidence: str) -> float:
|
|
54
|
+
return _LOE_MAGNITUDE.get(level_of_evidence.strip().lower(), 5.0)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PharmGKBAnnotator(Annotator):
|
|
58
|
+
"""Annotates variants with PharmGKB's curated drug-gene-variant associations."""
|
|
59
|
+
|
|
60
|
+
name: ClassVar[str] = "pharmgkb"
|
|
61
|
+
display_name: ClassVar[str] = "PharmGKB"
|
|
62
|
+
attribution: ClassVar[str] = "PharmGKB"
|
|
63
|
+
requires_download: ClassVar[bool] = True
|
|
64
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
65
|
+
spdx="CC-BY-SA-4.0",
|
|
66
|
+
license_url="https://creativecommons.org/licenses/by-sa/4.0/",
|
|
67
|
+
attribution_text=(
|
|
68
|
+
"Pharmacogenomic annotations sourced from PharmGKB, used under CC BY-SA 4.0."
|
|
69
|
+
),
|
|
70
|
+
source_url="https://www.pharmgkb.org",
|
|
71
|
+
commercial_ok=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
data_dir: Path,
|
|
77
|
+
clinvar_ref_provider: Callable[[str, str], str | None] | None = None,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""Resolve the PharmGKB SQLite cache path within `data_dir`.
|
|
80
|
+
|
|
81
|
+
`clinvar_ref_provider` is a `(rsid, build) -> ref_base | None` callable
|
|
82
|
+
used by the primary non-finding filter (ADR-0023). In production it's
|
|
83
|
+
wired to `ClinVarAnnotator.reference_for`. None disables the REF check
|
|
84
|
+
and falls back to the cache's CPIC-based `is_nonfinding` flag for all
|
|
85
|
+
suppression — the v0.7.1 behavior.
|
|
86
|
+
"""
|
|
87
|
+
super().__init__(data_dir)
|
|
88
|
+
self._db_path = data_dir / PHARMGKB_DB_FILENAME
|
|
89
|
+
self._conn: sqlite3.Connection | None = None
|
|
90
|
+
self._clinvar_ref_provider = clinvar_ref_provider
|
|
91
|
+
|
|
92
|
+
def _connection(self) -> sqlite3.Connection:
|
|
93
|
+
if self._conn is None:
|
|
94
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
95
|
+
return self._conn
|
|
96
|
+
|
|
97
|
+
def setup(self) -> None:
|
|
98
|
+
"""Download PharmGKB clinical annotations + CPIC allele functions, ingest atomically.
|
|
99
|
+
|
|
100
|
+
Two sources are fetched: PharmGKB's `clinicalAnnotations.zip`
|
|
101
|
+
(the annotation rows + per-genotype rows) and CPIC's API (the
|
|
102
|
+
structured per-allele function table per ADR-0020). The primary
|
|
103
|
+
non-finding filter is the ClinVar REF check (ADR-0023); CPIC's
|
|
104
|
+
per-allele function table is the secondary fallback for rsids
|
|
105
|
+
ClinVar doesn't catalog.
|
|
106
|
+
|
|
107
|
+
The ZIP is retained on disk so ``is_ready()`` can auto-reingest
|
|
108
|
+
when the interpreter version bumps — mirroring the GWAS TSV
|
|
109
|
+
retention pattern.
|
|
110
|
+
"""
|
|
111
|
+
url = PHARMGKB_CLINICAL_URL
|
|
112
|
+
signal = self.fetch_remote_signal()
|
|
113
|
+
if signal is None:
|
|
114
|
+
msg = (
|
|
115
|
+
"pharmgkb: cannot verify remote freshness signal. "
|
|
116
|
+
"Refresh aborted to avoid persisting an incomplete cache stamp. "
|
|
117
|
+
"Retry, or pass --force if you accept that next `db update` "
|
|
118
|
+
"will re-download to re-establish the signal."
|
|
119
|
+
)
|
|
120
|
+
raise RuntimeError(msg)
|
|
121
|
+
zip_path = self.data_dir / "clinicalAnnotations.zip"
|
|
122
|
+
# No content-hash verification: PharmGKB publishes no checksum and
|
|
123
|
+
# the content is mutable, so there is nothing to pin or fetch.
|
|
124
|
+
# TLS + Content-Length truncation guard only. See ADR-0029.
|
|
125
|
+
download(url, zip_path)
|
|
126
|
+
try:
|
|
127
|
+
cpic_lookup = fetch_cpic_allele_functions()
|
|
128
|
+
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
|
|
129
|
+
logger.warning(
|
|
130
|
+
"CPIC API unavailable (%s) -- proceeding without "
|
|
131
|
+
"allele function data. Non-finding filter degraded.",
|
|
132
|
+
exc,
|
|
133
|
+
)
|
|
134
|
+
cpic_lookup = {}
|
|
135
|
+
load_pharmgkb_tsv(
|
|
136
|
+
zip_path,
|
|
137
|
+
self._db_path,
|
|
138
|
+
source_url=url,
|
|
139
|
+
remote_signal=signal,
|
|
140
|
+
allele_function_lookup=cpic_lookup,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def is_ready(self) -> bool:
|
|
144
|
+
"""True iff a PharmGKB SQLite cache exists with current schema and interpreter stamp.
|
|
145
|
+
|
|
146
|
+
When the interpreter version has bumped and the raw ZIP is still
|
|
147
|
+
on disk (retained since the last ``db update``), auto-reingests
|
|
148
|
+
from the cached ZIP using the existing CPIC allele-function data —
|
|
149
|
+
mirroring the GWAS auto-reingest pattern.
|
|
150
|
+
|
|
151
|
+
Pre-mechanism caches (tag missing or baked into ``remote_signal``)
|
|
152
|
+
are self-healed with a one-shot stamp update.
|
|
153
|
+
"""
|
|
154
|
+
info = get_database_info(self._db_path, "pharmgkb")
|
|
155
|
+
if info is None:
|
|
156
|
+
return False
|
|
157
|
+
if not schema_is_current(self._db_path):
|
|
158
|
+
return False
|
|
159
|
+
tag = info.get("local_version_tag") or ""
|
|
160
|
+
if tag == f"iv:{PHARMGKB_INTERPRETER_VERSION}":
|
|
161
|
+
return True
|
|
162
|
+
if not tag:
|
|
163
|
+
return _stamp_existing_pharmgkb_cache(self._db_path)
|
|
164
|
+
return _reingest_pharmgkb_from_cached_zip(self._db_path, self.data_dir)
|
|
165
|
+
|
|
166
|
+
def version(self) -> str | None:
|
|
167
|
+
"""Cached database version (download date, or version supplied to load)."""
|
|
168
|
+
info = get_database_info(self._db_path, "pharmgkb")
|
|
169
|
+
return info["version"] if info else None
|
|
170
|
+
|
|
171
|
+
def record_count(self) -> int | None:
|
|
172
|
+
"""Number of (rsid, genotype) annotation rows in the cache."""
|
|
173
|
+
info = get_database_info(self._db_path, "pharmgkb")
|
|
174
|
+
return info["record_count"] if info else None
|
|
175
|
+
|
|
176
|
+
def close(self) -> None:
|
|
177
|
+
"""Close the SQLite connection if open. Safe to call multiple times."""
|
|
178
|
+
if self._conn is not None:
|
|
179
|
+
self._conn.close()
|
|
180
|
+
self._conn = None
|
|
181
|
+
|
|
182
|
+
def fetch_remote_signal(self) -> str | None:
|
|
183
|
+
"""Composite freshness signal for PharmGKB + CPIC (M-2, ADR-0020).
|
|
184
|
+
|
|
185
|
+
The signal format is `pgkb:<pgkb-signal>|cpic:<cpic-signal>`.
|
|
186
|
+
|
|
187
|
+
- PharmGKB portion: ETag if available, else Last-Modified
|
|
188
|
+
(per ADR-0012).
|
|
189
|
+
- CPIC portion: latest `change_log` date from CPIC's API,
|
|
190
|
+
or ``unavailable`` if the CPIC probe fails.
|
|
191
|
+
|
|
192
|
+
Returns None only when PharmGKB itself is unreachable. CPIC
|
|
193
|
+
failure is non-fatal: the signal carries ``cpic:unavailable``
|
|
194
|
+
so the cache is still refreshable (and the mismatch when CPIC
|
|
195
|
+
recovers triggers a re-download automatically).
|
|
196
|
+
"""
|
|
197
|
+
headers = head_request_headers(PHARMGKB_CLINICAL_URL)
|
|
198
|
+
if headers is None:
|
|
199
|
+
return None
|
|
200
|
+
etag = headers.get("ETag") or headers.get("Etag")
|
|
201
|
+
last_modified = headers.get("Last-Modified") or headers.get("Last-modified")
|
|
202
|
+
if etag:
|
|
203
|
+
pgkb_signal = f"etag:{etag.strip()}"
|
|
204
|
+
elif last_modified:
|
|
205
|
+
pgkb_signal = f"lm:{last_modified.strip()}"
|
|
206
|
+
else:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
cpic_signal = fetch_cpic_remote_signal()
|
|
210
|
+
if cpic_signal is None:
|
|
211
|
+
return f"pgkb:{pgkb_signal}|cpic:unavailable"
|
|
212
|
+
return f"pgkb:{pgkb_signal}|cpic:{cpic_signal}"
|
|
213
|
+
|
|
214
|
+
def cached_remote_signal(self) -> str | None:
|
|
215
|
+
"""Return the remote signal stored at last successful download."""
|
|
216
|
+
info = get_database_info(self._db_path, "pharmgkb")
|
|
217
|
+
if not info or not info["remote_signal"]:
|
|
218
|
+
return None
|
|
219
|
+
return info["remote_signal"] or None
|
|
220
|
+
|
|
221
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
222
|
+
"""Return PharmGKB annotations for variants the user actually carries.
|
|
223
|
+
|
|
224
|
+
Non-finding suppression has two independent signals; either one
|
|
225
|
+
is sufficient to suppress a row:
|
|
226
|
+
|
|
227
|
+
1. **ClinVar REF carrier rule (ADR-0023).** If ClinVar has a
|
|
228
|
+
single-base REF for this rsid and the user is homozygous
|
|
229
|
+
for it → suppress before hitting the database.
|
|
230
|
+
|
|
231
|
+
2. **CPIC per-allele function (ADR-0020).** The pre-computed
|
|
232
|
+
`is_nonfinding` flag in the cache — set at load time when
|
|
233
|
+
CPIC classifies every user-carried base as Normal function.
|
|
234
|
+
Applied via `AND is_nonfinding = 0` on every query.
|
|
235
|
+
|
|
236
|
+
The two checks are additive: ClinVar REF catches genes CPIC
|
|
237
|
+
doesn't cover; CPIC catches rows where both alleles are Normal
|
|
238
|
+
function even when the user isn't homozygous reference per ClinVar
|
|
239
|
+
(e.g. rs1801265 GG in DPYD).
|
|
240
|
+
|
|
241
|
+
No-calls and indels are filtered out by `_normalize_genotype()`
|
|
242
|
+
returning None — array-based parsers don't call indels (ADR-0011).
|
|
243
|
+
"""
|
|
244
|
+
if variant.is_no_call:
|
|
245
|
+
return []
|
|
246
|
+
user_geno = _normalize_genotype(variant.allele1 + variant.allele2)
|
|
247
|
+
if user_geno is None:
|
|
248
|
+
return []
|
|
249
|
+
|
|
250
|
+
if is_clinvar_homref(variant, self._clinvar_ref_provider):
|
|
251
|
+
return []
|
|
252
|
+
|
|
253
|
+
sql = (
|
|
254
|
+
"SELECT genotype, gene, drugs, phenotype, phenotype_category, "
|
|
255
|
+
"annotation_text, level_of_evidence, score, pgkb_annotation_id "
|
|
256
|
+
"FROM pharmgkb_annotations "
|
|
257
|
+
"WHERE rsid = ? AND genotype = ? AND is_nonfinding = 0"
|
|
258
|
+
)
|
|
259
|
+
params = (variant.rsid, user_geno)
|
|
260
|
+
|
|
261
|
+
rows = self._connection().execute(sql, params).fetchall()
|
|
262
|
+
annotations: list[Annotation] = []
|
|
263
|
+
user_diploid = _user_diploid(variant)
|
|
264
|
+
for row in rows:
|
|
265
|
+
(
|
|
266
|
+
_geno,
|
|
267
|
+
gene,
|
|
268
|
+
drugs,
|
|
269
|
+
phenotype,
|
|
270
|
+
_phenotype_category,
|
|
271
|
+
annotation_text,
|
|
272
|
+
level_of_evidence,
|
|
273
|
+
_score,
|
|
274
|
+
pgkb_annotation_id,
|
|
275
|
+
) = row
|
|
276
|
+
sig_label = level_of_evidence.strip().lower() or "unknown"
|
|
277
|
+
description_parts = [f"PharmGKB: {drugs}"] if drugs else ["PharmGKB"]
|
|
278
|
+
if phenotype:
|
|
279
|
+
description_parts.append(phenotype)
|
|
280
|
+
if annotation_text:
|
|
281
|
+
description_parts.append(annotation_text)
|
|
282
|
+
description = " — ".join(description_parts)
|
|
283
|
+
references = (
|
|
284
|
+
[f"pharmgkb:annotation/{pgkb_annotation_id}"] if pgkb_annotation_id else []
|
|
285
|
+
)
|
|
286
|
+
annotations.append(
|
|
287
|
+
Annotation(
|
|
288
|
+
source=self.name,
|
|
289
|
+
rsid=variant.rsid,
|
|
290
|
+
significance=f"pharmgkb_loe_{sig_label}",
|
|
291
|
+
category="pharma",
|
|
292
|
+
magnitude=_magnitude(level_of_evidence),
|
|
293
|
+
description=description,
|
|
294
|
+
attribution=self.attribution,
|
|
295
|
+
genotype_match=user_diploid,
|
|
296
|
+
references=references,
|
|
297
|
+
condition=phenotype or "",
|
|
298
|
+
gene=gene or "",
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
return annotations
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _user_diploid(variant: Variant) -> str:
|
|
305
|
+
"""Sorted two-letter diploid for SNVs; indel passthrough verbatim.
|
|
306
|
+
|
|
307
|
+
ADR-0023: report the user's actual genotype consistently across
|
|
308
|
+
annotators. Mirrors `allelix.annotators.clinvar._user_diploid`
|
|
309
|
+
(kept here to avoid a cross-annotator import dependency).
|
|
310
|
+
"""
|
|
311
|
+
a1, a2 = variant.allele1, variant.allele2
|
|
312
|
+
if len(a1) == 1 and len(a2) == 1:
|
|
313
|
+
return "".join(sorted((a1, a2)))
|
|
314
|
+
return f"{a1}/{a2}"
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _stamp_existing_pharmgkb_cache(db_path: Path) -> bool:
|
|
318
|
+
"""One-shot migration: stamp ``local_version_tag`` on a PharmGKB cache.
|
|
319
|
+
|
|
320
|
+
Handles legacy caches with ``|iv:N`` baked into ``remote_signal``
|
|
321
|
+
by moving the tag and cleaning the signal. Returns True if the
|
|
322
|
+
current interpreter version is now stamped.
|
|
323
|
+
"""
|
|
324
|
+
import contextlib
|
|
325
|
+
|
|
326
|
+
from allelix.databases.manager import _ensure_local_version_tag_column
|
|
327
|
+
|
|
328
|
+
if not db_path.exists():
|
|
329
|
+
return False
|
|
330
|
+
tag = f"iv:{PHARMGKB_INTERPRETER_VERSION}"
|
|
331
|
+
try:
|
|
332
|
+
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
333
|
+
_ensure_local_version_tag_column(conn)
|
|
334
|
+
row = conn.execute(
|
|
335
|
+
"SELECT remote_signal, local_version_tag "
|
|
336
|
+
"FROM database_versions WHERE name='pharmgkb'"
|
|
337
|
+
).fetchone()
|
|
338
|
+
if not row:
|
|
339
|
+
return False
|
|
340
|
+
sig, existing_tag = row
|
|
341
|
+
if existing_tag == tag:
|
|
342
|
+
return True
|
|
343
|
+
clean_signal = (sig or "").split("|iv:")[0]
|
|
344
|
+
conn.execute(
|
|
345
|
+
"UPDATE database_versions "
|
|
346
|
+
"SET remote_signal = ?, local_version_tag = ? "
|
|
347
|
+
"WHERE name = 'pharmgkb'",
|
|
348
|
+
(clean_signal, tag),
|
|
349
|
+
)
|
|
350
|
+
conn.commit()
|
|
351
|
+
return True
|
|
352
|
+
except (sqlite3.OperationalError, sqlite3.DatabaseError):
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _read_cached_cpic_lookup(db_path: Path) -> dict[tuple[str, str], str]:
|
|
357
|
+
"""Extract the CPIC allele-function table from an existing PharmGKB cache."""
|
|
358
|
+
import contextlib
|
|
359
|
+
|
|
360
|
+
lookup: dict[tuple[str, str], str] = {}
|
|
361
|
+
try:
|
|
362
|
+
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
363
|
+
rows = conn.execute(
|
|
364
|
+
"SELECT rsid, allele, function_class FROM pharmgkb_allele_function"
|
|
365
|
+
).fetchall()
|
|
366
|
+
for rsid, allele, function_class in rows:
|
|
367
|
+
lookup[(rsid, allele)] = function_class
|
|
368
|
+
except (sqlite3.OperationalError, sqlite3.DatabaseError):
|
|
369
|
+
pass
|
|
370
|
+
return lookup
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
|
|
374
|
+
"""Re-ingest PharmGKB from the retained ZIP when the interpreter version bumps.
|
|
375
|
+
|
|
376
|
+
Reads the existing CPIC allele-function data from the current cache
|
|
377
|
+
before replacing it — no network required for the reingest. Preserves
|
|
378
|
+
the original source URL and version from the previous cache.
|
|
379
|
+
"""
|
|
380
|
+
zip_path = data_dir / "clinicalAnnotations.zip"
|
|
381
|
+
if not zip_path.exists():
|
|
382
|
+
return False
|
|
383
|
+
info = get_database_info(db_path, "pharmgkb")
|
|
384
|
+
if info is None:
|
|
385
|
+
return False
|
|
386
|
+
old_signal = info.get("remote_signal") or ""
|
|
387
|
+
old_version = info.get("version") or ""
|
|
388
|
+
old_source_url = info.get("source_url") or ""
|
|
389
|
+
cpic_lookup = _read_cached_cpic_lookup(db_path)
|
|
390
|
+
logger.info("PharmGKB interpreter changed — re-ingesting from cached ZIP...")
|
|
391
|
+
try:
|
|
392
|
+
load_pharmgkb_tsv(
|
|
393
|
+
zip_path,
|
|
394
|
+
db_path,
|
|
395
|
+
source_url=old_source_url,
|
|
396
|
+
version=old_version,
|
|
397
|
+
remote_signal=old_signal,
|
|
398
|
+
allele_function_lookup=cpic_lookup,
|
|
399
|
+
)
|
|
400
|
+
except Exception:
|
|
401
|
+
logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)
|
|
402
|
+
return False
|
|
403
|
+
new_info = get_database_info(db_path, "pharmgkb")
|
|
404
|
+
if new_info is None:
|
|
405
|
+
return False
|
|
406
|
+
return (new_info.get("local_version_tag") or "") == f"iv:{PHARMGKB_INTERPRETER_VERSION}"
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""SNPedia annotator. Structured SQL lookups against pre-parsed genotype data.
|
|
4
|
+
|
|
5
|
+
Reads from the ``snpedia_genotypes`` table in the SNPedia SQLite archive.
|
|
6
|
+
The pre-built cache is downloaded from HuggingFace during ``db update``.
|
|
7
|
+
It can also be built locally via ``scripts/scrape_snpedia.py`` followed
|
|
8
|
+
by ``scripts/parse_snpedia.py``.
|
|
9
|
+
|
|
10
|
+
SNPedia content is CC-BY-NC-SA 3.0 US. Attribution is required in all
|
|
11
|
+
reports.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import contextlib
|
|
17
|
+
import logging
|
|
18
|
+
import sqlite3
|
|
19
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
20
|
+
|
|
21
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor, is_clinvar_homref
|
|
22
|
+
from allelix.databases.manager import (
|
|
23
|
+
download,
|
|
24
|
+
verify_file_hash,
|
|
25
|
+
)
|
|
26
|
+
from allelix.databases.snpedia_loader import (
|
|
27
|
+
SNPEDIA_CACHE_URL,
|
|
28
|
+
SNPEDIA_EXPECTED_SHA256,
|
|
29
|
+
install_prebuilt_cache,
|
|
30
|
+
)
|
|
31
|
+
from allelix.models import Annotation
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from collections.abc import Callable
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
from allelix.models import Variant
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
SNPEDIA_DB_FILENAME = "snpedia.sqlite"
|
|
42
|
+
SNPEDIA_RECORD_NAME = "snpedia"
|
|
43
|
+
|
|
44
|
+
_REPUTE_CATEGORY: dict[str, str] = {
|
|
45
|
+
"good": "trait",
|
|
46
|
+
"bad": "clinical",
|
|
47
|
+
"not set": "trait",
|
|
48
|
+
"": "trait",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
_SUMMARY_SUPPRESS_SUBSTRINGS: tuple[str, ...] = (
|
|
52
|
+
"mis-oriented",
|
|
53
|
+
"mis-orientation",
|
|
54
|
+
"wrong strand",
|
|
55
|
+
"orientation uncertain",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class SNPediaAnnotator(Annotator):
|
|
60
|
+
"""Annotates variants with SNPedia genotype data via structured SQL lookups."""
|
|
61
|
+
|
|
62
|
+
name: ClassVar[str] = "snpedia"
|
|
63
|
+
display_name: ClassVar[str] = "SNPedia"
|
|
64
|
+
attribution: ClassVar[str] = "SNPedia"
|
|
65
|
+
requires_download: ClassVar[bool] = True
|
|
66
|
+
server_driven_freshness: ClassVar[bool] = False
|
|
67
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
68
|
+
spdx="CC-BY-NC-SA-3.0-US",
|
|
69
|
+
license_url="https://creativecommons.org/licenses/by-nc-sa/3.0/us/",
|
|
70
|
+
attribution_text=(
|
|
71
|
+
"SNPedia annotations sourced from SNPedia, used under CC BY-NC-SA 3.0 US."
|
|
72
|
+
),
|
|
73
|
+
source_url="https://www.snpedia.com",
|
|
74
|
+
commercial_ok=False,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
data_dir: Path,
|
|
80
|
+
clinvar_ref_provider: Callable[[str, str], str | None] | None = None,
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Initialize with path to the data directory.
|
|
83
|
+
|
|
84
|
+
``clinvar_ref_provider`` is a ``(rsid, build) -> ref_base | None``
|
|
85
|
+
callable used by the ADR-0023 hom-ref check. In production it is
|
|
86
|
+
wired to ``ClinVarAnnotator.reference_for``. ``None`` disables the
|
|
87
|
+
check (tests, standalone use).
|
|
88
|
+
"""
|
|
89
|
+
super().__init__(data_dir)
|
|
90
|
+
self._db_path = data_dir / SNPEDIA_DB_FILENAME
|
|
91
|
+
self._conn: sqlite3.Connection | None = None
|
|
92
|
+
self._clinvar_ref_provider = clinvar_ref_provider
|
|
93
|
+
|
|
94
|
+
def _connection(self) -> sqlite3.Connection:
|
|
95
|
+
if self._conn is None:
|
|
96
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
97
|
+
return self._conn
|
|
98
|
+
|
|
99
|
+
def setup(self) -> None:
|
|
100
|
+
"""Download the pre-built SNPedia cache from HuggingFace.
|
|
101
|
+
|
|
102
|
+
The HuggingFace asset contains raw wiki markup (third-party,
|
|
103
|
+
CC-BY-NC-SA). After download, ``is_ready()`` triggers the
|
|
104
|
+
client-side parse into structured genotype rows and stamps
|
|
105
|
+
``database_versions`` with proper version metadata.
|
|
106
|
+
"""
|
|
107
|
+
gz_path = self.data_dir / "snpedia.sqlite.gz"
|
|
108
|
+
download(SNPEDIA_CACHE_URL, gz_path)
|
|
109
|
+
verify_file_hash(gz_path, "sha256", SNPEDIA_EXPECTED_SHA256)
|
|
110
|
+
install_prebuilt_cache(
|
|
111
|
+
gz_path,
|
|
112
|
+
self._db_path,
|
|
113
|
+
source_url=SNPEDIA_CACHE_URL,
|
|
114
|
+
)
|
|
115
|
+
try:
|
|
116
|
+
gz_path.unlink()
|
|
117
|
+
except OSError:
|
|
118
|
+
logger.warning("Could not remove staged file at %s", gz_path)
|
|
119
|
+
self.is_ready()
|
|
120
|
+
|
|
121
|
+
def is_ready(self) -> bool:
|
|
122
|
+
"""Return True when the parsed SNPedia genotype table exists and has data.
|
|
123
|
+
|
|
124
|
+
If raw pages exist but the structured table does not, automatically
|
|
125
|
+
parses the raw markup (one-time operation, ~2 minutes).
|
|
126
|
+
"""
|
|
127
|
+
if not self._db_path.exists():
|
|
128
|
+
return False
|
|
129
|
+
try:
|
|
130
|
+
from allelix.databases.snpedia_parser import (
|
|
131
|
+
detect_raw_table,
|
|
132
|
+
parse_raw_pages,
|
|
133
|
+
parser_is_current,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
with contextlib.closing(sqlite3.connect(self._db_path)) as conn:
|
|
137
|
+
has_rows = False
|
|
138
|
+
with contextlib.suppress(sqlite3.OperationalError):
|
|
139
|
+
has_rows = (
|
|
140
|
+
conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0] > 0
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
needs_reparse = has_rows and not parser_is_current(conn)
|
|
144
|
+
if has_rows and not needs_reparse:
|
|
145
|
+
return True
|
|
146
|
+
|
|
147
|
+
raw_table = detect_raw_table(conn)
|
|
148
|
+
if raw_table is None:
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
snp_count = conn.execute(
|
|
152
|
+
f"SELECT COUNT(*) FROM {raw_table} WHERE category='snp'"
|
|
153
|
+
).fetchone()[0]
|
|
154
|
+
genotype_count = conn.execute(
|
|
155
|
+
f"SELECT COUNT(*) FROM {raw_table} WHERE category='genotype'"
|
|
156
|
+
).fetchone()[0]
|
|
157
|
+
|
|
158
|
+
reason = "parser version changed" if needs_reparse else "one-time"
|
|
159
|
+
print(
|
|
160
|
+
f"Parsing {snp_count} SNP pages + {genotype_count} genotype pages"
|
|
161
|
+
f" into structured table ({reason}, ~5 min)...",
|
|
162
|
+
flush=True,
|
|
163
|
+
)
|
|
164
|
+
parsed = parse_raw_pages(str(self._db_path))
|
|
165
|
+
print(f"Parsed {parsed} SNPedia genotype rows.", flush=True)
|
|
166
|
+
return parsed > 0
|
|
167
|
+
except (sqlite3.OperationalError, sqlite3.DatabaseError):
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
def version(self) -> str | None:
|
|
171
|
+
"""Return a version string from the database_versions table."""
|
|
172
|
+
if not self._db_path.exists():
|
|
173
|
+
return None
|
|
174
|
+
try:
|
|
175
|
+
with contextlib.closing(sqlite3.connect(self._db_path)) as conn:
|
|
176
|
+
row = conn.execute(
|
|
177
|
+
"SELECT version FROM database_versions WHERE name = ?",
|
|
178
|
+
(SNPEDIA_RECORD_NAME,),
|
|
179
|
+
).fetchone()
|
|
180
|
+
if row and row[0]:
|
|
181
|
+
return row[0]
|
|
182
|
+
return None
|
|
183
|
+
except (sqlite3.OperationalError, sqlite3.DatabaseError):
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
def record_count(self) -> int | None:
|
|
187
|
+
"""Return the number of genotype rows in the structured table."""
|
|
188
|
+
if not self._db_path.exists():
|
|
189
|
+
return None
|
|
190
|
+
try:
|
|
191
|
+
with contextlib.closing(sqlite3.connect(self._db_path)) as conn:
|
|
192
|
+
count = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
|
|
193
|
+
return count
|
|
194
|
+
except (sqlite3.OperationalError, sqlite3.DatabaseError):
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
def close(self) -> None:
|
|
198
|
+
"""Close the SQLite connection."""
|
|
199
|
+
if self._conn is not None:
|
|
200
|
+
self._conn.close()
|
|
201
|
+
self._conn = None
|
|
202
|
+
|
|
203
|
+
def fetch_remote_signal(self) -> str | None:
|
|
204
|
+
"""Code-driven source — no runtime freshness probe (ADR-0030)."""
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
def cached_remote_signal(self) -> str | None:
|
|
208
|
+
"""Code-driven source — no cached signal to compare (ADR-0030)."""
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
212
|
+
"""Return SNPedia annotations matching the user's genotype."""
|
|
213
|
+
if variant.is_no_call:
|
|
214
|
+
return []
|
|
215
|
+
|
|
216
|
+
snp_id = variant.rsid.lower()
|
|
217
|
+
if snp_id.startswith("rs"):
|
|
218
|
+
snp_num = snp_id[2:]
|
|
219
|
+
snp_url_path = f"Rs{snp_num}"
|
|
220
|
+
elif snp_id.startswith("i"):
|
|
221
|
+
snp_num = snp_id[1:]
|
|
222
|
+
snp_url_path = f"I{snp_num}"
|
|
223
|
+
else:
|
|
224
|
+
return []
|
|
225
|
+
|
|
226
|
+
if not snp_num or not snp_num.isdigit():
|
|
227
|
+
return []
|
|
228
|
+
|
|
229
|
+
if snp_id.startswith("rs") and is_clinvar_homref(variant, self._clinvar_ref_provider):
|
|
230
|
+
return []
|
|
231
|
+
|
|
232
|
+
a1, a2 = variant.allele1.upper(), variant.allele2.upper()
|
|
233
|
+
sorted_alleles = (a1, a2) if a1 <= a2 else (a2, a1)
|
|
234
|
+
|
|
235
|
+
conn = self._connection()
|
|
236
|
+
rows = conn.execute(
|
|
237
|
+
"SELECT allele1, allele2, magnitude, repute, summary, gene "
|
|
238
|
+
"FROM snpedia_genotypes "
|
|
239
|
+
"WHERE rsid = ? AND allele1 = ? AND allele2 = ?",
|
|
240
|
+
(snp_id, sorted_alleles[0], sorted_alleles[1]),
|
|
241
|
+
).fetchall()
|
|
242
|
+
|
|
243
|
+
annotations: list[Annotation] = []
|
|
244
|
+
for allele1, allele2, magnitude, repute, summary, gene in rows:
|
|
245
|
+
if not summary:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
summary_lower = summary.lower()
|
|
249
|
+
if any(p in summary_lower for p in _SUMMARY_SUPPRESS_SUBSTRINGS):
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
if magnitude is None:
|
|
253
|
+
magnitude = 0.0
|
|
254
|
+
|
|
255
|
+
repute_lower = (repute or "").strip().lower()
|
|
256
|
+
category = _REPUTE_CATEGORY.get(repute_lower, "trait")
|
|
257
|
+
|
|
258
|
+
description = f"SNPedia: {summary}"
|
|
259
|
+
genotype_match = f"{allele1}{allele2}"
|
|
260
|
+
|
|
261
|
+
annotations.append(
|
|
262
|
+
Annotation(
|
|
263
|
+
source=self.name,
|
|
264
|
+
rsid=variant.rsid,
|
|
265
|
+
significance=f"snpedia_{repute_lower}" if repute_lower else "snpedia_genotype",
|
|
266
|
+
category=category,
|
|
267
|
+
magnitude=magnitude,
|
|
268
|
+
description=description,
|
|
269
|
+
attribution=self.attribution,
|
|
270
|
+
genotype_match=genotype_match,
|
|
271
|
+
references=[f"https://www.snpedia.com/index.php/{snp_url_path}"],
|
|
272
|
+
gene=gene or "",
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
return annotations
|