allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,406 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """PharmGKB annotator. Source-attributed pharmacogenomic annotations (ADR-0003)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import logging
9
+ import sqlite3
10
+ import urllib.error
11
+ from typing import TYPE_CHECKING, ClassVar
12
+
13
+ from allelix.annotators.base import Annotator, LicenseDescriptor, is_clinvar_homref
14
+ from allelix.databases._versions import PHARMGKB_INTERPRETER_VERSION
15
+ from allelix.databases.cpic_loader import (
16
+ fetch_cpic_allele_functions,
17
+ fetch_cpic_remote_signal,
18
+ )
19
+ from allelix.databases.manager import (
20
+ download,
21
+ get_database_info,
22
+ head_request_headers,
23
+ )
24
+ from allelix.databases.pharmgkb_loader import (
25
+ PHARMGKB_CLINICAL_URL,
26
+ PHARMGKB_DB_FILENAME,
27
+ _normalize_genotype,
28
+ load_pharmgkb_tsv,
29
+ schema_is_current,
30
+ )
31
+ from allelix.models import Annotation
32
+
33
+ if TYPE_CHECKING:
34
+ from collections.abc import Callable
35
+ from pathlib import Path
36
+
37
+ from allelix.models import Variant
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # Allelix-derived magnitude scoring from PharmGKB Level of Evidence. See ADR-0008.
42
+ # 1A is the strongest evidence (CPIC guideline-backed); 4 is the weakest.
43
+ _LOE_MAGNITUDE: dict[str, float] = {
44
+ "1a": 9.0,
45
+ "1b": 8.0,
46
+ "2a": 7.0,
47
+ "2b": 6.0,
48
+ "3": 4.0,
49
+ "4": 2.0,
50
+ }
51
+
52
+
53
+ def _magnitude(level_of_evidence: str) -> float:
54
+ return _LOE_MAGNITUDE.get(level_of_evidence.strip().lower(), 5.0)
55
+
56
+
57
+ class PharmGKBAnnotator(Annotator):
58
+ """Annotates variants with PharmGKB's curated drug-gene-variant associations."""
59
+
60
+ name: ClassVar[str] = "pharmgkb"
61
+ display_name: ClassVar[str] = "PharmGKB"
62
+ attribution: ClassVar[str] = "PharmGKB"
63
+ requires_download: ClassVar[bool] = True
64
+ license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
65
+ spdx="CC-BY-SA-4.0",
66
+ license_url="https://creativecommons.org/licenses/by-sa/4.0/",
67
+ attribution_text=(
68
+ "Pharmacogenomic annotations sourced from PharmGKB, used under CC BY-SA 4.0."
69
+ ),
70
+ source_url="https://www.pharmgkb.org",
71
+ commercial_ok=True,
72
+ )
73
+
74
+ def __init__(
75
+ self,
76
+ data_dir: Path,
77
+ clinvar_ref_provider: Callable[[str, str], str | None] | None = None,
78
+ ) -> None:
79
+ """Resolve the PharmGKB SQLite cache path within `data_dir`.
80
+
81
+ `clinvar_ref_provider` is a `(rsid, build) -> ref_base | None` callable
82
+ used by the primary non-finding filter (ADR-0023). In production it's
83
+ wired to `ClinVarAnnotator.reference_for`. None disables the REF check
84
+ and falls back to the cache's CPIC-based `is_nonfinding` flag for all
85
+ suppression — the v0.7.1 behavior.
86
+ """
87
+ super().__init__(data_dir)
88
+ self._db_path = data_dir / PHARMGKB_DB_FILENAME
89
+ self._conn: sqlite3.Connection | None = None
90
+ self._clinvar_ref_provider = clinvar_ref_provider
91
+
92
+ def _connection(self) -> sqlite3.Connection:
93
+ if self._conn is None:
94
+ self._conn = sqlite3.connect(self._db_path)
95
+ return self._conn
96
+
97
+ def setup(self) -> None:
98
+ """Download PharmGKB clinical annotations + CPIC allele functions, ingest atomically.
99
+
100
+ Two sources are fetched: PharmGKB's `clinicalAnnotations.zip`
101
+ (the annotation rows + per-genotype rows) and CPIC's API (the
102
+ structured per-allele function table per ADR-0020). The primary
103
+ non-finding filter is the ClinVar REF check (ADR-0023); CPIC's
104
+ per-allele function table is the secondary fallback for rsids
105
+ ClinVar doesn't catalog.
106
+
107
+ The ZIP is retained on disk so ``is_ready()`` can auto-reingest
108
+ when the interpreter version bumps — mirroring the GWAS TSV
109
+ retention pattern.
110
+ """
111
+ url = PHARMGKB_CLINICAL_URL
112
+ signal = self.fetch_remote_signal()
113
+ if signal is None:
114
+ msg = (
115
+ "pharmgkb: cannot verify remote freshness signal. "
116
+ "Refresh aborted to avoid persisting an incomplete cache stamp. "
117
+ "Retry, or pass --force if you accept that next `db update` "
118
+ "will re-download to re-establish the signal."
119
+ )
120
+ raise RuntimeError(msg)
121
+ zip_path = self.data_dir / "clinicalAnnotations.zip"
122
+ # No content-hash verification: PharmGKB publishes no checksum and
123
+ # the content is mutable, so there is nothing to pin or fetch.
124
+ # TLS + Content-Length truncation guard only. See ADR-0029.
125
+ download(url, zip_path)
126
+ try:
127
+ cpic_lookup = fetch_cpic_allele_functions()
128
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
129
+ logger.warning(
130
+ "CPIC API unavailable (%s) -- proceeding without "
131
+ "allele function data. Non-finding filter degraded.",
132
+ exc,
133
+ )
134
+ cpic_lookup = {}
135
+ load_pharmgkb_tsv(
136
+ zip_path,
137
+ self._db_path,
138
+ source_url=url,
139
+ remote_signal=signal,
140
+ allele_function_lookup=cpic_lookup,
141
+ )
142
+
143
+ def is_ready(self) -> bool:
144
+ """True iff a PharmGKB SQLite cache exists with current schema and interpreter stamp.
145
+
146
+ When the interpreter version has bumped and the raw ZIP is still
147
+ on disk (retained since the last ``db update``), auto-reingests
148
+ from the cached ZIP using the existing CPIC allele-function data —
149
+ mirroring the GWAS auto-reingest pattern.
150
+
151
+ Pre-mechanism caches (tag missing or baked into ``remote_signal``)
152
+ are self-healed with a one-shot stamp update.
153
+ """
154
+ info = get_database_info(self._db_path, "pharmgkb")
155
+ if info is None:
156
+ return False
157
+ if not schema_is_current(self._db_path):
158
+ return False
159
+ tag = info.get("local_version_tag") or ""
160
+ if tag == f"iv:{PHARMGKB_INTERPRETER_VERSION}":
161
+ return True
162
+ if not tag:
163
+ return _stamp_existing_pharmgkb_cache(self._db_path)
164
+ return _reingest_pharmgkb_from_cached_zip(self._db_path, self.data_dir)
165
+
166
+ def version(self) -> str | None:
167
+ """Cached database version (download date, or version supplied to load)."""
168
+ info = get_database_info(self._db_path, "pharmgkb")
169
+ return info["version"] if info else None
170
+
171
+ def record_count(self) -> int | None:
172
+ """Number of (rsid, genotype) annotation rows in the cache."""
173
+ info = get_database_info(self._db_path, "pharmgkb")
174
+ return info["record_count"] if info else None
175
+
176
+ def close(self) -> None:
177
+ """Close the SQLite connection if open. Safe to call multiple times."""
178
+ if self._conn is not None:
179
+ self._conn.close()
180
+ self._conn = None
181
+
182
+ def fetch_remote_signal(self) -> str | None:
183
+ """Composite freshness signal for PharmGKB + CPIC (M-2, ADR-0020).
184
+
185
+ The signal format is `pgkb:<pgkb-signal>|cpic:<cpic-signal>`.
186
+
187
+ - PharmGKB portion: ETag if available, else Last-Modified
188
+ (per ADR-0012).
189
+ - CPIC portion: latest `change_log` date from CPIC's API,
190
+ or ``unavailable`` if the CPIC probe fails.
191
+
192
+ Returns None only when PharmGKB itself is unreachable. CPIC
193
+ failure is non-fatal: the signal carries ``cpic:unavailable``
194
+ so the cache is still refreshable (and the mismatch when CPIC
195
+ recovers triggers a re-download automatically).
196
+ """
197
+ headers = head_request_headers(PHARMGKB_CLINICAL_URL)
198
+ if headers is None:
199
+ return None
200
+ etag = headers.get("ETag") or headers.get("Etag")
201
+ last_modified = headers.get("Last-Modified") or headers.get("Last-modified")
202
+ if etag:
203
+ pgkb_signal = f"etag:{etag.strip()}"
204
+ elif last_modified:
205
+ pgkb_signal = f"lm:{last_modified.strip()}"
206
+ else:
207
+ return None
208
+
209
+ cpic_signal = fetch_cpic_remote_signal()
210
+ if cpic_signal is None:
211
+ return f"pgkb:{pgkb_signal}|cpic:unavailable"
212
+ return f"pgkb:{pgkb_signal}|cpic:{cpic_signal}"
213
+
214
+ def cached_remote_signal(self) -> str | None:
215
+ """Return the remote signal stored at last successful download."""
216
+ info = get_database_info(self._db_path, "pharmgkb")
217
+ if not info or not info["remote_signal"]:
218
+ return None
219
+ return info["remote_signal"] or None
220
+
221
+ def annotate(self, variant: Variant) -> list[Annotation]:
222
+ """Return PharmGKB annotations for variants the user actually carries.
223
+
224
+ Non-finding suppression has two independent signals; either one
225
+ is sufficient to suppress a row:
226
+
227
+ 1. **ClinVar REF carrier rule (ADR-0023).** If ClinVar has a
228
+ single-base REF for this rsid and the user is homozygous
229
+ for it → suppress before hitting the database.
230
+
231
+ 2. **CPIC per-allele function (ADR-0020).** The pre-computed
232
+ `is_nonfinding` flag in the cache — set at load time when
233
+ CPIC classifies every user-carried base as Normal function.
234
+ Applied via `AND is_nonfinding = 0` on every query.
235
+
236
+ The two checks are additive: ClinVar REF catches genes CPIC
237
+ doesn't cover; CPIC catches rows where both alleles are Normal
238
+ function even when the user isn't homozygous reference per ClinVar
239
+ (e.g. rs1801265 GG in DPYD).
240
+
241
+ No-calls and indels are filtered out by `_normalize_genotype()`
242
+ returning None — array-based parsers don't call indels (ADR-0011).
243
+ """
244
+ if variant.is_no_call:
245
+ return []
246
+ user_geno = _normalize_genotype(variant.allele1 + variant.allele2)
247
+ if user_geno is None:
248
+ return []
249
+
250
+ if is_clinvar_homref(variant, self._clinvar_ref_provider):
251
+ return []
252
+
253
+ sql = (
254
+ "SELECT genotype, gene, drugs, phenotype, phenotype_category, "
255
+ "annotation_text, level_of_evidence, score, pgkb_annotation_id "
256
+ "FROM pharmgkb_annotations "
257
+ "WHERE rsid = ? AND genotype = ? AND is_nonfinding = 0"
258
+ )
259
+ params = (variant.rsid, user_geno)
260
+
261
+ rows = self._connection().execute(sql, params).fetchall()
262
+ annotations: list[Annotation] = []
263
+ user_diploid = _user_diploid(variant)
264
+ for row in rows:
265
+ (
266
+ _geno,
267
+ gene,
268
+ drugs,
269
+ phenotype,
270
+ _phenotype_category,
271
+ annotation_text,
272
+ level_of_evidence,
273
+ _score,
274
+ pgkb_annotation_id,
275
+ ) = row
276
+ sig_label = level_of_evidence.strip().lower() or "unknown"
277
+ description_parts = [f"PharmGKB: {drugs}"] if drugs else ["PharmGKB"]
278
+ if phenotype:
279
+ description_parts.append(phenotype)
280
+ if annotation_text:
281
+ description_parts.append(annotation_text)
282
+ description = " — ".join(description_parts)
283
+ references = (
284
+ [f"pharmgkb:annotation/{pgkb_annotation_id}"] if pgkb_annotation_id else []
285
+ )
286
+ annotations.append(
287
+ Annotation(
288
+ source=self.name,
289
+ rsid=variant.rsid,
290
+ significance=f"pharmgkb_loe_{sig_label}",
291
+ category="pharma",
292
+ magnitude=_magnitude(level_of_evidence),
293
+ description=description,
294
+ attribution=self.attribution,
295
+ genotype_match=user_diploid,
296
+ references=references,
297
+ condition=phenotype or "",
298
+ gene=gene or "",
299
+ )
300
+ )
301
+ return annotations
302
+
303
+
304
+ def _user_diploid(variant: Variant) -> str:
305
+ """Sorted two-letter diploid for SNVs; indel passthrough verbatim.
306
+
307
+ ADR-0023: report the user's actual genotype consistently across
308
+ annotators. Mirrors `allelix.annotators.clinvar._user_diploid`
309
+ (kept here to avoid a cross-annotator import dependency).
310
+ """
311
+ a1, a2 = variant.allele1, variant.allele2
312
+ if len(a1) == 1 and len(a2) == 1:
313
+ return "".join(sorted((a1, a2)))
314
+ return f"{a1}/{a2}"
315
+
316
+
317
+ def _stamp_existing_pharmgkb_cache(db_path: Path) -> bool:
318
+ """One-shot migration: stamp ``local_version_tag`` on a PharmGKB cache.
319
+
320
+ Handles legacy caches with ``|iv:N`` baked into ``remote_signal``
321
+ by moving the tag and cleaning the signal. Returns True if the
322
+ current interpreter version is now stamped.
323
+ """
324
+ import contextlib
325
+
326
+ from allelix.databases.manager import _ensure_local_version_tag_column
327
+
328
+ if not db_path.exists():
329
+ return False
330
+ tag = f"iv:{PHARMGKB_INTERPRETER_VERSION}"
331
+ try:
332
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
333
+ _ensure_local_version_tag_column(conn)
334
+ row = conn.execute(
335
+ "SELECT remote_signal, local_version_tag "
336
+ "FROM database_versions WHERE name='pharmgkb'"
337
+ ).fetchone()
338
+ if not row:
339
+ return False
340
+ sig, existing_tag = row
341
+ if existing_tag == tag:
342
+ return True
343
+ clean_signal = (sig or "").split("|iv:")[0]
344
+ conn.execute(
345
+ "UPDATE database_versions "
346
+ "SET remote_signal = ?, local_version_tag = ? "
347
+ "WHERE name = 'pharmgkb'",
348
+ (clean_signal, tag),
349
+ )
350
+ conn.commit()
351
+ return True
352
+ except (sqlite3.OperationalError, sqlite3.DatabaseError):
353
+ return False
354
+
355
+
356
+ def _read_cached_cpic_lookup(db_path: Path) -> dict[tuple[str, str], str]:
357
+ """Extract the CPIC allele-function table from an existing PharmGKB cache."""
358
+ import contextlib
359
+
360
+ lookup: dict[tuple[str, str], str] = {}
361
+ try:
362
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
363
+ rows = conn.execute(
364
+ "SELECT rsid, allele, function_class FROM pharmgkb_allele_function"
365
+ ).fetchall()
366
+ for rsid, allele, function_class in rows:
367
+ lookup[(rsid, allele)] = function_class
368
+ except (sqlite3.OperationalError, sqlite3.DatabaseError):
369
+ pass
370
+ return lookup
371
+
372
+
373
+ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
374
+ """Re-ingest PharmGKB from the retained ZIP when the interpreter version bumps.
375
+
376
+ Reads the existing CPIC allele-function data from the current cache
377
+ before replacing it — no network required for the reingest. Preserves
378
+ the original source URL and version from the previous cache.
379
+ """
380
+ zip_path = data_dir / "clinicalAnnotations.zip"
381
+ if not zip_path.exists():
382
+ return False
383
+ info = get_database_info(db_path, "pharmgkb")
384
+ if info is None:
385
+ return False
386
+ old_signal = info.get("remote_signal") or ""
387
+ old_version = info.get("version") or ""
388
+ old_source_url = info.get("source_url") or ""
389
+ cpic_lookup = _read_cached_cpic_lookup(db_path)
390
+ logger.info("PharmGKB interpreter changed — re-ingesting from cached ZIP...")
391
+ try:
392
+ load_pharmgkb_tsv(
393
+ zip_path,
394
+ db_path,
395
+ source_url=old_source_url,
396
+ version=old_version,
397
+ remote_signal=old_signal,
398
+ allele_function_lookup=cpic_lookup,
399
+ )
400
+ except Exception:
401
+ logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)
402
+ return False
403
+ new_info = get_database_info(db_path, "pharmgkb")
404
+ if new_info is None:
405
+ return False
406
+ return (new_info.get("local_version_tag") or "") == f"iv:{PHARMGKB_INTERPRETER_VERSION}"
@@ -0,0 +1,276 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """SNPedia annotator. Structured SQL lookups against pre-parsed genotype data.
4
+
5
+ Reads from the ``snpedia_genotypes`` table in the SNPedia SQLite archive.
6
+ The pre-built cache is downloaded from HuggingFace during ``db update``.
7
+ It can also be built locally via ``scripts/scrape_snpedia.py`` followed
8
+ by ``scripts/parse_snpedia.py``.
9
+
10
+ SNPedia content is CC-BY-NC-SA 3.0 US. Attribution is required in all
11
+ reports.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import contextlib
17
+ import logging
18
+ import sqlite3
19
+ from typing import TYPE_CHECKING, ClassVar
20
+
21
+ from allelix.annotators.base import Annotator, LicenseDescriptor, is_clinvar_homref
22
+ from allelix.databases.manager import (
23
+ download,
24
+ verify_file_hash,
25
+ )
26
+ from allelix.databases.snpedia_loader import (
27
+ SNPEDIA_CACHE_URL,
28
+ SNPEDIA_EXPECTED_SHA256,
29
+ install_prebuilt_cache,
30
+ )
31
+ from allelix.models import Annotation
32
+
33
+ if TYPE_CHECKING:
34
+ from collections.abc import Callable
35
+ from pathlib import Path
36
+
37
+ from allelix.models import Variant
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SNPEDIA_DB_FILENAME = "snpedia.sqlite"
42
+ SNPEDIA_RECORD_NAME = "snpedia"
43
+
44
+ _REPUTE_CATEGORY: dict[str, str] = {
45
+ "good": "trait",
46
+ "bad": "clinical",
47
+ "not set": "trait",
48
+ "": "trait",
49
+ }
50
+
51
+ _SUMMARY_SUPPRESS_SUBSTRINGS: tuple[str, ...] = (
52
+ "mis-oriented",
53
+ "mis-orientation",
54
+ "wrong strand",
55
+ "orientation uncertain",
56
+ )
57
+
58
+
59
+ class SNPediaAnnotator(Annotator):
60
+ """Annotates variants with SNPedia genotype data via structured SQL lookups."""
61
+
62
+ name: ClassVar[str] = "snpedia"
63
+ display_name: ClassVar[str] = "SNPedia"
64
+ attribution: ClassVar[str] = "SNPedia"
65
+ requires_download: ClassVar[bool] = True
66
+ server_driven_freshness: ClassVar[bool] = False
67
+ license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
68
+ spdx="CC-BY-NC-SA-3.0-US",
69
+ license_url="https://creativecommons.org/licenses/by-nc-sa/3.0/us/",
70
+ attribution_text=(
71
+ "SNPedia annotations sourced from SNPedia, used under CC BY-NC-SA 3.0 US."
72
+ ),
73
+ source_url="https://www.snpedia.com",
74
+ commercial_ok=False,
75
+ )
76
+
77
+ def __init__(
78
+ self,
79
+ data_dir: Path,
80
+ clinvar_ref_provider: Callable[[str, str], str | None] | None = None,
81
+ ) -> None:
82
+ """Initialize with path to the data directory.
83
+
84
+ ``clinvar_ref_provider`` is a ``(rsid, build) -> ref_base | None``
85
+ callable used by the ADR-0023 hom-ref check. In production it is
86
+ wired to ``ClinVarAnnotator.reference_for``. ``None`` disables the
87
+ check (tests, standalone use).
88
+ """
89
+ super().__init__(data_dir)
90
+ self._db_path = data_dir / SNPEDIA_DB_FILENAME
91
+ self._conn: sqlite3.Connection | None = None
92
+ self._clinvar_ref_provider = clinvar_ref_provider
93
+
94
+ def _connection(self) -> sqlite3.Connection:
95
+ if self._conn is None:
96
+ self._conn = sqlite3.connect(self._db_path)
97
+ return self._conn
98
+
99
+ def setup(self) -> None:
100
+ """Download the pre-built SNPedia cache from HuggingFace.
101
+
102
+ The HuggingFace asset contains raw wiki markup (third-party,
103
+ CC-BY-NC-SA). After download, ``is_ready()`` triggers the
104
+ client-side parse into structured genotype rows and stamps
105
+ ``database_versions`` with proper version metadata.
106
+ """
107
+ gz_path = self.data_dir / "snpedia.sqlite.gz"
108
+ download(SNPEDIA_CACHE_URL, gz_path)
109
+ verify_file_hash(gz_path, "sha256", SNPEDIA_EXPECTED_SHA256)
110
+ install_prebuilt_cache(
111
+ gz_path,
112
+ self._db_path,
113
+ source_url=SNPEDIA_CACHE_URL,
114
+ )
115
+ try:
116
+ gz_path.unlink()
117
+ except OSError:
118
+ logger.warning("Could not remove staged file at %s", gz_path)
119
+ self.is_ready()
120
+
121
+ def is_ready(self) -> bool:
122
+ """Return True when the parsed SNPedia genotype table exists and has data.
123
+
124
+ If raw pages exist but the structured table does not, automatically
125
+ parses the raw markup (one-time operation, ~2 minutes).
126
+ """
127
+ if not self._db_path.exists():
128
+ return False
129
+ try:
130
+ from allelix.databases.snpedia_parser import (
131
+ detect_raw_table,
132
+ parse_raw_pages,
133
+ parser_is_current,
134
+ )
135
+
136
+ with contextlib.closing(sqlite3.connect(self._db_path)) as conn:
137
+ has_rows = False
138
+ with contextlib.suppress(sqlite3.OperationalError):
139
+ has_rows = (
140
+ conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0] > 0
141
+ )
142
+
143
+ needs_reparse = has_rows and not parser_is_current(conn)
144
+ if has_rows and not needs_reparse:
145
+ return True
146
+
147
+ raw_table = detect_raw_table(conn)
148
+ if raw_table is None:
149
+ return False
150
+
151
+ snp_count = conn.execute(
152
+ f"SELECT COUNT(*) FROM {raw_table} WHERE category='snp'"
153
+ ).fetchone()[0]
154
+ genotype_count = conn.execute(
155
+ f"SELECT COUNT(*) FROM {raw_table} WHERE category='genotype'"
156
+ ).fetchone()[0]
157
+
158
+ reason = "parser version changed" if needs_reparse else "one-time"
159
+ print(
160
+ f"Parsing {snp_count} SNP pages + {genotype_count} genotype pages"
161
+ f" into structured table ({reason}, ~5 min)...",
162
+ flush=True,
163
+ )
164
+ parsed = parse_raw_pages(str(self._db_path))
165
+ print(f"Parsed {parsed} SNPedia genotype rows.", flush=True)
166
+ return parsed > 0
167
+ except (sqlite3.OperationalError, sqlite3.DatabaseError):
168
+ return False
169
+
170
+ def version(self) -> str | None:
171
+ """Return a version string from the database_versions table."""
172
+ if not self._db_path.exists():
173
+ return None
174
+ try:
175
+ with contextlib.closing(sqlite3.connect(self._db_path)) as conn:
176
+ row = conn.execute(
177
+ "SELECT version FROM database_versions WHERE name = ?",
178
+ (SNPEDIA_RECORD_NAME,),
179
+ ).fetchone()
180
+ if row and row[0]:
181
+ return row[0]
182
+ return None
183
+ except (sqlite3.OperationalError, sqlite3.DatabaseError):
184
+ return None
185
+
186
+ def record_count(self) -> int | None:
187
+ """Return the number of genotype rows in the structured table."""
188
+ if not self._db_path.exists():
189
+ return None
190
+ try:
191
+ with contextlib.closing(sqlite3.connect(self._db_path)) as conn:
192
+ count = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
193
+ return count
194
+ except (sqlite3.OperationalError, sqlite3.DatabaseError):
195
+ return None
196
+
197
+ def close(self) -> None:
198
+ """Close the SQLite connection."""
199
+ if self._conn is not None:
200
+ self._conn.close()
201
+ self._conn = None
202
+
203
+ def fetch_remote_signal(self) -> str | None:
204
+ """Code-driven source — no runtime freshness probe (ADR-0030)."""
205
+ return None
206
+
207
+ def cached_remote_signal(self) -> str | None:
208
+ """Code-driven source — no cached signal to compare (ADR-0030)."""
209
+ return None
210
+
211
+ def annotate(self, variant: Variant) -> list[Annotation]:
212
+ """Return SNPedia annotations matching the user's genotype."""
213
+ if variant.is_no_call:
214
+ return []
215
+
216
+ snp_id = variant.rsid.lower()
217
+ if snp_id.startswith("rs"):
218
+ snp_num = snp_id[2:]
219
+ snp_url_path = f"Rs{snp_num}"
220
+ elif snp_id.startswith("i"):
221
+ snp_num = snp_id[1:]
222
+ snp_url_path = f"I{snp_num}"
223
+ else:
224
+ return []
225
+
226
+ if not snp_num or not snp_num.isdigit():
227
+ return []
228
+
229
+ if snp_id.startswith("rs") and is_clinvar_homref(variant, self._clinvar_ref_provider):
230
+ return []
231
+
232
+ a1, a2 = variant.allele1.upper(), variant.allele2.upper()
233
+ sorted_alleles = (a1, a2) if a1 <= a2 else (a2, a1)
234
+
235
+ conn = self._connection()
236
+ rows = conn.execute(
237
+ "SELECT allele1, allele2, magnitude, repute, summary, gene "
238
+ "FROM snpedia_genotypes "
239
+ "WHERE rsid = ? AND allele1 = ? AND allele2 = ?",
240
+ (snp_id, sorted_alleles[0], sorted_alleles[1]),
241
+ ).fetchall()
242
+
243
+ annotations: list[Annotation] = []
244
+ for allele1, allele2, magnitude, repute, summary, gene in rows:
245
+ if not summary:
246
+ continue
247
+
248
+ summary_lower = summary.lower()
249
+ if any(p in summary_lower for p in _SUMMARY_SUPPRESS_SUBSTRINGS):
250
+ continue
251
+
252
+ if magnitude is None:
253
+ magnitude = 0.0
254
+
255
+ repute_lower = (repute or "").strip().lower()
256
+ category = _REPUTE_CATEGORY.get(repute_lower, "trait")
257
+
258
+ description = f"SNPedia: {summary}"
259
+ genotype_match = f"{allele1}{allele2}"
260
+
261
+ annotations.append(
262
+ Annotation(
263
+ source=self.name,
264
+ rsid=variant.rsid,
265
+ significance=f"snpedia_{repute_lower}" if repute_lower else "snpedia_genotype",
266
+ category=category,
267
+ magnitude=magnitude,
268
+ description=description,
269
+ attribution=self.attribution,
270
+ genotype_match=genotype_match,
271
+ references=[f"https://www.snpedia.com/index.php/{snp_url_path}"],
272
+ gene=gene or "",
273
+ )
274
+ )
275
+
276
+ return annotations