allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""CADD variant deleteriousness enrichment.
|
|
4
|
+
|
|
5
|
+
CADD is not a clinical annotator — it does not produce Annotation
|
|
6
|
+
objects. It enriches existing annotations with PHRED-scaled
|
|
7
|
+
deleteriousness scores. The pipeline calls ``bulk_lookup()`` after all
|
|
8
|
+
annotators have run, and stamps each annotation's ``cadd_phred`` field.
|
|
9
|
+
|
|
10
|
+
Two modes:
|
|
11
|
+
|
|
12
|
+
* **Cache mode** (default): pre-built SQLite database from HuggingFace
|
|
13
|
+
containing exome-region CADD scores. Fast, compact (~1 GB).
|
|
14
|
+
* **Full mode** (``options.cadd_full = true``): queries the complete
|
|
15
|
+
CADD v1.7 tabix file (``whole_genome_SNVs.tsv.gz``, ~81 GB). Covers
|
|
16
|
+
every scored position in the genome. Requires ``pysam`` and a local
|
|
17
|
+
copy of the tabix file + index. **GRCh38 only.**
|
|
18
|
+
|
|
19
|
+
License: LicenseRef-CADD — free for non-commercial use only. Commercial
|
|
20
|
+
use requires a separate license from University of Washington (CoMotion).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import logging
|
|
26
|
+
import sqlite3
|
|
27
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
28
|
+
|
|
29
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor
|
|
30
|
+
from allelix.databases._versions import CADD_SCHEMA_VERSION
|
|
31
|
+
from allelix.databases.cadd_loader import (
|
|
32
|
+
CADD_CACHE_URL,
|
|
33
|
+
CADD_DB_FILENAME,
|
|
34
|
+
CADD_EXPECTED_SHA256,
|
|
35
|
+
install_prebuilt_cache,
|
|
36
|
+
)
|
|
37
|
+
from allelix.databases.manager import (
|
|
38
|
+
download,
|
|
39
|
+
get_database_info,
|
|
40
|
+
verify_file_hash,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
|
|
46
|
+
from allelix.models import Annotation, Variant
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
_BULK_BATCH_SIZE = 900
|
|
51
|
+
|
|
52
|
+
CADD_FULL_FILENAME = "whole_genome_SNVs.tsv.gz"
|
|
53
|
+
CADD_INDEL_FILENAME = "gnomad.genomes.r4.0.indel.tsv.gz"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class CaddAnnotator(Annotator):
|
|
57
|
+
"""PHRED-scaled deleteriousness enrichment from CADD.
|
|
58
|
+
|
|
59
|
+
Subclasses Annotator for ``db update`` / ``db status`` / ``is_ready()``
|
|
60
|
+
integration. ``annotate()`` always returns ``[]`` — CADD does not
|
|
61
|
+
participate in the per-variant annotation loop.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
name: ClassVar[str] = "cadd"
|
|
65
|
+
display_name: ClassVar[str] = "CADD"
|
|
66
|
+
attribution: ClassVar[str] = "CADD"
|
|
67
|
+
requires_download: ClassVar[bool] = True
|
|
68
|
+
server_driven_freshness: ClassVar[bool] = False
|
|
69
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
70
|
+
spdx="LicenseRef-CADD",
|
|
71
|
+
license_url="https://cadd.gs.washington.edu/license",
|
|
72
|
+
attribution_text="CADD scores provided by the University of Washington.",
|
|
73
|
+
source_url="https://cadd.gs.washington.edu/",
|
|
74
|
+
citation="Schubach et al., Nucleic Acids Research 2024",
|
|
75
|
+
commercial_ok=False,
|
|
76
|
+
licensable=True,
|
|
77
|
+
purchase_url="https://els2.comotion.uw.edu/product/cadd-scores",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def __init__(self, data_dir: Path, *, full_mode: bool = False) -> None:
|
|
81
|
+
"""Bind to the data directory.
|
|
82
|
+
|
|
83
|
+
When ``full_mode`` is True the annotator queries a local tabix
|
|
84
|
+
file instead of the pre-built SQLite cache. The tabix file must
|
|
85
|
+
be placed at ``<data_dir>/whole_genome_SNVs.tsv.gz`` with its
|
|
86
|
+
``.tbi`` index alongside it.
|
|
87
|
+
"""
|
|
88
|
+
super().__init__(data_dir)
|
|
89
|
+
self._db_path = data_dir / CADD_DB_FILENAME
|
|
90
|
+
self._conn: sqlite3.Connection | None = None
|
|
91
|
+
self._full_mode = full_mode
|
|
92
|
+
self._tabix_path = data_dir / CADD_FULL_FILENAME
|
|
93
|
+
self._tabix: object | None = None
|
|
94
|
+
self._indel_tabix_path = data_dir / CADD_INDEL_FILENAME
|
|
95
|
+
self._indel_tabix: object | None = None
|
|
96
|
+
|
|
97
|
+
def _connection(self) -> sqlite3.Connection:
|
|
98
|
+
if self._conn is None:
|
|
99
|
+
if not self._db_path.exists():
|
|
100
|
+
raise FileNotFoundError(
|
|
101
|
+
f"CADD cache not found at {self._db_path}. "
|
|
102
|
+
"Run `allelix db update --cadd` first."
|
|
103
|
+
)
|
|
104
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
105
|
+
return self._conn
|
|
106
|
+
|
|
107
|
+
def _open_tabix(self) -> object:
|
|
108
|
+
"""Open the tabix file for full-mode queries."""
|
|
109
|
+
if self._tabix is None:
|
|
110
|
+
try:
|
|
111
|
+
import pysam # type: ignore[import-untyped]
|
|
112
|
+
except ImportError:
|
|
113
|
+
raise ImportError(
|
|
114
|
+
"Full CADD mode requires pysam. Install with: pip install 'allelix[cadd]'"
|
|
115
|
+
) from None
|
|
116
|
+
if not self._tabix_path.exists():
|
|
117
|
+
raise FileNotFoundError(
|
|
118
|
+
f"CADD tabix file not found at {self._tabix_path}. "
|
|
119
|
+
"Download whole_genome_SNVs.tsv.gz and its .tbi index from "
|
|
120
|
+
"https://cadd.gs.washington.edu/download"
|
|
121
|
+
)
|
|
122
|
+
self._tabix = pysam.TabixFile(str(self._tabix_path))
|
|
123
|
+
return self._tabix
|
|
124
|
+
|
|
125
|
+
def _open_indel_tabix(self) -> object | None:
|
|
126
|
+
"""Open the indel tabix file. Returns None if file doesn't exist."""
|
|
127
|
+
if self._indel_tabix is None:
|
|
128
|
+
if not self._indel_tabix_path.exists():
|
|
129
|
+
return None
|
|
130
|
+
tbi = self._indel_tabix_path.parent / (CADD_INDEL_FILENAME + ".tbi")
|
|
131
|
+
if not tbi.exists():
|
|
132
|
+
return None
|
|
133
|
+
try:
|
|
134
|
+
import pysam # type: ignore[import-untyped]
|
|
135
|
+
except ImportError:
|
|
136
|
+
return None
|
|
137
|
+
self._indel_tabix = pysam.TabixFile(str(self._indel_tabix_path))
|
|
138
|
+
return self._indel_tabix
|
|
139
|
+
|
|
140
|
+
def setup(self) -> None:
|
|
141
|
+
"""Download the pre-built CADD cache from HuggingFace."""
|
|
142
|
+
gz_path = self.data_dir / "cadd.sqlite.gz"
|
|
143
|
+
download(CADD_CACHE_URL, gz_path)
|
|
144
|
+
verify_file_hash(gz_path, "sha256", CADD_EXPECTED_SHA256)
|
|
145
|
+
install_prebuilt_cache(
|
|
146
|
+
gz_path,
|
|
147
|
+
self._db_path,
|
|
148
|
+
source_url=CADD_CACHE_URL,
|
|
149
|
+
)
|
|
150
|
+
try:
|
|
151
|
+
gz_path.unlink()
|
|
152
|
+
except OSError:
|
|
153
|
+
logger.warning("Could not remove staged file at %s", gz_path)
|
|
154
|
+
|
|
155
|
+
def is_ready(self) -> bool:
|
|
156
|
+
"""True when the active backend is available.
|
|
157
|
+
|
|
158
|
+
In full mode, checks for the tabix file. In cache mode, checks
|
|
159
|
+
the SQLite database.
|
|
160
|
+
"""
|
|
161
|
+
if self._full_mode:
|
|
162
|
+
return (
|
|
163
|
+
self._tabix_path.exists()
|
|
164
|
+
and (self._tabix_path.parent / (CADD_FULL_FILENAME + ".tbi")).exists()
|
|
165
|
+
)
|
|
166
|
+
info = get_database_info(self._db_path, "cadd")
|
|
167
|
+
if info is None:
|
|
168
|
+
return False
|
|
169
|
+
tag = info.get("local_version_tag") or ""
|
|
170
|
+
return tag == f"sv:{CADD_SCHEMA_VERSION}" or not tag
|
|
171
|
+
|
|
172
|
+
def version(self) -> str | None:
|
|
173
|
+
"""Return the cached database version, or None."""
|
|
174
|
+
if self._full_mode:
|
|
175
|
+
return "v1.7 (full)" if self.is_ready() else None
|
|
176
|
+
info = get_database_info(self._db_path, "cadd")
|
|
177
|
+
return info["version"] if info else None
|
|
178
|
+
|
|
179
|
+
def record_count(self) -> int | None:
|
|
180
|
+
"""Return the number of variants in the cache, or None."""
|
|
181
|
+
if self._full_mode:
|
|
182
|
+
return None
|
|
183
|
+
info = get_database_info(self._db_path, "cadd")
|
|
184
|
+
return info["record_count"] if info else None
|
|
185
|
+
|
|
186
|
+
def close(self) -> None:
|
|
187
|
+
"""Close the SQLite connection or tabix file if open."""
|
|
188
|
+
if self._conn is not None:
|
|
189
|
+
self._conn.close()
|
|
190
|
+
self._conn = None
|
|
191
|
+
if self._tabix is not None:
|
|
192
|
+
self._tabix.close()
|
|
193
|
+
self._tabix = None
|
|
194
|
+
if self._indel_tabix is not None:
|
|
195
|
+
self._indel_tabix.close()
|
|
196
|
+
self._indel_tabix = None
|
|
197
|
+
|
|
198
|
+
def fetch_remote_signal(self) -> str | None:
|
|
199
|
+
"""Code-driven source — no runtime freshness probe (ADR-0030)."""
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
def cached_remote_signal(self) -> str | None:
|
|
203
|
+
"""Code-driven source — no cached signal to compare (ADR-0030)."""
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
207
|
+
"""Not used — CADD enriches, does not annotate. Always returns []."""
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
def _tabix_lookup(self, chrom: str, pos: int, ref: str, alt: str) -> float | None:
|
|
211
|
+
"""Query the tabix file for a single variant.
|
|
212
|
+
|
|
213
|
+
SNVs (single-base ref and alt) query the SNV tabix file.
|
|
214
|
+
Indels route to the indel tabix file if available.
|
|
215
|
+
"""
|
|
216
|
+
query_chrom = chrom if not chrom.startswith("chr") else chrom[3:]
|
|
217
|
+
is_snv = len(ref) == 1 and len(alt) == 1
|
|
218
|
+
|
|
219
|
+
tbx = self._open_tabix() if is_snv else self._open_indel_tabix()
|
|
220
|
+
|
|
221
|
+
if tbx is None:
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
for row in tbx.fetch(query_chrom, pos - 1, pos):
|
|
226
|
+
fields = row.split("\t")
|
|
227
|
+
if len(fields) >= 6 and fields[2] == ref and fields[3] == alt:
|
|
228
|
+
return float(fields[5])
|
|
229
|
+
except (ValueError, KeyError):
|
|
230
|
+
pass
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
def lookup(self, chrom: str, pos: int, ref: str, alt: str) -> float | None:
|
|
234
|
+
"""Return CADD PHRED score for a single variant, or None."""
|
|
235
|
+
if self._full_mode:
|
|
236
|
+
return self._tabix_lookup(chrom, pos, ref, alt)
|
|
237
|
+
conn = self._connection()
|
|
238
|
+
row = conn.execute(
|
|
239
|
+
"SELECT phred FROM cadd_scores WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ?",
|
|
240
|
+
(chrom, pos, ref, alt),
|
|
241
|
+
).fetchone()
|
|
242
|
+
return row[0] if row else None
|
|
243
|
+
|
|
244
|
+
def bulk_lookup(
|
|
245
|
+
self, keys: set[tuple[str, int, str, str]]
|
|
246
|
+
) -> dict[tuple[str, int, str, str], float]:
|
|
247
|
+
"""Return ``{(chrom, pos, ref, alt): phred}`` for all matches.
|
|
248
|
+
|
|
249
|
+
In cache mode, batches SQL queries. In full mode, iterates tabix
|
|
250
|
+
lookups (I/O bound on the tabix index, not CPU).
|
|
251
|
+
"""
|
|
252
|
+
if not keys:
|
|
253
|
+
return {}
|
|
254
|
+
if self._full_mode:
|
|
255
|
+
return self._tabix_bulk_lookup(keys)
|
|
256
|
+
conn = self._connection()
|
|
257
|
+
result: dict[tuple[str, int, str, str], float] = {}
|
|
258
|
+
key_list = list(keys)
|
|
259
|
+
batch_size = _BULK_BATCH_SIZE // 4
|
|
260
|
+
for i in range(0, len(key_list), batch_size):
|
|
261
|
+
batch = key_list[i : i + batch_size]
|
|
262
|
+
clauses = " OR ".join(["(chrom = ? AND pos = ? AND ref = ? AND alt = ?)"] * len(batch))
|
|
263
|
+
params: list[str | int] = []
|
|
264
|
+
for chrom, pos, ref, alt in batch:
|
|
265
|
+
params.extend([chrom, pos, ref, alt])
|
|
266
|
+
rows = conn.execute(
|
|
267
|
+
f"SELECT chrom, pos, ref, alt, phred FROM cadd_scores WHERE {clauses}",
|
|
268
|
+
params,
|
|
269
|
+
).fetchall()
|
|
270
|
+
for chrom, pos, ref, alt, phred in rows:
|
|
271
|
+
result[(chrom, pos, ref, alt)] = phred
|
|
272
|
+
return result
|
|
273
|
+
|
|
274
|
+
def _tabix_bulk_lookup(
|
|
275
|
+
self, keys: set[tuple[str, int, str, str]]
|
|
276
|
+
) -> dict[tuple[str, int, str, str], float]:
|
|
277
|
+
"""Batch tabix lookups for full mode."""
|
|
278
|
+
result: dict[tuple[str, int, str, str], float] = {}
|
|
279
|
+
for chrom, pos, ref, alt in keys:
|
|
280
|
+
score = self._tabix_lookup(chrom, pos, ref, alt)
|
|
281
|
+
if score is not None:
|
|
282
|
+
result[(chrom, pos, ref, alt)] = score
|
|
283
|
+
return result
|
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""ClinVar annotator. Source-attributed pathogenicity calls (ADR-0003).
|
|
4
|
+
|
|
5
|
+
ADR-0021: per-build SQLite caches. ClinVar publishes separate VCFs for
|
|
6
|
+
GRCh37 and GRCh38, and the strand orientation of REF/ALT can invert
|
|
7
|
+
between builds for the ~0.4% of the genome where the reference
|
|
8
|
+
assembly was rebuilt. Carrier-rule matches (ADR-0007) MUST be done
|
|
9
|
+
against the same build the user's data is on. The annotator holds one
|
|
10
|
+
SQLite cache per build (`clinvar.GRCh37.sqlite`, `clinvar.GRCh38.sqlite`)
|
|
11
|
+
and dispatches per-variant by `variant.build`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import sqlite3
|
|
18
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
19
|
+
|
|
20
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor
|
|
21
|
+
from allelix.databases import manager as _manager_module
|
|
22
|
+
from allelix.databases._versions import CLINVAR_INTERPRETER_VERSION
|
|
23
|
+
from allelix.databases.manager import (
|
|
24
|
+
download,
|
|
25
|
+
fetch_remote_text,
|
|
26
|
+
get_database_info,
|
|
27
|
+
load_clinvar_vcf,
|
|
28
|
+
stamp_existing_clinvar_cache,
|
|
29
|
+
verify_file_hash,
|
|
30
|
+
)
|
|
31
|
+
from allelix.models import Annotation
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
|
|
36
|
+
from allelix.models import Variant
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def clinvar_db_filename(build: str) -> str:
|
|
44
|
+
"""Per-build cache filename. Two coexisting SQLite files per data_dir."""
|
|
45
|
+
return f"clinvar.{build}.sqlite"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def clinvar_record_name(build: str) -> str:
|
|
49
|
+
"""`database_versions` row name for a given build."""
|
|
50
|
+
return f"clinvar.{build}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Allelix-derived magnitude scoring from ClinVar's CLNSIG. See ADR-0008.
|
|
54
|
+
_CLNSIG_MAGNITUDE: dict[str, float] = {
|
|
55
|
+
"pathogenic": 9.0,
|
|
56
|
+
"pathogenic/likely_pathogenic": 8.5,
|
|
57
|
+
"likely_pathogenic": 7.0,
|
|
58
|
+
"drug_response": 6.5,
|
|
59
|
+
"risk_factor": 6.0,
|
|
60
|
+
"uncertain_significance": 4.0,
|
|
61
|
+
"conflicting_interpretations_of_pathogenicity": 4.0,
|
|
62
|
+
"conflicting_classifications_of_pathogenicity": 4.0,
|
|
63
|
+
"not_provided": 2.0,
|
|
64
|
+
"no_classification_for_the_single_variant": 2.0,
|
|
65
|
+
"likely_benign": 2.0,
|
|
66
|
+
"benign/likely_benign": 1.5,
|
|
67
|
+
"benign": 1.0,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
_BENIGN_CLNSIGS = frozenset({"benign", "likely_benign", "benign/likely_benign"})
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _normalize_clnsig(value: str) -> str:
|
|
75
|
+
return value.strip().lower().replace(" ", "_")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _magnitude(clnsig: str) -> float:
|
|
79
|
+
return _CLNSIG_MAGNITUDE.get(_normalize_clnsig(clnsig), 5.0)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _vcf_filename_for_url(url: str) -> str:
|
|
83
|
+
"""Pick the right local filename suffix based on the URL."""
|
|
84
|
+
return "clinvar.vcf.gz" if url.endswith(".gz") else "clinvar.vcf"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ClinVarAnnotator(Annotator):
|
|
88
|
+
"""Annotates variants with ClinVar's clinical significance classifications.
|
|
89
|
+
|
|
90
|
+
Per-build aware (ADR-0021). At `setup()` time, downloads each
|
|
91
|
+
requested build's VCF (default: both). At `annotate()` time,
|
|
92
|
+
dispatches to the cache matching `variant.build`. If the matching
|
|
93
|
+
cache is missing, the variant is skipped and a warning logged
|
|
94
|
+
(db update needed).
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
name: ClassVar[str] = "clinvar"
|
|
98
|
+
display_name: ClassVar[str] = "ClinVar"
|
|
99
|
+
attribution: ClassVar[str] = "ClinVar"
|
|
100
|
+
requires_download: ClassVar[bool] = True
|
|
101
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
102
|
+
spdx="custom-clinvar",
|
|
103
|
+
license_url="https://www.ncbi.nlm.nih.gov/clinvar/docs/maintenance_use/",
|
|
104
|
+
attribution_text="ClinVar variant classifications from NCBI.",
|
|
105
|
+
source_url="https://www.ncbi.nlm.nih.gov/clinvar/",
|
|
106
|
+
commercial_ok=True,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
data_dir: Path,
|
|
112
|
+
builds: tuple[str, ...] = CLINVAR_SUPPORTED_BUILDS,
|
|
113
|
+
*,
|
|
114
|
+
include_benign: bool = False,
|
|
115
|
+
) -> None:
|
|
116
|
+
"""Resolve per-build SQLite cache paths within `data_dir`.
|
|
117
|
+
|
|
118
|
+
`builds` selects which builds this annotator instance manages.
|
|
119
|
+
Default is both GRCh37 and GRCh38. Passing a single-element
|
|
120
|
+
tuple (e.g. `("GRCh38",)`) restricts setup/refresh to that
|
|
121
|
+
build — used by the CLI's `--build` flag.
|
|
122
|
+
|
|
123
|
+
`include_benign` controls whether Benign/Likely_benign annotations
|
|
124
|
+
are emitted. Default False suppresses them (ADR-0008 amendment).
|
|
125
|
+
"""
|
|
126
|
+
super().__init__(data_dir)
|
|
127
|
+
self._builds = tuple(builds)
|
|
128
|
+
self._include_benign = include_benign
|
|
129
|
+
for build in self._builds:
|
|
130
|
+
if build not in CLINVAR_SUPPORTED_BUILDS:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Unsupported ClinVar build {build!r}; expected one of "
|
|
133
|
+
f"{CLINVAR_SUPPORTED_BUILDS}"
|
|
134
|
+
)
|
|
135
|
+
self._db_paths: dict[str, Path] = {
|
|
136
|
+
build: data_dir / clinvar_db_filename(build) for build in self._builds
|
|
137
|
+
}
|
|
138
|
+
self._conns: dict[str, sqlite3.Connection] = {}
|
|
139
|
+
# ADR-0023: per-build (rsid -> single-base REF) cache. PharmGKB
|
|
140
|
+
# consults this as its primary non-finding filter. Built lazily
|
|
141
|
+
# on first lookup per build.
|
|
142
|
+
self._ref_lookups: dict[str, dict[str, str]] = {}
|
|
143
|
+
|
|
144
|
+
def _connection(self, build: str) -> sqlite3.Connection | None:
|
|
145
|
+
"""Return a lazy connection to the per-build cache, or None if missing."""
|
|
146
|
+
if build not in self._db_paths:
|
|
147
|
+
return None
|
|
148
|
+
if build not in self._conns:
|
|
149
|
+
db_path = self._db_paths[build]
|
|
150
|
+
if not db_path.exists():
|
|
151
|
+
return None
|
|
152
|
+
self._conns[build] = sqlite3.connect(db_path)
|
|
153
|
+
return self._conns[build]
|
|
154
|
+
|
|
155
|
+
def setup(self) -> None:
|
|
156
|
+
"""Download each managed build's ClinVar VCF and ingest atomically."""
|
|
157
|
+
for build in self._builds:
|
|
158
|
+
self._setup_one(build)
|
|
159
|
+
|
|
160
|
+
def _setup_one(self, build: str) -> None:
|
|
161
|
+
url = _manager_module.CLINVAR_URL_BY_BUILD[build]
|
|
162
|
+
signal = self._fetch_remote_signal_for(build)
|
|
163
|
+
if signal is None:
|
|
164
|
+
msg = (
|
|
165
|
+
f"clinvar ({build}): cannot verify remote freshness signal. "
|
|
166
|
+
"Refresh aborted to avoid persisting an incomplete cache stamp. "
|
|
167
|
+
"Retry, or pass --force if you accept that next `db update` "
|
|
168
|
+
"will re-download to re-establish the signal."
|
|
169
|
+
)
|
|
170
|
+
raise RuntimeError(msg)
|
|
171
|
+
vcf_path = self.data_dir / _vcf_filename_for_url(url)
|
|
172
|
+
download(url, vcf_path)
|
|
173
|
+
try:
|
|
174
|
+
verify_file_hash(vcf_path, "md5", signal.removeprefix("md5:"))
|
|
175
|
+
load_clinvar_vcf(
|
|
176
|
+
vcf_path,
|
|
177
|
+
self._db_paths[build],
|
|
178
|
+
source_url=url,
|
|
179
|
+
remote_signal=signal,
|
|
180
|
+
record_name=clinvar_record_name(build),
|
|
181
|
+
)
|
|
182
|
+
finally:
|
|
183
|
+
try:
|
|
184
|
+
vcf_path.unlink()
|
|
185
|
+
except FileNotFoundError:
|
|
186
|
+
pass
|
|
187
|
+
except OSError:
|
|
188
|
+
logger.warning("Could not remove staged VCF at %s", vcf_path)
|
|
189
|
+
|
|
190
|
+
def is_ready(self) -> bool:
|
|
191
|
+
"""True iff EVERY managed build has a populated, version-stamped cache.
|
|
192
|
+
|
|
193
|
+
Checks ``local_version_tag`` for the current interpreter version.
|
|
194
|
+
Pre-mechanism caches (tag missing or baked into ``remote_signal``)
|
|
195
|
+
are self-healed once via ``stamp_existing_clinvar_cache``.
|
|
196
|
+
"""
|
|
197
|
+
for build in self._builds:
|
|
198
|
+
info = get_database_info(self._db_paths[build], clinvar_record_name(build))
|
|
199
|
+
if info is None:
|
|
200
|
+
return False
|
|
201
|
+
tag = info.get("local_version_tag") or ""
|
|
202
|
+
if tag == f"iv:{CLINVAR_INTERPRETER_VERSION}":
|
|
203
|
+
continue
|
|
204
|
+
if stamp_existing_clinvar_cache(self._db_paths[build]):
|
|
205
|
+
continue
|
|
206
|
+
return False
|
|
207
|
+
return True
|
|
208
|
+
|
|
209
|
+
def version(self) -> str | None:
|
|
210
|
+
"""Composite version string across managed builds.
|
|
211
|
+
|
|
212
|
+
Format: `"GRCh37:<v>; GRCh38:<v>"` when both present, or a
|
|
213
|
+
single `<build>:<v>` when only one is managed. None if none.
|
|
214
|
+
"""
|
|
215
|
+
parts: list[str] = []
|
|
216
|
+
for build in self._builds:
|
|
217
|
+
info = get_database_info(self._db_paths[build], clinvar_record_name(build))
|
|
218
|
+
if info is not None:
|
|
219
|
+
parts.append(f"{build}:{info['version']}")
|
|
220
|
+
return "; ".join(parts) if parts else None
|
|
221
|
+
|
|
222
|
+
def record_count(self) -> int | None:
|
|
223
|
+
"""Total record count across managed build caches, or None if none cached."""
|
|
224
|
+
total = 0
|
|
225
|
+
any_present = False
|
|
226
|
+
for build in self._builds:
|
|
227
|
+
info = get_database_info(self._db_paths[build], clinvar_record_name(build))
|
|
228
|
+
if info is not None:
|
|
229
|
+
any_present = True
|
|
230
|
+
total += info["record_count"]
|
|
231
|
+
return total if any_present else None
|
|
232
|
+
|
|
233
|
+
def close(self) -> None:
|
|
234
|
+
"""Close all open per-build connections. Safe to call repeatedly."""
|
|
235
|
+
for conn in self._conns.values():
|
|
236
|
+
conn.close()
|
|
237
|
+
self._conns.clear()
|
|
238
|
+
self._ref_lookups.clear()
|
|
239
|
+
|
|
240
|
+
def reference_for(self, rsid: str, build: str) -> str | None:
|
|
241
|
+
"""Return ClinVar's single-base REF allele for `rsid` in `build`, or None.
|
|
242
|
+
|
|
243
|
+
ADR-0023: PharmGKB's primary non-finding filter calls this. If the
|
|
244
|
+
return value matches both of the user's alleles, the user is
|
|
245
|
+
homozygous reference and the PharmGKB annotation is a non-finding.
|
|
246
|
+
|
|
247
|
+
Lazily builds an in-memory `(rsid -> REF)` map per build on first
|
|
248
|
+
call so subsequent lookups are O(1). Multi-base REFs (indels) are
|
|
249
|
+
skipped — array-based parsers can't call indels, so a multi-base
|
|
250
|
+
REF can't validly suppress a single-base genotype.
|
|
251
|
+
|
|
252
|
+
Returns None when ClinVar has no data for the rsid in this build
|
|
253
|
+
(or has only indel REFs). Callers fall through to secondary tiers.
|
|
254
|
+
"""
|
|
255
|
+
if build not in self._db_paths:
|
|
256
|
+
return None
|
|
257
|
+
if build not in self._ref_lookups:
|
|
258
|
+
self._ref_lookups[build] = self._load_ref_lookup(build)
|
|
259
|
+
return self._ref_lookups[build].get(rsid)
|
|
260
|
+
|
|
261
|
+
def _load_ref_lookup(self, build: str) -> dict[str, str]:
|
|
262
|
+
"""Read the per-build cache once and build the `(rsid -> REF)` map."""
|
|
263
|
+
conn = self._connection(build)
|
|
264
|
+
if conn is None:
|
|
265
|
+
return {}
|
|
266
|
+
# Single-base REFs only: indel anchor-base encoding (REF=CTT, etc.)
|
|
267
|
+
# can't suppress a single-base array readout. The per-build cache
|
|
268
|
+
# may have BOTH SNV and indel rows for the same rsid; the WHERE
|
|
269
|
+
# filters those out so we keep only the SNV REF.
|
|
270
|
+
rows = conn.execute(
|
|
271
|
+
"SELECT DISTINCT rsid, ref FROM clinvar_variants WHERE length(ref) = 1"
|
|
272
|
+
).fetchall()
|
|
273
|
+
out: dict[str, str] = {}
|
|
274
|
+
for rsid, ref in rows:
|
|
275
|
+
# If a rsid has multiple single-base REFs (shouldn't happen at
|
|
276
|
+
# one position but defending against future data shapes), keep
|
|
277
|
+
# the first.
|
|
278
|
+
if rsid not in out:
|
|
279
|
+
out[rsid] = ref
|
|
280
|
+
return out
|
|
281
|
+
|
|
282
|
+
def fetch_remote_signal(self) -> str | None:
|
|
283
|
+
r"""Composite freshness signal across managed builds.
|
|
284
|
+
|
|
285
|
+
Format: `"GRCh37:md5:<hex>|GRCh38:md5:<hex>"`. Returns None if
|
|
286
|
+
ANY managed build's signal probe fails — the CLI then prints
|
|
287
|
+
"can't verify" and skips refresh per ADR-0012's policy.
|
|
288
|
+
"""
|
|
289
|
+
parts: list[str] = []
|
|
290
|
+
for build in self._builds:
|
|
291
|
+
sig = self._fetch_remote_signal_for(build)
|
|
292
|
+
if sig is None:
|
|
293
|
+
return None
|
|
294
|
+
parts.append(f"{build}:{sig}")
|
|
295
|
+
return "|".join(parts) if parts else None
|
|
296
|
+
|
|
297
|
+
@staticmethod
|
|
298
|
+
def _fetch_remote_signal_for(build: str) -> str | None:
|
|
299
|
+
body = fetch_remote_text(_manager_module.CLINVAR_URL_BY_BUILD[build] + ".md5")
|
|
300
|
+
if not body:
|
|
301
|
+
return None
|
|
302
|
+
first_token = body.strip().split(None, 1)[0] if body.strip() else ""
|
|
303
|
+
if not first_token:
|
|
304
|
+
return None
|
|
305
|
+
return f"md5:{first_token}"
|
|
306
|
+
|
|
307
|
+
def cached_remote_signal(self) -> str | None:
|
|
308
|
+
"""Composite cached signal across managed builds. None if any missing."""
|
|
309
|
+
parts: list[str] = []
|
|
310
|
+
for build in self._builds:
|
|
311
|
+
info = get_database_info(self._db_paths[build], clinvar_record_name(build))
|
|
312
|
+
if info is None or info["remote_signal"] is None:
|
|
313
|
+
return None
|
|
314
|
+
sig = info["remote_signal"]
|
|
315
|
+
if not sig:
|
|
316
|
+
return None
|
|
317
|
+
parts.append(f"{build}:{sig}")
|
|
318
|
+
return "|".join(parts) if parts else None
|
|
319
|
+
|
|
320
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
321
|
+
"""Return ClinVar annotations whose REF/ALT matches the user's genotype.
|
|
322
|
+
|
|
323
|
+
ADR-0007 carrier rule: an entry triggers only if `variant.allele1`
|
|
324
|
+
or `variant.allele2` equals the entry's ALT allele.
|
|
325
|
+
ADR-0011 indel-anchor protection: array-based parsers report
|
|
326
|
+
single-base genotypes; ClinVar's anchor-base indel encoding
|
|
327
|
+
does not match those by string equality.
|
|
328
|
+
ADR-0021: dispatch by `variant.build`. If the matching cache is
|
|
329
|
+
absent, the variant is skipped silently — the user already saw
|
|
330
|
+
the analyze-time build warning.
|
|
331
|
+
"""
|
|
332
|
+
if variant.is_no_call:
|
|
333
|
+
return []
|
|
334
|
+
conn = self._connection(variant.build)
|
|
335
|
+
if conn is None:
|
|
336
|
+
return []
|
|
337
|
+
rows = conn.execute(
|
|
338
|
+
"SELECT chromosome, position, ref, alt, clinical_significance, "
|
|
339
|
+
"condition, gene, review_status, allele_id "
|
|
340
|
+
"FROM clinvar_variants WHERE rsid = ?",
|
|
341
|
+
(variant.rsid,),
|
|
342
|
+
).fetchall()
|
|
343
|
+
annotations: list[Annotation] = []
|
|
344
|
+
carrier_alleles = {variant.allele1, variant.allele2}
|
|
345
|
+
user_is_multibase = len(variant.allele1) > 1 or len(variant.allele2) > 1
|
|
346
|
+
# ADR-0023: report the user's actual diploid call consistently
|
|
347
|
+
# across annotators, not the matched ALT base alone.
|
|
348
|
+
user_diploid = _user_diploid(variant)
|
|
349
|
+
for row in rows:
|
|
350
|
+
(
|
|
351
|
+
_chrom,
|
|
352
|
+
_pos,
|
|
353
|
+
ref,
|
|
354
|
+
alt,
|
|
355
|
+
clnsig,
|
|
356
|
+
condition,
|
|
357
|
+
gene,
|
|
358
|
+
review_status,
|
|
359
|
+
allele_id,
|
|
360
|
+
) = row
|
|
361
|
+
clinvar_is_indel = len(ref) > 1 or len(alt) > 1
|
|
362
|
+
if clinvar_is_indel and not user_is_multibase:
|
|
363
|
+
continue
|
|
364
|
+
if alt not in carrier_alleles:
|
|
365
|
+
continue
|
|
366
|
+
sig_label = _normalize_clnsig(clnsig) if clnsig else "unknown"
|
|
367
|
+
if not self._include_benign and sig_label in _BENIGN_CLNSIGS:
|
|
368
|
+
continue
|
|
369
|
+
description = (
|
|
370
|
+
f"ClinVar classifies this allele as "
|
|
371
|
+
f"{clnsig.replace('_', ' ') if clnsig else 'unknown significance'}"
|
|
372
|
+
)
|
|
373
|
+
references = [f"clinvar:allele/{allele_id}"] if allele_id else []
|
|
374
|
+
annotations.append(
|
|
375
|
+
Annotation(
|
|
376
|
+
source=self.name,
|
|
377
|
+
rsid=variant.rsid,
|
|
378
|
+
significance=f"clinvar_{sig_label}",
|
|
379
|
+
category="clinical",
|
|
380
|
+
magnitude=_magnitude(clnsig),
|
|
381
|
+
description=description,
|
|
382
|
+
attribution=self.attribution,
|
|
383
|
+
genotype_match=user_diploid,
|
|
384
|
+
references=references,
|
|
385
|
+
condition="" if not condition or condition == "." else condition,
|
|
386
|
+
gene=gene or "",
|
|
387
|
+
review_status=review_status or "",
|
|
388
|
+
alt=alt,
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
return annotations
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _user_diploid(variant: Variant) -> str:
|
|
395
|
+
"""Render the user's diploid call as a sorted two-letter string.
|
|
396
|
+
|
|
397
|
+
Used by ClinVar and PharmGKB so the report's "Genotype" column shows
|
|
398
|
+
the same shape for every annotation regardless of source (ADR-0023).
|
|
399
|
+
SNV: `("G", "A") -> "AG"`. Indel passthrough is verbatim.
|
|
400
|
+
"""
|
|
401
|
+
a1, a2 = variant.allele1, variant.allele2
|
|
402
|
+
if len(a1) == 1 and len(a2) == 1:
|
|
403
|
+
return "".join(sorted((a1, a2)))
|
|
404
|
+
return f"{a1}/{a2}"
|