allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """CADD variant deleteriousness enrichment.
4
+
5
+ CADD is not a clinical annotator — it does not produce Annotation
6
+ objects. It enriches existing annotations with PHRED-scaled
7
+ deleteriousness scores. The pipeline calls ``bulk_lookup()`` after all
8
+ annotators have run, and stamps each annotation's ``cadd_phred`` field.
9
+
10
+ Two modes:
11
+
12
+ * **Cache mode** (default): pre-built SQLite database from HuggingFace
13
+ containing exome-region CADD scores. Fast, compact (~1 GB).
14
+ * **Full mode** (``options.cadd_full = true``): queries the complete
15
+ CADD v1.7 tabix file (``whole_genome_SNVs.tsv.gz``, ~81 GB). Covers
16
+ every scored position in the genome. Requires ``pysam`` and a local
17
+ copy of the tabix file + index. **GRCh38 only.**
18
+
19
+ License: LicenseRef-CADD — free for non-commercial use only. Commercial
20
+ use requires a separate license from University of Washington (CoMotion).
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+ import sqlite3
27
+ from typing import TYPE_CHECKING, ClassVar
28
+
29
+ from allelix.annotators.base import Annotator, LicenseDescriptor
30
+ from allelix.databases._versions import CADD_SCHEMA_VERSION
31
+ from allelix.databases.cadd_loader import (
32
+ CADD_CACHE_URL,
33
+ CADD_DB_FILENAME,
34
+ CADD_EXPECTED_SHA256,
35
+ install_prebuilt_cache,
36
+ )
37
+ from allelix.databases.manager import (
38
+ download,
39
+ get_database_info,
40
+ verify_file_hash,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ from pathlib import Path
45
+
46
+ from allelix.models import Annotation, Variant
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+ _BULK_BATCH_SIZE = 900
51
+
52
+ CADD_FULL_FILENAME = "whole_genome_SNVs.tsv.gz"
53
+ CADD_INDEL_FILENAME = "gnomad.genomes.r4.0.indel.tsv.gz"
54
+
55
+
56
+ class CaddAnnotator(Annotator):
57
+ """PHRED-scaled deleteriousness enrichment from CADD.
58
+
59
+ Subclasses Annotator for ``db update`` / ``db status`` / ``is_ready()``
60
+ integration. ``annotate()`` always returns ``[]`` — CADD does not
61
+ participate in the per-variant annotation loop.
62
+ """
63
+
64
+ name: ClassVar[str] = "cadd"
65
+ display_name: ClassVar[str] = "CADD"
66
+ attribution: ClassVar[str] = "CADD"
67
+ requires_download: ClassVar[bool] = True
68
+ server_driven_freshness: ClassVar[bool] = False
69
+ license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
70
+ spdx="LicenseRef-CADD",
71
+ license_url="https://cadd.gs.washington.edu/license",
72
+ attribution_text="CADD scores provided by the University of Washington.",
73
+ source_url="https://cadd.gs.washington.edu/",
74
+ citation="Schubach et al., Nucleic Acids Research 2024",
75
+ commercial_ok=False,
76
+ licensable=True,
77
+ purchase_url="https://els2.comotion.uw.edu/product/cadd-scores",
78
+ )
79
+
80
+ def __init__(self, data_dir: Path, *, full_mode: bool = False) -> None:
81
+ """Bind to the data directory.
82
+
83
+ When ``full_mode`` is True the annotator queries a local tabix
84
+ file instead of the pre-built SQLite cache. The tabix file must
85
+ be placed at ``<data_dir>/whole_genome_SNVs.tsv.gz`` with its
86
+ ``.tbi`` index alongside it.
87
+ """
88
+ super().__init__(data_dir)
89
+ self._db_path = data_dir / CADD_DB_FILENAME
90
+ self._conn: sqlite3.Connection | None = None
91
+ self._full_mode = full_mode
92
+ self._tabix_path = data_dir / CADD_FULL_FILENAME
93
+ self._tabix: object | None = None
94
+ self._indel_tabix_path = data_dir / CADD_INDEL_FILENAME
95
+ self._indel_tabix: object | None = None
96
+
97
+ def _connection(self) -> sqlite3.Connection:
98
+ if self._conn is None:
99
+ if not self._db_path.exists():
100
+ raise FileNotFoundError(
101
+ f"CADD cache not found at {self._db_path}. "
102
+ "Run `allelix db update --cadd` first."
103
+ )
104
+ self._conn = sqlite3.connect(self._db_path)
105
+ return self._conn
106
+
107
+ def _open_tabix(self) -> object:
108
+ """Open the tabix file for full-mode queries."""
109
+ if self._tabix is None:
110
+ try:
111
+ import pysam # type: ignore[import-untyped]
112
+ except ImportError:
113
+ raise ImportError(
114
+ "Full CADD mode requires pysam. Install with: pip install 'allelix[cadd]'"
115
+ ) from None
116
+ if not self._tabix_path.exists():
117
+ raise FileNotFoundError(
118
+ f"CADD tabix file not found at {self._tabix_path}. "
119
+ "Download whole_genome_SNVs.tsv.gz and its .tbi index from "
120
+ "https://cadd.gs.washington.edu/download"
121
+ )
122
+ self._tabix = pysam.TabixFile(str(self._tabix_path))
123
+ return self._tabix
124
+
125
+ def _open_indel_tabix(self) -> object | None:
126
+ """Open the indel tabix file. Returns None if file doesn't exist."""
127
+ if self._indel_tabix is None:
128
+ if not self._indel_tabix_path.exists():
129
+ return None
130
+ tbi = self._indel_tabix_path.parent / (CADD_INDEL_FILENAME + ".tbi")
131
+ if not tbi.exists():
132
+ return None
133
+ try:
134
+ import pysam # type: ignore[import-untyped]
135
+ except ImportError:
136
+ return None
137
+ self._indel_tabix = pysam.TabixFile(str(self._indel_tabix_path))
138
+ return self._indel_tabix
139
+
140
+ def setup(self) -> None:
141
+ """Download the pre-built CADD cache from HuggingFace."""
142
+ gz_path = self.data_dir / "cadd.sqlite.gz"
143
+ download(CADD_CACHE_URL, gz_path)
144
+ verify_file_hash(gz_path, "sha256", CADD_EXPECTED_SHA256)
145
+ install_prebuilt_cache(
146
+ gz_path,
147
+ self._db_path,
148
+ source_url=CADD_CACHE_URL,
149
+ )
150
+ try:
151
+ gz_path.unlink()
152
+ except OSError:
153
+ logger.warning("Could not remove staged file at %s", gz_path)
154
+
155
+ def is_ready(self) -> bool:
156
+ """True when the active backend is available.
157
+
158
+ In full mode, checks for the tabix file. In cache mode, checks
159
+ the SQLite database.
160
+ """
161
+ if self._full_mode:
162
+ return (
163
+ self._tabix_path.exists()
164
+ and (self._tabix_path.parent / (CADD_FULL_FILENAME + ".tbi")).exists()
165
+ )
166
+ info = get_database_info(self._db_path, "cadd")
167
+ if info is None:
168
+ return False
169
+ tag = info.get("local_version_tag") or ""
170
+ return tag == f"sv:{CADD_SCHEMA_VERSION}" or not tag
171
+
172
+ def version(self) -> str | None:
173
+ """Return the cached database version, or None."""
174
+ if self._full_mode:
175
+ return "v1.7 (full)" if self.is_ready() else None
176
+ info = get_database_info(self._db_path, "cadd")
177
+ return info["version"] if info else None
178
+
179
+ def record_count(self) -> int | None:
180
+ """Return the number of variants in the cache, or None."""
181
+ if self._full_mode:
182
+ return None
183
+ info = get_database_info(self._db_path, "cadd")
184
+ return info["record_count"] if info else None
185
+
186
+ def close(self) -> None:
187
+ """Close the SQLite connection or tabix file if open."""
188
+ if self._conn is not None:
189
+ self._conn.close()
190
+ self._conn = None
191
+ if self._tabix is not None:
192
+ self._tabix.close()
193
+ self._tabix = None
194
+ if self._indel_tabix is not None:
195
+ self._indel_tabix.close()
196
+ self._indel_tabix = None
197
+
198
+ def fetch_remote_signal(self) -> str | None:
199
+ """Code-driven source — no runtime freshness probe (ADR-0030)."""
200
+ return None
201
+
202
+ def cached_remote_signal(self) -> str | None:
203
+ """Code-driven source — no cached signal to compare (ADR-0030)."""
204
+ return None
205
+
206
+ def annotate(self, variant: Variant) -> list[Annotation]:
207
+ """Not used — CADD enriches, does not annotate. Always returns []."""
208
+ return []
209
+
210
+ def _tabix_lookup(self, chrom: str, pos: int, ref: str, alt: str) -> float | None:
211
+ """Query the tabix file for a single variant.
212
+
213
+ SNVs (single-base ref and alt) query the SNV tabix file.
214
+ Indels route to the indel tabix file if available.
215
+ """
216
+ query_chrom = chrom if not chrom.startswith("chr") else chrom[3:]
217
+ is_snv = len(ref) == 1 and len(alt) == 1
218
+
219
+ tbx = self._open_tabix() if is_snv else self._open_indel_tabix()
220
+
221
+ if tbx is None:
222
+ return None
223
+
224
+ try:
225
+ for row in tbx.fetch(query_chrom, pos - 1, pos):
226
+ fields = row.split("\t")
227
+ if len(fields) >= 6 and fields[2] == ref and fields[3] == alt:
228
+ return float(fields[5])
229
+ except (ValueError, KeyError):
230
+ pass
231
+ return None
232
+
233
+ def lookup(self, chrom: str, pos: int, ref: str, alt: str) -> float | None:
234
+ """Return CADD PHRED score for a single variant, or None."""
235
+ if self._full_mode:
236
+ return self._tabix_lookup(chrom, pos, ref, alt)
237
+ conn = self._connection()
238
+ row = conn.execute(
239
+ "SELECT phred FROM cadd_scores WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ?",
240
+ (chrom, pos, ref, alt),
241
+ ).fetchone()
242
+ return row[0] if row else None
243
+
244
+ def bulk_lookup(
245
+ self, keys: set[tuple[str, int, str, str]]
246
+ ) -> dict[tuple[str, int, str, str], float]:
247
+ """Return ``{(chrom, pos, ref, alt): phred}`` for all matches.
248
+
249
+ In cache mode, batches SQL queries. In full mode, iterates tabix
250
+ lookups (I/O bound on the tabix index, not CPU).
251
+ """
252
+ if not keys:
253
+ return {}
254
+ if self._full_mode:
255
+ return self._tabix_bulk_lookup(keys)
256
+ conn = self._connection()
257
+ result: dict[tuple[str, int, str, str], float] = {}
258
+ key_list = list(keys)
259
+ batch_size = _BULK_BATCH_SIZE // 4
260
+ for i in range(0, len(key_list), batch_size):
261
+ batch = key_list[i : i + batch_size]
262
+ clauses = " OR ".join(["(chrom = ? AND pos = ? AND ref = ? AND alt = ?)"] * len(batch))
263
+ params: list[str | int] = []
264
+ for chrom, pos, ref, alt in batch:
265
+ params.extend([chrom, pos, ref, alt])
266
+ rows = conn.execute(
267
+ f"SELECT chrom, pos, ref, alt, phred FROM cadd_scores WHERE {clauses}",
268
+ params,
269
+ ).fetchall()
270
+ for chrom, pos, ref, alt, phred in rows:
271
+ result[(chrom, pos, ref, alt)] = phred
272
+ return result
273
+
274
+ def _tabix_bulk_lookup(
275
+ self, keys: set[tuple[str, int, str, str]]
276
+ ) -> dict[tuple[str, int, str, str], float]:
277
+ """Batch tabix lookups for full mode."""
278
+ result: dict[tuple[str, int, str, str], float] = {}
279
+ for chrom, pos, ref, alt in keys:
280
+ score = self._tabix_lookup(chrom, pos, ref, alt)
281
+ if score is not None:
282
+ result[(chrom, pos, ref, alt)] = score
283
+ return result
@@ -0,0 +1,404 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """ClinVar annotator. Source-attributed pathogenicity calls (ADR-0003).
4
+
5
+ ADR-0021: per-build SQLite caches. ClinVar publishes separate VCFs for
6
+ GRCh37 and GRCh38, and the strand orientation of REF/ALT can invert
7
+ between builds for the ~0.4% of the genome where the reference
8
+ assembly was rebuilt. Carrier-rule matches (ADR-0007) MUST be done
9
+ against the same build the user's data is on. The annotator holds one
10
+ SQLite cache per build (`clinvar.GRCh37.sqlite`, `clinvar.GRCh38.sqlite`)
11
+ and dispatches per-variant by `variant.build`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ import sqlite3
18
+ from typing import TYPE_CHECKING, ClassVar
19
+
20
+ from allelix.annotators.base import Annotator, LicenseDescriptor
21
+ from allelix.databases import manager as _manager_module
22
+ from allelix.databases._versions import CLINVAR_INTERPRETER_VERSION
23
+ from allelix.databases.manager import (
24
+ download,
25
+ fetch_remote_text,
26
+ get_database_info,
27
+ load_clinvar_vcf,
28
+ stamp_existing_clinvar_cache,
29
+ verify_file_hash,
30
+ )
31
+ from allelix.models import Annotation
32
+
33
+ if TYPE_CHECKING:
34
+ from pathlib import Path
35
+
36
+ from allelix.models import Variant
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
41
+
42
+
43
+ def clinvar_db_filename(build: str) -> str:
44
+ """Per-build cache filename. Two coexisting SQLite files per data_dir."""
45
+ return f"clinvar.{build}.sqlite"
46
+
47
+
48
+ def clinvar_record_name(build: str) -> str:
49
+ """`database_versions` row name for a given build."""
50
+ return f"clinvar.{build}"
51
+
52
+
53
+ # Allelix-derived magnitude scoring from ClinVar's CLNSIG. See ADR-0008.
54
+ _CLNSIG_MAGNITUDE: dict[str, float] = {
55
+ "pathogenic": 9.0,
56
+ "pathogenic/likely_pathogenic": 8.5,
57
+ "likely_pathogenic": 7.0,
58
+ "drug_response": 6.5,
59
+ "risk_factor": 6.0,
60
+ "uncertain_significance": 4.0,
61
+ "conflicting_interpretations_of_pathogenicity": 4.0,
62
+ "conflicting_classifications_of_pathogenicity": 4.0,
63
+ "not_provided": 2.0,
64
+ "no_classification_for_the_single_variant": 2.0,
65
+ "likely_benign": 2.0,
66
+ "benign/likely_benign": 1.5,
67
+ "benign": 1.0,
68
+ }
69
+
70
+
71
+ _BENIGN_CLNSIGS = frozenset({"benign", "likely_benign", "benign/likely_benign"})
72
+
73
+
74
+ def _normalize_clnsig(value: str) -> str:
75
+ return value.strip().lower().replace(" ", "_")
76
+
77
+
78
+ def _magnitude(clnsig: str) -> float:
79
+ return _CLNSIG_MAGNITUDE.get(_normalize_clnsig(clnsig), 5.0)
80
+
81
+
82
+ def _vcf_filename_for_url(url: str) -> str:
83
+ """Pick the right local filename suffix based on the URL."""
84
+ return "clinvar.vcf.gz" if url.endswith(".gz") else "clinvar.vcf"
85
+
86
+
87
+ class ClinVarAnnotator(Annotator):
88
+ """Annotates variants with ClinVar's clinical significance classifications.
89
+
90
+ Per-build aware (ADR-0021). At `setup()` time, downloads each
91
+ requested build's VCF (default: both). At `annotate()` time,
92
+ dispatches to the cache matching `variant.build`. If the matching
93
+ cache is missing, the variant is skipped and a warning logged
94
+ (db update needed).
95
+ """
96
+
97
+ name: ClassVar[str] = "clinvar"
98
+ display_name: ClassVar[str] = "ClinVar"
99
+ attribution: ClassVar[str] = "ClinVar"
100
+ requires_download: ClassVar[bool] = True
101
+ license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
102
+ spdx="custom-clinvar",
103
+ license_url="https://www.ncbi.nlm.nih.gov/clinvar/docs/maintenance_use/",
104
+ attribution_text="ClinVar variant classifications from NCBI.",
105
+ source_url="https://www.ncbi.nlm.nih.gov/clinvar/",
106
+ commercial_ok=True,
107
+ )
108
+
109
+ def __init__(
110
+ self,
111
+ data_dir: Path,
112
+ builds: tuple[str, ...] = CLINVAR_SUPPORTED_BUILDS,
113
+ *,
114
+ include_benign: bool = False,
115
+ ) -> None:
116
+ """Resolve per-build SQLite cache paths within `data_dir`.
117
+
118
+ `builds` selects which builds this annotator instance manages.
119
+ Default is both GRCh37 and GRCh38. Passing a single-element
120
+ tuple (e.g. `("GRCh38",)`) restricts setup/refresh to that
121
+ build — used by the CLI's `--build` flag.
122
+
123
+ `include_benign` controls whether Benign/Likely_benign annotations
124
+ are emitted. Default False suppresses them (ADR-0008 amendment).
125
+ """
126
+ super().__init__(data_dir)
127
+ self._builds = tuple(builds)
128
+ self._include_benign = include_benign
129
+ for build in self._builds:
130
+ if build not in CLINVAR_SUPPORTED_BUILDS:
131
+ raise ValueError(
132
+ f"Unsupported ClinVar build {build!r}; expected one of "
133
+ f"{CLINVAR_SUPPORTED_BUILDS}"
134
+ )
135
+ self._db_paths: dict[str, Path] = {
136
+ build: data_dir / clinvar_db_filename(build) for build in self._builds
137
+ }
138
+ self._conns: dict[str, sqlite3.Connection] = {}
139
+ # ADR-0023: per-build (rsid -> single-base REF) cache. PharmGKB
140
+ # consults this as its primary non-finding filter. Built lazily
141
+ # on first lookup per build.
142
+ self._ref_lookups: dict[str, dict[str, str]] = {}
143
+
144
+ def _connection(self, build: str) -> sqlite3.Connection | None:
145
+ """Return a lazy connection to the per-build cache, or None if missing."""
146
+ if build not in self._db_paths:
147
+ return None
148
+ if build not in self._conns:
149
+ db_path = self._db_paths[build]
150
+ if not db_path.exists():
151
+ return None
152
+ self._conns[build] = sqlite3.connect(db_path)
153
+ return self._conns[build]
154
+
155
+ def setup(self) -> None:
156
+ """Download each managed build's ClinVar VCF and ingest atomically."""
157
+ for build in self._builds:
158
+ self._setup_one(build)
159
+
160
+ def _setup_one(self, build: str) -> None:
161
+ url = _manager_module.CLINVAR_URL_BY_BUILD[build]
162
+ signal = self._fetch_remote_signal_for(build)
163
+ if signal is None:
164
+ msg = (
165
+ f"clinvar ({build}): cannot verify remote freshness signal. "
166
+ "Refresh aborted to avoid persisting an incomplete cache stamp. "
167
+ "Retry, or pass --force if you accept that next `db update` "
168
+ "will re-download to re-establish the signal."
169
+ )
170
+ raise RuntimeError(msg)
171
+ vcf_path = self.data_dir / _vcf_filename_for_url(url)
172
+ download(url, vcf_path)
173
+ try:
174
+ verify_file_hash(vcf_path, "md5", signal.removeprefix("md5:"))
175
+ load_clinvar_vcf(
176
+ vcf_path,
177
+ self._db_paths[build],
178
+ source_url=url,
179
+ remote_signal=signal,
180
+ record_name=clinvar_record_name(build),
181
+ )
182
+ finally:
183
+ try:
184
+ vcf_path.unlink()
185
+ except FileNotFoundError:
186
+ pass
187
+ except OSError:
188
+ logger.warning("Could not remove staged VCF at %s", vcf_path)
189
+
190
+ def is_ready(self) -> bool:
191
+ """True iff EVERY managed build has a populated, version-stamped cache.
192
+
193
+ Checks ``local_version_tag`` for the current interpreter version.
194
+ Pre-mechanism caches (tag missing or baked into ``remote_signal``)
195
+ are self-healed once via ``stamp_existing_clinvar_cache``.
196
+ """
197
+ for build in self._builds:
198
+ info = get_database_info(self._db_paths[build], clinvar_record_name(build))
199
+ if info is None:
200
+ return False
201
+ tag = info.get("local_version_tag") or ""
202
+ if tag == f"iv:{CLINVAR_INTERPRETER_VERSION}":
203
+ continue
204
+ if stamp_existing_clinvar_cache(self._db_paths[build]):
205
+ continue
206
+ return False
207
+ return True
208
+
209
+ def version(self) -> str | None:
210
+ """Composite version string across managed builds.
211
+
212
+ Format: `"GRCh37:<v>; GRCh38:<v>"` when both present, or a
213
+ single `<build>:<v>` when only one is managed. None if none.
214
+ """
215
+ parts: list[str] = []
216
+ for build in self._builds:
217
+ info = get_database_info(self._db_paths[build], clinvar_record_name(build))
218
+ if info is not None:
219
+ parts.append(f"{build}:{info['version']}")
220
+ return "; ".join(parts) if parts else None
221
+
222
+ def record_count(self) -> int | None:
223
+ """Total record count across managed build caches, or None if none cached."""
224
+ total = 0
225
+ any_present = False
226
+ for build in self._builds:
227
+ info = get_database_info(self._db_paths[build], clinvar_record_name(build))
228
+ if info is not None:
229
+ any_present = True
230
+ total += info["record_count"]
231
+ return total if any_present else None
232
+
233
+ def close(self) -> None:
234
+ """Close all open per-build connections. Safe to call repeatedly."""
235
+ for conn in self._conns.values():
236
+ conn.close()
237
+ self._conns.clear()
238
+ self._ref_lookups.clear()
239
+
240
+ def reference_for(self, rsid: str, build: str) -> str | None:
241
+ """Return ClinVar's single-base REF allele for `rsid` in `build`, or None.
242
+
243
+ ADR-0023: PharmGKB's primary non-finding filter calls this. If the
244
+ return value matches both of the user's alleles, the user is
245
+ homozygous reference and the PharmGKB annotation is a non-finding.
246
+
247
+ Lazily builds an in-memory `(rsid -> REF)` map per build on first
248
+ call so subsequent lookups are O(1). Multi-base REFs (indels) are
249
+ skipped — array-based parsers can't call indels, so a multi-base
250
+ REF can't validly suppress a single-base genotype.
251
+
252
+ Returns None when ClinVar has no data for the rsid in this build
253
+ (or has only indel REFs). Callers fall through to secondary tiers.
254
+ """
255
+ if build not in self._db_paths:
256
+ return None
257
+ if build not in self._ref_lookups:
258
+ self._ref_lookups[build] = self._load_ref_lookup(build)
259
+ return self._ref_lookups[build].get(rsid)
260
+
261
+ def _load_ref_lookup(self, build: str) -> dict[str, str]:
262
+ """Read the per-build cache once and build the `(rsid -> REF)` map."""
263
+ conn = self._connection(build)
264
+ if conn is None:
265
+ return {}
266
+ # Single-base REFs only: indel anchor-base encoding (REF=CTT, etc.)
267
+ # can't suppress a single-base array readout. The per-build cache
268
+ # may have BOTH SNV and indel rows for the same rsid; the WHERE
269
+ # filters those out so we keep only the SNV REF.
270
+ rows = conn.execute(
271
+ "SELECT DISTINCT rsid, ref FROM clinvar_variants WHERE length(ref) = 1"
272
+ ).fetchall()
273
+ out: dict[str, str] = {}
274
+ for rsid, ref in rows:
275
+ # If a rsid has multiple single-base REFs (shouldn't happen at
276
+ # one position but defending against future data shapes), keep
277
+ # the first.
278
+ if rsid not in out:
279
+ out[rsid] = ref
280
+ return out
281
+
282
+ def fetch_remote_signal(self) -> str | None:
283
+ r"""Composite freshness signal across managed builds.
284
+
285
+ Format: `"GRCh37:md5:<hex>|GRCh38:md5:<hex>"`. Returns None if
286
+ ANY managed build's signal probe fails — the CLI then prints
287
+ "can't verify" and skips refresh per ADR-0012's policy.
288
+ """
289
+ parts: list[str] = []
290
+ for build in self._builds:
291
+ sig = self._fetch_remote_signal_for(build)
292
+ if sig is None:
293
+ return None
294
+ parts.append(f"{build}:{sig}")
295
+ return "|".join(parts) if parts else None
296
+
297
+ @staticmethod
298
+ def _fetch_remote_signal_for(build: str) -> str | None:
299
+ body = fetch_remote_text(_manager_module.CLINVAR_URL_BY_BUILD[build] + ".md5")
300
+ if not body:
301
+ return None
302
+ first_token = body.strip().split(None, 1)[0] if body.strip() else ""
303
+ if not first_token:
304
+ return None
305
+ return f"md5:{first_token}"
306
+
307
+ def cached_remote_signal(self) -> str | None:
308
+ """Composite cached signal across managed builds. None if any missing."""
309
+ parts: list[str] = []
310
+ for build in self._builds:
311
+ info = get_database_info(self._db_paths[build], clinvar_record_name(build))
312
+ if info is None or info["remote_signal"] is None:
313
+ return None
314
+ sig = info["remote_signal"]
315
+ if not sig:
316
+ return None
317
+ parts.append(f"{build}:{sig}")
318
+ return "|".join(parts) if parts else None
319
+
320
+ def annotate(self, variant: Variant) -> list[Annotation]:
321
+ """Return ClinVar annotations whose REF/ALT matches the user's genotype.
322
+
323
+ ADR-0007 carrier rule: an entry triggers only if `variant.allele1`
324
+ or `variant.allele2` equals the entry's ALT allele.
325
+ ADR-0011 indel-anchor protection: array-based parsers report
326
+ single-base genotypes; ClinVar's anchor-base indel encoding
327
+ does not match those by string equality.
328
+ ADR-0021: dispatch by `variant.build`. If the matching cache is
329
+ absent, the variant is skipped silently — the user already saw
330
+ the analyze-time build warning.
331
+ """
332
+ if variant.is_no_call:
333
+ return []
334
+ conn = self._connection(variant.build)
335
+ if conn is None:
336
+ return []
337
+ rows = conn.execute(
338
+ "SELECT chromosome, position, ref, alt, clinical_significance, "
339
+ "condition, gene, review_status, allele_id "
340
+ "FROM clinvar_variants WHERE rsid = ?",
341
+ (variant.rsid,),
342
+ ).fetchall()
343
+ annotations: list[Annotation] = []
344
+ carrier_alleles = {variant.allele1, variant.allele2}
345
+ user_is_multibase = len(variant.allele1) > 1 or len(variant.allele2) > 1
346
+ # ADR-0023: report the user's actual diploid call consistently
347
+ # across annotators, not the matched ALT base alone.
348
+ user_diploid = _user_diploid(variant)
349
+ for row in rows:
350
+ (
351
+ _chrom,
352
+ _pos,
353
+ ref,
354
+ alt,
355
+ clnsig,
356
+ condition,
357
+ gene,
358
+ review_status,
359
+ allele_id,
360
+ ) = row
361
+ clinvar_is_indel = len(ref) > 1 or len(alt) > 1
362
+ if clinvar_is_indel and not user_is_multibase:
363
+ continue
364
+ if alt not in carrier_alleles:
365
+ continue
366
+ sig_label = _normalize_clnsig(clnsig) if clnsig else "unknown"
367
+ if not self._include_benign and sig_label in _BENIGN_CLNSIGS:
368
+ continue
369
+ description = (
370
+ f"ClinVar classifies this allele as "
371
+ f"{clnsig.replace('_', ' ') if clnsig else 'unknown significance'}"
372
+ )
373
+ references = [f"clinvar:allele/{allele_id}"] if allele_id else []
374
+ annotations.append(
375
+ Annotation(
376
+ source=self.name,
377
+ rsid=variant.rsid,
378
+ significance=f"clinvar_{sig_label}",
379
+ category="clinical",
380
+ magnitude=_magnitude(clnsig),
381
+ description=description,
382
+ attribution=self.attribution,
383
+ genotype_match=user_diploid,
384
+ references=references,
385
+ condition="" if not condition or condition == "." else condition,
386
+ gene=gene or "",
387
+ review_status=review_status or "",
388
+ alt=alt,
389
+ )
390
+ )
391
+ return annotations
392
+
393
+
394
+ def _user_diploid(variant: Variant) -> str:
395
+ """Render the user's diploid call as a sorted two-letter string.
396
+
397
+ Used by ClinVar and PharmGKB so the report's "Genotype" column shows
398
+ the same shape for every annotation regardless of source (ADR-0023).
399
+ SNV: `("G", "A") -> "AG"`. Indel passthrough is verbatim.
400
+ """
401
+ a1, a2 = variant.allele1, variant.allele2
402
+ if len(a1) == 1 and len(a2) == 1:
403
+ return "".join(sorted((a1, a2)))
404
+ return f"{a1}/{a2}"