allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
allelix/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Allelix: open-source genotype analysis toolkit."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
__version__ = version("allelix")
|
|
11
|
+
except PackageNotFoundError:
|
|
12
|
+
__version__ = "0.0.0+local"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Annotator registry. Unlike parsers, ALL annotators run on every variant."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from allelix.annotators.alphamissense import AlphaMissenseAnnotator
|
|
10
|
+
from allelix.annotators.base import Annotator
|
|
11
|
+
from allelix.annotators.cadd import CaddAnnotator
|
|
12
|
+
from allelix.annotators.clinvar import CLINVAR_SUPPORTED_BUILDS, ClinVarAnnotator
|
|
13
|
+
from allelix.annotators.gnomad import GnomadAnnotator
|
|
14
|
+
from allelix.annotators.gwas import GWASCatalogAnnotator
|
|
15
|
+
from allelix.annotators.pharmgkb import PharmGKBAnnotator
|
|
16
|
+
from allelix.annotators.snpedia import SNPediaAnnotator
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_annotators(
|
|
23
|
+
data_dir: Path,
|
|
24
|
+
clinvar_builds: tuple[str, ...] = CLINVAR_SUPPORTED_BUILDS,
|
|
25
|
+
*,
|
|
26
|
+
include_benign: bool = False,
|
|
27
|
+
gwas_filter_traits: bool = True,
|
|
28
|
+
cadd_full: bool = False,
|
|
29
|
+
) -> list[Annotator]:
|
|
30
|
+
"""Construct all registered annotators bound to the given data directory.
|
|
31
|
+
|
|
32
|
+
`clinvar_builds` selects which ClinVar builds are managed by this
|
|
33
|
+
process. Default is both GRCh37 and GRCh38 (per ADR-0021). The CLI
|
|
34
|
+
narrows it via `db update --build grch37|grch38`.
|
|
35
|
+
|
|
36
|
+
`include_benign` passes through to ClinVarAnnotator. Default False
|
|
37
|
+
suppresses Benign/Likely_benign annotations (ADR-0008 amendment).
|
|
38
|
+
|
|
39
|
+
`gwas_filter_traits` passes through to GWASCatalogAnnotator. Default
|
|
40
|
+
True excludes common-trait noise categories (ADR-0024 amendment).
|
|
41
|
+
|
|
42
|
+
`cadd_full` enables CADD full mode (tabix queries against the
|
|
43
|
+
complete 81 GB CADD file). Requires ``pysam`` and a local copy.
|
|
44
|
+
|
|
45
|
+
ADR-0023: ClinVar's `reference_for(rsid, build)` is wired into
|
|
46
|
+
PharmGKB and SNPedia as the primary hom-ref suppression filter — the
|
|
47
|
+
REF allele lookup universally determines whether the user is
|
|
48
|
+
homozygous reference (and thus a non-finding for that variant).
|
|
49
|
+
"""
|
|
50
|
+
clinvar = ClinVarAnnotator(data_dir, builds=clinvar_builds, include_benign=include_benign)
|
|
51
|
+
pharmgkb = PharmGKBAnnotator(data_dir, clinvar_ref_provider=clinvar.reference_for)
|
|
52
|
+
gwas = GWASCatalogAnnotator(data_dir, filter_traits=gwas_filter_traits)
|
|
53
|
+
snpedia = SNPediaAnnotator(data_dir, clinvar_ref_provider=clinvar.reference_for)
|
|
54
|
+
gnomad = GnomadAnnotator(data_dir)
|
|
55
|
+
alphamissense = AlphaMissenseAnnotator(data_dir)
|
|
56
|
+
cadd = CaddAnnotator(data_dir, full_mode=cadd_full)
|
|
57
|
+
return [clinvar, pharmgkb, gwas, snpedia, gnomad, alphamissense, cadd]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_ANNOTATOR_CLASSES: dict[str, type[Annotator]] = {
|
|
61
|
+
cls.name: cls
|
|
62
|
+
for cls in [
|
|
63
|
+
ClinVarAnnotator,
|
|
64
|
+
PharmGKBAnnotator,
|
|
65
|
+
GWASCatalogAnnotator,
|
|
66
|
+
SNPediaAnnotator,
|
|
67
|
+
GnomadAnnotator,
|
|
68
|
+
AlphaMissenseAnnotator,
|
|
69
|
+
CaddAnnotator,
|
|
70
|
+
]
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_annotator_class(name: str) -> type[Annotator] | None:
|
|
75
|
+
"""Return the annotator class for a given source name, or None."""
|
|
76
|
+
return _ANNOTATOR_CLASSES.get(name)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
__all__ = [
|
|
80
|
+
"AlphaMissenseAnnotator",
|
|
81
|
+
"Annotator",
|
|
82
|
+
"CaddAnnotator",
|
|
83
|
+
"ClinVarAnnotator",
|
|
84
|
+
"GWASCatalogAnnotator",
|
|
85
|
+
"GnomadAnnotator",
|
|
86
|
+
"PharmGKBAnnotator",
|
|
87
|
+
"SNPediaAnnotator",
|
|
88
|
+
"get_annotator_class",
|
|
89
|
+
"get_annotators",
|
|
90
|
+
]
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""AlphaMissense variant pathogenicity enrichment.
|
|
4
|
+
|
|
5
|
+
AlphaMissense is not a clinical annotator — it does not produce
|
|
6
|
+
Annotation objects. It enriches existing annotations with missense
|
|
7
|
+
variant pathogenicity predictions. The pipeline calls
|
|
8
|
+
``bulk_lookup()`` after all annotators have run, and stamps each
|
|
9
|
+
annotation's ``am_pathogenicity`` and ``am_class`` fields.
|
|
10
|
+
|
|
11
|
+
License: CC BY 4.0. Attribution: Cheng et al., Science 2023
|
|
12
|
+
(doi:10.1126/science.adg7492).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import sqlite3
|
|
19
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
20
|
+
|
|
21
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor
|
|
22
|
+
from allelix.databases._versions import ALPHAMISSENSE_SCHEMA_VERSION
|
|
23
|
+
from allelix.databases.alphamissense_loader import (
|
|
24
|
+
ALPHAMISSENSE_CACHE_URL,
|
|
25
|
+
ALPHAMISSENSE_DB_FILENAME,
|
|
26
|
+
ALPHAMISSENSE_EXPECTED_SHA256,
|
|
27
|
+
install_prebuilt_cache,
|
|
28
|
+
)
|
|
29
|
+
from allelix.databases.gnomad_loader import GNOMAD_DB_FILENAME
|
|
30
|
+
from allelix.databases.manager import (
|
|
31
|
+
download,
|
|
32
|
+
get_database_info,
|
|
33
|
+
verify_file_hash,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
|
|
39
|
+
from allelix.models import Annotation, Variant
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
_BULK_BATCH_SIZE = 900
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AlphaMissenseAnnotator(Annotator):
|
|
47
|
+
"""Missense variant pathogenicity enrichment from AlphaMissense.
|
|
48
|
+
|
|
49
|
+
Subclasses Annotator for ``db update`` / ``db status`` / ``is_ready()``
|
|
50
|
+
integration. ``annotate()`` always returns ``[]`` — AlphaMissense does
|
|
51
|
+
not participate in the per-variant annotation loop.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
name: ClassVar[str] = "alphamissense"
|
|
55
|
+
display_name: ClassVar[str] = "AlphaMissense"
|
|
56
|
+
attribution: ClassVar[str] = "AlphaMissense"
|
|
57
|
+
requires_download: ClassVar[bool] = True
|
|
58
|
+
server_driven_freshness: ClassVar[bool] = False
|
|
59
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
60
|
+
spdx="CC-BY-4.0",
|
|
61
|
+
license_url="https://creativecommons.org/licenses/by/4.0/",
|
|
62
|
+
attribution_text=(
|
|
63
|
+
"AlphaMissense predictions from Cheng et al., Science 2023"
|
|
64
|
+
" (doi:10.1126/science.adg7492). Licensed under CC BY 4.0."
|
|
65
|
+
),
|
|
66
|
+
source_url="https://zenodo.org/records/10813168",
|
|
67
|
+
citation="Cheng et al., Science 2023 (doi:10.1126/science.adg7492)",
|
|
68
|
+
commercial_ok=True,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def __init__(self, data_dir: Path) -> None:
|
|
72
|
+
"""Bind to the data directory."""
|
|
73
|
+
super().__init__(data_dir)
|
|
74
|
+
self._db_path = data_dir / ALPHAMISSENSE_DB_FILENAME
|
|
75
|
+
self._conn: sqlite3.Connection | None = None
|
|
76
|
+
|
|
77
|
+
def _connection(self) -> sqlite3.Connection:
|
|
78
|
+
if self._conn is None:
|
|
79
|
+
if not self._db_path.exists():
|
|
80
|
+
raise FileNotFoundError(
|
|
81
|
+
f"AlphaMissense cache not found at {self._db_path}. "
|
|
82
|
+
"Run `allelix db update` first."
|
|
83
|
+
)
|
|
84
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
85
|
+
self._check_gnomad_version()
|
|
86
|
+
return self._conn
|
|
87
|
+
|
|
88
|
+
def _check_gnomad_version(self) -> None:
|
|
89
|
+
"""Warn if the gnomAD version used to build the AM cache differs from installed."""
|
|
90
|
+
assert self._conn is not None
|
|
91
|
+
row = self._conn.execute(
|
|
92
|
+
"SELECT version FROM database_versions WHERE name = 'alphamissense_gnomad_source'"
|
|
93
|
+
).fetchone()
|
|
94
|
+
if row is None:
|
|
95
|
+
return
|
|
96
|
+
stamped = row[0]
|
|
97
|
+
if stamped == "no_gnomad":
|
|
98
|
+
logger.warning(
|
|
99
|
+
"AlphaMissense cache was built without gnomAD (--no-gnomad). "
|
|
100
|
+
"rsID lookups will return no results."
|
|
101
|
+
)
|
|
102
|
+
return
|
|
103
|
+
gnomad_info = get_database_info(self.data_dir / GNOMAD_DB_FILENAME, "gnomad")
|
|
104
|
+
if gnomad_info is None:
|
|
105
|
+
return
|
|
106
|
+
installed = gnomad_info["version"]
|
|
107
|
+
if installed and stamped != installed:
|
|
108
|
+
logger.warning(
|
|
109
|
+
"AlphaMissense cache was built against gnomAD %s but installed "
|
|
110
|
+
"gnomAD is %s. rsID mappings may be stale. Rebuild with: "
|
|
111
|
+
"python scripts/build_alphamissense_cache.py",
|
|
112
|
+
stamped,
|
|
113
|
+
installed,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def setup(self) -> None:
|
|
117
|
+
"""Download the pre-built AlphaMissense cache from HuggingFace."""
|
|
118
|
+
gz_path = self.data_dir / "alphamissense.sqlite.gz"
|
|
119
|
+
download(ALPHAMISSENSE_CACHE_URL, gz_path)
|
|
120
|
+
verify_file_hash(gz_path, "sha256", ALPHAMISSENSE_EXPECTED_SHA256)
|
|
121
|
+
install_prebuilt_cache(
|
|
122
|
+
gz_path,
|
|
123
|
+
self._db_path,
|
|
124
|
+
source_url=ALPHAMISSENSE_CACHE_URL,
|
|
125
|
+
)
|
|
126
|
+
try:
|
|
127
|
+
gz_path.unlink()
|
|
128
|
+
except OSError:
|
|
129
|
+
logger.warning("Could not remove staged file at %s", gz_path)
|
|
130
|
+
|
|
131
|
+
def is_ready(self) -> bool:
|
|
132
|
+
"""True when the AlphaMissense SQLite cache exists with current schema version."""
|
|
133
|
+
info = get_database_info(self._db_path, "alphamissense")
|
|
134
|
+
if info is None:
|
|
135
|
+
return False
|
|
136
|
+
tag = info.get("local_version_tag") or ""
|
|
137
|
+
return tag == f"sv:{ALPHAMISSENSE_SCHEMA_VERSION}" or not tag
|
|
138
|
+
|
|
139
|
+
def version(self) -> str | None:
|
|
140
|
+
"""Return the cached database version, or None."""
|
|
141
|
+
info = get_database_info(self._db_path, "alphamissense")
|
|
142
|
+
return info["version"] if info else None
|
|
143
|
+
|
|
144
|
+
def record_count(self) -> int | None:
|
|
145
|
+
"""Return the number of variants in the cache, or None."""
|
|
146
|
+
info = get_database_info(self._db_path, "alphamissense")
|
|
147
|
+
return info["record_count"] if info else None
|
|
148
|
+
|
|
149
|
+
def close(self) -> None:
|
|
150
|
+
"""Close the SQLite connection if open."""
|
|
151
|
+
if self._conn is not None:
|
|
152
|
+
self._conn.close()
|
|
153
|
+
self._conn = None
|
|
154
|
+
|
|
155
|
+
def fetch_remote_signal(self) -> str | None:
|
|
156
|
+
"""Code-driven source — no runtime freshness probe (ADR-0030)."""
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
def cached_remote_signal(self) -> str | None:
|
|
160
|
+
"""Code-driven source — no cached signal to compare (ADR-0030)."""
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
164
|
+
"""Not used — AlphaMissense enriches, does not annotate. Always returns []."""
|
|
165
|
+
return []
|
|
166
|
+
|
|
167
|
+
def lookup(self, rsid: str) -> tuple[float, str] | None:
|
|
168
|
+
"""Return (am_pathogenicity, am_class) for a single rsID, or None."""
|
|
169
|
+
conn = self._connection()
|
|
170
|
+
row = conn.execute(
|
|
171
|
+
"SELECT MAX(am_pathogenicity), am_class FROM alphamissense_scores WHERE rsid = ?",
|
|
172
|
+
(rsid,),
|
|
173
|
+
).fetchone()
|
|
174
|
+
if row is None or row[0] is None:
|
|
175
|
+
return None
|
|
176
|
+
return (row[0], row[1])
|
|
177
|
+
|
|
178
|
+
def bulk_lookup(self, rsids: set[str]) -> dict[str, tuple[float, str]]:
|
|
179
|
+
"""Return ``{rsid: (am_pathogenicity, am_class)}`` for found rsIDs.
|
|
180
|
+
|
|
181
|
+
Fallback for annotations without a known alt allele. Uses MAX to
|
|
182
|
+
resolve multi-allelic sites. Prefer ``bulk_lookup_by_alt`` when alt
|
|
183
|
+
is available.
|
|
184
|
+
|
|
185
|
+
Batches into chunks of 900 to stay within SQLite's variable limit.
|
|
186
|
+
"""
|
|
187
|
+
if not rsids:
|
|
188
|
+
return {}
|
|
189
|
+
conn = self._connection()
|
|
190
|
+
result: dict[str, tuple[float, str]] = {}
|
|
191
|
+
rsid_list = list(rsids)
|
|
192
|
+
for i in range(0, len(rsid_list), _BULK_BATCH_SIZE):
|
|
193
|
+
batch = rsid_list[i : i + _BULK_BATCH_SIZE]
|
|
194
|
+
placeholders = ",".join("?" * len(batch))
|
|
195
|
+
rows = conn.execute(
|
|
196
|
+
f"SELECT rsid, MAX(am_pathogenicity), am_class"
|
|
197
|
+
f" FROM alphamissense_scores"
|
|
198
|
+
f" WHERE rsid IN ({placeholders}) GROUP BY rsid",
|
|
199
|
+
batch,
|
|
200
|
+
).fetchall()
|
|
201
|
+
for rsid, score, cls in rows:
|
|
202
|
+
if score is not None:
|
|
203
|
+
result[rsid] = (score, cls)
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
def bulk_lookup_by_alt(
|
|
207
|
+
self, keys: set[tuple[str, str]]
|
|
208
|
+
) -> dict[tuple[str, str], tuple[float, str]]:
|
|
209
|
+
"""Return ``{(rsid, alt): (am_pathogenicity, am_class)}`` for exact matches."""
|
|
210
|
+
if not keys:
|
|
211
|
+
return {}
|
|
212
|
+
conn = self._connection()
|
|
213
|
+
result: dict[tuple[str, str], tuple[float, str]] = {}
|
|
214
|
+
key_list = list(keys)
|
|
215
|
+
batch_size = _BULK_BATCH_SIZE // 2
|
|
216
|
+
for i in range(0, len(key_list), batch_size):
|
|
217
|
+
batch = key_list[i : i + batch_size]
|
|
218
|
+
clauses = " OR ".join(["(rsid = ? AND alt = ?)"] * len(batch))
|
|
219
|
+
params = [v for rsid, alt in batch for v in (rsid, alt)]
|
|
220
|
+
rows = conn.execute(
|
|
221
|
+
f"SELECT rsid, alt, am_pathogenicity, am_class"
|
|
222
|
+
f" FROM alphamissense_scores WHERE {clauses}",
|
|
223
|
+
params,
|
|
224
|
+
).fetchall()
|
|
225
|
+
for rsid, alt, score, cls in rows:
|
|
226
|
+
if score is not None:
|
|
227
|
+
result[(rsid, alt)] = (score, cls)
|
|
228
|
+
return result
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Abstract base class for reference-database annotators."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import contextlib
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum, auto
|
|
11
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from types import TracebackType
|
|
17
|
+
|
|
18
|
+
from allelix.models import Annotation, Variant
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class LicenseDescriptor:
|
|
23
|
+
"""Single source of truth for a data source's license terms."""
|
|
24
|
+
|
|
25
|
+
spdx: str
|
|
26
|
+
license_url: str
|
|
27
|
+
attribution_text: str
|
|
28
|
+
source_url: str | None = None
|
|
29
|
+
citation: str | None = None
|
|
30
|
+
commercial_ok: bool | None = None
|
|
31
|
+
licensable: bool = False
|
|
32
|
+
purchase_url: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_NON_COMMERCIAL_SPDX: frozenset[str] = frozenset(
|
|
36
|
+
{
|
|
37
|
+
"CC-BY-NC-SA-3.0-US",
|
|
38
|
+
"CC-BY-NC-SA-4.0",
|
|
39
|
+
"CC-BY-NC-4.0",
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def is_non_commercial(descriptor: LicenseDescriptor) -> bool:
|
|
45
|
+
"""Return True if the license prohibits commercial use."""
|
|
46
|
+
if descriptor.commercial_ok is not None:
|
|
47
|
+
return not descriptor.commercial_ok
|
|
48
|
+
return descriptor.spdx in _NON_COMMERCIAL_SPDX
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class Permission(Enum):
|
|
52
|
+
"""Three-state permission result for a source in the current license context."""
|
|
53
|
+
|
|
54
|
+
ALLOW = auto()
|
|
55
|
+
BLOCK_FINAL = auto()
|
|
56
|
+
BLOCK_PURCHASABLE = auto()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def permission(
|
|
60
|
+
descriptor: LicenseDescriptor,
|
|
61
|
+
*,
|
|
62
|
+
commercial: bool,
|
|
63
|
+
license_held: bool,
|
|
64
|
+
) -> Permission:
|
|
65
|
+
"""Determine whether a source is permitted under the current license context."""
|
|
66
|
+
if not commercial:
|
|
67
|
+
return Permission.ALLOW
|
|
68
|
+
if not is_non_commercial(descriptor):
|
|
69
|
+
return Permission.ALLOW
|
|
70
|
+
if not descriptor.licensable:
|
|
71
|
+
return Permission.BLOCK_FINAL
|
|
72
|
+
if license_held:
|
|
73
|
+
return Permission.ALLOW
|
|
74
|
+
return Permission.BLOCK_PURCHASABLE
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def is_clinvar_homref(
|
|
78
|
+
variant: Variant,
|
|
79
|
+
clinvar_ref_provider: Callable[[str, str], str | None] | None,
|
|
80
|
+
) -> bool:
|
|
81
|
+
"""Return True if the variant is homozygous reference per ClinVar (ADR-0023)."""
|
|
82
|
+
if clinvar_ref_provider is None:
|
|
83
|
+
return False
|
|
84
|
+
ref = clinvar_ref_provider(variant.rsid, variant.build)
|
|
85
|
+
return ref is not None and len(ref) == 1 and variant.allele1 == ref and variant.allele2 == ref
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class Annotator(ABC):
|
|
89
|
+
"""Base class for all reference database annotators.
|
|
90
|
+
|
|
91
|
+
Annotators bind to a `data_dir` at construction. `setup()` performs the
|
|
92
|
+
one-time download/parse into the cache. `is_ready()` reports whether the
|
|
93
|
+
cache exists and is queryable. `annotate(variant)` returns zero or more
|
|
94
|
+
`Annotation` objects for the variant — checking both rsid AND genotype, per
|
|
95
|
+
the regulatory posture (ADR-0003) and the genotype-matching rule (ADR-0007).
|
|
96
|
+
|
|
97
|
+
Annotators hold resources (SQLite connections, file handles). Always close
|
|
98
|
+
them via `close()` or the context manager protocol; the CLI uses
|
|
99
|
+
`contextlib.ExitStack` to guarantee deterministic cleanup.
|
|
100
|
+
|
|
101
|
+
Attributes:
|
|
102
|
+
name: Lowercase identifier (e.g., "clinvar").
|
|
103
|
+
display_name: Human-readable name ("ClinVar").
|
|
104
|
+
attribution: Display label used in user-facing reports ("ClinVar").
|
|
105
|
+
Equal to `display_name` for first-party single-source annotators.
|
|
106
|
+
requires_download: Whether `setup()` needs network/disk space.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
name: ClassVar[str]
|
|
110
|
+
display_name: ClassVar[str]
|
|
111
|
+
attribution: ClassVar[str]
|
|
112
|
+
requires_download: ClassVar[bool] = True
|
|
113
|
+
server_driven_freshness: ClassVar[bool] = True
|
|
114
|
+
license: ClassVar[LicenseDescriptor]
|
|
115
|
+
|
|
116
|
+
def __init_subclass__(cls, **kwargs: object) -> None:
|
|
117
|
+
"""Enforce required ClassVars at subclass definition time."""
|
|
118
|
+
super().__init_subclass__(**kwargs)
|
|
119
|
+
is_abstract = any(getattr(v, "__isabstractmethod__", False) for v in cls.__dict__.values())
|
|
120
|
+
if not is_abstract and not hasattr(cls, "license"):
|
|
121
|
+
msg = f"{cls.__name__} must declare a 'license' ClassVar of type LicenseDescriptor"
|
|
122
|
+
raise TypeError(msg)
|
|
123
|
+
if hasattr(cls, "license"):
|
|
124
|
+
desc = cls.license
|
|
125
|
+
if (
|
|
126
|
+
desc.spdx.startswith("LicenseRef-") or desc.spdx.startswith("custom-")
|
|
127
|
+
) and desc.commercial_ok is None:
|
|
128
|
+
msg = (
|
|
129
|
+
f"{cls.__name__} uses custom SPDX '{desc.spdx}' but "
|
|
130
|
+
f"does not declare commercial_ok (True or False)"
|
|
131
|
+
)
|
|
132
|
+
raise TypeError(msg)
|
|
133
|
+
if desc.licensable and desc.purchase_url is None:
|
|
134
|
+
msg = (
|
|
135
|
+
f"{cls.__name__} declares licensable=True but "
|
|
136
|
+
f"purchase_url is None — set it explicitly"
|
|
137
|
+
)
|
|
138
|
+
raise TypeError(msg)
|
|
139
|
+
|
|
140
|
+
def __init__(self, data_dir: Path) -> None:
|
|
141
|
+
"""Bind the annotator to a data directory (created elsewhere)."""
|
|
142
|
+
self.data_dir = data_dir
|
|
143
|
+
|
|
144
|
+
def __del__(self) -> None:
|
|
145
|
+
"""Release resources on GC to prevent ResourceWarning."""
|
|
146
|
+
with contextlib.suppress(Exception):
|
|
147
|
+
self.close()
|
|
148
|
+
|
|
149
|
+
def __enter__(self) -> Annotator:
|
|
150
|
+
"""Return self for `with` usage."""
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def __exit__(
|
|
154
|
+
self,
|
|
155
|
+
exc_type: type[BaseException] | None,
|
|
156
|
+
exc_val: BaseException | None,
|
|
157
|
+
exc_tb: TracebackType | None,
|
|
158
|
+
) -> None:
|
|
159
|
+
"""Release any open resources via `close()`."""
|
|
160
|
+
self.close()
|
|
161
|
+
|
|
162
|
+
@abstractmethod
|
|
163
|
+
def setup(self) -> None:
|
|
164
|
+
"""Download and prepare the reference database. Idempotent."""
|
|
165
|
+
...
|
|
166
|
+
|
|
167
|
+
@abstractmethod
|
|
168
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
169
|
+
"""Return all annotations for this variant.
|
|
170
|
+
|
|
171
|
+
Implementations MUST verify both rsid AND genotype — presence in the
|
|
172
|
+
database is not enough. The user must carry the flagged allele.
|
|
173
|
+
"""
|
|
174
|
+
...
|
|
175
|
+
|
|
176
|
+
@abstractmethod
|
|
177
|
+
def is_ready(self) -> bool:
|
|
178
|
+
"""Whether the local cache exists and is queryable."""
|
|
179
|
+
...
|
|
180
|
+
|
|
181
|
+
@abstractmethod
|
|
182
|
+
def version(self) -> str | None:
|
|
183
|
+
"""Return the cached database version, or None if not set up."""
|
|
184
|
+
...
|
|
185
|
+
|
|
186
|
+
@abstractmethod
|
|
187
|
+
def close(self) -> None:
|
|
188
|
+
"""Release any open resources (database connections, file handles)."""
|
|
189
|
+
...
|
|
190
|
+
|
|
191
|
+
@abstractmethod
|
|
192
|
+
def fetch_remote_signal(self) -> str | None:
|
|
193
|
+
"""Fetch a small remote freshness signal (md5 hash, ETag, Last-Modified).
|
|
194
|
+
|
|
195
|
+
Implementations MUST return a prefixed, opaque string (e.g.
|
|
196
|
+
`"md5:abcdef…"`, `"lm:Wed, 21 Oct 2025 …"`, `"etag:…"`) so that a
|
|
197
|
+
future server-side switch in signal type triggers a refresh
|
|
198
|
+
rather than a silent miss.
|
|
199
|
+
|
|
200
|
+
Returns None on any failure (network error, timeout, missing
|
|
201
|
+
header, source doesn't expose a signal). Never raises — `db update`
|
|
202
|
+
treats None as "can't verify freshness" and falls through to skip
|
|
203
|
+
with a notice. See ADR-0012.
|
|
204
|
+
"""
|
|
205
|
+
...
|
|
206
|
+
|
|
207
|
+
@abstractmethod
|
|
208
|
+
def cached_remote_signal(self) -> str | None:
|
|
209
|
+
"""Return the remote signal stored at last successful download, or None.
|
|
210
|
+
|
|
211
|
+
Returns None if the cache is missing entirely OR if the cache was
|
|
212
|
+
written by a pre-v0.4.2 release that didn't capture signals.
|
|
213
|
+
"""
|
|
214
|
+
...
|