allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""gnomAD population frequency enrichment.
|
|
4
|
+
|
|
5
|
+
gnomAD is not a clinical annotator — it does not produce Annotation
|
|
6
|
+
objects. It enriches existing annotations with population allele
|
|
7
|
+
frequency context. The pipeline calls ``bulk_lookup()`` after all
|
|
8
|
+
annotators have run, and stamps each annotation's ``allele_frequency``
|
|
9
|
+
field.
|
|
10
|
+
|
|
11
|
+
License: ODbL v1.0 (Open Database License). We extract only rsID +
|
|
12
|
+
allele frequencies (no SpliceAI or other restrictively licensed fields).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import sqlite3
|
|
19
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
20
|
+
|
|
21
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor
|
|
22
|
+
from allelix.databases._versions import GNOMAD_SCHEMA_VERSION
|
|
23
|
+
from allelix.databases.gnomad_loader import (
|
|
24
|
+
GNOMAD_CACHE_URL,
|
|
25
|
+
GNOMAD_DB_FILENAME,
|
|
26
|
+
GNOMAD_EXPECTED_SHA256,
|
|
27
|
+
install_prebuilt_cache,
|
|
28
|
+
)
|
|
29
|
+
from allelix.databases.manager import (
|
|
30
|
+
download,
|
|
31
|
+
get_database_info,
|
|
32
|
+
verify_file_hash,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
|
|
38
|
+
from allelix.models import Annotation, Variant
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
_BULK_BATCH_SIZE = 900
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class GnomadAnnotator(Annotator):
|
|
46
|
+
"""Population frequency enrichment from gnomAD.
|
|
47
|
+
|
|
48
|
+
Subclasses Annotator for ``db update`` / ``db status`` / ``is_ready()``
|
|
49
|
+
integration. ``annotate()`` always returns ``[]`` — gnomAD does not
|
|
50
|
+
participate in the per-variant annotation loop.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
name: ClassVar[str] = "gnomad"
|
|
54
|
+
display_name: ClassVar[str] = "gnomAD"
|
|
55
|
+
attribution: ClassVar[str] = "gnomAD"
|
|
56
|
+
requires_download: ClassVar[bool] = True
|
|
57
|
+
server_driven_freshness: ClassVar[bool] = False
|
|
58
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
59
|
+
spdx="ODbL-1.0",
|
|
60
|
+
license_url="https://opendatacommons.org/licenses/odbl/1-0/",
|
|
61
|
+
attribution_text=("Population frequencies sourced from gnomAD, used under ODbL v1.0."),
|
|
62
|
+
source_url="https://gnomad.broadinstitute.org",
|
|
63
|
+
commercial_ok=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def __init__(self, data_dir: Path) -> None:
|
|
67
|
+
"""Bind to the data directory."""
|
|
68
|
+
super().__init__(data_dir)
|
|
69
|
+
self._db_path = data_dir / GNOMAD_DB_FILENAME
|
|
70
|
+
self._conn: sqlite3.Connection | None = None
|
|
71
|
+
|
|
72
|
+
def _connection(self) -> sqlite3.Connection:
|
|
73
|
+
if self._conn is None:
|
|
74
|
+
if not self._db_path.exists():
|
|
75
|
+
raise FileNotFoundError(
|
|
76
|
+
f"gnomAD cache not found at {self._db_path}. Run `allelix db update` first."
|
|
77
|
+
)
|
|
78
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
79
|
+
return self._conn
|
|
80
|
+
|
|
81
|
+
def setup(self) -> None:
|
|
82
|
+
"""Download the pre-built gnomAD exome frequency cache from HuggingFace."""
|
|
83
|
+
gz_path = self.data_dir / "gnomad.sqlite.gz"
|
|
84
|
+
download(GNOMAD_CACHE_URL, gz_path)
|
|
85
|
+
verify_file_hash(gz_path, "sha256", GNOMAD_EXPECTED_SHA256)
|
|
86
|
+
install_prebuilt_cache(
|
|
87
|
+
gz_path,
|
|
88
|
+
self._db_path,
|
|
89
|
+
source_url=GNOMAD_CACHE_URL,
|
|
90
|
+
)
|
|
91
|
+
try:
|
|
92
|
+
gz_path.unlink()
|
|
93
|
+
except OSError:
|
|
94
|
+
logger.warning("Could not remove staged file at %s", gz_path)
|
|
95
|
+
|
|
96
|
+
def is_ready(self) -> bool:
|
|
97
|
+
"""True when the gnomAD SQLite cache exists with current schema version."""
|
|
98
|
+
info = get_database_info(self._db_path, "gnomad")
|
|
99
|
+
if info is None:
|
|
100
|
+
return False
|
|
101
|
+
tag = info.get("local_version_tag") or ""
|
|
102
|
+
return tag == f"sv:{GNOMAD_SCHEMA_VERSION}" or not tag
|
|
103
|
+
|
|
104
|
+
def version(self) -> str | None:
|
|
105
|
+
"""Return the cached database version, or None."""
|
|
106
|
+
info = get_database_info(self._db_path, "gnomad")
|
|
107
|
+
return info["version"] if info else None
|
|
108
|
+
|
|
109
|
+
def record_count(self) -> int | None:
|
|
110
|
+
"""Return the number of rsIDs in the cache, or None."""
|
|
111
|
+
info = get_database_info(self._db_path, "gnomad")
|
|
112
|
+
return info["record_count"] if info else None
|
|
113
|
+
|
|
114
|
+
def close(self) -> None:
|
|
115
|
+
"""Close the SQLite connection if open."""
|
|
116
|
+
if self._conn is not None:
|
|
117
|
+
self._conn.close()
|
|
118
|
+
self._conn = None
|
|
119
|
+
|
|
120
|
+
def fetch_remote_signal(self) -> str | None:
|
|
121
|
+
"""Code-driven source — no runtime freshness probe (ADR-0030)."""
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
def cached_remote_signal(self) -> str | None:
|
|
125
|
+
"""Code-driven source — no cached signal to compare (ADR-0030)."""
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
129
|
+
"""Not used — gnomAD enriches, does not annotate. Always returns []."""
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
def lookup(self, rsid: str) -> float | None:
|
|
133
|
+
"""Return global allele frequency for a single rsID, or None."""
|
|
134
|
+
conn = self._connection()
|
|
135
|
+
row = conn.execute(
|
|
136
|
+
"SELECT MAX(af) FROM gnomad_frequencies WHERE rsid = ?", (rsid,)
|
|
137
|
+
).fetchone()
|
|
138
|
+
return row[0] if row else None
|
|
139
|
+
|
|
140
|
+
def bulk_lookup(self, rsids: set[str]) -> dict[str, float]:
|
|
141
|
+
"""Return ``{rsid: af}`` for all rsIDs found in the cache.
|
|
142
|
+
|
|
143
|
+
Fallback for annotations without a known alt allele. Uses MAX to
|
|
144
|
+
resolve multi-allelic sites. Prefer ``bulk_lookup_by_alt`` when alt
|
|
145
|
+
is available.
|
|
146
|
+
|
|
147
|
+
Batches into chunks of 900 to stay within SQLite's variable limit.
|
|
148
|
+
"""
|
|
149
|
+
if not rsids:
|
|
150
|
+
return {}
|
|
151
|
+
conn = self._connection()
|
|
152
|
+
result: dict[str, float] = {}
|
|
153
|
+
rsid_list = list(rsids)
|
|
154
|
+
for i in range(0, len(rsid_list), _BULK_BATCH_SIZE):
|
|
155
|
+
batch = rsid_list[i : i + _BULK_BATCH_SIZE]
|
|
156
|
+
placeholders = ",".join("?" * len(batch))
|
|
157
|
+
rows = conn.execute(
|
|
158
|
+
f"SELECT rsid, MAX(af) FROM gnomad_frequencies"
|
|
159
|
+
f" WHERE rsid IN ({placeholders}) GROUP BY rsid",
|
|
160
|
+
batch,
|
|
161
|
+
).fetchall()
|
|
162
|
+
for rsid, af in rows:
|
|
163
|
+
if af is not None:
|
|
164
|
+
result[rsid] = af
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
def bulk_resolve_coordinates(
|
|
168
|
+
self, rsids: set[str]
|
|
169
|
+
) -> dict[str, list[tuple[str, int, str, str]]]:
|
|
170
|
+
"""Return ``{rsid: [(chrom, pos, ref, alt), ...]}`` from the gnomAD cache.
|
|
171
|
+
|
|
172
|
+
Maps rsIDs to genomic coordinates for coordinate-based lookups
|
|
173
|
+
(CADD, future VCF-keyed sources). Multi-allelic sites return
|
|
174
|
+
multiple tuples per rsid.
|
|
175
|
+
"""
|
|
176
|
+
if not rsids:
|
|
177
|
+
return {}
|
|
178
|
+
conn = self._connection()
|
|
179
|
+
result: dict[str, list[tuple[str, int, str, str]]] = {}
|
|
180
|
+
rsid_list = list(rsids)
|
|
181
|
+
for i in range(0, len(rsid_list), _BULK_BATCH_SIZE):
|
|
182
|
+
batch = rsid_list[i : i + _BULK_BATCH_SIZE]
|
|
183
|
+
placeholders = ",".join("?" * len(batch))
|
|
184
|
+
rows = conn.execute(
|
|
185
|
+
f"SELECT rsid, chrom, pos, ref, alt FROM gnomad_frequencies"
|
|
186
|
+
f" WHERE rsid IN ({placeholders})",
|
|
187
|
+
batch,
|
|
188
|
+
).fetchall()
|
|
189
|
+
for rsid, chrom, pos, ref, alt in rows:
|
|
190
|
+
result.setdefault(rsid, []).append((chrom, pos, ref, alt))
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
def bulk_lookup_by_alt(self, keys: set[tuple[str, str]]) -> dict[tuple[str, str], float]:
|
|
194
|
+
"""Return ``{(rsid, alt): af}`` for exact allele matches."""
|
|
195
|
+
if not keys:
|
|
196
|
+
return {}
|
|
197
|
+
conn = self._connection()
|
|
198
|
+
result: dict[tuple[str, str], float] = {}
|
|
199
|
+
key_list = list(keys)
|
|
200
|
+
batch_size = _BULK_BATCH_SIZE // 2
|
|
201
|
+
for i in range(0, len(key_list), batch_size):
|
|
202
|
+
batch = key_list[i : i + batch_size]
|
|
203
|
+
clauses = " OR ".join(["(rsid = ? AND alt = ?)"] * len(batch))
|
|
204
|
+
params = [v for rsid, alt in batch for v in (rsid, alt)]
|
|
205
|
+
rows = conn.execute(
|
|
206
|
+
f"SELECT rsid, alt, af FROM gnomad_frequencies WHERE {clauses}",
|
|
207
|
+
params,
|
|
208
|
+
).fetchall()
|
|
209
|
+
for rsid, alt, af in rows:
|
|
210
|
+
if af is not None:
|
|
211
|
+
result[(rsid, alt)] = af
|
|
212
|
+
return result
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""GWAS Catalog annotator. Source-attributed trait associations."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import contextlib
|
|
8
|
+
import logging
|
|
9
|
+
import sqlite3
|
|
10
|
+
import zipfile
|
|
11
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
12
|
+
|
|
13
|
+
from allelix.annotators.base import Annotator, LicenseDescriptor
|
|
14
|
+
from allelix.databases.gwas_loader import (
|
|
15
|
+
_CATEGORIZER_VERSION,
|
|
16
|
+
_REQUIRED_GWAS_COLUMNS,
|
|
17
|
+
GWAS_CATALOG_URL,
|
|
18
|
+
GWAS_DB_FILENAME,
|
|
19
|
+
load_gwas_tsv,
|
|
20
|
+
schema_is_current,
|
|
21
|
+
)
|
|
22
|
+
from allelix.databases.manager import (
|
|
23
|
+
_ensure_local_version_tag_column,
|
|
24
|
+
download,
|
|
25
|
+
get_database_info,
|
|
26
|
+
head_request_headers,
|
|
27
|
+
)
|
|
28
|
+
from allelix.models import Annotation
|
|
29
|
+
|
|
30
|
+
_EXCLUDED_TRAIT_CATEGORIES = frozenset(
|
|
31
|
+
{
|
|
32
|
+
"body_measurement",
|
|
33
|
+
"lipid_measurement",
|
|
34
|
+
"hematological_measurement",
|
|
35
|
+
"other_measurement",
|
|
36
|
+
"behavioral",
|
|
37
|
+
}
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
_MUST_INCLUDE_RSIDS = frozenset(
|
|
41
|
+
{
|
|
42
|
+
"rs10737680", # CFH — age-related macular degeneration
|
|
43
|
+
"rs11209026", # IL23R — inflammatory bowel disease
|
|
44
|
+
"rs9271366", # HLA-DRB1 — multiple sclerosis
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if TYPE_CHECKING:
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
|
|
51
|
+
from allelix.models import Variant
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _magnitude(p_value: float | None, or_beta: float | None) -> float:
|
|
57
|
+
"""Derive magnitude from p-value and optional effect size."""
|
|
58
|
+
if p_value is None:
|
|
59
|
+
base = 2.0
|
|
60
|
+
elif p_value < 5e-100:
|
|
61
|
+
base = 8.0
|
|
62
|
+
elif p_value < 5e-20:
|
|
63
|
+
base = 7.0
|
|
64
|
+
elif p_value < 5e-8:
|
|
65
|
+
base = 6.0
|
|
66
|
+
elif p_value < 5e-6:
|
|
67
|
+
base = 4.0
|
|
68
|
+
elif p_value < 5e-4:
|
|
69
|
+
base = 3.0
|
|
70
|
+
else:
|
|
71
|
+
base = 2.0
|
|
72
|
+
|
|
73
|
+
if or_beta is not None and or_beta > 0:
|
|
74
|
+
if or_beta >= 3.0 or or_beta <= 0.33:
|
|
75
|
+
base = min(base + 1.0, 9.0)
|
|
76
|
+
elif or_beta >= 2.0 or or_beta <= 0.5:
|
|
77
|
+
base = min(base + 0.5, 9.0)
|
|
78
|
+
|
|
79
|
+
return base
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
_UNKNOWN_RISK_ALLELE_MAG_CAP = 3.0
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class GWASCatalogAnnotator(Annotator):
|
|
86
|
+
"""Annotates variants with GWAS Catalog trait associations."""
|
|
87
|
+
|
|
88
|
+
name: ClassVar[str] = "gwas"
|
|
89
|
+
display_name: ClassVar[str] = "GWAS Catalog"
|
|
90
|
+
attribution: ClassVar[str] = "GWAS Catalog"
|
|
91
|
+
requires_download: ClassVar[bool] = True
|
|
92
|
+
license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
|
|
93
|
+
spdx="custom-embl-ebi",
|
|
94
|
+
license_url="https://www.ebi.ac.uk/gwas/docs/about",
|
|
95
|
+
attribution_text=(
|
|
96
|
+
"GWAS Catalog data sourced from NHGRI-EBI GWAS Catalog,"
|
|
97
|
+
" available under EMBL-EBI Terms of Use."
|
|
98
|
+
),
|
|
99
|
+
source_url="https://www.ebi.ac.uk/gwas/",
|
|
100
|
+
commercial_ok=True,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def __init__(self, data_dir: Path, *, filter_traits: bool = True) -> None:
|
|
104
|
+
"""Initialize with path to the data directory."""
|
|
105
|
+
super().__init__(data_dir)
|
|
106
|
+
self._db_path = data_dir / GWAS_DB_FILENAME
|
|
107
|
+
self._conn: sqlite3.Connection | None = None
|
|
108
|
+
self._filter_traits = filter_traits
|
|
109
|
+
|
|
110
|
+
def _connection(self) -> sqlite3.Connection:
|
|
111
|
+
if self._conn is None:
|
|
112
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
113
|
+
return self._conn
|
|
114
|
+
|
|
115
|
+
def setup(self) -> None:
|
|
116
|
+
"""Download GWAS Catalog associations ZIP, extract TSV, and ingest."""
|
|
117
|
+
url = GWAS_CATALOG_URL
|
|
118
|
+
signal = self.fetch_remote_signal()
|
|
119
|
+
if signal is None:
|
|
120
|
+
msg = (
|
|
121
|
+
"gwas: cannot verify remote freshness signal. "
|
|
122
|
+
"Refresh aborted to avoid persisting an incomplete cache stamp. "
|
|
123
|
+
"Retry, or pass --force if you accept that next `db update` "
|
|
124
|
+
"will re-download to re-establish the signal."
|
|
125
|
+
)
|
|
126
|
+
raise RuntimeError(msg)
|
|
127
|
+
zip_path = self.data_dir / "gwas_catalog_associations.zip"
|
|
128
|
+
tsv_path = self.data_dir / "gwas_catalog_associations.tsv"
|
|
129
|
+
# No content-hash verification: EBI publishes no checksum for this
|
|
130
|
+
# file and the content is mutable, so there is nothing to pin or
|
|
131
|
+
# fetch. TLS + Content-Length truncation guard only. See ADR-0029.
|
|
132
|
+
download(url, zip_path)
|
|
133
|
+
try:
|
|
134
|
+
with zipfile.ZipFile(zip_path) as zf:
|
|
135
|
+
tsv_names = [n for n in zf.namelist() if n.endswith(".tsv")]
|
|
136
|
+
if not tsv_names:
|
|
137
|
+
msg = f"No .tsv file found in {zip_path}"
|
|
138
|
+
raise RuntimeError(msg)
|
|
139
|
+
zf.extract(tsv_names[0], self.data_dir)
|
|
140
|
+
extracted = self.data_dir / tsv_names[0]
|
|
141
|
+
if extracted != tsv_path:
|
|
142
|
+
extracted.rename(tsv_path)
|
|
143
|
+
load_gwas_tsv(tsv_path, self._db_path, source_url=url, remote_signal=signal)
|
|
144
|
+
finally:
|
|
145
|
+
try:
|
|
146
|
+
zip_path.unlink()
|
|
147
|
+
except FileNotFoundError:
|
|
148
|
+
pass
|
|
149
|
+
except OSError:
|
|
150
|
+
logger.warning("Could not remove staged file at %s", zip_path)
|
|
151
|
+
|
|
152
|
+
def is_ready(self) -> bool:
|
|
153
|
+
"""Return True when the local GWAS cache exists and has current schema.
|
|
154
|
+
|
|
155
|
+
Handles three states:
|
|
156
|
+
|
|
157
|
+
1. Current tag in ``local_version_tag`` — ready.
|
|
158
|
+
2. No tag (legacy cache or ``|cv:`` still in ``remote_signal``) —
|
|
159
|
+
one-shot migration via ``_stamp_existing_gwas_cache``.
|
|
160
|
+
3. Stale tag (categorizer bumped) — auto-reingest from cached TSV
|
|
161
|
+
if still on disk.
|
|
162
|
+
"""
|
|
163
|
+
info = get_database_info(self._db_path, "gwas")
|
|
164
|
+
if info is None:
|
|
165
|
+
return False
|
|
166
|
+
tag = info.get("local_version_tag") or ""
|
|
167
|
+
if tag == f"cv:{_CATEGORIZER_VERSION}":
|
|
168
|
+
return _has_current_gwas_columns(self._db_path)
|
|
169
|
+
if not tag and _stamp_existing_gwas_cache(self._db_path):
|
|
170
|
+
return _has_current_gwas_columns(self._db_path)
|
|
171
|
+
tsv_path = self.data_dir / "gwas_catalog_associations.tsv"
|
|
172
|
+
if tsv_path.exists():
|
|
173
|
+
print(
|
|
174
|
+
"GWAS categorizer changed — re-ingesting from cached TSV...",
|
|
175
|
+
flush=True,
|
|
176
|
+
)
|
|
177
|
+
try:
|
|
178
|
+
load_gwas_tsv(
|
|
179
|
+
tsv_path,
|
|
180
|
+
self._db_path,
|
|
181
|
+
source_url=GWAS_CATALOG_URL,
|
|
182
|
+
remote_signal=self.cached_remote_signal(),
|
|
183
|
+
)
|
|
184
|
+
except Exception:
|
|
185
|
+
logger.warning("Auto-reingest from cached TSV failed", exc_info=True)
|
|
186
|
+
return False
|
|
187
|
+
return schema_is_current(self._db_path)
|
|
188
|
+
return False
|
|
189
|
+
|
|
190
|
+
def version(self) -> str | None:
|
|
191
|
+
"""Return the cached database version string, or None."""
|
|
192
|
+
info = get_database_info(self._db_path, "gwas")
|
|
193
|
+
return info["version"] if info else None
|
|
194
|
+
|
|
195
|
+
def record_count(self) -> int | None:
|
|
196
|
+
"""Return the number of cached GWAS association records, or None."""
|
|
197
|
+
info = get_database_info(self._db_path, "gwas")
|
|
198
|
+
return info["record_count"] if info else None
|
|
199
|
+
|
|
200
|
+
def close(self) -> None:
|
|
201
|
+
"""Close the SQLite connection if open."""
|
|
202
|
+
if self._conn is not None:
|
|
203
|
+
self._conn.close()
|
|
204
|
+
self._conn = None
|
|
205
|
+
|
|
206
|
+
def fetch_remote_signal(self) -> str | None:
|
|
207
|
+
"""Probe the GWAS Catalog URL for ETag or Last-Modified."""
|
|
208
|
+
headers = head_request_headers(GWAS_CATALOG_URL)
|
|
209
|
+
if headers is None:
|
|
210
|
+
return None
|
|
211
|
+
etag = headers.get("ETag") or headers.get("Etag")
|
|
212
|
+
last_modified = headers.get("Last-Modified") or headers.get("Last-modified")
|
|
213
|
+
if etag:
|
|
214
|
+
return f"etag:{etag.strip()}"
|
|
215
|
+
if last_modified:
|
|
216
|
+
return f"lm:{last_modified.strip()}"
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
def cached_remote_signal(self) -> str | None:
|
|
220
|
+
"""Return the remote signal stored during the last successful ingest."""
|
|
221
|
+
info = get_database_info(self._db_path, "gwas")
|
|
222
|
+
if not info or not info["remote_signal"]:
|
|
223
|
+
return None
|
|
224
|
+
return info["remote_signal"] or None
|
|
225
|
+
|
|
226
|
+
def annotate(self, variant: Variant) -> list[Annotation]:
|
|
227
|
+
"""Return GWAS Catalog annotations for variants the user carries.
|
|
228
|
+
|
|
229
|
+
Carrier matching uses the risk allele when specified. When the
|
|
230
|
+
GWAS entry doesn't specify a risk allele, the annotation fires
|
|
231
|
+
on rsid match alone with a magnitude penalty.
|
|
232
|
+
"""
|
|
233
|
+
if variant.is_no_call:
|
|
234
|
+
return []
|
|
235
|
+
|
|
236
|
+
sql = (
|
|
237
|
+
"SELECT risk_allele, trait, p_value, or_beta, gene, "
|
|
238
|
+
"study_accession, pubmed_id, trait_category "
|
|
239
|
+
"FROM gwas_associations WHERE rsid = ?"
|
|
240
|
+
)
|
|
241
|
+
rows = self._connection().execute(sql, (variant.rsid,)).fetchall()
|
|
242
|
+
annotations: list[Annotation] = []
|
|
243
|
+
user_diploid = _user_diploid(variant)
|
|
244
|
+
|
|
245
|
+
for row in rows:
|
|
246
|
+
(
|
|
247
|
+
risk_allele,
|
|
248
|
+
trait,
|
|
249
|
+
p_value,
|
|
250
|
+
or_beta,
|
|
251
|
+
gene,
|
|
252
|
+
study_accession,
|
|
253
|
+
pubmed_id,
|
|
254
|
+
trait_category,
|
|
255
|
+
) = row
|
|
256
|
+
|
|
257
|
+
if self._filter_traits and trait_category in _EXCLUDED_TRAIT_CATEGORIES:
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
if risk_allele is not None:
|
|
261
|
+
if variant.allele1 != risk_allele and variant.allele2 != risk_allele:
|
|
262
|
+
continue
|
|
263
|
+
mag = _magnitude(p_value, or_beta)
|
|
264
|
+
risk_note = ""
|
|
265
|
+
else:
|
|
266
|
+
# ADR-0024: unknown risk allele fires on rsID match alone
|
|
267
|
+
# but capped at 3.0 so it doesn't pass typical --min-magnitude
|
|
268
|
+
# thresholds. Without knowing which allele is the risk allele,
|
|
269
|
+
# we can't apply the carrier rule (ADR-0007).
|
|
270
|
+
mag = min(_magnitude(p_value, or_beta), _UNKNOWN_RISK_ALLELE_MAG_CAP)
|
|
271
|
+
risk_note = " (risk allele not specified in study)"
|
|
272
|
+
|
|
273
|
+
p_str = f"p={p_value:.1e}" if p_value is not None else "p=N/A"
|
|
274
|
+
gene_str = gene or "—"
|
|
275
|
+
description = f"GWAS Catalog: {trait} ({p_str}, gene: {gene_str}){risk_note}"
|
|
276
|
+
|
|
277
|
+
references: list[str] = []
|
|
278
|
+
if pubmed_id:
|
|
279
|
+
references.append(f"pubmed:{pubmed_id}")
|
|
280
|
+
if study_accession:
|
|
281
|
+
references.append(f"gwas:{study_accession}")
|
|
282
|
+
|
|
283
|
+
annotations.append(
|
|
284
|
+
Annotation(
|
|
285
|
+
source=self.name,
|
|
286
|
+
rsid=variant.rsid,
|
|
287
|
+
significance="gwas_association",
|
|
288
|
+
category="trait",
|
|
289
|
+
magnitude=mag,
|
|
290
|
+
description=description,
|
|
291
|
+
attribution=self.attribution,
|
|
292
|
+
genotype_match=user_diploid,
|
|
293
|
+
references=references,
|
|
294
|
+
condition=trait,
|
|
295
|
+
gene=gene or "",
|
|
296
|
+
alt="",
|
|
297
|
+
is_must_include=variant.rsid in _MUST_INCLUDE_RSIDS,
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
return annotations
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _user_diploid(variant: Variant) -> str:
|
|
304
|
+
"""Sorted two-letter diploid for SNVs; indel passthrough verbatim."""
|
|
305
|
+
a1, a2 = variant.allele1, variant.allele2
|
|
306
|
+
if len(a1) == 1 and len(a2) == 1:
|
|
307
|
+
return "".join(sorted((a1, a2)))
|
|
308
|
+
return f"{a1}/{a2}"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _has_current_gwas_columns(db_path: Path) -> bool:
|
|
312
|
+
"""True iff the gwas_associations table has the required columns."""
|
|
313
|
+
try:
|
|
314
|
+
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
315
|
+
cols = {row[1] for row in conn.execute("PRAGMA table_info(gwas_associations)")}
|
|
316
|
+
return _REQUIRED_GWAS_COLUMNS.issubset(cols)
|
|
317
|
+
except sqlite3.DatabaseError:
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _stamp_existing_gwas_cache(db_path: Path) -> bool:
|
|
322
|
+
"""One-shot migration: stamp ``local_version_tag`` on a GWAS cache.
|
|
323
|
+
|
|
324
|
+
Handles legacy caches with ``|cv:N`` baked into ``remote_signal``
|
|
325
|
+
by moving the tag and cleaning the signal. Returns True if the
|
|
326
|
+
current categorizer version is now stamped.
|
|
327
|
+
"""
|
|
328
|
+
if not db_path.exists():
|
|
329
|
+
return False
|
|
330
|
+
tag = f"cv:{_CATEGORIZER_VERSION}"
|
|
331
|
+
try:
|
|
332
|
+
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
333
|
+
_ensure_local_version_tag_column(conn)
|
|
334
|
+
row = conn.execute(
|
|
335
|
+
"SELECT remote_signal, local_version_tag FROM database_versions WHERE name='gwas'"
|
|
336
|
+
).fetchone()
|
|
337
|
+
if not row:
|
|
338
|
+
return False
|
|
339
|
+
sig, existing_tag = row
|
|
340
|
+
if existing_tag == tag:
|
|
341
|
+
return True
|
|
342
|
+
if existing_tag is not None:
|
|
343
|
+
return False
|
|
344
|
+
clean_signal = (sig or "").split("|cv:")[0]
|
|
345
|
+
conn.execute(
|
|
346
|
+
"UPDATE database_versions "
|
|
347
|
+
"SET remote_signal = ?, local_version_tag = ? "
|
|
348
|
+
"WHERE name = 'gwas'",
|
|
349
|
+
(clean_signal, tag),
|
|
350
|
+
)
|
|
351
|
+
conn.commit()
|
|
352
|
+
return True
|
|
353
|
+
except (sqlite3.OperationalError, sqlite3.DatabaseError):
|
|
354
|
+
return False
|