allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""CPIC API → per-allele function lookup.
|
|
4
|
+
|
|
5
|
+
CPIC (Clinical Pharmacogenetics Implementation Consortium) publishes
|
|
6
|
+
structured per-allele functional status assignments via its public
|
|
7
|
+
PostgREST API. Each allele (haplotype or single variant) carries a
|
|
8
|
+
`clinicalfunctionalstatus` chosen from a small enumeration:
|
|
9
|
+
|
|
10
|
+
Normal function | Decreased function | No function |
|
|
11
|
+
Increased function | Uncertain function | (gene-specific tags)
|
|
12
|
+
|
|
13
|
+
The PharmGKB filter is a join: for an annotation row matching the
|
|
14
|
+
user's `(rsid, genotype)`, look up each base of the user's genotype
|
|
15
|
+
in the per-allele function table. If both alleles map to
|
|
16
|
+
`Normal function`, the row is a non-finding (the user does not carry
|
|
17
|
+
the studied variant). Otherwise the row emits.
|
|
18
|
+
|
|
19
|
+
ADR-0020 documents this as the canonical structured source. Three
|
|
20
|
+
CPIC tables are joined client-side:
|
|
21
|
+
|
|
22
|
+
sequence_location.dbsnpid (rsid)
|
|
23
|
+
↔ allele_location_value.locationid
|
|
24
|
+
↔ allele_location_value.alleledefinitionid
|
|
25
|
+
↔ allele.definitionid
|
|
26
|
+
→ allele.clinicalfunctionalstatus
|
|
27
|
+
|
|
28
|
+
Result: `(rsid, base) → function_class` for every CPIC-curated
|
|
29
|
+
variant. Genes outside CPIC's scope have no entries; the filter
|
|
30
|
+
treats absence as "no opinion" and emits the row.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import json
|
|
36
|
+
import logging
|
|
37
|
+
import time
|
|
38
|
+
import urllib.error
|
|
39
|
+
import urllib.request
|
|
40
|
+
|
|
41
|
+
from allelix import __version__
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
CPIC_API_BASE = "https://api.cpicpgx.org/v1"
|
|
46
|
+
CPIC_TIMEOUT_SECONDS = 60
|
|
47
|
+
CPIC_MAX_ROWS = 99_999 # PostgREST defaults to limit=1000 without explicit Range.
|
|
48
|
+
|
|
49
|
+
# M-1: retry on transient failures. CPIC's PostgREST API is generally
|
|
50
|
+
# reliable but TCP RSTs and brief 5xx blips do happen; one retry burst
|
|
51
|
+
# saves the user from a manual `db update --force` rerun. Backoff is
|
|
52
|
+
# capped low because the loader runs interactively at db-update time.
|
|
53
|
+
CPIC_RETRY_ATTEMPTS = 3
|
|
54
|
+
CPIC_RETRY_BACKOFF_SECONDS: tuple[float, ...] = (1.0, 2.0, 4.0)
|
|
55
|
+
|
|
56
|
+
USER_AGENT = f"allelix/{__version__} (+https://github.com/dial481/allelix)"
|
|
57
|
+
|
|
58
|
+
# CPIC's clinicalfunctionalstatus enumeration → Allelix's function_class enum.
|
|
59
|
+
# Anything not mapped here is treated as "not Normal" (i.e., a variant) and
|
|
60
|
+
# the row containing that allele emits. We never coerce an unknown status
|
|
61
|
+
# into Normal - silent suppression is the failure mode v0.5-v0.8 kept
|
|
62
|
+
# producing.
|
|
63
|
+
FUNCTION_CLASS_NORMAL = "normal"
|
|
64
|
+
FUNCTION_CLASS_DECREASED = "decreased"
|
|
65
|
+
FUNCTION_CLASS_NO_FUNCTION = "no_function"
|
|
66
|
+
FUNCTION_CLASS_INCREASED = "increased"
|
|
67
|
+
FUNCTION_CLASS_UNCERTAIN = "uncertain"
|
|
68
|
+
|
|
69
|
+
_CPIC_TO_FUNCTION_CLASS: dict[str, str] = {
|
|
70
|
+
"normal function": FUNCTION_CLASS_NORMAL,
|
|
71
|
+
"decreased function": FUNCTION_CLASS_DECREASED,
|
|
72
|
+
"no function": FUNCTION_CLASS_NO_FUNCTION,
|
|
73
|
+
"increased function": FUNCTION_CLASS_INCREASED,
|
|
74
|
+
"possibly increased function": FUNCTION_CLASS_INCREASED,
|
|
75
|
+
"uncertain function": FUNCTION_CLASS_UNCERTAIN,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _classify_cpic_status(status: str | None) -> str | None:
|
|
80
|
+
"""Map a CPIC `clinicalfunctionalstatus` string to a function_class enum.
|
|
81
|
+
|
|
82
|
+
Returns None for empty/unrecognized strings — the caller skips the
|
|
83
|
+
row rather than guessing.
|
|
84
|
+
"""
|
|
85
|
+
if not status:
|
|
86
|
+
return None
|
|
87
|
+
return _CPIC_TO_FUNCTION_CLASS.get(status.strip().lower())
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def fetch_cpic_remote_signal(api_base: str = CPIC_API_BASE) -> str | None:
|
|
91
|
+
"""Return a freshness signal for CPIC's data, or None on failure.
|
|
92
|
+
|
|
93
|
+
M-2: PharmGKB's bulk-download Last-Modified header tells us nothing
|
|
94
|
+
about CPIC's curation database. CPIC publishes a `change_log` table
|
|
95
|
+
with one row per curated change; the most recent date is a stable,
|
|
96
|
+
cheap freshness proxy. The signal format is `lastchange:{date}`.
|
|
97
|
+
|
|
98
|
+
Never retries — this is the lightweight probe used at `db update`
|
|
99
|
+
freshness-check time. Persistent CPIC outages should NOT block the
|
|
100
|
+
user; returning None signals "can't verify" and the CLI prints
|
|
101
|
+
"pass --force to refresh anyway" rather than aborting.
|
|
102
|
+
"""
|
|
103
|
+
url = f"{api_base}/change_log?select=date&order=date.desc&limit=1"
|
|
104
|
+
request = urllib.request.Request(
|
|
105
|
+
url,
|
|
106
|
+
headers={
|
|
107
|
+
"User-Agent": USER_AGENT,
|
|
108
|
+
"Accept": "application/json",
|
|
109
|
+
},
|
|
110
|
+
)
|
|
111
|
+
try:
|
|
112
|
+
with urllib.request.urlopen(request, timeout=CPIC_TIMEOUT_SECONDS) as response:
|
|
113
|
+
rows = json.loads(response.read().decode("utf-8"))
|
|
114
|
+
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
|
|
115
|
+
if hasattr(exc, "close"):
|
|
116
|
+
exc.close()
|
|
117
|
+
logger.warning("CPIC freshness probe failed: %s", exc)
|
|
118
|
+
return None
|
|
119
|
+
if not rows or not isinstance(rows[0], dict):
|
|
120
|
+
return None
|
|
121
|
+
date = rows[0].get("date")
|
|
122
|
+
if not date:
|
|
123
|
+
return None
|
|
124
|
+
return f"lastchange:{date}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _http_get_json(url: str, timeout: float = CPIC_TIMEOUT_SECONDS) -> list[dict]:
|
|
128
|
+
"""Fetch a CPIC PostgREST endpoint and return the JSON body.
|
|
129
|
+
|
|
130
|
+
Sends `Range: 0-N` to bypass PostgREST's default 1000-row cap.
|
|
131
|
+
Retries up to `CPIC_RETRY_ATTEMPTS` times with exponential backoff
|
|
132
|
+
on transient transport failures (M-1). Raises the last exception
|
|
133
|
+
after the final attempt so the caller surfaces a clear failure
|
|
134
|
+
instead of silently producing an empty lookup.
|
|
135
|
+
"""
|
|
136
|
+
request = urllib.request.Request(
|
|
137
|
+
url,
|
|
138
|
+
headers={
|
|
139
|
+
"User-Agent": USER_AGENT,
|
|
140
|
+
"Accept": "application/json",
|
|
141
|
+
"Range-Unit": "items",
|
|
142
|
+
"Range": f"0-{CPIC_MAX_ROWS}",
|
|
143
|
+
},
|
|
144
|
+
)
|
|
145
|
+
last_error: Exception | None = None
|
|
146
|
+
for attempt in range(CPIC_RETRY_ATTEMPTS):
|
|
147
|
+
try:
|
|
148
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
149
|
+
return json.loads(response.read().decode("utf-8"))
|
|
150
|
+
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
|
|
151
|
+
if hasattr(exc, "close"):
|
|
152
|
+
exc.close()
|
|
153
|
+
last_error = exc
|
|
154
|
+
if attempt + 1 < CPIC_RETRY_ATTEMPTS:
|
|
155
|
+
backoff = CPIC_RETRY_BACKOFF_SECONDS[attempt]
|
|
156
|
+
logger.warning(
|
|
157
|
+
"CPIC fetch failed (attempt %d/%d): %s — retrying in %.1fs",
|
|
158
|
+
attempt + 1,
|
|
159
|
+
CPIC_RETRY_ATTEMPTS,
|
|
160
|
+
exc,
|
|
161
|
+
backoff,
|
|
162
|
+
)
|
|
163
|
+
time.sleep(backoff)
|
|
164
|
+
assert last_error is not None # loop runs at least once
|
|
165
|
+
raise last_error
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def fetch_cpic_allele_functions(
|
|
169
|
+
api_base: str = CPIC_API_BASE,
|
|
170
|
+
) -> dict[tuple[str, str], str]:
|
|
171
|
+
"""Build the `(rsid, base) → function_class` lookup from CPIC's API.
|
|
172
|
+
|
|
173
|
+
ADR-0020: this IS the PharmGKB non-finding filter's data source.
|
|
174
|
+
Three CPIC tables are fetched and joined client-side:
|
|
175
|
+
|
|
176
|
+
- `sequence_location` (id, dbsnpid)
|
|
177
|
+
- `allele_location_value` (alleledefinitionid, locationid, variantallele)
|
|
178
|
+
- `allele` (definitionid, clinicalfunctionalstatus)
|
|
179
|
+
|
|
180
|
+
Only single-base alleles (A/C/G/T) are emitted; CPIC's tables also
|
|
181
|
+
contain multi-base haplotype components which don't apply to the
|
|
182
|
+
SNV genotype-matching path (ADR-0009).
|
|
183
|
+
|
|
184
|
+
On network failure, raises `urllib.error.URLError` (or similar).
|
|
185
|
+
The caller decides whether to abort `db update` or fall back to a
|
|
186
|
+
cached lookup.
|
|
187
|
+
"""
|
|
188
|
+
seq_url = f"{api_base}/sequence_location?dbsnpid=not.is.null&select=id,dbsnpid"
|
|
189
|
+
loc_url = (
|
|
190
|
+
f"{api_base}/allele_location_value?select=alleledefinitionid,locationid,variantallele"
|
|
191
|
+
)
|
|
192
|
+
allele_url = f"{api_base}/allele?select=definitionid,clinicalfunctionalstatus"
|
|
193
|
+
|
|
194
|
+
sequence_locations = _http_get_json(seq_url)
|
|
195
|
+
location_values = _http_get_json(loc_url)
|
|
196
|
+
alleles = _http_get_json(allele_url)
|
|
197
|
+
|
|
198
|
+
location_to_rsid: dict[int, str] = {}
|
|
199
|
+
for row in sequence_locations:
|
|
200
|
+
loc_id = row.get("id")
|
|
201
|
+
rsid = row.get("dbsnpid")
|
|
202
|
+
if loc_id is not None and rsid:
|
|
203
|
+
location_to_rsid[loc_id] = rsid
|
|
204
|
+
|
|
205
|
+
allele_to_function: dict[int, str] = {}
|
|
206
|
+
for row in alleles:
|
|
207
|
+
definition_id = row.get("definitionid")
|
|
208
|
+
function_class = _classify_cpic_status(row.get("clinicalfunctionalstatus"))
|
|
209
|
+
if definition_id is not None and function_class is not None:
|
|
210
|
+
allele_to_function[definition_id] = function_class
|
|
211
|
+
|
|
212
|
+
out: dict[tuple[str, str], str] = {}
|
|
213
|
+
for row in location_values:
|
|
214
|
+
rsid = location_to_rsid.get(row.get("locationid"))
|
|
215
|
+
function_class = allele_to_function.get(row.get("alleledefinitionid"))
|
|
216
|
+
base = (row.get("variantallele") or "").strip().upper()
|
|
217
|
+
if not rsid or function_class is None:
|
|
218
|
+
continue
|
|
219
|
+
if len(base) != 1 or base not in "ACGT":
|
|
220
|
+
continue
|
|
221
|
+
# Conflict policy: when the same (rsid, base) appears under multiple
|
|
222
|
+
# allele definitions with different function classes, prefer the
|
|
223
|
+
# non-Normal classification. Suppressing happens only when EVERY
|
|
224
|
+
# base maps to Normal, so when CPIC's own data has a Normal-vs-non-
|
|
225
|
+
# Normal conflict the safe choice is "treat as variant and emit"
|
|
226
|
+
# — never silently suppress a real variant. In practice CPIC's
|
|
227
|
+
# tables are internally consistent; this is defense in depth.
|
|
228
|
+
prev = out.get((rsid, base))
|
|
229
|
+
if prev is None:
|
|
230
|
+
out[(rsid, base)] = function_class
|
|
231
|
+
elif prev != function_class and FUNCTION_CLASS_NORMAL in (prev, function_class):
|
|
232
|
+
out[(rsid, base)] = function_class if prev == FUNCTION_CLASS_NORMAL else prev
|
|
233
|
+
# else: both classifications agree, or both non-Normal — keep first.
|
|
234
|
+
return out
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""gnomAD exome frequency cache loader.
|
|
4
|
+
|
|
5
|
+
The pre-built SQLite cache is downloaded from HuggingFace during
|
|
6
|
+
``db update``. Contains all ~16M exome rsIDs from gnomAD v4.1 with
|
|
7
|
+
genomic coordinates (chrom/pos/ref/alt) for future AlphaMissense/CADD
|
|
8
|
+
integration.
|
|
9
|
+
|
|
10
|
+
The cache can also be built locally from gnomAD exome VCFs via
|
|
11
|
+
``scripts/build_gnomad_cache.py`` (streaming or local file mode).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from allelix.databases._versions import GNOMAD_SCHEMA_VERSION
|
|
19
|
+
from allelix.databases.loader_utils import install_prebuilt_gz_cache
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
GNOMAD_DB_FILENAME = "gnomad.sqlite"
|
|
25
|
+
|
|
26
|
+
GNOMAD_CACHE_URL = (
|
|
27
|
+
"https://huggingface.co/datasets/dial481/allelix-gnomad"
|
|
28
|
+
"/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
GNOMAD_EXPECTED_SHA256 = "e001b6c472b89075f18c82a34ccfb1e8e5c524f8502b988db1a546d25b0c6fe4"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def install_prebuilt_cache(
|
|
35
|
+
gz_path: Path,
|
|
36
|
+
db_path: Path,
|
|
37
|
+
*,
|
|
38
|
+
source_url: str = "",
|
|
39
|
+
remote_signal: str | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
"""Decompress a gzipped pre-built SQLite cache into place."""
|
|
42
|
+
install_prebuilt_gz_cache(
|
|
43
|
+
gz_path,
|
|
44
|
+
db_path,
|
|
45
|
+
"gnomad",
|
|
46
|
+
source_url=source_url,
|
|
47
|
+
remote_signal=remote_signal,
|
|
48
|
+
schema_version_tag=f"sv:{GNOMAD_SCHEMA_VERSION}",
|
|
49
|
+
)
|