allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,234 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """CPIC API → per-allele function lookup.
4
+
5
+ CPIC (Clinical Pharmacogenetics Implementation Consortium) publishes
6
+ structured per-allele functional status assignments via its public
7
+ PostgREST API. Each allele (haplotype or single variant) carries a
8
+ `clinicalfunctionalstatus` chosen from a small enumeration:
9
+
10
+ Normal function | Decreased function | No function |
11
+ Increased function | Uncertain function | (gene-specific tags)
12
+
13
+ The PharmGKB filter is a join: for an annotation row matching the
14
+ user's `(rsid, genotype)`, look up each base of the user's genotype
15
+ in the per-allele function table. If both alleles map to
16
+ `Normal function`, the row is a non-finding (the user does not carry
17
+ the studied variant). Otherwise the row emits.
18
+
19
+ ADR-0020 documents this as the canonical structured source. Three
20
+ CPIC tables are joined client-side:
21
+
22
+ sequence_location.dbsnpid (rsid)
23
+ ↔ allele_location_value.locationid
24
+ ↔ allele_location_value.alleledefinitionid
25
+ ↔ allele.definitionid
26
+ → allele.clinicalfunctionalstatus
27
+
28
+ Result: `(rsid, base) → function_class` for every CPIC-curated
29
+ variant. Genes outside CPIC's scope have no entries; the filter
30
+ treats absence as "no opinion" and emits the row.
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import json
36
+ import logging
37
+ import time
38
+ import urllib.error
39
+ import urllib.request
40
+
41
+ from allelix import __version__
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ CPIC_API_BASE = "https://api.cpicpgx.org/v1"
46
+ CPIC_TIMEOUT_SECONDS = 60
47
+ CPIC_MAX_ROWS = 99_999 # PostgREST defaults to limit=1000 without explicit Range.
48
+
49
+ # M-1: retry on transient failures. CPIC's PostgREST API is generally
50
+ # reliable but TCP RSTs and brief 5xx blips do happen; one retry burst
51
+ # saves the user from a manual `db update --force` rerun. Backoff is
52
+ # capped low because the loader runs interactively at db-update time.
53
+ CPIC_RETRY_ATTEMPTS = 3
54
+ CPIC_RETRY_BACKOFF_SECONDS: tuple[float, ...] = (1.0, 2.0, 4.0)
55
+
56
+ USER_AGENT = f"allelix/{__version__} (+https://github.com/dial481/allelix)"
57
+
58
+ # CPIC's clinicalfunctionalstatus enumeration → Allelix's function_class enum.
59
+ # Anything not mapped here is treated as "not Normal" (i.e., a variant) and
60
+ # the row containing that allele emits. We never coerce an unknown status
61
+ # into Normal - silent suppression is the failure mode v0.5-v0.8 kept
62
+ # producing.
63
+ FUNCTION_CLASS_NORMAL = "normal"
64
+ FUNCTION_CLASS_DECREASED = "decreased"
65
+ FUNCTION_CLASS_NO_FUNCTION = "no_function"
66
+ FUNCTION_CLASS_INCREASED = "increased"
67
+ FUNCTION_CLASS_UNCERTAIN = "uncertain"
68
+
69
+ _CPIC_TO_FUNCTION_CLASS: dict[str, str] = {
70
+ "normal function": FUNCTION_CLASS_NORMAL,
71
+ "decreased function": FUNCTION_CLASS_DECREASED,
72
+ "no function": FUNCTION_CLASS_NO_FUNCTION,
73
+ "increased function": FUNCTION_CLASS_INCREASED,
74
+ "possibly increased function": FUNCTION_CLASS_INCREASED,
75
+ "uncertain function": FUNCTION_CLASS_UNCERTAIN,
76
+ }
77
+
78
+
79
+ def _classify_cpic_status(status: str | None) -> str | None:
80
+ """Map a CPIC `clinicalfunctionalstatus` string to a function_class enum.
81
+
82
+ Returns None for empty/unrecognized strings — the caller skips the
83
+ row rather than guessing.
84
+ """
85
+ if not status:
86
+ return None
87
+ return _CPIC_TO_FUNCTION_CLASS.get(status.strip().lower())
88
+
89
+
90
+ def fetch_cpic_remote_signal(api_base: str = CPIC_API_BASE) -> str | None:
91
+ """Return a freshness signal for CPIC's data, or None on failure.
92
+
93
+ M-2: PharmGKB's bulk-download Last-Modified header tells us nothing
94
+ about CPIC's curation database. CPIC publishes a `change_log` table
95
+ with one row per curated change; the most recent date is a stable,
96
+ cheap freshness proxy. The signal format is `lastchange:{date}`.
97
+
98
+ Never retries — this is the lightweight probe used at `db update`
99
+ freshness-check time. Persistent CPIC outages should NOT block the
100
+ user; returning None signals "can't verify" and the CLI prints
101
+ "pass --force to refresh anyway" rather than aborting.
102
+ """
103
+ url = f"{api_base}/change_log?select=date&order=date.desc&limit=1"
104
+ request = urllib.request.Request(
105
+ url,
106
+ headers={
107
+ "User-Agent": USER_AGENT,
108
+ "Accept": "application/json",
109
+ },
110
+ )
111
+ try:
112
+ with urllib.request.urlopen(request, timeout=CPIC_TIMEOUT_SECONDS) as response:
113
+ rows = json.loads(response.read().decode("utf-8"))
114
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
115
+ if hasattr(exc, "close"):
116
+ exc.close()
117
+ logger.warning("CPIC freshness probe failed: %s", exc)
118
+ return None
119
+ if not rows or not isinstance(rows[0], dict):
120
+ return None
121
+ date = rows[0].get("date")
122
+ if not date:
123
+ return None
124
+ return f"lastchange:{date}"
125
+
126
+
127
+ def _http_get_json(url: str, timeout: float = CPIC_TIMEOUT_SECONDS) -> list[dict]:
128
+ """Fetch a CPIC PostgREST endpoint and return the JSON body.
129
+
130
+ Sends `Range: 0-N` to bypass PostgREST's default 1000-row cap.
131
+ Retries up to `CPIC_RETRY_ATTEMPTS` times with exponential backoff
132
+ on transient transport failures (M-1). Raises the last exception
133
+ after the final attempt so the caller surfaces a clear failure
134
+ instead of silently producing an empty lookup.
135
+ """
136
+ request = urllib.request.Request(
137
+ url,
138
+ headers={
139
+ "User-Agent": USER_AGENT,
140
+ "Accept": "application/json",
141
+ "Range-Unit": "items",
142
+ "Range": f"0-{CPIC_MAX_ROWS}",
143
+ },
144
+ )
145
+ last_error: Exception | None = None
146
+ for attempt in range(CPIC_RETRY_ATTEMPTS):
147
+ try:
148
+ with urllib.request.urlopen(request, timeout=timeout) as response:
149
+ return json.loads(response.read().decode("utf-8"))
150
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exc:
151
+ if hasattr(exc, "close"):
152
+ exc.close()
153
+ last_error = exc
154
+ if attempt + 1 < CPIC_RETRY_ATTEMPTS:
155
+ backoff = CPIC_RETRY_BACKOFF_SECONDS[attempt]
156
+ logger.warning(
157
+ "CPIC fetch failed (attempt %d/%d): %s — retrying in %.1fs",
158
+ attempt + 1,
159
+ CPIC_RETRY_ATTEMPTS,
160
+ exc,
161
+ backoff,
162
+ )
163
+ time.sleep(backoff)
164
+ assert last_error is not None # loop runs at least once
165
+ raise last_error
166
+
167
+
168
+ def fetch_cpic_allele_functions(
169
+ api_base: str = CPIC_API_BASE,
170
+ ) -> dict[tuple[str, str], str]:
171
+ """Build the `(rsid, base) → function_class` lookup from CPIC's API.
172
+
173
+ ADR-0020: this IS the PharmGKB non-finding filter's data source.
174
+ Three CPIC tables are fetched and joined client-side:
175
+
176
+ - `sequence_location` (id, dbsnpid)
177
+ - `allele_location_value` (alleledefinitionid, locationid, variantallele)
178
+ - `allele` (definitionid, clinicalfunctionalstatus)
179
+
180
+ Only single-base alleles (A/C/G/T) are emitted; CPIC's tables also
181
+ contain multi-base haplotype components which don't apply to the
182
+ SNV genotype-matching path (ADR-0009).
183
+
184
+ On network failure, raises `urllib.error.URLError` (or similar).
185
+ The caller decides whether to abort `db update` or fall back to a
186
+ cached lookup.
187
+ """
188
+ seq_url = f"{api_base}/sequence_location?dbsnpid=not.is.null&select=id,dbsnpid"
189
+ loc_url = (
190
+ f"{api_base}/allele_location_value?select=alleledefinitionid,locationid,variantallele"
191
+ )
192
+ allele_url = f"{api_base}/allele?select=definitionid,clinicalfunctionalstatus"
193
+
194
+ sequence_locations = _http_get_json(seq_url)
195
+ location_values = _http_get_json(loc_url)
196
+ alleles = _http_get_json(allele_url)
197
+
198
+ location_to_rsid: dict[int, str] = {}
199
+ for row in sequence_locations:
200
+ loc_id = row.get("id")
201
+ rsid = row.get("dbsnpid")
202
+ if loc_id is not None and rsid:
203
+ location_to_rsid[loc_id] = rsid
204
+
205
+ allele_to_function: dict[int, str] = {}
206
+ for row in alleles:
207
+ definition_id = row.get("definitionid")
208
+ function_class = _classify_cpic_status(row.get("clinicalfunctionalstatus"))
209
+ if definition_id is not None and function_class is not None:
210
+ allele_to_function[definition_id] = function_class
211
+
212
+ out: dict[tuple[str, str], str] = {}
213
+ for row in location_values:
214
+ rsid = location_to_rsid.get(row.get("locationid"))
215
+ function_class = allele_to_function.get(row.get("alleledefinitionid"))
216
+ base = (row.get("variantallele") or "").strip().upper()
217
+ if not rsid or function_class is None:
218
+ continue
219
+ if len(base) != 1 or base not in "ACGT":
220
+ continue
221
+ # Conflict policy: when the same (rsid, base) appears under multiple
222
+ # allele definitions with different function classes, prefer the
223
+ # non-Normal classification. Suppressing happens only when EVERY
224
+ # base maps to Normal, so when CPIC's own data has a Normal-vs-non-
225
+ # Normal conflict the safe choice is "treat as variant and emit"
226
+ # — never silently suppress a real variant. In practice CPIC's
227
+ # tables are internally consistent; this is defense in depth.
228
+ prev = out.get((rsid, base))
229
+ if prev is None:
230
+ out[(rsid, base)] = function_class
231
+ elif prev != function_class and FUNCTION_CLASS_NORMAL in (prev, function_class):
232
+ out[(rsid, base)] = function_class if prev == FUNCTION_CLASS_NORMAL else prev
233
+ # else: both classifications agree, or both non-Normal — keep first.
234
+ return out
@@ -0,0 +1,49 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """gnomAD exome frequency cache loader.
4
+
5
+ The pre-built SQLite cache is downloaded from HuggingFace during
6
+ ``db update``. Contains all ~16M exome rsIDs from gnomAD v4.1 with
7
+ genomic coordinates (chrom/pos/ref/alt) for future AlphaMissense/CADD
8
+ integration.
9
+
10
+ The cache can also be built locally from gnomAD exome VCFs via
11
+ ``scripts/build_gnomad_cache.py`` (streaming or local file mode).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from typing import TYPE_CHECKING
17
+
18
+ from allelix.databases._versions import GNOMAD_SCHEMA_VERSION
19
+ from allelix.databases.loader_utils import install_prebuilt_gz_cache
20
+
21
+ if TYPE_CHECKING:
22
+ from pathlib import Path
23
+
24
+ GNOMAD_DB_FILENAME = "gnomad.sqlite"
25
+
26
+ GNOMAD_CACHE_URL = (
27
+ "https://huggingface.co/datasets/dial481/allelix-gnomad"
28
+ "/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
29
+ )
30
+
31
+ GNOMAD_EXPECTED_SHA256 = "e001b6c472b89075f18c82a34ccfb1e8e5c524f8502b988db1a546d25b0c6fe4"
32
+
33
+
34
+ def install_prebuilt_cache(
35
+ gz_path: Path,
36
+ db_path: Path,
37
+ *,
38
+ source_url: str = "",
39
+ remote_signal: str | None = None,
40
+ ) -> None:
41
+ """Decompress a gzipped pre-built SQLite cache into place."""
42
+ install_prebuilt_gz_cache(
43
+ gz_path,
44
+ db_path,
45
+ "gnomad",
46
+ source_url=source_url,
47
+ remote_signal=remote_signal,
48
+ schema_version_tag=f"sv:{GNOMAD_SCHEMA_VERSION}",
49
+ )