allelix 2.0.0__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {allelix-2.0.0 → allelix-2.0.2}/PKG-INFO +13 -1
- {allelix-2.0.0 → allelix-2.0.2}/README.md +12 -0
- allelix-2.0.2/allelix/__init__.py +41 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/base.py +24 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/clinvar.py +21 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/gnomad.py +10 -2
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/gwas.py +22 -7
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/pharmgkb.py +3 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/_helpers.py +44 -3
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/db.py +12 -23
- allelix-2.0.2/allelix/data/high_value_snps.yaml +136 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/_versions.py +1 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/alphamissense_loader.py +1 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/gnomad_loader.py +1 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/gwas_loader.py +25 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/manager.py +21 -4
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/pharmgkb_loader.py +17 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/snpedia_parser.py +17 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/models.py +19 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/_helpers.py +18 -4
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/base.py +8 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/ftdna.py +18 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/livingdna.py +22 -1
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/vcf.py +30 -7
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/_pipeline.py +150 -45
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/diff.py +43 -4
- allelix-2.0.2/allelix/reports/terminal.py +241 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/utils/allele.py +16 -14
- {allelix-2.0.0 → allelix-2.0.2}/allelix/utils/build_detect.py +31 -9
- {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/PKG-INFO +13 -1
- {allelix-2.0.0 → allelix-2.0.2}/pyproject.toml +1 -1
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_cli.py +99 -10
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_end_to_end.py +28 -6
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_models.py +33 -0
- allelix-2.0.2/tests/test_version.py +100 -0
- allelix-2.0.0/allelix/__init__.py +0 -12
- allelix-2.0.0/allelix/data/high_value_snps.yaml +0 -64
- allelix-2.0.0/allelix/reports/terminal.py +0 -205
- allelix-2.0.0/tests/test_version.py +0 -52
- {allelix-2.0.0 → allelix-2.0.2}/LICENSE +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/alphamissense.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/cadd.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/snpedia.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/_options.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/analyze.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/config.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/focused.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/utility.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/compare.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/config.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/data/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/data/clinvar_clnsig_snapshot.yaml +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/cadd_loader.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/cpic_loader.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/loader_utils.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/schema.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/snpedia_loader.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/exporters/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/exporters/plink.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/ancestrydna.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/ftdna_illumina.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/myhappygenes.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/myheritage.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/twentythreeandme.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/py.typed +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/high_value.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/html.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/json_report.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/methylation.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix/utils/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/SOURCES.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/dependency_links.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/entry_points.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/requires.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/top_level.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/setup.cfg +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_cli_helpers.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_compare.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_config.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_mock_data_invariants.py +0 -0
- {allelix-2.0.0 → allelix-2.0.2}/tests/test_registry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: allelix
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.2
|
|
4
4
|
Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
5
5
|
Author: Allelix
|
|
6
6
|
Maintainer-email: dial481 <dial481@users.noreply.github.com>
|
|
@@ -161,6 +161,18 @@ This is not a disclaimer afterthought. It is a design constraint that affects mo
|
|
|
161
161
|
- Reference databases are downloaded via `allelix db update` and cached locally.
|
|
162
162
|
- Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
|
|
163
163
|
|
|
164
|
+
### Output files contain real annotations of your genome
|
|
165
|
+
|
|
166
|
+
The JSON / HTML / terminal output of `allelix analyze` and its
|
|
167
|
+
focused subcommands contains real annotations against your specific
|
|
168
|
+
variants — drug-response calls, carrier-status flags, hereditary-
|
|
169
|
+
disease findings. Wherever you write them via `--output <path>`,
|
|
170
|
+
that's where they sit until you delete them. Allelix doesn't
|
|
171
|
+
auto-clean and won't warn you when you write to `/tmp/` or any
|
|
172
|
+
other shared location. Treat the files as personal data: read them,
|
|
173
|
+
move them somewhere you control, or delete when you're done. A
|
|
174
|
+
data-lifecycle subcommand is planned for v2.1.
|
|
175
|
+
|
|
164
176
|
## Configuration
|
|
165
177
|
|
|
166
178
|
Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.
|
|
@@ -124,6 +124,18 @@ This is not a disclaimer afterthought. It is a design constraint that affects mo
|
|
|
124
124
|
- Reference databases are downloaded via `allelix db update` and cached locally.
|
|
125
125
|
- Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
|
|
126
126
|
|
|
127
|
+
### Output files contain real annotations of your genome
|
|
128
|
+
|
|
129
|
+
The JSON / HTML / terminal output of `allelix analyze` and its
|
|
130
|
+
focused subcommands contains real annotations against your specific
|
|
131
|
+
variants — drug-response calls, carrier-status flags, hereditary-
|
|
132
|
+
disease findings. Wherever you write them via `--output <path>`,
|
|
133
|
+
that's where they sit until you delete them. Allelix doesn't
|
|
134
|
+
auto-clean and won't warn you when you write to `/tmp/` or any
|
|
135
|
+
other shared location. Treat the files as personal data: read them,
|
|
136
|
+
move them somewhere you control, or delete when you're done. A
|
|
137
|
+
data-lifecycle subcommand is planned for v2.1.
|
|
138
|
+
|
|
127
139
|
## Configuration
|
|
128
140
|
|
|
129
141
|
Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 Allelix
|
|
3
|
+
"""Allelix: open-source genotype analysis toolkit."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _read_pyproject_version() -> str | None:
|
|
11
|
+
"""Read the package version from ``pyproject.toml``.
|
|
12
|
+
|
|
13
|
+
GH #34: fall back to ``pyproject.toml`` when run from a bare source
|
|
14
|
+
checkout (no editable install, no installed package metadata). Keeps
|
|
15
|
+
``--version`` and the outbound HTTP User-Agent reporting the real
|
|
16
|
+
version string instead of the ``0.0.0+local`` sentinel that
|
|
17
|
+
misidentifies our traffic to NCBI / EBI / HuggingFace.
|
|
18
|
+
|
|
19
|
+
Returns ``None`` on any failure — the caller falls back to the
|
|
20
|
+
sentinel rather than crashing import.
|
|
21
|
+
"""
|
|
22
|
+
import tomllib
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
|
|
26
|
+
try:
|
|
27
|
+
with pyproject.open("rb") as fh:
|
|
28
|
+
data = tomllib.load(fh)
|
|
29
|
+
except (OSError, tomllib.TOMLDecodeError):
|
|
30
|
+
return None
|
|
31
|
+
project = data.get("project") or {}
|
|
32
|
+
v = project.get("version")
|
|
33
|
+
return v if isinstance(v, str) and v else None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
__version__ = version("allelix")
|
|
38
|
+
except PackageNotFoundError:
|
|
39
|
+
# Source checkout without an editable install. Try pyproject.toml
|
|
40
|
+
# before falling back to the sentinel.
|
|
41
|
+
__version__ = _read_pyproject_version() or "0.0.0+local"
|
|
@@ -142,7 +142,30 @@ class Annotator(ABC):
|
|
|
142
142
|
self.data_dir = data_dir
|
|
143
143
|
|
|
144
144
|
def __del__(self) -> None:
|
|
145
|
-
"""
|
|
145
|
+
"""Safety-net resource release on GC. Deliberately retained.
|
|
146
|
+
|
|
147
|
+
GH #36 (audit second pass) flagged ``__del__`` as a Python
|
|
148
|
+
antipattern — GC timing is nondeterministic and raised exceptions
|
|
149
|
+
are silently swallowed. The correct usage pattern is the
|
|
150
|
+
``__enter__`` / ``__exit__`` context manager pair below, wired
|
|
151
|
+
through ``contextlib.ExitStack`` in ``reports/_pipeline.py``.
|
|
152
|
+
|
|
153
|
+
However: removing ``__del__`` exposes residual SQLite connection
|
|
154
|
+
leaks in code paths that construct an annotator outside a
|
|
155
|
+
context manager. ``ResourceWarning`` is elevated to error by
|
|
156
|
+
``pytest`` config, so leaks fail the suite as
|
|
157
|
+
``PytestUnraisableExceptionWarning`` — caught in the v2.0.2
|
|
158
|
+
ship gate when ``__del__`` was first removed. Until every call
|
|
159
|
+
site is verified to use ``with`` / ``ExitStack`` / explicit
|
|
160
|
+
``close()``, this safety net stays. v2.1 task: audit and
|
|
161
|
+
remove.
|
|
162
|
+
|
|
163
|
+
``contextlib.suppress(Exception)`` is deliberate — ``__del__``
|
|
164
|
+
must never raise. The GC timing and shutdown-ordering edges
|
|
165
|
+
are explicitly silenced; this is exactly the
|
|
166
|
+
"if you must keep ``__del__``, make absolutely sure it can
|
|
167
|
+
never raise" mitigation the audit recommended.
|
|
168
|
+
"""
|
|
146
169
|
with contextlib.suppress(Exception):
|
|
147
170
|
self.close()
|
|
148
171
|
|
|
@@ -14,6 +14,7 @@ and dispatches per-variant by `variant.build`.
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
import re
|
|
17
18
|
import sqlite3
|
|
18
19
|
from typing import TYPE_CHECKING, ClassVar
|
|
19
20
|
|
|
@@ -42,6 +43,14 @@ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
|
|
|
42
43
|
|
|
43
44
|
_BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
|
|
44
45
|
|
|
46
|
+
# GH #21: a remote .md5 endpoint can return an HTML error page on a
|
|
47
|
+
# transient blip. The first whitespace-separated token of the body is
|
|
48
|
+
# what we treat as the hash, so without this gate `<!DOCTYPE` would be
|
|
49
|
+
# accepted as the "signal" and later passed to `verify_file_hash`, which
|
|
50
|
+
# would then delete the freshly downloaded VCF. MD5 is exactly 32 hex
|
|
51
|
+
# digits; reject anything else.
|
|
52
|
+
_MD5_HEX_RE = re.compile(r"^[0-9a-fA-F]{32}$")
|
|
53
|
+
|
|
45
54
|
|
|
46
55
|
def clinvar_db_filename(build: str) -> str:
|
|
47
56
|
"""Per-build cache filename. Two coexisting SQLite files per data_dir."""
|
|
@@ -323,7 +332,18 @@ class ClinVarAnnotator(Annotator):
|
|
|
323
332
|
if not body:
|
|
324
333
|
return None
|
|
325
334
|
first_token = body.strip().split(None, 1)[0] if body.strip() else ""
|
|
326
|
-
if not first_token:
|
|
335
|
+
if not _MD5_HEX_RE.fullmatch(first_token):
|
|
336
|
+
# CDN error page, redirect interstitial, or empty body. Treat
|
|
337
|
+
# as a transient signal failure rather than poisoning the
|
|
338
|
+
# cache: callers handle `None` as "freshness unknown, skip"
|
|
339
|
+
# in `db update`, and `setup()` raises rather than passing
|
|
340
|
+
# garbage to `verify_file_hash` (which would delete the VCF).
|
|
341
|
+
logger.warning(
|
|
342
|
+
"clinvar(%s): .md5 endpoint returned a body whose first token "
|
|
343
|
+
"is not a 32-char hex digest (got %r); treating as no signal",
|
|
344
|
+
build,
|
|
345
|
+
first_token[:32],
|
|
346
|
+
)
|
|
327
347
|
return None
|
|
328
348
|
return f"md5:{first_token}"
|
|
329
349
|
|
|
@@ -94,12 +94,20 @@ class GnomadAnnotator(Annotator):
|
|
|
94
94
|
logger.warning("Could not remove staged file at %s", gz_path)
|
|
95
95
|
|
|
96
96
|
def is_ready(self) -> bool:
|
|
97
|
-
"""True when the gnomAD SQLite cache exists with current schema version.
|
|
97
|
+
"""True when the gnomAD SQLite cache exists with current schema version.
|
|
98
|
+
|
|
99
|
+
GH #22: a cache with no ``local_version_tag`` used to be accepted
|
|
100
|
+
as ready (the previous ``or not tag`` escape). That defeated the
|
|
101
|
+
whole point of ``GNOMAD_SCHEMA_VERSION``: if it ever gets bumped,
|
|
102
|
+
every tagless legacy cache would silently pass as the new
|
|
103
|
+
version. Reject tagless caches so the user is told to re-run
|
|
104
|
+
``db update``.
|
|
105
|
+
"""
|
|
98
106
|
info = get_database_info(self._db_path, "gnomad")
|
|
99
107
|
if info is None:
|
|
100
108
|
return False
|
|
101
109
|
tag = info.get("local_version_tag") or ""
|
|
102
|
-
return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
|
|
110
|
+
return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
|
|
103
111
|
|
|
104
112
|
def version(self) -> str | None:
|
|
105
113
|
"""Return the cached database version, or None."""
|
|
@@ -16,6 +16,7 @@ from allelix.databases.gwas_loader import (
|
|
|
16
16
|
_REQUIRED_GWAS_COLUMNS,
|
|
17
17
|
GWAS_CATALOG_URL,
|
|
18
18
|
GWAS_DB_FILENAME,
|
|
19
|
+
GWAS_MIN_ROWS,
|
|
19
20
|
load_gwas_tsv,
|
|
20
21
|
schema_is_current,
|
|
21
22
|
)
|
|
@@ -57,18 +58,25 @@ _BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
|
|
|
57
58
|
|
|
58
59
|
|
|
59
60
|
def _magnitude(p_value: float | None, or_beta: float | None) -> float:
|
|
60
|
-
"""Derive magnitude from p-value and optional effect size.
|
|
61
|
+
"""Derive magnitude from p-value and optional effect size.
|
|
62
|
+
|
|
63
|
+
GH #17: boundary comparisons are inclusive (``<=``) so the canonical
|
|
64
|
+
genome-wide-significance threshold ``p = 5e-8`` lands inside the
|
|
65
|
+
significant bucket rather than the suggestive bucket below it.
|
|
66
|
+
Strict ``<`` made the exact threshold value fall a full magnitude
|
|
67
|
+
below a barely-significant hit.
|
|
68
|
+
"""
|
|
61
69
|
if p_value is None:
|
|
62
70
|
base = 2.0
|
|
63
|
-
elif p_value
|
|
71
|
+
elif p_value <= 5e-100:
|
|
64
72
|
base = 8.0
|
|
65
|
-
elif p_value
|
|
73
|
+
elif p_value <= 5e-20:
|
|
66
74
|
base = 7.0
|
|
67
|
-
elif p_value
|
|
75
|
+
elif p_value <= 5e-8:
|
|
68
76
|
base = 6.0
|
|
69
|
-
elif p_value
|
|
77
|
+
elif p_value <= 5e-6:
|
|
70
78
|
base = 4.0
|
|
71
|
-
elif p_value
|
|
79
|
+
elif p_value <= 5e-4:
|
|
72
80
|
base = 3.0
|
|
73
81
|
else:
|
|
74
82
|
base = 2.0
|
|
@@ -143,7 +151,13 @@ class GWASCatalogAnnotator(Annotator):
|
|
|
143
151
|
extracted = self.data_dir / tsv_names[0]
|
|
144
152
|
if extracted != tsv_path:
|
|
145
153
|
extracted.rename(tsv_path)
|
|
146
|
-
load_gwas_tsv(
|
|
154
|
+
load_gwas_tsv(
|
|
155
|
+
tsv_path,
|
|
156
|
+
self._db_path,
|
|
157
|
+
source_url=url,
|
|
158
|
+
remote_signal=signal,
|
|
159
|
+
min_rows=GWAS_MIN_ROWS,
|
|
160
|
+
)
|
|
147
161
|
finally:
|
|
148
162
|
try:
|
|
149
163
|
zip_path.unlink()
|
|
@@ -183,6 +197,7 @@ class GWASCatalogAnnotator(Annotator):
|
|
|
183
197
|
self._db_path,
|
|
184
198
|
source_url=GWAS_CATALOG_URL,
|
|
185
199
|
remote_signal=self.cached_remote_signal(),
|
|
200
|
+
min_rows=GWAS_MIN_ROWS,
|
|
186
201
|
)
|
|
187
202
|
except Exception:
|
|
188
203
|
logger.warning("Auto-reingest from cached TSV failed", exc_info=True)
|
|
@@ -24,6 +24,7 @@ from allelix.databases.manager import (
|
|
|
24
24
|
from allelix.databases.pharmgkb_loader import (
|
|
25
25
|
PHARMGKB_CLINICAL_URL,
|
|
26
26
|
PHARMGKB_DB_FILENAME,
|
|
27
|
+
PHARMGKB_MIN_ROWS,
|
|
27
28
|
_normalize_genotype,
|
|
28
29
|
load_pharmgkb_tsv,
|
|
29
30
|
schema_is_current,
|
|
@@ -146,6 +147,7 @@ class PharmGKBAnnotator(Annotator):
|
|
|
146
147
|
source_url=url,
|
|
147
148
|
remote_signal=signal,
|
|
148
149
|
allele_function_lookup=cpic_lookup,
|
|
150
|
+
min_rows=PHARMGKB_MIN_ROWS,
|
|
149
151
|
)
|
|
150
152
|
|
|
151
153
|
def is_ready(self) -> bool:
|
|
@@ -489,6 +491,7 @@ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
|
|
|
489
491
|
version=old_version,
|
|
490
492
|
remote_signal=old_signal,
|
|
491
493
|
allele_function_lookup=cpic_lookup,
|
|
494
|
+
min_rows=PHARMGKB_MIN_ROWS,
|
|
492
495
|
)
|
|
493
496
|
except Exception:
|
|
494
497
|
logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)
|
|
@@ -336,6 +336,12 @@ def _emit_build_diagnostics(result: object) -> None:
|
|
|
336
336
|
source = "detected"
|
|
337
337
|
elif diag.header_build:
|
|
338
338
|
source = "header (no position confirmation)"
|
|
339
|
+
elif diag.chr_prefix_inferred:
|
|
340
|
+
# GH #38: chr-prefixed contig names ("chr1", "chrX", ...) reliably
|
|
341
|
+
# indicate GRCh38 in modern caller output. We DID detect a build;
|
|
342
|
+
# the banner and the warning should say so instead of reading as
|
|
343
|
+
# a blind default.
|
|
344
|
+
source = "inferred from chr-prefixed contig names"
|
|
339
345
|
else:
|
|
340
346
|
source = "fallback (no known SNPs matched)"
|
|
341
347
|
console.print(
|
|
@@ -349,20 +355,55 @@ def _emit_build_diagnostics(result: object) -> None:
|
|
|
349
355
|
f"This is a real-world data-quality issue — your provider may have "
|
|
350
356
|
f"mislabeled the build (see ADR-0021).[/yellow]"
|
|
351
357
|
)
|
|
358
|
+
elif diag.chr_prefix_inferred:
|
|
359
|
+
# GH #38: positive, accurate message — the inference path
|
|
360
|
+
# actually fired. Still recommend `--build` for users who
|
|
361
|
+
# want to lock in the answer; chr-prefix is a strong signal
|
|
362
|
+
# but UCSC hg19 also uses `chr` prefixes, so the heuristic
|
|
363
|
+
# isn't guaranteed against a hg19-converted file.
|
|
364
|
+
console.print(
|
|
365
|
+
f"[dim]Inferred {diag.effective_build} from chr-prefixed contig "
|
|
366
|
+
f"names (GRCh38 convention). Pass --build grch37 if this file is "
|
|
367
|
+
f"UCSC hg19 with chr-prefixed contigs instead.[/dim]"
|
|
368
|
+
)
|
|
352
369
|
elif not diag.override and diag.detected_build is None and diag.header_build is None:
|
|
353
370
|
# Common shape: VCF from a variant caller where the ID column is `.`
|
|
354
|
-
# and the header has no ##contig assembly tag
|
|
355
|
-
#
|
|
371
|
+
# and the header has no ##contig assembly tag, AND no chr-prefix
|
|
372
|
+
# signal was observed. All three auto-detect paths failed.
|
|
356
373
|
# Loudly recommend an explicit --build because picking the wrong one
|
|
357
374
|
# silently means every annotation lookup uses wrong coordinates.
|
|
358
375
|
console.print(
|
|
359
376
|
f"[yellow]Could not auto-detect genome build (no rsIDs in input, "
|
|
360
|
-
f"no ##contig assembly tag
|
|
377
|
+
f"no ##contig assembly tag, no chr-prefixed contigs). Defaulted to "
|
|
361
378
|
f"{diag.effective_build}. If the file is the other build, pass "
|
|
362
379
|
f"--build grch37 or --build grch38 explicitly — annotation "
|
|
363
380
|
f"coordinates differ between builds and silently using the wrong "
|
|
364
381
|
f"one will miss every hit.[/yellow]"
|
|
365
382
|
)
|
|
383
|
+
elif (
|
|
384
|
+
not diag.override
|
|
385
|
+
and diag.detected_build is None
|
|
386
|
+
and diag.header_build is not None
|
|
387
|
+
and diag.inspected_count > 0
|
|
388
|
+
):
|
|
389
|
+
# Position-detection inspected known-rsID rows but couldn't pick a
|
|
390
|
+
# build — either votes tied across builds or no row matched any
|
|
391
|
+
# build's reference position. Without this warning, the pipeline
|
|
392
|
+
# silently falls through to header_build, and a GRCh36 file with a
|
|
393
|
+
# GRCh37-mislabeled header gets the GRCh37 ClinVar cache (the
|
|
394
|
+
# silent-coords trap #15). The dim "header (no position
|
|
395
|
+
# confirmation)" status line shows the same facts but reads as
|
|
396
|
+
# routine — yellow is what the situation deserves.
|
|
397
|
+
console.print(
|
|
398
|
+
f"[yellow]Build detection inconclusive: "
|
|
399
|
+
f"{diag.inspected_count} known-rsID position checks ran but "
|
|
400
|
+
f"did not converge on a build. Using the file's header-claimed "
|
|
401
|
+
f"build ({diag.header_build}), which has not been confirmed "
|
|
402
|
+
f"against your position data. If the file is actually a "
|
|
403
|
+
f"different build, pass --build grch37 or --build grch38 to "
|
|
404
|
+
f"force — wrong coordinates will silently mis-annotate every "
|
|
405
|
+
f"variant.[/yellow]"
|
|
406
|
+
)
|
|
366
407
|
if diag.effective_build == "GRCh36":
|
|
367
408
|
console.print(
|
|
368
409
|
"[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "
|
|
@@ -18,29 +18,12 @@ from allelix.databases import resolve_data_dir
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
|
|
21
|
-
from allelix.annotators.base import Annotator
|
|
22
|
-
|
|
23
21
|
|
|
24
22
|
@main.group()
|
|
25
23
|
def db() -> None:
|
|
26
24
|
"""Manage local reference database cache."""
|
|
27
25
|
|
|
28
26
|
|
|
29
|
-
def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
|
|
30
|
-
"""Write a remote signal to an existing cache without re-downloading."""
|
|
31
|
-
import contextlib
|
|
32
|
-
import sqlite3
|
|
33
|
-
|
|
34
|
-
from allelix.databases.manager import stamp_remote_signal
|
|
35
|
-
|
|
36
|
-
db_path = getattr(annotator, "_db_path", None)
|
|
37
|
-
if db_path is None:
|
|
38
|
-
return
|
|
39
|
-
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
40
|
-
stamp_remote_signal(conn, annotator.name, signal)
|
|
41
|
-
conn.commit()
|
|
42
|
-
|
|
43
|
-
|
|
44
27
|
def _confirm_cadd_license(*, license_held: bool = False) -> bool:
|
|
45
28
|
"""Show the CADD license notice and ask for confirmation."""
|
|
46
29
|
if license_held:
|
|
@@ -207,14 +190,20 @@ def db_update(
|
|
|
207
190
|
continue
|
|
208
191
|
|
|
209
192
|
if cached is None:
|
|
210
|
-
|
|
193
|
+
# GH #20: a cache with no stored freshness signal almost
|
|
194
|
+
# always predates the signal mechanism — i.e., it is old.
|
|
195
|
+
# The previous behavior was to stamp the live remote signal
|
|
196
|
+
# onto the cache and call it current, which permanently
|
|
197
|
+
# marked stale data as fresh (only `--force` would escape).
|
|
198
|
+
# Treat tagless caches as needing a refresh.
|
|
211
199
|
console.print(
|
|
212
|
-
f" [
|
|
213
|
-
|
|
200
|
+
f" [bold]{annotator.name}[/bold]: cache predates the "
|
|
201
|
+
"freshness signal; re-downloading…"
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
console.print(
|
|
205
|
+
f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…"
|
|
214
206
|
)
|
|
215
|
-
continue
|
|
216
|
-
|
|
217
|
-
console.print(f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
|
|
218
207
|
if _helpers._run_setup(annotator):
|
|
219
208
|
console.print(
|
|
220
209
|
f" [green]✓ {annotator.name} refreshed[/green] "
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# High-value SNPs: clinically important variants where a no-call
|
|
2
|
+
# should be explicitly flagged rather than silently omitted.
|
|
3
|
+
#
|
|
4
|
+
# Schema:
|
|
5
|
+
# rsid: dbSNP identifier
|
|
6
|
+
# gene: gene symbol
|
|
7
|
+
# cluster: optional grouping (e.g., "APOE" for the two-SNP APOE haplotype)
|
|
8
|
+
# note: human-readable warning text for no-call reports
|
|
9
|
+
#
|
|
10
|
+
# To add a SNP: append an entry following this format. Entries with the
|
|
11
|
+
# same cluster are grouped in warnings (e.g., "APOE genotype cannot be
|
|
12
|
+
# determined" when either rs429358 or rs7412 is a no-call).
|
|
13
|
+
|
|
14
|
+
- rsid: rs429358
|
|
15
|
+
gene: APOE
|
|
16
|
+
cluster: APOE
|
|
17
|
+
note: Required (with rs7412) to determine APOE genotype
|
|
18
|
+
|
|
19
|
+
- rsid: rs7412
|
|
20
|
+
gene: APOE
|
|
21
|
+
cluster: APOE
|
|
22
|
+
note: Required (with rs429358) to determine APOE genotype
|
|
23
|
+
|
|
24
|
+
- rsid: rs5742904
|
|
25
|
+
gene: APOB
|
|
26
|
+
note: Familial hypercholesterolemia marker (FH)
|
|
27
|
+
|
|
28
|
+
- rsid: rs80357906
|
|
29
|
+
gene: BRCA1
|
|
30
|
+
note: Hereditary breast/ovarian cancer marker
|
|
31
|
+
|
|
32
|
+
- rsid: rs1801133
|
|
33
|
+
gene: MTHFR
|
|
34
|
+
cluster: MTHFR
|
|
35
|
+
note: Methylation pathway (C677T)
|
|
36
|
+
|
|
37
|
+
- rsid: rs1801131
|
|
38
|
+
gene: MTHFR
|
|
39
|
+
cluster: MTHFR
|
|
40
|
+
note: Methylation pathway (A1298C)
|
|
41
|
+
|
|
42
|
+
- rsid: rs4680
|
|
43
|
+
gene: COMT
|
|
44
|
+
note: Catechol-O-methyltransferase activity
|
|
45
|
+
|
|
46
|
+
- rsid: rs1065852
|
|
47
|
+
gene: CYP2D6
|
|
48
|
+
note: Opioid / SSRI metabolism
|
|
49
|
+
|
|
50
|
+
- rsid: rs4244285
|
|
51
|
+
gene: CYP2C19
|
|
52
|
+
note: Clopidogrel, PPIs metabolism
|
|
53
|
+
|
|
54
|
+
- rsid: rs1799853
|
|
55
|
+
gene: CYP2C9
|
|
56
|
+
note: Warfarin metabolism
|
|
57
|
+
|
|
58
|
+
- rsid: rs4149056
|
|
59
|
+
gene: SLCO1B1
|
|
60
|
+
note: Statin myopathy risk
|
|
61
|
+
|
|
62
|
+
- rsid: rs3918290
|
|
63
|
+
gene: DPYD
|
|
64
|
+
note: Fluoropyrimidine toxicity
|
|
65
|
+
|
|
66
|
+
# v2.0.2 additions (GH #7): clinically actionable single-SNP variants
|
|
67
|
+
# verified to be on consumer arrays. Two new clusters: HFE (hereditary
|
|
68
|
+
# hemochromatosis compound-het) and TPMT (thiopurine *3 haplotype).
|
|
69
|
+
|
|
70
|
+
- rsid: rs6025
|
|
71
|
+
gene: F5
|
|
72
|
+
note: Factor V Leiden — hereditary thrombophilia (FDA-cleared GHR variant)
|
|
73
|
+
|
|
74
|
+
- rsid: rs1799963
|
|
75
|
+
gene: F2
|
|
76
|
+
note: Prothrombin G20210A — hereditary thrombophilia
|
|
77
|
+
|
|
78
|
+
- rsid: rs1800562
|
|
79
|
+
gene: HFE
|
|
80
|
+
cluster: HFE
|
|
81
|
+
note: C282Y — hereditary hemochromatosis (compound het with H63D is the clinical form)
|
|
82
|
+
|
|
83
|
+
- rsid: rs1799945
|
|
84
|
+
gene: HFE
|
|
85
|
+
cluster: HFE
|
|
86
|
+
note: H63D — hereditary hemochromatosis (compound het with C282Y is the clinical form)
|
|
87
|
+
|
|
88
|
+
- rsid: rs113993960
|
|
89
|
+
gene: CFTR
|
|
90
|
+
note: F508del — most common CF allele; carrier status for reproductive planning
|
|
91
|
+
|
|
92
|
+
- rsid: rs334
|
|
93
|
+
gene: HBB
|
|
94
|
+
note: Sickle cell (HbS) — most-screened-for variant worldwide; carrier status
|
|
95
|
+
|
|
96
|
+
- rsid: rs80359550
|
|
97
|
+
gene: BRCA2
|
|
98
|
+
note: BRCA2 6174delT — most common Ashkenazi founder mutation (BRCA1 covered by rs80357906)
|
|
99
|
+
|
|
100
|
+
- rsid: rs9923231
|
|
101
|
+
gene: VKORC1
|
|
102
|
+
note: Warfarin dosing (CPIC Level A, pairs with CYP2C9 rs1799853)
|
|
103
|
+
|
|
104
|
+
- rsid: rs1057910
|
|
105
|
+
gene: CYP2C9
|
|
106
|
+
note: CYP2C9*3 — completes warfarin metabolizer profile alongside *2 (rs1799853)
|
|
107
|
+
|
|
108
|
+
- rsid: rs12248560
|
|
109
|
+
gene: CYP2C19
|
|
110
|
+
note: CYP2C19*17 ultrarapid metabolizer — completes clopidogrel profile alongside *2 (rs4244285)
|
|
111
|
+
|
|
112
|
+
- rsid: rs3892097
|
|
113
|
+
gene: CYP2D6
|
|
114
|
+
note: CYP2D6*4 — most common LOF in Europeans (complements rs1065852 *10)
|
|
115
|
+
|
|
116
|
+
- rsid: rs776746
|
|
117
|
+
gene: CYP3A5
|
|
118
|
+
note: CYP3A5*3 — tacrolimus dosing (CPIC Level A)
|
|
119
|
+
|
|
120
|
+
- rsid: rs1142345
|
|
121
|
+
gene: TPMT
|
|
122
|
+
cluster: TPMT
|
|
123
|
+
note: TPMT*3C — thiopurine dosing (CPIC Level A; with rs1800460 resolves *3A/*3B/*3C)
|
|
124
|
+
|
|
125
|
+
- rsid: rs1800460
|
|
126
|
+
gene: TPMT
|
|
127
|
+
cluster: TPMT
|
|
128
|
+
note: TPMT*3B — thiopurine dosing (CPIC Level A; with rs1142345 resolves *3A/*3B/*3C)
|
|
129
|
+
|
|
130
|
+
- rsid: rs116855232
|
|
131
|
+
gene: NUDT15
|
|
132
|
+
note: Thiopurine toxicity (CPIC Level A; critical in East Asian populations, complements TPMT cluster)
|
|
133
|
+
|
|
134
|
+
- rsid: rs34637584
|
|
135
|
+
gene: LRRK2
|
|
136
|
+
note: G2019S — most common monogenic Parkinson's variant
|
|
@@ -9,7 +9,7 @@ column of ``database_versions`` (e.g. ``iv:1``) so ``is_ready()`` can
|
|
|
9
9
|
reject stale caches without forcing a full re-download.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
CLINVAR_INTERPRETER_VERSION = 1
|
|
12
|
+
CLINVAR_INTERPRETER_VERSION = 2 # v2.0.1: GH #42 CLNDN-join in iter_clinvar_records
|
|
13
13
|
PHARMGKB_INTERPRETER_VERSION = 1
|
|
14
14
|
GNOMAD_SCHEMA_VERSION = 1
|
|
15
15
|
ALPHAMISSENSE_SCHEMA_VERSION = 1
|
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
ALPHAMISSENSE_DB_FILENAME = "alphamissense.sqlite"
|
|
24
24
|
|
|
25
25
|
ALPHAMISSENSE_CACHE_URL = (
|
|
26
|
-
"https://huggingface.co/datasets/
|
|
26
|
+
"https://huggingface.co/datasets/allelix/allelix-alphamissense"
|
|
27
27
|
"/resolve/13a15e199536512b5e2d208d79c4f93c0a73f71f/alphamissense.sqlite.gz"
|
|
28
28
|
)
|
|
29
29
|
|
|
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
|
|
|
24
24
|
GNOMAD_DB_FILENAME = "gnomad.sqlite"
|
|
25
25
|
|
|
26
26
|
GNOMAD_CACHE_URL = (
|
|
27
|
-
"https://huggingface.co/datasets/
|
|
27
|
+
"https://huggingface.co/datasets/allelix/allelix-gnomad"
|
|
28
28
|
"/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
|
|
29
29
|
)
|
|
30
30
|
|
|
@@ -465,16 +465,32 @@ def iter_gwas_records(tsv_path: Path) -> Iterator[dict[str, object]]:
|
|
|
465
465
|
yield from best.values()
|
|
466
466
|
|
|
467
467
|
|
|
468
|
+
# Truncation sanity floor for production loads. This guards the count
|
|
469
|
+
# returned by iter_gwas_records — i.e. rows AFTER haplotype/no-trait
|
|
470
|
+
# filtering and (rsid, trait) dedup, not the raw catalog. EBI curates
|
|
471
|
+
# ~625K lead associations (GWAS Catalog, 2025); the loaded count is lower
|
|
472
|
+
# than that but still far above this floor. 100K only catches gross
|
|
473
|
+
# truncation (a mid-stream download committed as "complete") while staying
|
|
474
|
+
# permissive against legitimate upstream drift. Set to 0 from tests so
|
|
475
|
+
# synthetic fixtures of any size load cleanly. See GH #19.
|
|
476
|
+
GWAS_MIN_ROWS = 100_000
|
|
477
|
+
|
|
478
|
+
|
|
468
479
|
def load_gwas_tsv(
|
|
469
480
|
tsv_path: Path,
|
|
470
481
|
db_path: Path,
|
|
471
482
|
source_url: str = "",
|
|
472
483
|
remote_signal: str | None = None,
|
|
484
|
+
min_rows: int = 0,
|
|
473
485
|
) -> int:
|
|
474
486
|
"""Parse a GWAS Catalog TSV into a fresh SQLite cache atomically.
|
|
475
487
|
|
|
476
488
|
Writes to a `.tmp` sibling and `os.replace`s onto `db_path` only after a
|
|
477
489
|
successful commit. Returns the number of records loaded.
|
|
490
|
+
|
|
491
|
+
``min_rows`` is a sanity floor checked before the final ``os.replace``.
|
|
492
|
+
Set by production callers (see ``GwasAnnotator.setup``) to
|
|
493
|
+
``GWAS_MIN_ROWS``; defaults to 0 so test fixtures of any size load.
|
|
478
494
|
"""
|
|
479
495
|
tmp_path = db_path.parent / f"{db_path.name}.tmp"
|
|
480
496
|
if tmp_path.exists():
|
|
@@ -535,6 +551,15 @@ def load_gwas_tsv(
|
|
|
535
551
|
),
|
|
536
552
|
)
|
|
537
553
|
conn.commit()
|
|
554
|
+
if count < min_rows:
|
|
555
|
+
msg = (
|
|
556
|
+
f"GWAS Catalog load aborted: only {count:,} rows ingested "
|
|
557
|
+
f"(floor {min_rows:,}). The download was likely truncated "
|
|
558
|
+
f"in flight (chunked transfer with no Content-Length, or "
|
|
559
|
+
f"connection drop mid-stream). Retry with "
|
|
560
|
+
f"`allelix db update --force`."
|
|
561
|
+
)
|
|
562
|
+
raise OSError(msg)
|
|
538
563
|
os.replace(tmp_path, db_path)
|
|
539
564
|
return count
|
|
540
565
|
except Exception:
|
|
@@ -197,9 +197,20 @@ def parse_clinvar_version(vcf_path: Path) -> str | None:
|
|
|
197
197
|
def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
|
|
198
198
|
"""Stream parse a ClinVar VCF (.vcf or .vcf.gz). Skip entries without an RS id.
|
|
199
199
|
|
|
200
|
-
Multi-allelic rows (ALT="A,T") are split into one record per ALT.
|
|
201
|
-
INFO fields
|
|
202
|
-
|
|
200
|
+
Multi-allelic rows (ALT="A,T") are split into one record per ALT.
|
|
201
|
+
Parallel INFO fields ``CLNSIG`` and ``ALLELEID`` are separated by
|
|
202
|
+
``|`` and index-paired with the ALTs.
|
|
203
|
+
|
|
204
|
+
GH #42: ``CLNDN`` is NOT index-paired with ALTs — its ``|`` separator
|
|
205
|
+
enumerates the union of conditions across all SCV submissions on the
|
|
206
|
+
variant, with no positional mapping to CLNSIG. Joining the full list
|
|
207
|
+
into a single ``condition`` string per record avoids the Frankenstein
|
|
208
|
+
pairing (one SCV's classification next to another SCV's condition)
|
|
209
|
+
that index-picking introduced. The primary classification
|
|
210
|
+
(``CLNSIG[0]``) is kept as-is — that value is correct as a
|
|
211
|
+
variant-level claim; only the condition-pairing was misleading.
|
|
212
|
+
Full per-(classification, condition) pairing via
|
|
213
|
+
``submission_summary.txt.gz`` is tracked for v2.1.
|
|
203
214
|
"""
|
|
204
215
|
opener = gzip.open if vcf_path.suffix == ".gz" else open
|
|
205
216
|
with opener(vcf_path, "rt", encoding="utf-8") as fh:
|
|
@@ -231,6 +242,12 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
|
|
|
231
242
|
review_status = info_dict.get("CLNREVSTAT", "")
|
|
232
243
|
gene = _extract_gene(info_dict.get("GENEINFO", ""))
|
|
233
244
|
|
|
245
|
+
# GH #42: CLNDN's `|`-separator is per-SCV, not per-ALT.
|
|
246
|
+
# Join the full list once per row (same string emitted for
|
|
247
|
+
# every ALT split-out of this record). Empty/`.`/blank
|
|
248
|
+
# tokens are filtered out so callers don't see leading/trailing
|
|
249
|
+
# separators.
|
|
250
|
+
joined_condition = "; ".join(c.replace("_", " ") for c in clndns if c and c != ".")
|
|
234
251
|
for i, alt in enumerate(alts):
|
|
235
252
|
yield {
|
|
236
253
|
"rsid": f"rs{rs}",
|
|
@@ -239,7 +256,7 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
|
|
|
239
256
|
"ref": ref,
|
|
240
257
|
"alt": alt,
|
|
241
258
|
"clinical_significance": _pick(clnsigs, i),
|
|
242
|
-
"condition":
|
|
259
|
+
"condition": joined_condition,
|
|
243
260
|
"gene": gene,
|
|
244
261
|
"review_status": review_status,
|
|
245
262
|
"allele_id": _safe_int(_pick(allele_ids, i)),
|