allelix 2.0.0__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {allelix-2.0.0 → allelix-2.0.1}/PKG-INFO +1 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/clinvar.py +21 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/gnomad.py +10 -2
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/gwas.py +22 -7
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/pharmgkb.py +3 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/_helpers.py +24 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/db.py +12 -23
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/_versions.py +1 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/alphamissense_loader.py +1 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/gnomad_loader.py +1 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/gwas_loader.py +25 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/manager.py +21 -4
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/pharmgkb_loader.py +17 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/snpedia_parser.py +17 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/models.py +19 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/_helpers.py +18 -4
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ftdna.py +18 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/livingdna.py +22 -1
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/_pipeline.py +69 -39
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/diff.py +43 -4
- allelix-2.0.1/allelix/reports/terminal.py +241 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/utils/allele.py +16 -14
- {allelix-2.0.0 → allelix-2.0.1}/allelix/utils/build_detect.py +31 -9
- {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/PKG-INFO +1 -1
- {allelix-2.0.0 → allelix-2.0.1}/pyproject.toml +1 -1
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_cli.py +99 -10
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_models.py +33 -0
- allelix-2.0.0/allelix/reports/terminal.py +0 -205
- {allelix-2.0.0 → allelix-2.0.1}/LICENSE +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/README.md +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/alphamissense.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/base.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/cadd.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/snpedia.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/_options.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/analyze.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/config.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/focused.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/utility.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/compare.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/config.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/data/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/data/clinvar_clnsig_snapshot.yaml +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/data/high_value_snps.yaml +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/cadd_loader.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/cpic_loader.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/loader_utils.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/schema.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/snpedia_loader.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/exporters/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/exporters/plink.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ancestrydna.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/base.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ftdna_illumina.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/myhappygenes.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/myheritage.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/twentythreeandme.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/vcf.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/py.typed +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/high_value.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/html.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/json_report.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/methylation.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix/utils/__init__.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/SOURCES.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/dependency_links.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/entry_points.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/requires.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/top_level.txt +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/setup.cfg +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_cli_helpers.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_compare.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_config.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_end_to_end.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_mock_data_invariants.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_registry.py +0 -0
- {allelix-2.0.0 → allelix-2.0.1}/tests/test_version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: allelix
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.1
|
|
4
4
|
Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
|
|
5
5
|
Author: Allelix
|
|
6
6
|
Maintainer-email: dial481 <dial481@users.noreply.github.com>
|
|
@@ -14,6 +14,7 @@ and dispatches per-variant by `variant.build`.
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
import re
|
|
17
18
|
import sqlite3
|
|
18
19
|
from typing import TYPE_CHECKING, ClassVar
|
|
19
20
|
|
|
@@ -42,6 +43,14 @@ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
|
|
|
42
43
|
|
|
43
44
|
_BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
|
|
44
45
|
|
|
46
|
+
# GH #21: a remote .md5 endpoint can return an HTML error page on a
|
|
47
|
+
# transient blip. The first whitespace-separated token of the body is
|
|
48
|
+
# what we treat as the hash, so without this gate `<!DOCTYPE` would be
|
|
49
|
+
# accepted as the "signal" and later passed to `verify_file_hash`, which
|
|
50
|
+
# would then delete the freshly downloaded VCF. MD5 is exactly 32 hex
|
|
51
|
+
# digits; reject anything else.
|
|
52
|
+
_MD5_HEX_RE = re.compile(r"^[0-9a-fA-F]{32}$")
|
|
53
|
+
|
|
45
54
|
|
|
46
55
|
def clinvar_db_filename(build: str) -> str:
|
|
47
56
|
"""Per-build cache filename. Two coexisting SQLite files per data_dir."""
|
|
@@ -323,7 +332,18 @@ class ClinVarAnnotator(Annotator):
|
|
|
323
332
|
if not body:
|
|
324
333
|
return None
|
|
325
334
|
first_token = body.strip().split(None, 1)[0] if body.strip() else ""
|
|
326
|
-
if not first_token:
|
|
335
|
+
if not _MD5_HEX_RE.fullmatch(first_token):
|
|
336
|
+
# CDN error page, redirect interstitial, or empty body. Treat
|
|
337
|
+
# as a transient signal failure rather than poisoning the
|
|
338
|
+
# cache: callers handle `None` as "freshness unknown, skip"
|
|
339
|
+
# in `db update`, and `setup()` raises rather than passing
|
|
340
|
+
# garbage to `verify_file_hash` (which would delete the VCF).
|
|
341
|
+
logger.warning(
|
|
342
|
+
"clinvar(%s): .md5 endpoint returned a body whose first token "
|
|
343
|
+
"is not a 32-char hex digest (got %r); treating as no signal",
|
|
344
|
+
build,
|
|
345
|
+
first_token[:32],
|
|
346
|
+
)
|
|
327
347
|
return None
|
|
328
348
|
return f"md5:{first_token}"
|
|
329
349
|
|
|
@@ -94,12 +94,20 @@ class GnomadAnnotator(Annotator):
|
|
|
94
94
|
logger.warning("Could not remove staged file at %s", gz_path)
|
|
95
95
|
|
|
96
96
|
def is_ready(self) -> bool:
|
|
97
|
-
"""True when the gnomAD SQLite cache exists with current schema version.
|
|
97
|
+
"""True when the gnomAD SQLite cache exists with current schema version.
|
|
98
|
+
|
|
99
|
+
GH #22: a cache with no ``local_version_tag`` used to be accepted
|
|
100
|
+
as ready (the previous ``or not tag`` escape). That defeated the
|
|
101
|
+
whole point of ``GNOMAD_SCHEMA_VERSION``: if it ever gets bumped,
|
|
102
|
+
every tagless legacy cache would silently pass as the new
|
|
103
|
+
version. Reject tagless caches so the user is told to re-run
|
|
104
|
+
``db update``.
|
|
105
|
+
"""
|
|
98
106
|
info = get_database_info(self._db_path, "gnomad")
|
|
99
107
|
if info is None:
|
|
100
108
|
return False
|
|
101
109
|
tag = info.get("local_version_tag") or ""
|
|
102
|
-
return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
|
|
110
|
+
return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
|
|
103
111
|
|
|
104
112
|
def version(self) -> str | None:
|
|
105
113
|
"""Return the cached database version, or None."""
|
|
@@ -16,6 +16,7 @@ from allelix.databases.gwas_loader import (
|
|
|
16
16
|
_REQUIRED_GWAS_COLUMNS,
|
|
17
17
|
GWAS_CATALOG_URL,
|
|
18
18
|
GWAS_DB_FILENAME,
|
|
19
|
+
GWAS_MIN_ROWS,
|
|
19
20
|
load_gwas_tsv,
|
|
20
21
|
schema_is_current,
|
|
21
22
|
)
|
|
@@ -57,18 +58,25 @@ _BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
|
|
|
57
58
|
|
|
58
59
|
|
|
59
60
|
def _magnitude(p_value: float | None, or_beta: float | None) -> float:
|
|
60
|
-
"""Derive magnitude from p-value and optional effect size.
|
|
61
|
+
"""Derive magnitude from p-value and optional effect size.
|
|
62
|
+
|
|
63
|
+
GH #17: boundary comparisons are inclusive (``<=``) so the canonical
|
|
64
|
+
genome-wide-significance threshold ``p = 5e-8`` lands inside the
|
|
65
|
+
significant bucket rather than the suggestive bucket below it.
|
|
66
|
+
Strict ``<`` made the exact threshold value fall a full magnitude
|
|
67
|
+
below a barely-significant hit.
|
|
68
|
+
"""
|
|
61
69
|
if p_value is None:
|
|
62
70
|
base = 2.0
|
|
63
|
-
elif p_value
|
|
71
|
+
elif p_value <= 5e-100:
|
|
64
72
|
base = 8.0
|
|
65
|
-
elif p_value
|
|
73
|
+
elif p_value <= 5e-20:
|
|
66
74
|
base = 7.0
|
|
67
|
-
elif p_value
|
|
75
|
+
elif p_value <= 5e-8:
|
|
68
76
|
base = 6.0
|
|
69
|
-
elif p_value
|
|
77
|
+
elif p_value <= 5e-6:
|
|
70
78
|
base = 4.0
|
|
71
|
-
elif p_value
|
|
79
|
+
elif p_value <= 5e-4:
|
|
72
80
|
base = 3.0
|
|
73
81
|
else:
|
|
74
82
|
base = 2.0
|
|
@@ -143,7 +151,13 @@ class GWASCatalogAnnotator(Annotator):
|
|
|
143
151
|
extracted = self.data_dir / tsv_names[0]
|
|
144
152
|
if extracted != tsv_path:
|
|
145
153
|
extracted.rename(tsv_path)
|
|
146
|
-
load_gwas_tsv(
|
|
154
|
+
load_gwas_tsv(
|
|
155
|
+
tsv_path,
|
|
156
|
+
self._db_path,
|
|
157
|
+
source_url=url,
|
|
158
|
+
remote_signal=signal,
|
|
159
|
+
min_rows=GWAS_MIN_ROWS,
|
|
160
|
+
)
|
|
147
161
|
finally:
|
|
148
162
|
try:
|
|
149
163
|
zip_path.unlink()
|
|
@@ -183,6 +197,7 @@ class GWASCatalogAnnotator(Annotator):
|
|
|
183
197
|
self._db_path,
|
|
184
198
|
source_url=GWAS_CATALOG_URL,
|
|
185
199
|
remote_signal=self.cached_remote_signal(),
|
|
200
|
+
min_rows=GWAS_MIN_ROWS,
|
|
186
201
|
)
|
|
187
202
|
except Exception:
|
|
188
203
|
logger.warning("Auto-reingest from cached TSV failed", exc_info=True)
|
|
@@ -24,6 +24,7 @@ from allelix.databases.manager import (
|
|
|
24
24
|
from allelix.databases.pharmgkb_loader import (
|
|
25
25
|
PHARMGKB_CLINICAL_URL,
|
|
26
26
|
PHARMGKB_DB_FILENAME,
|
|
27
|
+
PHARMGKB_MIN_ROWS,
|
|
27
28
|
_normalize_genotype,
|
|
28
29
|
load_pharmgkb_tsv,
|
|
29
30
|
schema_is_current,
|
|
@@ -146,6 +147,7 @@ class PharmGKBAnnotator(Annotator):
|
|
|
146
147
|
source_url=url,
|
|
147
148
|
remote_signal=signal,
|
|
148
149
|
allele_function_lookup=cpic_lookup,
|
|
150
|
+
min_rows=PHARMGKB_MIN_ROWS,
|
|
149
151
|
)
|
|
150
152
|
|
|
151
153
|
def is_ready(self) -> bool:
|
|
@@ -489,6 +491,7 @@ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
|
|
|
489
491
|
version=old_version,
|
|
490
492
|
remote_signal=old_signal,
|
|
491
493
|
allele_function_lookup=cpic_lookup,
|
|
494
|
+
min_rows=PHARMGKB_MIN_ROWS,
|
|
492
495
|
)
|
|
493
496
|
except Exception:
|
|
494
497
|
logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)
|
|
@@ -363,6 +363,30 @@ def _emit_build_diagnostics(result: object) -> None:
|
|
|
363
363
|
f"coordinates differ between builds and silently using the wrong "
|
|
364
364
|
f"one will miss every hit.[/yellow]"
|
|
365
365
|
)
|
|
366
|
+
elif (
|
|
367
|
+
not diag.override
|
|
368
|
+
and diag.detected_build is None
|
|
369
|
+
and diag.header_build is not None
|
|
370
|
+
and diag.inspected_count > 0
|
|
371
|
+
):
|
|
372
|
+
# Position-detection inspected known-rsID rows but couldn't pick a
|
|
373
|
+
# build — either votes tied across builds or no row matched any
|
|
374
|
+
# build's reference position. Without this warning, the pipeline
|
|
375
|
+
# silently falls through to header_build, and a GRCh36 file with a
|
|
376
|
+
# GRCh37-mislabeled header gets the GRCh37 ClinVar cache (the
|
|
377
|
+
# silent-coords trap #15). The dim "header (no position
|
|
378
|
+
# confirmation)" status line shows the same facts but reads as
|
|
379
|
+
# routine — yellow is what the situation deserves.
|
|
380
|
+
console.print(
|
|
381
|
+
f"[yellow]Build detection inconclusive: "
|
|
382
|
+
f"{diag.inspected_count} known-rsID position checks ran but "
|
|
383
|
+
f"did not converge on a build. Using the file's header-claimed "
|
|
384
|
+
f"build ({diag.header_build}), which has not been confirmed "
|
|
385
|
+
f"against your position data. If the file is actually a "
|
|
386
|
+
f"different build, pass --build grch37 or --build grch38 to "
|
|
387
|
+
f"force — wrong coordinates will silently mis-annotate every "
|
|
388
|
+
f"variant.[/yellow]"
|
|
389
|
+
)
|
|
366
390
|
if diag.effective_build == "GRCh36":
|
|
367
391
|
console.print(
|
|
368
392
|
"[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "
|
|
@@ -18,29 +18,12 @@ from allelix.databases import resolve_data_dir
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
|
|
21
|
-
from allelix.annotators.base import Annotator
|
|
22
|
-
|
|
23
21
|
|
|
24
22
|
@main.group()
|
|
25
23
|
def db() -> None:
|
|
26
24
|
"""Manage local reference database cache."""
|
|
27
25
|
|
|
28
26
|
|
|
29
|
-
def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
|
|
30
|
-
"""Write a remote signal to an existing cache without re-downloading."""
|
|
31
|
-
import contextlib
|
|
32
|
-
import sqlite3
|
|
33
|
-
|
|
34
|
-
from allelix.databases.manager import stamp_remote_signal
|
|
35
|
-
|
|
36
|
-
db_path = getattr(annotator, "_db_path", None)
|
|
37
|
-
if db_path is None:
|
|
38
|
-
return
|
|
39
|
-
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
40
|
-
stamp_remote_signal(conn, annotator.name, signal)
|
|
41
|
-
conn.commit()
|
|
42
|
-
|
|
43
|
-
|
|
44
27
|
def _confirm_cadd_license(*, license_held: bool = False) -> bool:
|
|
45
28
|
"""Show the CADD license notice and ask for confirmation."""
|
|
46
29
|
if license_held:
|
|
@@ -207,14 +190,20 @@ def db_update(
|
|
|
207
190
|
continue
|
|
208
191
|
|
|
209
192
|
if cached is None:
|
|
210
|
-
|
|
193
|
+
# GH #20: a cache with no stored freshness signal almost
|
|
194
|
+
# always predates the signal mechanism — i.e., it is old.
|
|
195
|
+
# The previous behavior was to stamp the live remote signal
|
|
196
|
+
# onto the cache and call it current, which permanently
|
|
197
|
+
# marked stale data as fresh (only `--force` would escape).
|
|
198
|
+
# Treat tagless caches as needing a refresh.
|
|
211
199
|
console.print(
|
|
212
|
-
f" [
|
|
213
|
-
|
|
200
|
+
f" [bold]{annotator.name}[/bold]: cache predates the "
|
|
201
|
+
"freshness signal; re-downloading…"
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
console.print(
|
|
205
|
+
f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…"
|
|
214
206
|
)
|
|
215
|
-
continue
|
|
216
|
-
|
|
217
|
-
console.print(f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
|
|
218
207
|
if _helpers._run_setup(annotator):
|
|
219
208
|
console.print(
|
|
220
209
|
f" [green]✓ {annotator.name} refreshed[/green] "
|
|
@@ -9,7 +9,7 @@ column of ``database_versions`` (e.g. ``iv:1``) so ``is_ready()`` can
|
|
|
9
9
|
reject stale caches without forcing a full re-download.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
CLINVAR_INTERPRETER_VERSION = 1
|
|
12
|
+
CLINVAR_INTERPRETER_VERSION = 2 # v2.0.1: GH #42 CLNDN-join in iter_clinvar_records
|
|
13
13
|
PHARMGKB_INTERPRETER_VERSION = 1
|
|
14
14
|
GNOMAD_SCHEMA_VERSION = 1
|
|
15
15
|
ALPHAMISSENSE_SCHEMA_VERSION = 1
|
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
ALPHAMISSENSE_DB_FILENAME = "alphamissense.sqlite"
|
|
24
24
|
|
|
25
25
|
ALPHAMISSENSE_CACHE_URL = (
|
|
26
|
-
"https://huggingface.co/datasets/
|
|
26
|
+
"https://huggingface.co/datasets/allelix/allelix-alphamissense"
|
|
27
27
|
"/resolve/13a15e199536512b5e2d208d79c4f93c0a73f71f/alphamissense.sqlite.gz"
|
|
28
28
|
)
|
|
29
29
|
|
|
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
|
|
|
24
24
|
GNOMAD_DB_FILENAME = "gnomad.sqlite"
|
|
25
25
|
|
|
26
26
|
GNOMAD_CACHE_URL = (
|
|
27
|
-
"https://huggingface.co/datasets/
|
|
27
|
+
"https://huggingface.co/datasets/allelix/allelix-gnomad"
|
|
28
28
|
"/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
|
|
29
29
|
)
|
|
30
30
|
|
|
@@ -465,16 +465,32 @@ def iter_gwas_records(tsv_path: Path) -> Iterator[dict[str, object]]:
|
|
|
465
465
|
yield from best.values()
|
|
466
466
|
|
|
467
467
|
|
|
468
|
+
# Truncation sanity floor for production loads. This guards the count
|
|
469
|
+
# returned by iter_gwas_records — i.e. rows AFTER haplotype/no-trait
|
|
470
|
+
# filtering and (rsid, trait) dedup, not the raw catalog. EBI curates
|
|
471
|
+
# ~625K lead associations (GWAS Catalog, 2025); the loaded count is lower
|
|
472
|
+
# than that but still far above this floor. 100K only catches gross
|
|
473
|
+
# truncation (a mid-stream download committed as "complete") while staying
|
|
474
|
+
# permissive against legitimate upstream drift. Set to 0 from tests so
|
|
475
|
+
# synthetic fixtures of any size load cleanly. See GH #19.
|
|
476
|
+
GWAS_MIN_ROWS = 100_000
|
|
477
|
+
|
|
478
|
+
|
|
468
479
|
def load_gwas_tsv(
|
|
469
480
|
tsv_path: Path,
|
|
470
481
|
db_path: Path,
|
|
471
482
|
source_url: str = "",
|
|
472
483
|
remote_signal: str | None = None,
|
|
484
|
+
min_rows: int = 0,
|
|
473
485
|
) -> int:
|
|
474
486
|
"""Parse a GWAS Catalog TSV into a fresh SQLite cache atomically.
|
|
475
487
|
|
|
476
488
|
Writes to a `.tmp` sibling and `os.replace`s onto `db_path` only after a
|
|
477
489
|
successful commit. Returns the number of records loaded.
|
|
490
|
+
|
|
491
|
+
``min_rows`` is a sanity floor checked before the final ``os.replace``.
|
|
492
|
+
Set by production callers (see ``GwasAnnotator.setup``) to
|
|
493
|
+
``GWAS_MIN_ROWS``; defaults to 0 so test fixtures of any size load.
|
|
478
494
|
"""
|
|
479
495
|
tmp_path = db_path.parent / f"{db_path.name}.tmp"
|
|
480
496
|
if tmp_path.exists():
|
|
@@ -535,6 +551,15 @@ def load_gwas_tsv(
|
|
|
535
551
|
),
|
|
536
552
|
)
|
|
537
553
|
conn.commit()
|
|
554
|
+
if count < min_rows:
|
|
555
|
+
msg = (
|
|
556
|
+
f"GWAS Catalog load aborted: only {count:,} rows ingested "
|
|
557
|
+
f"(floor {min_rows:,}). The download was likely truncated "
|
|
558
|
+
f"in flight (chunked transfer with no Content-Length, or "
|
|
559
|
+
f"connection drop mid-stream). Retry with "
|
|
560
|
+
f"`allelix db update --force`."
|
|
561
|
+
)
|
|
562
|
+
raise OSError(msg)
|
|
538
563
|
os.replace(tmp_path, db_path)
|
|
539
564
|
return count
|
|
540
565
|
except Exception:
|
|
@@ -197,9 +197,20 @@ def parse_clinvar_version(vcf_path: Path) -> str | None:
|
|
|
197
197
|
def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
|
|
198
198
|
"""Stream parse a ClinVar VCF (.vcf or .vcf.gz). Skip entries without an RS id.
|
|
199
199
|
|
|
200
|
-
Multi-allelic rows (ALT="A,T") are split into one record per ALT.
|
|
201
|
-
INFO fields
|
|
202
|
-
|
|
200
|
+
Multi-allelic rows (ALT="A,T") are split into one record per ALT.
|
|
201
|
+
Parallel INFO fields ``CLNSIG`` and ``ALLELEID`` are separated by
|
|
202
|
+
``|`` and index-paired with the ALTs.
|
|
203
|
+
|
|
204
|
+
GH #42: ``CLNDN`` is NOT index-paired with ALTs — its ``|`` separator
|
|
205
|
+
enumerates the union of conditions across all SCV submissions on the
|
|
206
|
+
variant, with no positional mapping to CLNSIG. Joining the full list
|
|
207
|
+
into a single ``condition`` string per record avoids the Frankenstein
|
|
208
|
+
pairing (one SCV's classification next to another SCV's condition)
|
|
209
|
+
that index-picking introduced. The primary classification
|
|
210
|
+
(``CLNSIG[0]``) is kept as-is — that value is correct as a
|
|
211
|
+
variant-level claim; only the condition-pairing was misleading.
|
|
212
|
+
Full per-(classification, condition) pairing via
|
|
213
|
+
``submission_summary.txt.gz`` is tracked for v2.1.
|
|
203
214
|
"""
|
|
204
215
|
opener = gzip.open if vcf_path.suffix == ".gz" else open
|
|
205
216
|
with opener(vcf_path, "rt", encoding="utf-8") as fh:
|
|
@@ -231,6 +242,12 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
|
|
|
231
242
|
review_status = info_dict.get("CLNREVSTAT", "")
|
|
232
243
|
gene = _extract_gene(info_dict.get("GENEINFO", ""))
|
|
233
244
|
|
|
245
|
+
# GH #42: CLNDN's `|`-separator is per-SCV, not per-ALT.
|
|
246
|
+
# Join the full list once per row (same string emitted for
|
|
247
|
+
# every ALT split-out of this record). Empty/`.`/blank
|
|
248
|
+
# tokens are filtered out so callers don't see leading/trailing
|
|
249
|
+
# separators.
|
|
250
|
+
joined_condition = "; ".join(c.replace("_", " ") for c in clndns if c and c != ".")
|
|
234
251
|
for i, alt in enumerate(alts):
|
|
235
252
|
yield {
|
|
236
253
|
"rsid": f"rs{rs}",
|
|
@@ -239,7 +256,7 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
|
|
|
239
256
|
"ref": ref,
|
|
240
257
|
"alt": alt,
|
|
241
258
|
"clinical_significance": _pick(clnsigs, i),
|
|
242
|
-
"condition":
|
|
259
|
+
"condition": joined_condition,
|
|
243
260
|
"gene": gene,
|
|
244
261
|
"review_status": review_status,
|
|
245
262
|
"allele_id": _safe_int(_pick(allele_ids, i)),
|
|
@@ -334,6 +334,13 @@ def _safe_float(value: str) -> float | None:
|
|
|
334
334
|
return None
|
|
335
335
|
|
|
336
336
|
|
|
337
|
+
# Truncation sanity floor for production loads. Current ClinPGx clinical
|
|
338
|
+
# annotations ship ~13K rows; ~5K is a generous floor that catches gross
|
|
339
|
+
# truncation while staying permissive against upstream-data drift. Set
|
|
340
|
+
# to 0 from tests so synthetic fixtures of any size load cleanly. See GH #19.
|
|
341
|
+
PHARMGKB_MIN_ROWS = 5_000
|
|
342
|
+
|
|
343
|
+
|
|
337
344
|
def load_pharmgkb_tsv(
|
|
338
345
|
zip_or_dir: Path,
|
|
339
346
|
db_path: Path,
|
|
@@ -341,6 +348,7 @@ def load_pharmgkb_tsv(
|
|
|
341
348
|
version: str = "",
|
|
342
349
|
remote_signal: str | None = None,
|
|
343
350
|
allele_function_lookup: dict[tuple[str, str], str] | None = None,
|
|
351
|
+
min_rows: int = 0,
|
|
344
352
|
) -> int:
|
|
345
353
|
"""Load a ClinPGx clinical-annotations dump into a fresh SQLite cache atomically.
|
|
346
354
|
|
|
@@ -430,6 +438,15 @@ def load_pharmgkb_tsv(
|
|
|
430
438
|
),
|
|
431
439
|
)
|
|
432
440
|
conn.commit()
|
|
441
|
+
if count < min_rows:
|
|
442
|
+
msg = (
|
|
443
|
+
f"ClinPGx load aborted: only {count:,} rows ingested "
|
|
444
|
+
f"(floor {min_rows:,}). The download was likely truncated "
|
|
445
|
+
f"in flight (chunked transfer with no Content-Length, or "
|
|
446
|
+
f"connection drop mid-stream). Retry with "
|
|
447
|
+
f"`allelix db update --force`."
|
|
448
|
+
)
|
|
449
|
+
raise OSError(msg)
|
|
433
450
|
os.replace(tmp_path, db_path)
|
|
434
451
|
return count
|
|
435
452
|
except Exception:
|
|
@@ -147,6 +147,13 @@ def _dedupe_existing(conn: sqlite3.Connection) -> int:
|
|
|
147
147
|
return before - after
|
|
148
148
|
|
|
149
149
|
|
|
150
|
+
# GH #12: identifier allowlist for the raw_table f-string interpolation
|
|
151
|
+
# below. The interpolation cannot be parameterized (SQLite doesn't support
|
|
152
|
+
# bind variables for identifiers); the allowlist gives a programmatic
|
|
153
|
+
# guarantee that only these two literals can reach the SQL.
|
|
154
|
+
_VALID_RAW_TABLES: frozenset[str] = frozenset({"_raw_pages", "pages"})
|
|
155
|
+
|
|
156
|
+
|
|
150
157
|
def detect_raw_table(conn: sqlite3.Connection) -> str | None:
|
|
151
158
|
"""Return the name of the raw pages table, or None if absent."""
|
|
152
159
|
tables = {
|
|
@@ -183,6 +190,16 @@ def _parse_raw_pages_inner(conn: sqlite3.Connection, *, verbose: bool = False) -
|
|
|
183
190
|
raw_table = detect_raw_table(conn)
|
|
184
191
|
if raw_table is None:
|
|
185
192
|
return 0
|
|
193
|
+
# GH #12: `raw_table` flows into three SQL queries via f-string
|
|
194
|
+
# interpolation because SQLite doesn't support parameterized
|
|
195
|
+
# identifiers. Today it's safe — `detect_raw_table` only ever returns
|
|
196
|
+
# one of two literals or None — but the function's `str | None` return
|
|
197
|
+
# type doesn't pin that. A future edit (config-driven table name,
|
|
198
|
+
# scraped metadata) could drift it into an injection path. Allowlist
|
|
199
|
+
# explicitly so the guarantee outlives memory of the original design.
|
|
200
|
+
if raw_table not in _VALID_RAW_TABLES:
|
|
201
|
+
msg = f"unexpected raw table name: {raw_table!r}"
|
|
202
|
+
raise ValueError(msg)
|
|
186
203
|
|
|
187
204
|
if verbose:
|
|
188
205
|
logger.info("Parsing SNPedia raw pages from '%s' table", raw_table)
|
|
@@ -40,6 +40,25 @@ class Variant:
|
|
|
40
40
|
allele2: str
|
|
41
41
|
build: str = DEFAULT_BUILD
|
|
42
42
|
|
|
43
|
+
def __post_init__(self) -> None:
|
|
44
|
+
"""Normalize allele case at construction (GH #14).
|
|
45
|
+
|
|
46
|
+
Reference databases (ClinVar, gnomAD, ClinPGx, etc.) all ship
|
|
47
|
+
uppercase alleles, and carrier matching is raw set membership
|
|
48
|
+
against ``{allele1, allele2}`` — a lowercase user allele would
|
|
49
|
+
silently fail to match and zero annotations would be produced
|
|
50
|
+
for a real carrier. Production parsers all emit uppercase
|
|
51
|
+
today, but a user-supplied filter file (custom panel) or a
|
|
52
|
+
future format variant could leak lowercase through. Normalize
|
|
53
|
+
at the model boundary so the invariant is impossible to
|
|
54
|
+
violate downstream. The no-call marker is left as-is;
|
|
55
|
+
multi-base alleles (indels) are uppercased in place.
|
|
56
|
+
"""
|
|
57
|
+
if self.allele1 and self.allele1 != NO_CALL_MARKER:
|
|
58
|
+
self.allele1 = self.allele1.upper()
|
|
59
|
+
if self.allele2 and self.allele2 != NO_CALL_MARKER:
|
|
60
|
+
self.allele2 = self.allele2.upper()
|
|
61
|
+
|
|
43
62
|
@property
|
|
44
63
|
def is_heterozygous(self) -> bool:
|
|
45
64
|
"""True if the two alleles differ (and neither is a no-call)."""
|
|
@@ -17,10 +17,24 @@ logger = logging.getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def split_csv_line(line: str) -> list[str]:
|
|
20
|
-
"""Split a comma-delimited line and strip
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
(
|
|
20
|
+
"""Split a comma-delimited line and strip surrounding quotes from each field.
|
|
21
|
+
|
|
22
|
+
Implementation is ``line.split(",")`` followed by a per-field
|
|
23
|
+
``strip().strip('"')``. This is NOT a real CSV parser: a quoted field
|
|
24
|
+
containing a literal comma yields the wrong column count and is
|
|
25
|
+
silently dropped by callers' ``len(parts) != EXPECTED_COLUMNS``
|
|
26
|
+
guard.
|
|
27
|
+
|
|
28
|
+
Adequate for FTDNA / MyHeritage / Living DNA because every value in
|
|
29
|
+
those exports is either an rsID, chromosome identifier, integer
|
|
30
|
+
position, or concatenated genotype string — none of which contain
|
|
31
|
+
commas. If a future format ever ships embedded commas in quoted
|
|
32
|
+
fields, swap to ``csv.reader`` rather than relying on this helper.
|
|
33
|
+
|
|
34
|
+
Strips both surrounding double quotes (``"rs1"``) and the
|
|
35
|
+
double-double-quote variant some MyHeritage exports produce
|
|
36
|
+
(``""rs1""``) — the latter via two iterations of the trailing
|
|
37
|
+
``strip('"')``.
|
|
24
38
|
"""
|
|
25
39
|
return [field.strip().strip('"') for field in line.split(",")]
|
|
26
40
|
|
|
@@ -42,6 +42,14 @@ SNIFF_LINE_LIMIT = 50
|
|
|
42
42
|
EXPECTED_COLUMNS = 4
|
|
43
43
|
HEADER_CANONICAL = "RSID,CHROMOSOME,POSITION,RESULT"
|
|
44
44
|
|
|
45
|
+
# GH #26: FTDNA and MyHeritage files share the same data shape and
|
|
46
|
+
# header line. Without an explicit exclusion, both `can_parse`
|
|
47
|
+
# implementations accept MyHeritage files and the routing was masked
|
|
48
|
+
# only by registry order (MyHeritage listed first in parsers/__init__.py).
|
|
49
|
+
# Reorder the registry and FTDNA silently mislabels source format. The
|
|
50
|
+
# discriminator is the MyHeritage signature comment in the first line.
|
|
51
|
+
_MYHERITAGE_SIGNATURE = "MyHeritage"
|
|
52
|
+
|
|
45
53
|
|
|
46
54
|
def _is_header_line(line: str) -> bool:
|
|
47
55
|
"""True if *line* is the FTDNA column header (quoted or unquoted)."""
|
|
@@ -58,7 +66,14 @@ class FTDNAParser(GenotypeParser):
|
|
|
58
66
|
url: ClassVar[str] = "https://www.familytreedna.com"
|
|
59
67
|
|
|
60
68
|
def can_parse(self, file_path: Path) -> bool:
|
|
61
|
-
"""Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header.
|
|
69
|
+
"""Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header.
|
|
70
|
+
|
|
71
|
+
GH #26: rejects files carrying the MyHeritage signature comment.
|
|
72
|
+
Both formats are byte-identical past the first comment line, so
|
|
73
|
+
the discriminator must be checked explicitly — otherwise FTDNA
|
|
74
|
+
also claims MyHeritage files and routing depends on registry
|
|
75
|
+
order (which has silently mislabeled formats in past audits).
|
|
76
|
+
"""
|
|
62
77
|
try:
|
|
63
78
|
with file_path.open("r", encoding="utf-8") as fh:
|
|
64
79
|
for _ in range(SNIFF_LINE_LIMIT):
|
|
@@ -66,6 +81,8 @@ class FTDNAParser(GenotypeParser):
|
|
|
66
81
|
if not line:
|
|
67
82
|
return False
|
|
68
83
|
line = line.rstrip("\r\n")
|
|
84
|
+
if _MYHERITAGE_SIGNATURE in line:
|
|
85
|
+
return False
|
|
69
86
|
if not line or line.startswith("#"):
|
|
70
87
|
continue
|
|
71
88
|
return _is_header_line(line)
|
|
@@ -32,6 +32,7 @@ Specifics:
|
|
|
32
32
|
from __future__ import annotations
|
|
33
33
|
|
|
34
34
|
import logging
|
|
35
|
+
import re
|
|
35
36
|
from typing import TYPE_CHECKING, ClassVar
|
|
36
37
|
|
|
37
38
|
from allelix.models import DEFAULT_BUILD, Variant
|
|
@@ -49,6 +50,12 @@ SIGNATURE = "Living DNA"
|
|
|
49
50
|
SNIFF_LINE_LIMIT = 50
|
|
50
51
|
EXPECTED_COLUMNS = 4
|
|
51
52
|
|
|
53
|
+
# GH #16: only inspect comment lines that look like a build marker.
|
|
54
|
+
# Without this filter every comment line is fed to
|
|
55
|
+
# ``normalize_build_label`` and a stray date / version digit can override
|
|
56
|
+
# the real build line.
|
|
57
|
+
_BUILD_MARKER_RE = re.compile(r"\b(build|reference|genome)\b", re.IGNORECASE)
|
|
58
|
+
|
|
52
59
|
|
|
53
60
|
class LivingDNAParser(GenotypeParser):
|
|
54
61
|
"""Parser for Living DNA consumer genotype files."""
|
|
@@ -104,16 +111,30 @@ class LivingDNAParser(GenotypeParser):
|
|
|
104
111
|
)
|
|
105
112
|
|
|
106
113
|
def get_metadata(self, file_path: Path) -> GenotypeMetadata:
|
|
107
|
-
"""Extract build from header comments. Living DNA has no sample ID field.
|
|
114
|
+
"""Extract build from header comments. Living DNA has no sample ID field.
|
|
115
|
+
|
|
116
|
+
GH #16: previously every comment line was passed into
|
|
117
|
+
``normalize_build_label`` and the *last* match won. A download-
|
|
118
|
+
date comment like ``# downloaded 2038-01-01`` would silently
|
|
119
|
+
retag the file as GRCh38. Only lines that look like an explicit
|
|
120
|
+
build marker (containing ``build``, ``reference``, or ``genome``)
|
|
121
|
+
are inspected, and the first match wins — matching the format
|
|
122
|
+
spec which puts the build line near the top:
|
|
123
|
+
|
|
124
|
+
# Human Genome Reference Build 37 (GRCh37.p13).
|
|
125
|
+
"""
|
|
108
126
|
build = DEFAULT_BUILD
|
|
109
127
|
with file_path.open("r", encoding="utf-8") as fh:
|
|
110
128
|
for raw in fh:
|
|
111
129
|
line = raw.rstrip("\r\n")
|
|
112
130
|
if not line.startswith("#"):
|
|
113
131
|
break
|
|
132
|
+
if not _BUILD_MARKER_RE.search(line):
|
|
133
|
+
continue
|
|
114
134
|
normalized = normalize_build_label(line)
|
|
115
135
|
if normalized:
|
|
116
136
|
build = normalized
|
|
137
|
+
break
|
|
117
138
|
return GenotypeMetadata(
|
|
118
139
|
format=self.name,
|
|
119
140
|
sample_id="",
|