allelix 2.0.0__tar.gz → 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {allelix-2.0.0 → allelix-2.0.2}/PKG-INFO +13 -1
  2. {allelix-2.0.0 → allelix-2.0.2}/README.md +12 -0
  3. allelix-2.0.2/allelix/__init__.py +41 -0
  4. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/base.py +24 -1
  5. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/clinvar.py +21 -1
  6. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/gnomad.py +10 -2
  7. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/gwas.py +22 -7
  8. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/pharmgkb.py +3 -0
  9. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/_helpers.py +44 -3
  10. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/db.py +12 -23
  11. allelix-2.0.2/allelix/data/high_value_snps.yaml +136 -0
  12. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/_versions.py +1 -1
  13. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/alphamissense_loader.py +1 -1
  14. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/gnomad_loader.py +1 -1
  15. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/gwas_loader.py +25 -0
  16. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/manager.py +21 -4
  17. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/pharmgkb_loader.py +17 -0
  18. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/snpedia_parser.py +17 -0
  19. {allelix-2.0.0 → allelix-2.0.2}/allelix/models.py +19 -0
  20. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/_helpers.py +18 -4
  21. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/base.py +8 -1
  22. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/ftdna.py +18 -1
  23. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/livingdna.py +22 -1
  24. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/vcf.py +30 -7
  25. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/_pipeline.py +150 -45
  26. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/diff.py +43 -4
  27. allelix-2.0.2/allelix/reports/terminal.py +241 -0
  28. {allelix-2.0.0 → allelix-2.0.2}/allelix/utils/allele.py +16 -14
  29. {allelix-2.0.0 → allelix-2.0.2}/allelix/utils/build_detect.py +31 -9
  30. {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/PKG-INFO +13 -1
  31. {allelix-2.0.0 → allelix-2.0.2}/pyproject.toml +1 -1
  32. {allelix-2.0.0 → allelix-2.0.2}/tests/test_cli.py +99 -10
  33. {allelix-2.0.0 → allelix-2.0.2}/tests/test_end_to_end.py +28 -6
  34. {allelix-2.0.0 → allelix-2.0.2}/tests/test_models.py +33 -0
  35. allelix-2.0.2/tests/test_version.py +100 -0
  36. allelix-2.0.0/allelix/__init__.py +0 -12
  37. allelix-2.0.0/allelix/data/high_value_snps.yaml +0 -64
  38. allelix-2.0.0/allelix/reports/terminal.py +0 -205
  39. allelix-2.0.0/tests/test_version.py +0 -52
  40. {allelix-2.0.0 → allelix-2.0.2}/LICENSE +0 -0
  41. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/__init__.py +0 -0
  42. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/alphamissense.py +0 -0
  43. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/cadd.py +0 -0
  44. {allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/snpedia.py +0 -0
  45. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/__init__.py +0 -0
  46. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/_options.py +0 -0
  47. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/analyze.py +0 -0
  48. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/config.py +0 -0
  49. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/focused.py +0 -0
  50. {allelix-2.0.0 → allelix-2.0.2}/allelix/cli/utility.py +0 -0
  51. {allelix-2.0.0 → allelix-2.0.2}/allelix/compare.py +0 -0
  52. {allelix-2.0.0 → allelix-2.0.2}/allelix/config.py +0 -0
  53. {allelix-2.0.0 → allelix-2.0.2}/allelix/data/__init__.py +0 -0
  54. {allelix-2.0.0 → allelix-2.0.2}/allelix/data/clinvar_clnsig_snapshot.yaml +0 -0
  55. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/__init__.py +0 -0
  56. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/cadd_loader.py +0 -0
  57. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/cpic_loader.py +0 -0
  58. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/loader_utils.py +0 -0
  59. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/schema.py +0 -0
  60. {allelix-2.0.0 → allelix-2.0.2}/allelix/databases/snpedia_loader.py +0 -0
  61. {allelix-2.0.0 → allelix-2.0.2}/allelix/exporters/__init__.py +0 -0
  62. {allelix-2.0.0 → allelix-2.0.2}/allelix/exporters/plink.py +0 -0
  63. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/__init__.py +0 -0
  64. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/ancestrydna.py +0 -0
  65. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/ftdna_illumina.py +0 -0
  66. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/myhappygenes.py +0 -0
  67. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/myheritage.py +0 -0
  68. {allelix-2.0.0 → allelix-2.0.2}/allelix/parsers/twentythreeandme.py +0 -0
  69. {allelix-2.0.0 → allelix-2.0.2}/allelix/py.typed +0 -0
  70. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/__init__.py +0 -0
  71. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/high_value.py +0 -0
  72. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/html.py +0 -0
  73. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/json_report.py +0 -0
  74. {allelix-2.0.0 → allelix-2.0.2}/allelix/reports/methylation.py +0 -0
  75. {allelix-2.0.0 → allelix-2.0.2}/allelix/utils/__init__.py +0 -0
  76. {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/SOURCES.txt +0 -0
  77. {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/dependency_links.txt +0 -0
  78. {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/entry_points.txt +0 -0
  79. {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/requires.txt +0 -0
  80. {allelix-2.0.0 → allelix-2.0.2}/allelix.egg-info/top_level.txt +0 -0
  81. {allelix-2.0.0 → allelix-2.0.2}/setup.cfg +0 -0
  82. {allelix-2.0.0 → allelix-2.0.2}/tests/test_cli_helpers.py +0 -0
  83. {allelix-2.0.0 → allelix-2.0.2}/tests/test_compare.py +0 -0
  84. {allelix-2.0.0 → allelix-2.0.2}/tests/test_config.py +0 -0
  85. {allelix-2.0.0 → allelix-2.0.2}/tests/test_mock_data_invariants.py +0 -0
  86. {allelix-2.0.0 → allelix-2.0.2}/tests/test_registry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
5
  Author: Allelix
6
6
  Maintainer-email: dial481 <dial481@users.noreply.github.com>
@@ -161,6 +161,18 @@ This is not a disclaimer afterthought. It is a design constraint that affects mo
161
161
  - Reference databases are downloaded via `allelix db update` and cached locally.
162
162
  - Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
163
163
 
164
+ ### Output files contain real annotations of your genome
165
+
166
+ The JSON / HTML / terminal output of `allelix analyze` and its
167
+ focused subcommands contains real annotations against your specific
168
+ variants — drug-response calls, carrier-status flags, hereditary-
169
+ disease findings. Wherever you write them via `--output <path>`,
170
+ that's where they sit until you delete them. Allelix doesn't
171
+ auto-clean and won't warn you when you write to `/tmp/` or any
172
+ other shared location. Treat the files as personal data: read them,
173
+ move them somewhere you control, or delete when you're done. A
174
+ data-lifecycle subcommand is planned for v2.1.
175
+
164
176
  ## Configuration
165
177
 
166
178
  Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.
@@ -124,6 +124,18 @@ This is not a disclaimer afterthought. It is a design constraint that affects mo
124
124
  - Reference databases are downloaded via `allelix db update` and cached locally.
125
125
  - Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
126
126
 
127
+ ### Output files contain real annotations of your genome
128
+
129
+ The JSON / HTML / terminal output of `allelix analyze` and its
130
+ focused subcommands contains real annotations against your specific
131
+ variants — drug-response calls, carrier-status flags, hereditary-
132
+ disease findings. Wherever you write them via `--output <path>`,
133
+ that's where they sit until you delete them. Allelix doesn't
134
+ auto-clean and won't warn you when you write to `/tmp/` or any
135
+ other shared location. Treat the files as personal data: read them,
136
+ move them somewhere you control, or delete when you're done. A
137
+ data-lifecycle subcommand is planned for v2.1.
138
+
127
139
  ## Configuration
128
140
 
129
141
  Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.
@@ -0,0 +1,41 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 Allelix
3
+ """Allelix: open-source genotype analysis toolkit."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from importlib.metadata import PackageNotFoundError, version
8
+
9
+
10
+ def _read_pyproject_version() -> str | None:
11
+ """Read the package version from ``pyproject.toml``.
12
+
13
+ GH #34: fall back to ``pyproject.toml`` when run from a bare source
14
+ checkout (no editable install, no installed package metadata). Keeps
15
+ ``--version`` and the outbound HTTP User-Agent reporting the real
16
+ version string instead of the ``0.0.0+local`` sentinel that
17
+ misidentifies our traffic to NCBI / EBI / HuggingFace.
18
+
19
+ Returns ``None`` on any failure — the caller falls back to the
20
+ sentinel rather than crashing import.
21
+ """
22
+ import tomllib
23
+ from pathlib import Path
24
+
25
+ pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
26
+ try:
27
+ with pyproject.open("rb") as fh:
28
+ data = tomllib.load(fh)
29
+ except (OSError, tomllib.TOMLDecodeError):
30
+ return None
31
+ project = data.get("project") or {}
32
+ v = project.get("version")
33
+ return v if isinstance(v, str) and v else None
34
+
35
+
36
+ try:
37
+ __version__ = version("allelix")
38
+ except PackageNotFoundError:
39
+ # Source checkout without an editable install. Try pyproject.toml
40
+ # before falling back to the sentinel.
41
+ __version__ = _read_pyproject_version() or "0.0.0+local"
@@ -142,7 +142,30 @@ class Annotator(ABC):
142
142
  self.data_dir = data_dir
143
143
 
144
144
  def __del__(self) -> None:
145
- """Release resources on GC to prevent ResourceWarning."""
145
+ """Safety-net resource release on GC. Deliberately retained.
146
+
147
+ GH #36 (audit second pass) flagged ``__del__`` as a Python
148
+ antipattern — GC timing is nondeterministic and raised exceptions
149
+ are silently swallowed. The correct usage pattern is the
150
+ ``__enter__`` / ``__exit__`` context manager pair below, wired
151
+ through ``contextlib.ExitStack`` in ``reports/_pipeline.py``.
152
+
153
+ However: removing ``__del__`` exposes residual SQLite connection
154
+ leaks in code paths that construct an annotator outside a
155
+ context manager. ``ResourceWarning`` is elevated to error by
156
+ ``pytest`` config, so leaks fail the suite as
157
+ ``PytestUnraisableExceptionWarning`` — caught in the v2.0.2
158
+ ship gate when ``__del__`` was first removed. Until every call
159
+ site is verified to use ``with`` / ``ExitStack`` / explicit
160
+ ``close()``, this safety net stays. v2.1 task: audit and
161
+ remove.
162
+
163
+ ``contextlib.suppress(Exception)`` is deliberate — ``__del__``
164
+ must never raise. The GC timing and shutdown-ordering edges
165
+ are explicitly silenced; this is exactly the
166
+ "if you must keep ``__del__``, make absolutely sure it can
167
+ never raise" mitigation the audit recommended.
168
+ """
146
169
  with contextlib.suppress(Exception):
147
170
  self.close()
148
171
 
@@ -14,6 +14,7 @@ and dispatches per-variant by `variant.build`.
14
14
  from __future__ import annotations
15
15
 
16
16
  import logging
17
+ import re
17
18
  import sqlite3
18
19
  from typing import TYPE_CHECKING, ClassVar
19
20
 
@@ -42,6 +43,14 @@ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
42
43
 
43
44
  _BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
44
45
 
46
+ # GH #21: a remote .md5 endpoint can return an HTML error page on a
47
+ # transient blip. The first whitespace-separated token of the body is
48
+ # what we treat as the hash, so without this gate `<!DOCTYPE` would be
49
+ # accepted as the "signal" and later passed to `verify_file_hash`, which
50
+ # would then delete the freshly downloaded VCF. MD5 is exactly 32 hex
51
+ # digits; reject anything else.
52
+ _MD5_HEX_RE = re.compile(r"^[0-9a-fA-F]{32}$")
53
+
45
54
 
46
55
  def clinvar_db_filename(build: str) -> str:
47
56
  """Per-build cache filename. Two coexisting SQLite files per data_dir."""
@@ -323,7 +332,18 @@ class ClinVarAnnotator(Annotator):
323
332
  if not body:
324
333
  return None
325
334
  first_token = body.strip().split(None, 1)[0] if body.strip() else ""
326
- if not first_token:
335
+ if not _MD5_HEX_RE.fullmatch(first_token):
336
+ # CDN error page, redirect interstitial, or empty body. Treat
337
+ # as a transient signal failure rather than poisoning the
338
+ # cache: callers handle `None` as "freshness unknown, skip"
339
+ # in `db update`, and `setup()` raises rather than passing
340
+ # garbage to `verify_file_hash` (which would delete the VCF).
341
+ logger.warning(
342
+ "clinvar(%s): .md5 endpoint returned a body whose first token "
343
+ "is not a 32-char hex digest (got %r); treating as no signal",
344
+ build,
345
+ first_token[:32],
346
+ )
327
347
  return None
328
348
  return f"md5:{first_token}"
329
349
 
@@ -94,12 +94,20 @@ class GnomadAnnotator(Annotator):
94
94
  logger.warning("Could not remove staged file at %s", gz_path)
95
95
 
96
96
  def is_ready(self) -> bool:
97
- """True when the gnomAD SQLite cache exists with current schema version."""
97
+ """True when the gnomAD SQLite cache exists with current schema version.
98
+
99
+ GH #22: a cache with no ``local_version_tag`` used to be accepted
100
+ as ready (the previous ``or not tag`` escape). That defeated the
101
+ whole point of ``GNOMAD_SCHEMA_VERSION``: if it ever gets bumped,
102
+ every tagless legacy cache would silently pass as the new
103
+ version. Reject tagless caches so the user is told to re-run
104
+ ``db update``.
105
+ """
98
106
  info = get_database_info(self._db_path, "gnomad")
99
107
  if info is None:
100
108
  return False
101
109
  tag = info.get("local_version_tag") or ""
102
- return tag == f"sv:{GNOMAD_SCHEMA_VERSION}" or not tag
110
+ return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
103
111
 
104
112
  def version(self) -> str | None:
105
113
  """Return the cached database version, or None."""
@@ -16,6 +16,7 @@ from allelix.databases.gwas_loader import (
16
16
  _REQUIRED_GWAS_COLUMNS,
17
17
  GWAS_CATALOG_URL,
18
18
  GWAS_DB_FILENAME,
19
+ GWAS_MIN_ROWS,
19
20
  load_gwas_tsv,
20
21
  schema_is_current,
21
22
  )
@@ -57,18 +58,25 @@ _BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
57
58
 
58
59
 
59
60
  def _magnitude(p_value: float | None, or_beta: float | None) -> float:
60
- """Derive magnitude from p-value and optional effect size."""
61
+ """Derive magnitude from p-value and optional effect size.
62
+
63
+ GH #17: boundary comparisons are inclusive (``<=``) so the canonical
64
+ genome-wide-significance threshold ``p = 5e-8`` lands inside the
65
+ significant bucket rather than the suggestive bucket below it.
66
+ Strict ``<`` made the exact threshold value fall a full magnitude
67
+ below a barely-significant hit.
68
+ """
61
69
  if p_value is None:
62
70
  base = 2.0
63
- elif p_value < 5e-100:
71
+ elif p_value <= 5e-100:
64
72
  base = 8.0
65
- elif p_value < 5e-20:
73
+ elif p_value <= 5e-20:
66
74
  base = 7.0
67
- elif p_value < 5e-8:
75
+ elif p_value <= 5e-8:
68
76
  base = 6.0
69
- elif p_value < 5e-6:
77
+ elif p_value <= 5e-6:
70
78
  base = 4.0
71
- elif p_value < 5e-4:
79
+ elif p_value <= 5e-4:
72
80
  base = 3.0
73
81
  else:
74
82
  base = 2.0
@@ -143,7 +151,13 @@ class GWASCatalogAnnotator(Annotator):
143
151
  extracted = self.data_dir / tsv_names[0]
144
152
  if extracted != tsv_path:
145
153
  extracted.rename(tsv_path)
146
- load_gwas_tsv(tsv_path, self._db_path, source_url=url, remote_signal=signal)
154
+ load_gwas_tsv(
155
+ tsv_path,
156
+ self._db_path,
157
+ source_url=url,
158
+ remote_signal=signal,
159
+ min_rows=GWAS_MIN_ROWS,
160
+ )
147
161
  finally:
148
162
  try:
149
163
  zip_path.unlink()
@@ -183,6 +197,7 @@ class GWASCatalogAnnotator(Annotator):
183
197
  self._db_path,
184
198
  source_url=GWAS_CATALOG_URL,
185
199
  remote_signal=self.cached_remote_signal(),
200
+ min_rows=GWAS_MIN_ROWS,
186
201
  )
187
202
  except Exception:
188
203
  logger.warning("Auto-reingest from cached TSV failed", exc_info=True)
@@ -24,6 +24,7 @@ from allelix.databases.manager import (
24
24
  from allelix.databases.pharmgkb_loader import (
25
25
  PHARMGKB_CLINICAL_URL,
26
26
  PHARMGKB_DB_FILENAME,
27
+ PHARMGKB_MIN_ROWS,
27
28
  _normalize_genotype,
28
29
  load_pharmgkb_tsv,
29
30
  schema_is_current,
@@ -146,6 +147,7 @@ class PharmGKBAnnotator(Annotator):
146
147
  source_url=url,
147
148
  remote_signal=signal,
148
149
  allele_function_lookup=cpic_lookup,
150
+ min_rows=PHARMGKB_MIN_ROWS,
149
151
  )
150
152
 
151
153
  def is_ready(self) -> bool:
@@ -489,6 +491,7 @@ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
489
491
  version=old_version,
490
492
  remote_signal=old_signal,
491
493
  allele_function_lookup=cpic_lookup,
494
+ min_rows=PHARMGKB_MIN_ROWS,
492
495
  )
493
496
  except Exception:
494
497
  logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)
@@ -336,6 +336,12 @@ def _emit_build_diagnostics(result: object) -> None:
336
336
  source = "detected"
337
337
  elif diag.header_build:
338
338
  source = "header (no position confirmation)"
339
+ elif diag.chr_prefix_inferred:
340
+ # GH #38: chr-prefixed contig names ("chr1", "chrX", ...) reliably
341
+ # indicate GRCh38 in modern caller output. We DID detect a build;
342
+ # the banner and the warning should say so instead of reading as
343
+ # a blind default.
344
+ source = "inferred from chr-prefixed contig names"
339
345
  else:
340
346
  source = "fallback (no known SNPs matched)"
341
347
  console.print(
@@ -349,20 +355,55 @@ def _emit_build_diagnostics(result: object) -> None:
349
355
  f"This is a real-world data-quality issue — your provider may have "
350
356
  f"mislabeled the build (see ADR-0021).[/yellow]"
351
357
  )
358
+ elif diag.chr_prefix_inferred:
359
+ # GH #38: positive, accurate message — the inference path
360
+ # actually fired. Still recommend `--build` for users who
361
+ # want to lock in the answer; chr-prefix is a strong signal
362
+ # but UCSC hg19 also uses `chr` prefixes, so the heuristic
363
+ # isn't guaranteed against a hg19-converted file.
364
+ console.print(
365
+ f"[dim]Inferred {diag.effective_build} from chr-prefixed contig "
366
+ f"names (GRCh38 convention). Pass --build grch37 if this file is "
367
+ f"UCSC hg19 with chr-prefixed contigs instead.[/dim]"
368
+ )
352
369
  elif not diag.override and diag.detected_build is None and diag.header_build is None:
353
370
  # Common shape: VCF from a variant caller where the ID column is `.`
354
- # and the header has no ##contig assembly tag. The detector had no
355
- # rsID signal AND no header signal — both auto-detect paths failed.
371
+ # and the header has no ##contig assembly tag, AND no chr-prefix
372
+ # signal was observed. All three auto-detect paths failed.
356
373
  # Loudly recommend an explicit --build because picking the wrong one
357
374
  # silently means every annotation lookup uses wrong coordinates.
358
375
  console.print(
359
376
  f"[yellow]Could not auto-detect genome build (no rsIDs in input, "
360
- f"no ##contig assembly tag in header). Defaulted to "
377
+ f"no ##contig assembly tag, no chr-prefixed contigs). Defaulted to "
361
378
  f"{diag.effective_build}. If the file is the other build, pass "
362
379
  f"--build grch37 or --build grch38 explicitly — annotation "
363
380
  f"coordinates differ between builds and silently using the wrong "
364
381
  f"one will miss every hit.[/yellow]"
365
382
  )
383
+ elif (
384
+ not diag.override
385
+ and diag.detected_build is None
386
+ and diag.header_build is not None
387
+ and diag.inspected_count > 0
388
+ ):
389
+ # Position-detection inspected known-rsID rows but couldn't pick a
390
+ # build — either votes tied across builds or no row matched any
391
+ # build's reference position. Without this warning, the pipeline
392
+ # silently falls through to header_build, and a GRCh36 file with a
393
+ # GRCh37-mislabeled header gets the GRCh37 ClinVar cache (the
394
+ # silent-coords trap #15). The dim "header (no position
395
+ # confirmation)" status line shows the same facts but reads as
396
+ # routine — yellow is what the situation deserves.
397
+ console.print(
398
+ f"[yellow]Build detection inconclusive: "
399
+ f"{diag.inspected_count} known-rsID position checks ran but "
400
+ f"did not converge on a build. Using the file's header-claimed "
401
+ f"build ({diag.header_build}), which has not been confirmed "
402
+ f"against your position data. If the file is actually a "
403
+ f"different build, pass --build grch37 or --build grch38 to "
404
+ f"force — wrong coordinates will silently mis-annotate every "
405
+ f"variant.[/yellow]"
406
+ )
366
407
  if diag.effective_build == "GRCh36":
367
408
  console.print(
368
409
  "[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "
@@ -18,29 +18,12 @@ from allelix.databases import resolve_data_dir
18
18
  if TYPE_CHECKING:
19
19
  from pathlib import Path
20
20
 
21
- from allelix.annotators.base import Annotator
22
-
23
21
 
24
22
  @main.group()
25
23
  def db() -> None:
26
24
  """Manage local reference database cache."""
27
25
 
28
26
 
29
- def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
30
- """Write a remote signal to an existing cache without re-downloading."""
31
- import contextlib
32
- import sqlite3
33
-
34
- from allelix.databases.manager import stamp_remote_signal
35
-
36
- db_path = getattr(annotator, "_db_path", None)
37
- if db_path is None:
38
- return
39
- with contextlib.closing(sqlite3.connect(db_path)) as conn:
40
- stamp_remote_signal(conn, annotator.name, signal)
41
- conn.commit()
42
-
43
-
44
27
  def _confirm_cadd_license(*, license_held: bool = False) -> bool:
45
28
  """Show the CADD license notice and ask for confirmation."""
46
29
  if license_held:
@@ -207,14 +190,20 @@ def db_update(
207
190
  continue
208
191
 
209
192
  if cached is None:
210
- _stamp_remote_signal(annotator, remote)
193
+ # GH #20: a cache with no stored freshness signal almost
194
+ # always predates the signal mechanism — i.e., it is old.
195
+ # The previous behavior was to stamp the live remote signal
196
+ # onto the cache and call it current, which permanently
197
+ # marked stale data as fresh (only `--force` would escape).
198
+ # Treat tagless caches as needing a refresh.
211
199
  console.print(
212
- f" [dim]{annotator.name}: stamped remote signal "
213
- f"(version {annotator.version() or '(unknown)'})[/dim]"
200
+ f" [bold]{annotator.name}[/bold]: cache predates the "
201
+ "freshness signal; re-downloading…"
202
+ )
203
+ else:
204
+ console.print(
205
+ f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…"
214
206
  )
215
- continue
216
-
217
- console.print(f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
218
207
  if _helpers._run_setup(annotator):
219
208
  console.print(
220
209
  f" [green]✓ {annotator.name} refreshed[/green] "
@@ -0,0 +1,136 @@
1
+ # High-value SNPs: clinically important variants where a no-call
2
+ # should be explicitly flagged rather than silently omitted.
3
+ #
4
+ # Schema:
5
+ # rsid: dbSNP identifier
6
+ # gene: gene symbol
7
+ # cluster: optional grouping (e.g., "APOE" for the two-SNP APOE haplotype)
8
+ # note: human-readable warning text for no-call reports
9
+ #
10
+ # To add a SNP: append an entry following this format. Entries with the
11
+ # same cluster are grouped in warnings (e.g., "APOE genotype cannot be
12
+ # determined" when either rs429358 or rs7412 is a no-call).
13
+
14
+ - rsid: rs429358
15
+ gene: APOE
16
+ cluster: APOE
17
+ note: Required (with rs7412) to determine APOE genotype
18
+
19
+ - rsid: rs7412
20
+ gene: APOE
21
+ cluster: APOE
22
+ note: Required (with rs429358) to determine APOE genotype
23
+
24
+ - rsid: rs5742904
25
+ gene: APOB
26
+ note: Familial hypercholesterolemia marker (FH)
27
+
28
+ - rsid: rs80357906
29
+ gene: BRCA1
30
+ note: Hereditary breast/ovarian cancer marker
31
+
32
+ - rsid: rs1801133
33
+ gene: MTHFR
34
+ cluster: MTHFR
35
+ note: Methylation pathway (C677T)
36
+
37
+ - rsid: rs1801131
38
+ gene: MTHFR
39
+ cluster: MTHFR
40
+ note: Methylation pathway (A1298C)
41
+
42
+ - rsid: rs4680
43
+ gene: COMT
44
+ note: Catechol-O-methyltransferase activity
45
+
46
+ - rsid: rs1065852
47
+ gene: CYP2D6
48
+ note: Opioid / SSRI metabolism
49
+
50
+ - rsid: rs4244285
51
+ gene: CYP2C19
52
+ note: Clopidogrel, PPIs metabolism
53
+
54
+ - rsid: rs1799853
55
+ gene: CYP2C9
56
+ note: Warfarin metabolism
57
+
58
+ - rsid: rs4149056
59
+ gene: SLCO1B1
60
+ note: Statin myopathy risk
61
+
62
+ - rsid: rs3918290
63
+ gene: DPYD
64
+ note: Fluoropyrimidine toxicity
65
+
66
+ # v2.0.2 additions (GH #7): clinically actionable single-SNP variants
67
+ # verified to be on consumer arrays. Two new clusters: HFE (hereditary
68
+ # hemochromatosis compound-het) and TPMT (thiopurine *3 haplotype).
69
+
70
+ - rsid: rs6025
71
+ gene: F5
72
+ note: Factor V Leiden — hereditary thrombophilia (FDA-cleared GHR variant)
73
+
74
+ - rsid: rs1799963
75
+ gene: F2
76
+ note: Prothrombin G20210A — hereditary thrombophilia
77
+
78
+ - rsid: rs1800562
79
+ gene: HFE
80
+ cluster: HFE
81
+ note: C282Y — hereditary hemochromatosis (compound het with H63D is the clinical form)
82
+
83
+ - rsid: rs1799945
84
+ gene: HFE
85
+ cluster: HFE
86
+ note: H63D — hereditary hemochromatosis (compound het with C282Y is the clinical form)
87
+
88
+ - rsid: rs113993960
89
+ gene: CFTR
90
+ note: F508del — most common CF allele; carrier status for reproductive planning
91
+
92
+ - rsid: rs334
93
+ gene: HBB
94
+ note: Sickle cell (HbS) — most-screened-for variant worldwide; carrier status
95
+
96
+ - rsid: rs80359550
97
+ gene: BRCA2
98
+ note: BRCA2 6174delT — most common Ashkenazi founder mutation (BRCA1 covered by rs80357906)
99
+
100
+ - rsid: rs9923231
101
+ gene: VKORC1
102
+ note: Warfarin dosing (CPIC Level A, pairs with CYP2C9 rs1799853)
103
+
104
+ - rsid: rs1057910
105
+ gene: CYP2C9
106
+ note: CYP2C9*3 — completes warfarin metabolizer profile alongside *2 (rs1799853)
107
+
108
+ - rsid: rs12248560
109
+ gene: CYP2C19
110
+ note: CYP2C19*17 ultrarapid metabolizer — completes clopidogrel profile alongside *2 (rs4244285)
111
+
112
+ - rsid: rs3892097
113
+ gene: CYP2D6
114
+ note: CYP2D6*4 — most common LOF in Europeans (complements rs1065852 *10)
115
+
116
+ - rsid: rs776746
117
+ gene: CYP3A5
118
+ note: CYP3A5*3 — tacrolimus dosing (CPIC Level A)
119
+
120
+ - rsid: rs1142345
121
+ gene: TPMT
122
+ cluster: TPMT
123
+ note: TPMT*3C — thiopurine dosing (CPIC Level A; with rs1800460 resolves *3A/*3B/*3C)
124
+
125
+ - rsid: rs1800460
126
+ gene: TPMT
127
+ cluster: TPMT
128
+ note: TPMT*3B — thiopurine dosing (CPIC Level A; with rs1142345 resolves *3A/*3B/*3C)
129
+
130
+ - rsid: rs116855232
131
+ gene: NUDT15
132
+ note: Thiopurine toxicity (CPIC Level A; critical in East Asian populations, complements TPMT cluster)
133
+
134
+ - rsid: rs34637584
135
+ gene: LRRK2
136
+ note: G2019S — most common monogenic Parkinson's variant
@@ -9,7 +9,7 @@ column of ``database_versions`` (e.g. ``iv:1``) so ``is_ready()`` can
9
9
  reject stale caches without forcing a full re-download.
10
10
  """
11
11
 
12
- CLINVAR_INTERPRETER_VERSION = 1
12
+ CLINVAR_INTERPRETER_VERSION = 2 # v2.0.1: GH #42 CLNDN-join in iter_clinvar_records
13
13
  PHARMGKB_INTERPRETER_VERSION = 1
14
14
  GNOMAD_SCHEMA_VERSION = 1
15
15
  ALPHAMISSENSE_SCHEMA_VERSION = 1
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
23
23
  ALPHAMISSENSE_DB_FILENAME = "alphamissense.sqlite"
24
24
 
25
25
  ALPHAMISSENSE_CACHE_URL = (
26
- "https://huggingface.co/datasets/dial481/allelix-alphamissense"
26
+ "https://huggingface.co/datasets/allelix/allelix-alphamissense"
27
27
  "/resolve/13a15e199536512b5e2d208d79c4f93c0a73f71f/alphamissense.sqlite.gz"
28
28
  )
29
29
 
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
24
24
  GNOMAD_DB_FILENAME = "gnomad.sqlite"
25
25
 
26
26
  GNOMAD_CACHE_URL = (
27
- "https://huggingface.co/datasets/dial481/allelix-gnomad"
27
+ "https://huggingface.co/datasets/allelix/allelix-gnomad"
28
28
  "/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
29
29
  )
30
30
 
@@ -465,16 +465,32 @@ def iter_gwas_records(tsv_path: Path) -> Iterator[dict[str, object]]:
465
465
  yield from best.values()
466
466
 
467
467
 
468
+ # Truncation sanity floor for production loads. This guards the count
469
+ # returned by iter_gwas_records — i.e. rows AFTER haplotype/no-trait
470
+ # filtering and (rsid, trait) dedup, not the raw catalog. EBI curates
471
+ # ~625K lead associations (GWAS Catalog, 2025); the loaded count is lower
472
+ # than that but still far above this floor. 100K only catches gross
473
+ # truncation (a mid-stream download committed as "complete") while staying
474
+ # permissive against legitimate upstream drift. Set to 0 from tests so
475
+ # synthetic fixtures of any size load cleanly. See GH #19.
476
+ GWAS_MIN_ROWS = 100_000
477
+
478
+
468
479
  def load_gwas_tsv(
469
480
  tsv_path: Path,
470
481
  db_path: Path,
471
482
  source_url: str = "",
472
483
  remote_signal: str | None = None,
484
+ min_rows: int = 0,
473
485
  ) -> int:
474
486
  """Parse a GWAS Catalog TSV into a fresh SQLite cache atomically.
475
487
 
476
488
  Writes to a `.tmp` sibling and `os.replace`s onto `db_path` only after a
477
489
  successful commit. Returns the number of records loaded.
490
+
491
+ ``min_rows`` is a sanity floor checked before the final ``os.replace``.
492
+ Set by production callers (see ``GwasAnnotator.setup``) to
493
+ ``GWAS_MIN_ROWS``; defaults to 0 so test fixtures of any size load.
478
494
  """
479
495
  tmp_path = db_path.parent / f"{db_path.name}.tmp"
480
496
  if tmp_path.exists():
@@ -535,6 +551,15 @@ def load_gwas_tsv(
535
551
  ),
536
552
  )
537
553
  conn.commit()
554
+ if count < min_rows:
555
+ msg = (
556
+ f"GWAS Catalog load aborted: only {count:,} rows ingested "
557
+ f"(floor {min_rows:,}). The download was likely truncated "
558
+ f"in flight (chunked transfer with no Content-Length, or "
559
+ f"connection drop mid-stream). Retry with "
560
+ f"`allelix db update --force`."
561
+ )
562
+ raise OSError(msg)
538
563
  os.replace(tmp_path, db_path)
539
564
  return count
540
565
  except Exception:
@@ -197,9 +197,20 @@ def parse_clinvar_version(vcf_path: Path) -> str | None:
197
197
  def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
198
198
  """Stream parse a ClinVar VCF (.vcf or .vcf.gz). Skip entries without an RS id.
199
199
 
200
- Multi-allelic rows (ALT="A,T") are split into one record per ALT. Parallel
201
- INFO fields (CLNSIG, CLNDN, ALLELEID) are separated by `|` per ClinVar's
202
- convention and index-paired with the ALTs.
200
+ Multi-allelic rows (ALT="A,T") are split into one record per ALT.
201
+ Parallel INFO fields ``CLNSIG`` and ``ALLELEID`` are separated by
202
+ ``|`` and index-paired with the ALTs.
203
+
204
+ GH #42: ``CLNDN`` is NOT index-paired with ALTs — its ``|`` separator
205
+ enumerates the union of conditions across all SCV submissions on the
206
+ variant, with no positional mapping to CLNSIG. Joining the full list
207
+ into a single ``condition`` string per record avoids the Frankenstein
208
+ pairing (one SCV's classification next to another SCV's condition)
209
+ that index-picking introduced. The primary classification
210
+ (``CLNSIG[0]``) is kept as-is — that value is correct as a
211
+ variant-level claim; only the condition-pairing was misleading.
212
+ Full per-(classification, condition) pairing via
213
+ ``submission_summary.txt.gz`` is tracked for v2.1.
203
214
  """
204
215
  opener = gzip.open if vcf_path.suffix == ".gz" else open
205
216
  with opener(vcf_path, "rt", encoding="utf-8") as fh:
@@ -231,6 +242,12 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
231
242
  review_status = info_dict.get("CLNREVSTAT", "")
232
243
  gene = _extract_gene(info_dict.get("GENEINFO", ""))
233
244
 
245
+ # GH #42: CLNDN's `|`-separator is per-SCV, not per-ALT.
246
+ # Join the full list once per row (same string emitted for
247
+ # every ALT split-out of this record). Empty/`.`/blank
248
+ # tokens are filtered out so callers don't see leading/trailing
249
+ # separators.
250
+ joined_condition = "; ".join(c.replace("_", " ") for c in clndns if c and c != ".")
234
251
  for i, alt in enumerate(alts):
235
252
  yield {
236
253
  "rsid": f"rs{rs}",
@@ -239,7 +256,7 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
239
256
  "ref": ref,
240
257
  "alt": alt,
241
258
  "clinical_significance": _pick(clnsigs, i),
242
- "condition": _pick(clndns, i).replace("_", " "),
259
+ "condition": joined_condition,
243
260
  "gene": gene,
244
261
  "review_status": review_status,
245
262
  "allele_id": _safe_int(_pick(allele_ids, i)),