allelix 2.0.0__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {allelix-2.0.0 → allelix-2.0.1}/PKG-INFO +1 -1
  2. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/clinvar.py +21 -1
  3. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/gnomad.py +10 -2
  4. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/gwas.py +22 -7
  5. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/pharmgkb.py +3 -0
  6. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/_helpers.py +24 -0
  7. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/db.py +12 -23
  8. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/_versions.py +1 -1
  9. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/alphamissense_loader.py +1 -1
  10. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/gnomad_loader.py +1 -1
  11. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/gwas_loader.py +25 -0
  12. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/manager.py +21 -4
  13. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/pharmgkb_loader.py +17 -0
  14. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/snpedia_parser.py +17 -0
  15. {allelix-2.0.0 → allelix-2.0.1}/allelix/models.py +19 -0
  16. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/_helpers.py +18 -4
  17. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ftdna.py +18 -1
  18. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/livingdna.py +22 -1
  19. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/_pipeline.py +69 -39
  20. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/diff.py +43 -4
  21. allelix-2.0.1/allelix/reports/terminal.py +241 -0
  22. {allelix-2.0.0 → allelix-2.0.1}/allelix/utils/allele.py +16 -14
  23. {allelix-2.0.0 → allelix-2.0.1}/allelix/utils/build_detect.py +31 -9
  24. {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/PKG-INFO +1 -1
  25. {allelix-2.0.0 → allelix-2.0.1}/pyproject.toml +1 -1
  26. {allelix-2.0.0 → allelix-2.0.1}/tests/test_cli.py +99 -10
  27. {allelix-2.0.0 → allelix-2.0.1}/tests/test_models.py +33 -0
  28. allelix-2.0.0/allelix/reports/terminal.py +0 -205
  29. {allelix-2.0.0 → allelix-2.0.1}/LICENSE +0 -0
  30. {allelix-2.0.0 → allelix-2.0.1}/README.md +0 -0
  31. {allelix-2.0.0 → allelix-2.0.1}/allelix/__init__.py +0 -0
  32. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/__init__.py +0 -0
  33. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/alphamissense.py +0 -0
  34. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/base.py +0 -0
  35. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/cadd.py +0 -0
  36. {allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/snpedia.py +0 -0
  37. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/__init__.py +0 -0
  38. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/_options.py +0 -0
  39. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/analyze.py +0 -0
  40. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/config.py +0 -0
  41. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/focused.py +0 -0
  42. {allelix-2.0.0 → allelix-2.0.1}/allelix/cli/utility.py +0 -0
  43. {allelix-2.0.0 → allelix-2.0.1}/allelix/compare.py +0 -0
  44. {allelix-2.0.0 → allelix-2.0.1}/allelix/config.py +0 -0
  45. {allelix-2.0.0 → allelix-2.0.1}/allelix/data/__init__.py +0 -0
  46. {allelix-2.0.0 → allelix-2.0.1}/allelix/data/clinvar_clnsig_snapshot.yaml +0 -0
  47. {allelix-2.0.0 → allelix-2.0.1}/allelix/data/high_value_snps.yaml +0 -0
  48. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/__init__.py +0 -0
  49. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/cadd_loader.py +0 -0
  50. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/cpic_loader.py +0 -0
  51. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/loader_utils.py +0 -0
  52. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/schema.py +0 -0
  53. {allelix-2.0.0 → allelix-2.0.1}/allelix/databases/snpedia_loader.py +0 -0
  54. {allelix-2.0.0 → allelix-2.0.1}/allelix/exporters/__init__.py +0 -0
  55. {allelix-2.0.0 → allelix-2.0.1}/allelix/exporters/plink.py +0 -0
  56. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/__init__.py +0 -0
  57. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ancestrydna.py +0 -0
  58. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/base.py +0 -0
  59. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ftdna_illumina.py +0 -0
  60. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/myhappygenes.py +0 -0
  61. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/myheritage.py +0 -0
  62. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/twentythreeandme.py +0 -0
  63. {allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/vcf.py +0 -0
  64. {allelix-2.0.0 → allelix-2.0.1}/allelix/py.typed +0 -0
  65. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/__init__.py +0 -0
  66. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/high_value.py +0 -0
  67. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/html.py +0 -0
  68. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/json_report.py +0 -0
  69. {allelix-2.0.0 → allelix-2.0.1}/allelix/reports/methylation.py +0 -0
  70. {allelix-2.0.0 → allelix-2.0.1}/allelix/utils/__init__.py +0 -0
  71. {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/SOURCES.txt +0 -0
  72. {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/dependency_links.txt +0 -0
  73. {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/entry_points.txt +0 -0
  74. {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/requires.txt +0 -0
  75. {allelix-2.0.0 → allelix-2.0.1}/allelix.egg-info/top_level.txt +0 -0
  76. {allelix-2.0.0 → allelix-2.0.1}/setup.cfg +0 -0
  77. {allelix-2.0.0 → allelix-2.0.1}/tests/test_cli_helpers.py +0 -0
  78. {allelix-2.0.0 → allelix-2.0.1}/tests/test_compare.py +0 -0
  79. {allelix-2.0.0 → allelix-2.0.1}/tests/test_config.py +0 -0
  80. {allelix-2.0.0 → allelix-2.0.1}/tests/test_end_to_end.py +0 -0
  81. {allelix-2.0.0 → allelix-2.0.1}/tests/test_mock_data_invariants.py +0 -0
  82. {allelix-2.0.0 → allelix-2.0.1}/tests/test_registry.py +0 -0
  83. {allelix-2.0.0 → allelix-2.0.1}/tests/test_version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: allelix
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
5
5
  Author: Allelix
6
6
  Maintainer-email: dial481 <dial481@users.noreply.github.com>
@@ -14,6 +14,7 @@ and dispatches per-variant by `variant.build`.
14
14
  from __future__ import annotations
15
15
 
16
16
  import logging
17
+ import re
17
18
  import sqlite3
18
19
  from typing import TYPE_CHECKING, ClassVar
19
20
 
@@ -42,6 +43,14 @@ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
42
43
 
43
44
  _BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
44
45
 
46
+ # GH #21: a remote .md5 endpoint can return an HTML error page on a
47
+ # transient blip. The first whitespace-separated token of the body is
48
+ # what we treat as the hash, so without this gate `<!DOCTYPE` would be
49
+ # accepted as the "signal" and later passed to `verify_file_hash`, which
50
+ # would then delete the freshly downloaded VCF. MD5 is exactly 32 hex
51
+ # digits; reject anything else.
52
+ _MD5_HEX_RE = re.compile(r"^[0-9a-fA-F]{32}$")
53
+
45
54
 
46
55
  def clinvar_db_filename(build: str) -> str:
47
56
  """Per-build cache filename. Two coexisting SQLite files per data_dir."""
@@ -323,7 +332,18 @@ class ClinVarAnnotator(Annotator):
323
332
  if not body:
324
333
  return None
325
334
  first_token = body.strip().split(None, 1)[0] if body.strip() else ""
326
- if not first_token:
335
+ if not _MD5_HEX_RE.fullmatch(first_token):
336
+ # CDN error page, redirect interstitial, or empty body. Treat
337
+ # as a transient signal failure rather than poisoning the
338
+ # cache: callers handle `None` as "freshness unknown, skip"
339
+ # in `db update`, and `setup()` raises rather than passing
340
+ # garbage to `verify_file_hash` (which would delete the VCF).
341
+ logger.warning(
342
+ "clinvar(%s): .md5 endpoint returned a body whose first token "
343
+ "is not a 32-char hex digest (got %r); treating as no signal",
344
+ build,
345
+ first_token[:32],
346
+ )
327
347
  return None
328
348
  return f"md5:{first_token}"
329
349
 
@@ -94,12 +94,20 @@ class GnomadAnnotator(Annotator):
94
94
  logger.warning("Could not remove staged file at %s", gz_path)
95
95
 
96
96
  def is_ready(self) -> bool:
97
- """True when the gnomAD SQLite cache exists with current schema version."""
97
+ """True when the gnomAD SQLite cache exists with current schema version.
98
+
99
+ GH #22: a cache with no ``local_version_tag`` used to be accepted
100
+ as ready (the previous ``or not tag`` escape). That defeated the
101
+ whole point of ``GNOMAD_SCHEMA_VERSION``: if it ever gets bumped,
102
+ every tagless legacy cache would silently pass as the new
103
+ version. Reject tagless caches so the user is told to re-run
104
+ ``db update``.
105
+ """
98
106
  info = get_database_info(self._db_path, "gnomad")
99
107
  if info is None:
100
108
  return False
101
109
  tag = info.get("local_version_tag") or ""
102
- return tag == f"sv:{GNOMAD_SCHEMA_VERSION}" or not tag
110
+ return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
103
111
 
104
112
  def version(self) -> str | None:
105
113
  """Return the cached database version, or None."""
@@ -16,6 +16,7 @@ from allelix.databases.gwas_loader import (
16
16
  _REQUIRED_GWAS_COLUMNS,
17
17
  GWAS_CATALOG_URL,
18
18
  GWAS_DB_FILENAME,
19
+ GWAS_MIN_ROWS,
19
20
  load_gwas_tsv,
20
21
  schema_is_current,
21
22
  )
@@ -57,18 +58,25 @@ _BATCH_CHUNK = 500 # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
57
58
 
58
59
 
59
60
  def _magnitude(p_value: float | None, or_beta: float | None) -> float:
60
- """Derive magnitude from p-value and optional effect size."""
61
+ """Derive magnitude from p-value and optional effect size.
62
+
63
+ GH #17: boundary comparisons are inclusive (``<=``) so the canonical
64
+ genome-wide-significance threshold ``p = 5e-8`` lands inside the
65
+ significant bucket rather than the suggestive bucket below it.
66
+ Strict ``<`` made the exact threshold value fall a full magnitude
67
+ below a barely-significant hit.
68
+ """
61
69
  if p_value is None:
62
70
  base = 2.0
63
- elif p_value < 5e-100:
71
+ elif p_value <= 5e-100:
64
72
  base = 8.0
65
- elif p_value < 5e-20:
73
+ elif p_value <= 5e-20:
66
74
  base = 7.0
67
- elif p_value < 5e-8:
75
+ elif p_value <= 5e-8:
68
76
  base = 6.0
69
- elif p_value < 5e-6:
77
+ elif p_value <= 5e-6:
70
78
  base = 4.0
71
- elif p_value < 5e-4:
79
+ elif p_value <= 5e-4:
72
80
  base = 3.0
73
81
  else:
74
82
  base = 2.0
@@ -143,7 +151,13 @@ class GWASCatalogAnnotator(Annotator):
143
151
  extracted = self.data_dir / tsv_names[0]
144
152
  if extracted != tsv_path:
145
153
  extracted.rename(tsv_path)
146
- load_gwas_tsv(tsv_path, self._db_path, source_url=url, remote_signal=signal)
154
+ load_gwas_tsv(
155
+ tsv_path,
156
+ self._db_path,
157
+ source_url=url,
158
+ remote_signal=signal,
159
+ min_rows=GWAS_MIN_ROWS,
160
+ )
147
161
  finally:
148
162
  try:
149
163
  zip_path.unlink()
@@ -183,6 +197,7 @@ class GWASCatalogAnnotator(Annotator):
183
197
  self._db_path,
184
198
  source_url=GWAS_CATALOG_URL,
185
199
  remote_signal=self.cached_remote_signal(),
200
+ min_rows=GWAS_MIN_ROWS,
186
201
  )
187
202
  except Exception:
188
203
  logger.warning("Auto-reingest from cached TSV failed", exc_info=True)
@@ -24,6 +24,7 @@ from allelix.databases.manager import (
24
24
  from allelix.databases.pharmgkb_loader import (
25
25
  PHARMGKB_CLINICAL_URL,
26
26
  PHARMGKB_DB_FILENAME,
27
+ PHARMGKB_MIN_ROWS,
27
28
  _normalize_genotype,
28
29
  load_pharmgkb_tsv,
29
30
  schema_is_current,
@@ -146,6 +147,7 @@ class PharmGKBAnnotator(Annotator):
146
147
  source_url=url,
147
148
  remote_signal=signal,
148
149
  allele_function_lookup=cpic_lookup,
150
+ min_rows=PHARMGKB_MIN_ROWS,
149
151
  )
150
152
 
151
153
  def is_ready(self) -> bool:
@@ -489,6 +491,7 @@ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
489
491
  version=old_version,
490
492
  remote_signal=old_signal,
491
493
  allele_function_lookup=cpic_lookup,
494
+ min_rows=PHARMGKB_MIN_ROWS,
492
495
  )
493
496
  except Exception:
494
497
  logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)
@@ -363,6 +363,30 @@ def _emit_build_diagnostics(result: object) -> None:
363
363
  f"coordinates differ between builds and silently using the wrong "
364
364
  f"one will miss every hit.[/yellow]"
365
365
  )
366
+ elif (
367
+ not diag.override
368
+ and diag.detected_build is None
369
+ and diag.header_build is not None
370
+ and diag.inspected_count > 0
371
+ ):
372
+ # Position-detection inspected known-rsID rows but couldn't pick a
373
+ # build — either votes tied across builds or no row matched any
374
+ # build's reference position. Without this warning, the pipeline
375
+ # silently falls through to header_build, and a GRCh36 file with a
376
+ # GRCh37-mislabeled header gets the GRCh37 ClinVar cache (the
377
+ # silent-coords trap #15). The dim "header (no position
378
+ # confirmation)" status line shows the same facts but reads as
379
+ # routine — yellow is what the situation deserves.
380
+ console.print(
381
+ f"[yellow]Build detection inconclusive: "
382
+ f"{diag.inspected_count} known-rsID position checks ran but "
383
+ f"did not converge on a build. Using the file's header-claimed "
384
+ f"build ({diag.header_build}), which has not been confirmed "
385
+ f"against your position data. If the file is actually a "
386
+ f"different build, pass --build grch37 or --build grch38 to "
387
+ f"force — wrong coordinates will silently mis-annotate every "
388
+ f"variant.[/yellow]"
389
+ )
366
390
  if diag.effective_build == "GRCh36":
367
391
  console.print(
368
392
  "[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "
@@ -18,29 +18,12 @@ from allelix.databases import resolve_data_dir
18
18
  if TYPE_CHECKING:
19
19
  from pathlib import Path
20
20
 
21
- from allelix.annotators.base import Annotator
22
-
23
21
 
24
22
  @main.group()
25
23
  def db() -> None:
26
24
  """Manage local reference database cache."""
27
25
 
28
26
 
29
- def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
30
- """Write a remote signal to an existing cache without re-downloading."""
31
- import contextlib
32
- import sqlite3
33
-
34
- from allelix.databases.manager import stamp_remote_signal
35
-
36
- db_path = getattr(annotator, "_db_path", None)
37
- if db_path is None:
38
- return
39
- with contextlib.closing(sqlite3.connect(db_path)) as conn:
40
- stamp_remote_signal(conn, annotator.name, signal)
41
- conn.commit()
42
-
43
-
44
27
  def _confirm_cadd_license(*, license_held: bool = False) -> bool:
45
28
  """Show the CADD license notice and ask for confirmation."""
46
29
  if license_held:
@@ -207,14 +190,20 @@ def db_update(
207
190
  continue
208
191
 
209
192
  if cached is None:
210
- _stamp_remote_signal(annotator, remote)
193
+ # GH #20: a cache with no stored freshness signal almost
194
+ # always predates the signal mechanism — i.e., it is old.
195
+ # The previous behavior was to stamp the live remote signal
196
+ # onto the cache and call it current, which permanently
197
+ # marked stale data as fresh (only `--force` would escape).
198
+ # Treat tagless caches as needing a refresh.
211
199
  console.print(
212
- f" [dim]{annotator.name}: stamped remote signal "
213
- f"(version {annotator.version() or '(unknown)'})[/dim]"
200
+ f" [bold]{annotator.name}[/bold]: cache predates the "
201
+ "freshness signal; re-downloading…"
202
+ )
203
+ else:
204
+ console.print(
205
+ f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…"
214
206
  )
215
- continue
216
-
217
- console.print(f" [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
218
207
  if _helpers._run_setup(annotator):
219
208
  console.print(
220
209
  f" [green]✓ {annotator.name} refreshed[/green] "
@@ -9,7 +9,7 @@ column of ``database_versions`` (e.g. ``iv:1``) so ``is_ready()`` can
9
9
  reject stale caches without forcing a full re-download.
10
10
  """
11
11
 
12
- CLINVAR_INTERPRETER_VERSION = 1
12
+ CLINVAR_INTERPRETER_VERSION = 2 # v2.0.1: GH #42 CLNDN-join in iter_clinvar_records
13
13
  PHARMGKB_INTERPRETER_VERSION = 1
14
14
  GNOMAD_SCHEMA_VERSION = 1
15
15
  ALPHAMISSENSE_SCHEMA_VERSION = 1
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
23
23
  ALPHAMISSENSE_DB_FILENAME = "alphamissense.sqlite"
24
24
 
25
25
  ALPHAMISSENSE_CACHE_URL = (
26
- "https://huggingface.co/datasets/dial481/allelix-alphamissense"
26
+ "https://huggingface.co/datasets/allelix/allelix-alphamissense"
27
27
  "/resolve/13a15e199536512b5e2d208d79c4f93c0a73f71f/alphamissense.sqlite.gz"
28
28
  )
29
29
 
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
24
24
  GNOMAD_DB_FILENAME = "gnomad.sqlite"
25
25
 
26
26
  GNOMAD_CACHE_URL = (
27
- "https://huggingface.co/datasets/dial481/allelix-gnomad"
27
+ "https://huggingface.co/datasets/allelix/allelix-gnomad"
28
28
  "/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
29
29
  )
30
30
 
@@ -465,16 +465,32 @@ def iter_gwas_records(tsv_path: Path) -> Iterator[dict[str, object]]:
465
465
  yield from best.values()
466
466
 
467
467
 
468
+ # Truncation sanity floor for production loads. This guards the count
469
+ # returned by iter_gwas_records — i.e. rows AFTER haplotype/no-trait
470
+ # filtering and (rsid, trait) dedup, not the raw catalog. EBI curates
471
+ # ~625K lead associations (GWAS Catalog, 2025); the loaded count is lower
472
+ # than that but still far above this floor. 100K only catches gross
473
+ # truncation (a mid-stream download committed as "complete") while staying
474
+ # permissive against legitimate upstream drift. Set to 0 from tests so
475
+ # synthetic fixtures of any size load cleanly. See GH #19.
476
+ GWAS_MIN_ROWS = 100_000
477
+
478
+
468
479
  def load_gwas_tsv(
469
480
  tsv_path: Path,
470
481
  db_path: Path,
471
482
  source_url: str = "",
472
483
  remote_signal: str | None = None,
484
+ min_rows: int = 0,
473
485
  ) -> int:
474
486
  """Parse a GWAS Catalog TSV into a fresh SQLite cache atomically.
475
487
 
476
488
  Writes to a `.tmp` sibling and `os.replace`s onto `db_path` only after a
477
489
  successful commit. Returns the number of records loaded.
490
+
491
+ ``min_rows`` is a sanity floor checked before the final ``os.replace``.
492
+ Set by production callers (see ``GwasAnnotator.setup``) to
493
+ ``GWAS_MIN_ROWS``; defaults to 0 so test fixtures of any size load.
478
494
  """
479
495
  tmp_path = db_path.parent / f"{db_path.name}.tmp"
480
496
  if tmp_path.exists():
@@ -535,6 +551,15 @@ def load_gwas_tsv(
535
551
  ),
536
552
  )
537
553
  conn.commit()
554
+ if count < min_rows:
555
+ msg = (
556
+ f"GWAS Catalog load aborted: only {count:,} rows ingested "
557
+ f"(floor {min_rows:,}). The download was likely truncated "
558
+ f"in flight (chunked transfer with no Content-Length, or "
559
+ f"connection drop mid-stream). Retry with "
560
+ f"`allelix db update --force`."
561
+ )
562
+ raise OSError(msg)
538
563
  os.replace(tmp_path, db_path)
539
564
  return count
540
565
  except Exception:
@@ -197,9 +197,20 @@ def parse_clinvar_version(vcf_path: Path) -> str | None:
197
197
  def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
198
198
  """Stream parse a ClinVar VCF (.vcf or .vcf.gz). Skip entries without an RS id.
199
199
 
200
- Multi-allelic rows (ALT="A,T") are split into one record per ALT. Parallel
201
- INFO fields (CLNSIG, CLNDN, ALLELEID) are separated by `|` per ClinVar's
202
- convention and index-paired with the ALTs.
200
+ Multi-allelic rows (ALT="A,T") are split into one record per ALT.
201
+ Parallel INFO fields ``CLNSIG`` and ``ALLELEID`` are separated by
202
+ ``|`` and index-paired with the ALTs.
203
+
204
+ GH #42: ``CLNDN`` is NOT index-paired with ALTs — its ``|`` separator
205
+ enumerates the union of conditions across all SCV submissions on the
206
+ variant, with no positional mapping to CLNSIG. Joining the full list
207
+ into a single ``condition`` string per record avoids the Frankenstein
208
+ pairing (one SCV's classification next to another SCV's condition)
209
+ that index-picking introduced. The primary classification
210
+ (``CLNSIG[0]``) is kept as-is — that value is correct as a
211
+ variant-level claim; only the condition-pairing was misleading.
212
+ Full per-(classification, condition) pairing via
213
+ ``submission_summary.txt.gz`` is tracked for v2.1.
203
214
  """
204
215
  opener = gzip.open if vcf_path.suffix == ".gz" else open
205
216
  with opener(vcf_path, "rt", encoding="utf-8") as fh:
@@ -231,6 +242,12 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
231
242
  review_status = info_dict.get("CLNREVSTAT", "")
232
243
  gene = _extract_gene(info_dict.get("GENEINFO", ""))
233
244
 
245
+ # GH #42: CLNDN's `|`-separator is per-SCV, not per-ALT.
246
+ # Join the full list once per row (same string emitted for
247
+ # every ALT split-out of this record). Empty/`.`/blank
248
+ # tokens are filtered out so callers don't see leading/trailing
249
+ # separators.
250
+ joined_condition = "; ".join(c.replace("_", " ") for c in clndns if c and c != ".")
234
251
  for i, alt in enumerate(alts):
235
252
  yield {
236
253
  "rsid": f"rs{rs}",
@@ -239,7 +256,7 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
239
256
  "ref": ref,
240
257
  "alt": alt,
241
258
  "clinical_significance": _pick(clnsigs, i),
242
- "condition": _pick(clndns, i).replace("_", " "),
259
+ "condition": joined_condition,
243
260
  "gene": gene,
244
261
  "review_status": review_status,
245
262
  "allele_id": _safe_int(_pick(allele_ids, i)),
@@ -334,6 +334,13 @@ def _safe_float(value: str) -> float | None:
334
334
  return None
335
335
 
336
336
 
337
+ # Truncation sanity floor for production loads. Current ClinPGx clinical
338
+ # annotations ship ~13K rows; ~5K is a generous floor that catches gross
339
+ # truncation while staying permissive against upstream-data drift. Set
340
+ # to 0 from tests so synthetic fixtures of any size load cleanly. See GH #19.
341
+ PHARMGKB_MIN_ROWS = 5_000
342
+
343
+
337
344
  def load_pharmgkb_tsv(
338
345
  zip_or_dir: Path,
339
346
  db_path: Path,
@@ -341,6 +348,7 @@ def load_pharmgkb_tsv(
341
348
  version: str = "",
342
349
  remote_signal: str | None = None,
343
350
  allele_function_lookup: dict[tuple[str, str], str] | None = None,
351
+ min_rows: int = 0,
344
352
  ) -> int:
345
353
  """Load a ClinPGx clinical-annotations dump into a fresh SQLite cache atomically.
346
354
 
@@ -430,6 +438,15 @@ def load_pharmgkb_tsv(
430
438
  ),
431
439
  )
432
440
  conn.commit()
441
+ if count < min_rows:
442
+ msg = (
443
+ f"ClinPGx load aborted: only {count:,} rows ingested "
444
+ f"(floor {min_rows:,}). The download was likely truncated "
445
+ f"in flight (chunked transfer with no Content-Length, or "
446
+ f"connection drop mid-stream). Retry with "
447
+ f"`allelix db update --force`."
448
+ )
449
+ raise OSError(msg)
433
450
  os.replace(tmp_path, db_path)
434
451
  return count
435
452
  except Exception:
@@ -147,6 +147,13 @@ def _dedupe_existing(conn: sqlite3.Connection) -> int:
147
147
  return before - after
148
148
 
149
149
 
150
+ # GH #12: identifier allowlist for the raw_table f-string interpolation
151
+ # below. The interpolation cannot be parameterized (SQLite doesn't support
152
+ # bind variables for identifiers); the allowlist gives a programmatic
153
+ # guarantee that only these two literals can reach the SQL.
154
+ _VALID_RAW_TABLES: frozenset[str] = frozenset({"_raw_pages", "pages"})
155
+
156
+
150
157
  def detect_raw_table(conn: sqlite3.Connection) -> str | None:
151
158
  """Return the name of the raw pages table, or None if absent."""
152
159
  tables = {
@@ -183,6 +190,16 @@ def _parse_raw_pages_inner(conn: sqlite3.Connection, *, verbose: bool = False) -
183
190
  raw_table = detect_raw_table(conn)
184
191
  if raw_table is None:
185
192
  return 0
193
+ # GH #12: `raw_table` flows into three SQL queries via f-string
194
+ # interpolation because SQLite doesn't support parameterized
195
+ # identifiers. Today it's safe — `detect_raw_table` only ever returns
196
+ # one of two literals or None — but the function's `str | None` return
197
+ # type doesn't pin that. A future edit (config-driven table name,
198
+ # scraped metadata) could drift it into an injection path. Allowlist
199
+ # explicitly so the guarantee outlives memory of the original design.
200
+ if raw_table not in _VALID_RAW_TABLES:
201
+ msg = f"unexpected raw table name: {raw_table!r}"
202
+ raise ValueError(msg)
186
203
 
187
204
  if verbose:
188
205
  logger.info("Parsing SNPedia raw pages from '%s' table", raw_table)
@@ -40,6 +40,25 @@ class Variant:
40
40
  allele2: str
41
41
  build: str = DEFAULT_BUILD
42
42
 
43
+ def __post_init__(self) -> None:
44
+ """Normalize allele case at construction (GH #14).
45
+
46
+ Reference databases (ClinVar, gnomAD, ClinPGx, etc.) all ship
47
+ uppercase alleles, and carrier matching is raw set membership
48
+ against ``{allele1, allele2}`` — a lowercase user allele would
49
+ silently fail to match and zero annotations would be produced
50
+ for a real carrier. Production parsers all emit uppercase
51
+ today, but a user-supplied filter file (custom panel) or a
52
+ future format variant could leak lowercase through. Normalize
53
+ at the model boundary so the invariant is impossible to
54
+ violate downstream. The no-call marker is left as-is;
55
+ multi-base alleles (indels) are uppercased in place.
56
+ """
57
+ if self.allele1 and self.allele1 != NO_CALL_MARKER:
58
+ self.allele1 = self.allele1.upper()
59
+ if self.allele2 and self.allele2 != NO_CALL_MARKER:
60
+ self.allele2 = self.allele2.upper()
61
+
43
62
  @property
44
63
  def is_heterozygous(self) -> bool:
45
64
  """True if the two alleles differ (and neither is a no-call)."""
@@ -17,10 +17,24 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
19
  def split_csv_line(line: str) -> list[str]:
20
- """Split a comma-delimited line and strip double-quotes from each field.
21
-
22
- Handles single-quoted, double-quoted, and double-double-quoted fields
23
- (the MyHeritage "extra quotes" variant).
20
+ """Split a comma-delimited line and strip surrounding quotes from each field.
21
+
22
+ Implementation is ``line.split(",")`` followed by a per-field
23
+ ``strip().strip('"')``. This is NOT a real CSV parser: a quoted field
24
+ containing a literal comma yields the wrong column count and is
25
+ silently dropped by callers' ``len(parts) != EXPECTED_COLUMNS``
26
+ guard.
27
+
28
+ Adequate for FTDNA / MyHeritage / Living DNA because every value in
29
+ those exports is either an rsID, chromosome identifier, integer
30
+ position, or concatenated genotype string — none of which contain
31
+ commas. If a future format ever ships embedded commas in quoted
32
+ fields, swap to ``csv.reader`` rather than relying on this helper.
33
+
34
+ Strips both surrounding double quotes (``"rs1"``) and the
35
+ double-double-quote variant some MyHeritage exports produce
36
+ (``""rs1""``) — the latter via two iterations of the trailing
37
+ ``strip('"')``.
24
38
  """
25
39
  return [field.strip().strip('"') for field in line.split(",")]
26
40
 
@@ -42,6 +42,14 @@ SNIFF_LINE_LIMIT = 50
42
42
  EXPECTED_COLUMNS = 4
43
43
  HEADER_CANONICAL = "RSID,CHROMOSOME,POSITION,RESULT"
44
44
 
45
+ # GH #26: FTDNA and MyHeritage files share the same data shape and
46
+ # header line. Without an explicit exclusion, both `can_parse`
47
+ # implementations accept MyHeritage files and the routing was masked
48
+ # only by registry order (MyHeritage listed first in parsers/__init__.py).
49
+ # Reorder the registry and FTDNA silently mislabels source format. The
50
+ # discriminator is the MyHeritage signature comment in the first line.
51
+ _MYHERITAGE_SIGNATURE = "MyHeritage"
52
+
45
53
 
46
54
  def _is_header_line(line: str) -> bool:
47
55
  """True if *line* is the FTDNA column header (quoted or unquoted)."""
@@ -58,7 +66,14 @@ class FTDNAParser(GenotypeParser):
58
66
  url: ClassVar[str] = "https://www.familytreedna.com"
59
67
 
60
68
  def can_parse(self, file_path: Path) -> bool:
61
- """Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header."""
69
+ """Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header.
70
+
71
+ GH #26: rejects files carrying the MyHeritage signature comment.
72
+ Both formats are byte-identical past the first comment line, so
73
+ the discriminator must be checked explicitly — otherwise FTDNA
74
+ also claims MyHeritage files and routing depends on registry
75
+ order (which has silently mislabeled formats in past audits).
76
+ """
62
77
  try:
63
78
  with file_path.open("r", encoding="utf-8") as fh:
64
79
  for _ in range(SNIFF_LINE_LIMIT):
@@ -66,6 +81,8 @@ class FTDNAParser(GenotypeParser):
66
81
  if not line:
67
82
  return False
68
83
  line = line.rstrip("\r\n")
84
+ if _MYHERITAGE_SIGNATURE in line:
85
+ return False
69
86
  if not line or line.startswith("#"):
70
87
  continue
71
88
  return _is_header_line(line)
@@ -32,6 +32,7 @@ Specifics:
32
32
  from __future__ import annotations
33
33
 
34
34
  import logging
35
+ import re
35
36
  from typing import TYPE_CHECKING, ClassVar
36
37
 
37
38
  from allelix.models import DEFAULT_BUILD, Variant
@@ -49,6 +50,12 @@ SIGNATURE = "Living DNA"
49
50
  SNIFF_LINE_LIMIT = 50
50
51
  EXPECTED_COLUMNS = 4
51
52
 
53
+ # GH #16: only inspect comment lines that look like a build marker.
54
+ # Without this filter every comment line is fed to
55
+ # ``normalize_build_label`` and a stray date / version digit can override
56
+ # the real build line.
57
+ _BUILD_MARKER_RE = re.compile(r"\b(build|reference|genome)\b", re.IGNORECASE)
58
+
52
59
 
53
60
  class LivingDNAParser(GenotypeParser):
54
61
  """Parser for Living DNA consumer genotype files."""
@@ -104,16 +111,30 @@ class LivingDNAParser(GenotypeParser):
104
111
  )
105
112
 
106
113
  def get_metadata(self, file_path: Path) -> GenotypeMetadata:
107
- """Extract build from header comments. Living DNA has no sample ID field."""
114
+ """Extract build from header comments. Living DNA has no sample ID field.
115
+
116
+ GH #16: previously every comment line was passed into
117
+ ``normalize_build_label`` and the *last* match won. A download-
118
+ date comment like ``# downloaded 2038-01-01`` would silently
119
+ retag the file as GRCh38. Only lines that look like an explicit
120
+ build marker (containing ``build``, ``reference``, or ``genome``)
121
+ are inspected, and the first match wins — matching the format
122
+ spec which puts the build line near the top:
123
+
124
+ # Human Genome Reference Build 37 (GRCh37.p13).
125
+ """
108
126
  build = DEFAULT_BUILD
109
127
  with file_path.open("r", encoding="utf-8") as fh:
110
128
  for raw in fh:
111
129
  line = raw.rstrip("\r\n")
112
130
  if not line.startswith("#"):
113
131
  break
132
+ if not _BUILD_MARKER_RE.search(line):
133
+ continue
114
134
  normalized = normalize_build_label(line)
115
135
  if normalized:
116
136
  build = normalized
137
+ break
117
138
  return GenotypeMetadata(
118
139
  format=self.name,
119
140
  sample_id="",