PyPI - allelix - Versions diffs - 2.0.0__tar.gz → 2.0.2__tar.gz - Mend

allelix 2.0.0tar.gz → 2.0.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

{allelix-2.0.0 → allelix-2.0.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: allelix
-Version: 2.0.0
+Version: 2.0.2
 Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
 Author: Allelix
 Maintainer-email: dial481 <dial481@users.noreply.github.com>
@@ -161,6 +161,18 @@ This is not a disclaimer afterthought. It is a design constraint that affects mo
 - Reference databases are downloaded via `allelix db update` and cached locally.
 - Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
+### Output files contain real annotations of your genome
+The JSON / HTML / terminal output of `allelix analyze` and its
+focused subcommands contains real annotations against your specific
+variants — drug-response calls, carrier-status flags, hereditary-
+disease findings. Wherever you write them via `--output <path>`,
+that's where they sit until you delete them. Allelix doesn't
+auto-clean and won't warn you when you write to `/tmp/` or any
+other shared location. Treat the files as personal data: read them,
+move them somewhere you control, or delete when you're done. A
+data-lifecycle subcommand is planned for v2.1.
 ## Configuration
 Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.

{allelix-2.0.0 → allelix-2.0.2}/README.md RENAMED Viewed

@@ -124,6 +124,18 @@ This is not a disclaimer afterthought. It is a design constraint that affects mo
 - Reference databases are downloaded via `allelix db update` and cached locally.
 - Analysis runs offline against local database caches. A brief freshness check runs before analysis by default (skipped with `--no-update`).
+### Output files contain real annotations of your genome
+The JSON / HTML / terminal output of `allelix analyze` and its
+focused subcommands contains real annotations against your specific
+variants — drug-response calls, carrier-status flags, hereditary-
+disease findings. Wherever you write them via `--output <path>`,
+that's where they sit until you delete them. Allelix doesn't
+auto-clean and won't warn you when you write to `/tmp/` or any
+other shared location. Treat the files as personal data: read them,
+move them somewhere you control, or delete when you're done. A
+data-lifecycle subcommand is planned for v2.1.
 ## Configuration
 Allelix stores persistent configuration in `config.toml` (in the data directory, default `~/.local/share/allelix/`). A default config is created on first run.

allelix-2.0.2/allelix/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2026 Allelix
+"""Allelix: open-source genotype analysis toolkit."""
+from __future__ import annotations
+from importlib.metadata import PackageNotFoundError, version
+def _read_pyproject_version() -> str | None:
+    """Read the package version from ``pyproject.toml``.
+    GH #34: fall back to ``pyproject.toml`` when run from a bare source
+    checkout (no editable install, no installed package metadata). Keeps
+    ``--version`` and the outbound HTTP User-Agent reporting the real
+    version string instead of the ``0.0.0+local`` sentinel that
+    misidentifies our traffic to NCBI / EBI / HuggingFace.
+    Returns ``None`` on any failure — the caller falls back to the
+    sentinel rather than crashing import.
+    """
+    import tomllib
+    from pathlib import Path
+    pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
+    try:
+        with pyproject.open("rb") as fh:
+            data = tomllib.load(fh)
+    except (OSError, tomllib.TOMLDecodeError):
+        return None
+    project = data.get("project") or {}
+    v = project.get("version")
+    return v if isinstance(v, str) and v else None
+try:
+    __version__ = version("allelix")
+except PackageNotFoundError:
+    # Source checkout without an editable install. Try pyproject.toml
+    # before falling back to the sentinel.
+    __version__ = _read_pyproject_version() or "0.0.0+local"

{allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/base.py RENAMED Viewed

@@ -142,7 +142,30 @@ class Annotator(ABC):
         self.data_dir = data_dir
     def __del__(self) -> None:
-        """Release resources on GC to prevent ResourceWarning."""
+        """Safety-net resource release on GC. Deliberately retained.
+        GH #36 (audit second pass) flagged ``__del__`` as a Python
+        antipattern — GC timing is nondeterministic and raised exceptions
+        are silently swallowed. The correct usage pattern is the
+        ``__enter__`` / ``__exit__`` context manager pair below, wired
+        through ``contextlib.ExitStack`` in ``reports/_pipeline.py``.
+        However: removing ``__del__`` exposes residual SQLite connection
+        leaks in code paths that construct an annotator outside a
+        context manager. ``ResourceWarning`` is elevated to error by
+        ``pytest`` config, so leaks fail the suite as
+        ``PytestUnraisableExceptionWarning`` — caught in the v2.0.2
+        ship gate when ``__del__`` was first removed. Until every call
+        site is verified to use ``with`` / ``ExitStack`` / explicit
+        ``close()``, this safety net stays. v2.1 task: audit and
+        remove.
+        ``contextlib.suppress(Exception)`` is deliberate — ``__del__``
+        must never raise. The GC timing and shutdown-ordering edges
+        are explicitly silenced; this is exactly the
+        "if you must keep ``__del__``, make absolutely sure it can
+        never raise" mitigation the audit recommended.
+        """
         with contextlib.suppress(Exception):
             self.close()

{allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/clinvar.py RENAMED Viewed

@@ -14,6 +14,7 @@ and dispatches per-variant by `variant.build`.
 from __future__ import annotations
 import logging
+import re
 import sqlite3
 from typing import TYPE_CHECKING, ClassVar
@@ -42,6 +43,14 @@ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
 _BATCH_CHUNK = 500  # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
+# GH #21: a remote .md5 endpoint can return an HTML error page on a
+# transient blip. The first whitespace-separated token of the body is
+# what we treat as the hash, so without this gate `<!DOCTYPE` would be
+# accepted as the "signal" and later passed to `verify_file_hash`, which
+# would then delete the freshly downloaded VCF. MD5 is exactly 32 hex
+# digits; reject anything else.
+_MD5_HEX_RE = re.compile(r"^[0-9a-fA-F]{32}$")
 def clinvar_db_filename(build: str) -> str:
     """Per-build cache filename. Two coexisting SQLite files per data_dir."""
@@ -323,7 +332,18 @@ class ClinVarAnnotator(Annotator):
         if not body:
             return None
         first_token = body.strip().split(None, 1)[0] if body.strip() else ""
-        if not first_token:
+        if not _MD5_HEX_RE.fullmatch(first_token):
+            # CDN error page, redirect interstitial, or empty body. Treat
+            # as a transient signal failure rather than poisoning the
+            # cache: callers handle `None` as "freshness unknown, skip"
+            # in `db update`, and `setup()` raises rather than passing
+            # garbage to `verify_file_hash` (which would delete the VCF).
+            logger.warning(
+                "clinvar(%s): .md5 endpoint returned a body whose first token "
+                "is not a 32-char hex digest (got %r); treating as no signal",
+                build,
+                first_token[:32],
+            )
             return None
         return f"md5:{first_token}"

{allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/gnomad.py RENAMED Viewed

@@ -94,12 +94,20 @@ class GnomadAnnotator(Annotator):
             logger.warning("Could not remove staged file at %s", gz_path)
     def is_ready(self) -> bool:
-        """True when the gnomAD SQLite cache exists with current schema version."""
+        """True when the gnomAD SQLite cache exists with current schema version.
+        GH #22: a cache with no ``local_version_tag`` used to be accepted
+        as ready (the previous ``or not tag`` escape). That defeated the
+        whole point of ``GNOMAD_SCHEMA_VERSION``: if it ever gets bumped,
+        every tagless legacy cache would silently pass as the new
+        version. Reject tagless caches so the user is told to re-run
+        ``db update``.
+        """
         info = get_database_info(self._db_path, "gnomad")
         if info is None:
             return False
         tag = info.get("local_version_tag") or ""
-        return tag == f"sv:{GNOMAD_SCHEMA_VERSION}" or not tag
+        return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
     def version(self) -> str | None:
         """Return the cached database version, or None."""

{allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/gwas.py RENAMED Viewed

@@ -16,6 +16,7 @@ from allelix.databases.gwas_loader import (
     _REQUIRED_GWAS_COLUMNS,
     GWAS_CATALOG_URL,
     GWAS_DB_FILENAME,
+    GWAS_MIN_ROWS,
     load_gwas_tsv,
     schema_is_current,
 )
@@ -57,18 +58,25 @@ _BATCH_CHUNK = 500  # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
 def _magnitude(p_value: float | None, or_beta: float | None) -> float:
-    """Derive magnitude from p-value and optional effect size."""
+    """Derive magnitude from p-value and optional effect size.
+    GH #17: boundary comparisons are inclusive (``<=``) so the canonical
+    genome-wide-significance threshold ``p = 5e-8`` lands inside the
+    significant bucket rather than the suggestive bucket below it.
+    Strict ``<`` made the exact threshold value fall a full magnitude
+    below a barely-significant hit.
+    """
     if p_value is None:
         base = 2.0
-    elif p_value < 5e-100:
+    elif p_value <= 5e-100:
         base = 8.0
-    elif p_value < 5e-20:
+    elif p_value <= 5e-20:
         base = 7.0
-    elif p_value < 5e-8:
+    elif p_value <= 5e-8:
         base = 6.0
-    elif p_value < 5e-6:
+    elif p_value <= 5e-6:
         base = 4.0
-    elif p_value < 5e-4:
+    elif p_value <= 5e-4:
         base = 3.0
     else:
         base = 2.0
@@ -143,7 +151,13 @@ class GWASCatalogAnnotator(Annotator):
                 extracted = self.data_dir / tsv_names[0]
                 if extracted != tsv_path:
                     extracted.rename(tsv_path)
-            load_gwas_tsv(tsv_path, self._db_path, source_url=url, remote_signal=signal)
+            load_gwas_tsv(
+                tsv_path,
+                self._db_path,
+                source_url=url,
+                remote_signal=signal,
+                min_rows=GWAS_MIN_ROWS,
+            )
         finally:
             try:
                 zip_path.unlink()
@@ -183,6 +197,7 @@ class GWASCatalogAnnotator(Annotator):
                     self._db_path,
                     source_url=GWAS_CATALOG_URL,
                     remote_signal=self.cached_remote_signal(),
+                    min_rows=GWAS_MIN_ROWS,
                 )
             except Exception:
                 logger.warning("Auto-reingest from cached TSV failed", exc_info=True)

{allelix-2.0.0 → allelix-2.0.2}/allelix/annotators/pharmgkb.py RENAMED Viewed

@@ -24,6 +24,7 @@ from allelix.databases.manager import (
 from allelix.databases.pharmgkb_loader import (
     PHARMGKB_CLINICAL_URL,
     PHARMGKB_DB_FILENAME,
+    PHARMGKB_MIN_ROWS,
     _normalize_genotype,
     load_pharmgkb_tsv,
     schema_is_current,
@@ -146,6 +147,7 @@ class PharmGKBAnnotator(Annotator):
             source_url=url,
             remote_signal=signal,
             allele_function_lookup=cpic_lookup,
+            min_rows=PHARMGKB_MIN_ROWS,
         )
     def is_ready(self) -> bool:
@@ -489,6 +491,7 @@ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
             version=old_version,
             remote_signal=old_signal,
             allele_function_lookup=cpic_lookup,
+            min_rows=PHARMGKB_MIN_ROWS,
         )
     except Exception:
         logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)

{allelix-2.0.0 → allelix-2.0.2}/allelix/cli/_helpers.py RENAMED Viewed

@@ -336,6 +336,12 @@ def _emit_build_diagnostics(result: object) -> None:
         source = "detected"
     elif diag.header_build:
         source = "header (no position confirmation)"
+    elif diag.chr_prefix_inferred:
+        # GH #38: chr-prefixed contig names ("chr1", "chrX", ...) reliably
+        # indicate GRCh38 in modern caller output. We DID detect a build;
+        # the banner and the warning should say so instead of reading as
+        # a blind default.
+        source = "inferred from chr-prefixed contig names"
     else:
         source = "fallback (no known SNPs matched)"
     console.print(
@@ -349,20 +355,55 @@ def _emit_build_diagnostics(result: object) -> None:
             f"This is a real-world data-quality issue — your provider may have "
             f"mislabeled the build (see ADR-0021).[/yellow]"
         )
+    elif diag.chr_prefix_inferred:
+        # GH #38: positive, accurate message — the inference path
+        # actually fired. Still recommend `--build` for users who
+        # want to lock in the answer; chr-prefix is a strong signal
+        # but UCSC hg19 also uses `chr` prefixes, so the heuristic
+        # isn't guaranteed against a hg19-converted file.
+        console.print(
+            f"[dim]Inferred {diag.effective_build} from chr-prefixed contig "
+            f"names (GRCh38 convention). Pass --build grch37 if this file is "
+            f"UCSC hg19 with chr-prefixed contigs instead.[/dim]"
+        )
     elif not diag.override and diag.detected_build is None and diag.header_build is None:
         # Common shape: VCF from a variant caller where the ID column is `.`
-        # and the header has no ##contig assembly tag. The detector had no
-        # rsID signal AND no header signal — both auto-detect paths failed.
+        # and the header has no ##contig assembly tag, AND no chr-prefix
+        # signal was observed. All three auto-detect paths failed.
         # Loudly recommend an explicit --build because picking the wrong one
         # silently means every annotation lookup uses wrong coordinates.
         console.print(
             f"[yellow]Could not auto-detect genome build (no rsIDs in input, "
-            f"no ##contig assembly tag in header). Defaulted to "
+            f"no ##contig assembly tag, no chr-prefixed contigs). Defaulted to "
             f"{diag.effective_build}. If the file is the other build, pass "
             f"--build grch37 or --build grch38 explicitly — annotation "
             f"coordinates differ between builds and silently using the wrong "
             f"one will miss every hit.[/yellow]"
         )
+    elif (
+        not diag.override
+        and diag.detected_build is None
+        and diag.header_build is not None
+        and diag.inspected_count > 0
+    ):
+        # Position-detection inspected known-rsID rows but couldn't pick a
+        # build — either votes tied across builds or no row matched any
+        # build's reference position. Without this warning, the pipeline
+        # silently falls through to header_build, and a GRCh36 file with a
+        # GRCh37-mislabeled header gets the GRCh37 ClinVar cache (the
+        # silent-coords trap #15). The dim "header (no position
+        # confirmation)" status line shows the same facts but reads as
+        # routine — yellow is what the situation deserves.
+        console.print(
+            f"[yellow]Build detection inconclusive: "
+            f"{diag.inspected_count} known-rsID position checks ran but "
+            f"did not converge on a build. Using the file's header-claimed "
+            f"build ({diag.header_build}), which has not been confirmed "
+            f"against your position data. If the file is actually a "
+            f"different build, pass --build grch37 or --build grch38 to "
+            f"force — wrong coordinates will silently mis-annotate every "
+            f"variant.[/yellow]"
+        )
     if diag.effective_build == "GRCh36":
         console.print(
             "[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "

{allelix-2.0.0 → allelix-2.0.2}/allelix/cli/db.py RENAMED Viewed

@@ -18,29 +18,12 @@ from allelix.databases import resolve_data_dir
 if TYPE_CHECKING:
     from pathlib import Path
-    from allelix.annotators.base import Annotator
 @main.group()
 def db() -> None:
     """Manage local reference database cache."""
-def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
-    """Write a remote signal to an existing cache without re-downloading."""
-    import contextlib
-    import sqlite3
-    from allelix.databases.manager import stamp_remote_signal
-    db_path = getattr(annotator, "_db_path", None)
-    if db_path is None:
-        return
-    with contextlib.closing(sqlite3.connect(db_path)) as conn:
-        stamp_remote_signal(conn, annotator.name, signal)
-        conn.commit()
 def _confirm_cadd_license(*, license_held: bool = False) -> bool:
     """Show the CADD license notice and ask for confirmation."""
     if license_held:
@@ -207,14 +190,20 @@ def db_update(
                 continue
             if cached is None:
-                _stamp_remote_signal(annotator, remote)
+                # GH #20: a cache with no stored freshness signal almost
+                # always predates the signal mechanism — i.e., it is old.
+                # The previous behavior was to stamp the live remote signal
+                # onto the cache and call it current, which permanently
+                # marked stale data as fresh (only `--force` would escape).
+                # Treat tagless caches as needing a refresh.
                 console.print(
-                    f"  [dim]{annotator.name}: stamped remote signal "
-                    f"(version {annotator.version() or '(unknown)'})[/dim]"
+                    f"  [bold]{annotator.name}[/bold]: cache predates the "
+                    "freshness signal; re-downloading…"
+                )
+            else:
+                console.print(
+                    f"  [bold]{annotator.name}[/bold]: remote signal changed; refreshing…"
                 )
-                continue
-            console.print(f"  [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
             if _helpers._run_setup(annotator):
                 console.print(
                     f"  [green]✓ {annotator.name} refreshed[/green] "

allelix-2.0.2/allelix/data/high_value_snps.yaml ADDED Viewed

@@ -0,0 +1,136 @@
+# High-value SNPs: clinically important variants where a no-call
+# should be explicitly flagged rather than silently omitted.
+#
+# Schema:
+#   rsid:     dbSNP identifier
+#   gene:     gene symbol
+#   cluster:  optional grouping (e.g., "APOE" for the two-SNP APOE haplotype)
+#   note:     human-readable warning text for no-call reports
+#
+# To add a SNP: append an entry following this format. Entries with the
+# same cluster are grouped in warnings (e.g., "APOE genotype cannot be
+# determined" when either rs429358 or rs7412 is a no-call).
+- rsid: rs429358
+  gene: APOE
+  cluster: APOE
+  note: Required (with rs7412) to determine APOE genotype
+- rsid: rs7412
+  gene: APOE
+  cluster: APOE
+  note: Required (with rs429358) to determine APOE genotype
+- rsid: rs5742904
+  gene: APOB
+  note: Familial hypercholesterolemia marker (FH)
+- rsid: rs80357906
+  gene: BRCA1
+  note: Hereditary breast/ovarian cancer marker
+- rsid: rs1801133
+  gene: MTHFR
+  cluster: MTHFR
+  note: Methylation pathway (C677T)
+- rsid: rs1801131
+  gene: MTHFR
+  cluster: MTHFR
+  note: Methylation pathway (A1298C)
+- rsid: rs4680
+  gene: COMT
+  note: Catechol-O-methyltransferase activity
+- rsid: rs1065852
+  gene: CYP2D6
+  note: Opioid / SSRI metabolism
+- rsid: rs4244285
+  gene: CYP2C19
+  note: Clopidogrel, PPIs metabolism
+- rsid: rs1799853
+  gene: CYP2C9
+  note: Warfarin metabolism
+- rsid: rs4149056
+  gene: SLCO1B1
+  note: Statin myopathy risk
+- rsid: rs3918290
+  gene: DPYD
+  note: Fluoropyrimidine toxicity
+# v2.0.2 additions (GH #7): clinically actionable single-SNP variants
+# verified to be on consumer arrays. Two new clusters: HFE (hereditary
+# hemochromatosis compound-het) and TPMT (thiopurine *3 haplotype).
+- rsid: rs6025
+  gene: F5
+  note: Factor V Leiden — hereditary thrombophilia (FDA-cleared GHR variant)
+- rsid: rs1799963
+  gene: F2
+  note: Prothrombin G20210A — hereditary thrombophilia
+- rsid: rs1800562
+  gene: HFE
+  cluster: HFE
+  note: C282Y — hereditary hemochromatosis (compound het with H63D is the clinical form)
+- rsid: rs1799945
+  gene: HFE
+  cluster: HFE
+  note: H63D — hereditary hemochromatosis (compound het with C282Y is the clinical form)
+- rsid: rs113993960
+  gene: CFTR
+  note: F508del — most common CF allele; carrier status for reproductive planning
+- rsid: rs334
+  gene: HBB
+  note: Sickle cell (HbS) — most-screened-for variant worldwide; carrier status
+- rsid: rs80359550
+  gene: BRCA2
+  note: BRCA2 6174delT — most common Ashkenazi founder mutation (BRCA1 covered by rs80357906)
+- rsid: rs9923231
+  gene: VKORC1
+  note: Warfarin dosing (CPIC Level A, pairs with CYP2C9 rs1799853)
+- rsid: rs1057910
+  gene: CYP2C9
+  note: CYP2C9*3 — completes warfarin metabolizer profile alongside *2 (rs1799853)
+- rsid: rs12248560
+  gene: CYP2C19
+  note: CYP2C19*17 ultrarapid metabolizer — completes clopidogrel profile alongside *2 (rs4244285)
+- rsid: rs3892097
+  gene: CYP2D6
+  note: CYP2D6*4 — most common LOF in Europeans (complements rs1065852 *10)
+- rsid: rs776746
+  gene: CYP3A5
+  note: CYP3A5*3 — tacrolimus dosing (CPIC Level A)
+- rsid: rs1142345
+  gene: TPMT
+  cluster: TPMT
+  note: TPMT*3C — thiopurine dosing (CPIC Level A; with rs1800460 resolves *3A/*3B/*3C)
+- rsid: rs1800460
+  gene: TPMT
+  cluster: TPMT
+  note: TPMT*3B — thiopurine dosing (CPIC Level A; with rs1142345 resolves *3A/*3B/*3C)
+- rsid: rs116855232
+  gene: NUDT15
+  note: Thiopurine toxicity (CPIC Level A; critical in East Asian populations, complements TPMT cluster)
+- rsid: rs34637584
+  gene: LRRK2
+  note: G2019S — most common monogenic Parkinson's variant

{allelix-2.0.0 → allelix-2.0.2}/allelix/databases/_versions.py RENAMED Viewed

@@ -9,7 +9,7 @@ column of ``database_versions`` (e.g. ``iv:1``) so ``is_ready()`` can
 reject stale caches without forcing a full re-download.
 """
-CLINVAR_INTERPRETER_VERSION = 1
+CLINVAR_INTERPRETER_VERSION = 2  # v2.0.1: GH #42 CLNDN-join in iter_clinvar_records
 PHARMGKB_INTERPRETER_VERSION = 1
 GNOMAD_SCHEMA_VERSION = 1
 ALPHAMISSENSE_SCHEMA_VERSION = 1

{allelix-2.0.0 → allelix-2.0.2}/allelix/databases/alphamissense_loader.py RENAMED Viewed

@@ -23,7 +23,7 @@ if TYPE_CHECKING:
 ALPHAMISSENSE_DB_FILENAME = "alphamissense.sqlite"
 ALPHAMISSENSE_CACHE_URL = (
-    "https://huggingface.co/datasets/dial481/allelix-alphamissense"
+    "https://huggingface.co/datasets/allelix/allelix-alphamissense"
     "/resolve/13a15e199536512b5e2d208d79c4f93c0a73f71f/alphamissense.sqlite.gz"
 )

{allelix-2.0.0 → allelix-2.0.2}/allelix/databases/gnomad_loader.py RENAMED Viewed

@@ -24,7 +24,7 @@ if TYPE_CHECKING:
 GNOMAD_DB_FILENAME = "gnomad.sqlite"
 GNOMAD_CACHE_URL = (
-    "https://huggingface.co/datasets/dial481/allelix-gnomad"
+    "https://huggingface.co/datasets/allelix/allelix-gnomad"
     "/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
 )

{allelix-2.0.0 → allelix-2.0.2}/allelix/databases/gwas_loader.py RENAMED Viewed

@@ -465,16 +465,32 @@ def iter_gwas_records(tsv_path: Path) -> Iterator[dict[str, object]]:
     yield from best.values()
+# Truncation sanity floor for production loads. This guards the count
+# returned by iter_gwas_records — i.e. rows AFTER haplotype/no-trait
+# filtering and (rsid, trait) dedup, not the raw catalog. EBI curates
+# ~625K lead associations (GWAS Catalog, 2025); the loaded count is lower
+# than that but still far above this floor. 100K only catches gross
+# truncation (a mid-stream download committed as "complete") while staying
+# permissive against legitimate upstream drift. Set to 0 from tests so
+# synthetic fixtures of any size load cleanly. See GH #19.
+GWAS_MIN_ROWS = 100_000
 def load_gwas_tsv(
     tsv_path: Path,
     db_path: Path,
     source_url: str = "",
     remote_signal: str | None = None,
+    min_rows: int = 0,
 ) -> int:
     """Parse a GWAS Catalog TSV into a fresh SQLite cache atomically.
     Writes to a `.tmp` sibling and `os.replace`s onto `db_path` only after a
     successful commit. Returns the number of records loaded.
+    ``min_rows`` is a sanity floor checked before the final ``os.replace``.
+    Set by production callers (see ``GwasAnnotator.setup``) to
+    ``GWAS_MIN_ROWS``; defaults to 0 so test fixtures of any size load.
     """
     tmp_path = db_path.parent / f"{db_path.name}.tmp"
     if tmp_path.exists():
@@ -535,6 +551,15 @@ def load_gwas_tsv(
                 ),
             )
             conn.commit()
+        if count < min_rows:
+            msg = (
+                f"GWAS Catalog load aborted: only {count:,} rows ingested "
+                f"(floor {min_rows:,}). The download was likely truncated "
+                f"in flight (chunked transfer with no Content-Length, or "
+                f"connection drop mid-stream). Retry with "
+                f"`allelix db update --force`."
+            )
+            raise OSError(msg)
         os.replace(tmp_path, db_path)
         return count
     except Exception:

{allelix-2.0.0 → allelix-2.0.2}/allelix/databases/manager.py RENAMED Viewed

@@ -197,9 +197,20 @@ def parse_clinvar_version(vcf_path: Path) -> str | None:
 def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
     """Stream parse a ClinVar VCF (.vcf or .vcf.gz). Skip entries without an RS id.
-    Multi-allelic rows (ALT="A,T") are split into one record per ALT. Parallel
-    INFO fields (CLNSIG, CLNDN, ALLELEID) are separated by `|` per ClinVar's
-    convention and index-paired with the ALTs.
+    Multi-allelic rows (ALT="A,T") are split into one record per ALT.
+    Parallel INFO fields ``CLNSIG`` and ``ALLELEID`` are separated by
+    ``|`` and index-paired with the ALTs.
+    GH #42: ``CLNDN`` is NOT index-paired with ALTs — its ``|`` separator
+    enumerates the union of conditions across all SCV submissions on the
+    variant, with no positional mapping to CLNSIG. Joining the full list
+    into a single ``condition`` string per record avoids the Frankenstein
+    pairing (one SCV's classification next to another SCV's condition)
+    that index-picking introduced. The primary classification
+    (``CLNSIG[0]``) is kept as-is — that value is correct as a
+    variant-level claim; only the condition-pairing was misleading.
+    Full per-(classification, condition) pairing via
+    ``submission_summary.txt.gz`` is tracked for v2.1.
     """
     opener = gzip.open if vcf_path.suffix == ".gz" else open
     with opener(vcf_path, "rt", encoding="utf-8") as fh:
@@ -231,6 +242,12 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
             review_status = info_dict.get("CLNREVSTAT", "")
             gene = _extract_gene(info_dict.get("GENEINFO", ""))
+            # GH #42: CLNDN's `|`-separator is per-SCV, not per-ALT.
+            # Join the full list once per row (same string emitted for
+            # every ALT split-out of this record). Empty/`.`/blank
+            # tokens are filtered out so callers don't see leading/trailing
+            # separators.
+            joined_condition = "; ".join(c.replace("_", " ") for c in clndns if c and c != ".")
             for i, alt in enumerate(alts):
                 yield {
                     "rsid": f"rs{rs}",
@@ -239,7 +256,7 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
                     "ref": ref,
                     "alt": alt,
                     "clinical_significance": _pick(clnsigs, i),
-                    "condition": _pick(clndns, i).replace("_", " "),
+                    "condition": joined_condition,
                     "gene": gene,
                     "review_status": review_status,
                     "allele_id": _safe_int(_pick(allele_ids, i)),

allelix 2.0.0__tar.gz → 2.0.2__tar.gz

allelix 2.0.0tar.gz → 2.0.2tar.gz