PyPI - allelix - Versions diffs - 2.0.0__tar.gz → 2.0.1__tar.gz - Mend

allelix 2.0.0tar.gz → 2.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

{allelix-2.0.0 → allelix-2.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: allelix
-Version: 2.0.0
+Version: 2.0.1
 Summary: Open-source genotype analysis toolkit. Format-agnostic ingestion, database-agnostic annotation, offline-first.
 Author: Allelix
 Maintainer-email: dial481 <dial481@users.noreply.github.com>

{allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/clinvar.py RENAMED Viewed

@@ -14,6 +14,7 @@ and dispatches per-variant by `variant.build`.
 from __future__ import annotations
 import logging
+import re
 import sqlite3
 from typing import TYPE_CHECKING, ClassVar
@@ -42,6 +43,14 @@ CLINVAR_SUPPORTED_BUILDS: tuple[str, ...] = ("GRCh37", "GRCh38")
 _BATCH_CHUNK = 500  # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
+# GH #21: a remote .md5 endpoint can return an HTML error page on a
+# transient blip. The first whitespace-separated token of the body is
+# what we treat as the hash, so without this gate `<!DOCTYPE` would be
+# accepted as the "signal" and later passed to `verify_file_hash`, which
+# would then delete the freshly downloaded VCF. MD5 is exactly 32 hex
+# digits; reject anything else.
+_MD5_HEX_RE = re.compile(r"^[0-9a-fA-F]{32}$")
 def clinvar_db_filename(build: str) -> str:
     """Per-build cache filename. Two coexisting SQLite files per data_dir."""
@@ -323,7 +332,18 @@ class ClinVarAnnotator(Annotator):
         if not body:
             return None
         first_token = body.strip().split(None, 1)[0] if body.strip() else ""
-        if not first_token:
+        if not _MD5_HEX_RE.fullmatch(first_token):
+            # CDN error page, redirect interstitial, or empty body. Treat
+            # as a transient signal failure rather than poisoning the
+            # cache: callers handle `None` as "freshness unknown, skip"
+            # in `db update`, and `setup()` raises rather than passing
+            # garbage to `verify_file_hash` (which would delete the VCF).
+            logger.warning(
+                "clinvar(%s): .md5 endpoint returned a body whose first token "
+                "is not a 32-char hex digest (got %r); treating as no signal",
+                build,
+                first_token[:32],
+            )
             return None
         return f"md5:{first_token}"

{allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/gnomad.py RENAMED Viewed

@@ -94,12 +94,20 @@ class GnomadAnnotator(Annotator):
             logger.warning("Could not remove staged file at %s", gz_path)
     def is_ready(self) -> bool:
-        """True when the gnomAD SQLite cache exists with current schema version."""
+        """True when the gnomAD SQLite cache exists with current schema version.
+        GH #22: a cache with no ``local_version_tag`` used to be accepted
+        as ready (the previous ``or not tag`` escape). That defeated the
+        whole point of ``GNOMAD_SCHEMA_VERSION``: if it ever gets bumped,
+        every tagless legacy cache would silently pass as the new
+        version. Reject tagless caches so the user is told to re-run
+        ``db update``.
+        """
         info = get_database_info(self._db_path, "gnomad")
         if info is None:
             return False
         tag = info.get("local_version_tag") or ""
-        return tag == f"sv:{GNOMAD_SCHEMA_VERSION}" or not tag
+        return tag == f"sv:{GNOMAD_SCHEMA_VERSION}"
     def version(self) -> str | None:
         """Return the cached database version, or None."""

{allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/gwas.py RENAMED Viewed

@@ -16,6 +16,7 @@ from allelix.databases.gwas_loader import (
     _REQUIRED_GWAS_COLUMNS,
     GWAS_CATALOG_URL,
     GWAS_DB_FILENAME,
+    GWAS_MIN_ROWS,
     load_gwas_tsv,
     schema_is_current,
 )
@@ -57,18 +58,25 @@ _BATCH_CHUNK = 500  # SQLite default SQLITE_MAX_VARIABLE_NUMBER is 999
 def _magnitude(p_value: float | None, or_beta: float | None) -> float:
-    """Derive magnitude from p-value and optional effect size."""
+    """Derive magnitude from p-value and optional effect size.
+    GH #17: boundary comparisons are inclusive (``<=``) so the canonical
+    genome-wide-significance threshold ``p = 5e-8`` lands inside the
+    significant bucket rather than the suggestive bucket below it.
+    Strict ``<`` made the exact threshold value fall a full magnitude
+    below a barely-significant hit.
+    """
     if p_value is None:
         base = 2.0
-    elif p_value < 5e-100:
+    elif p_value <= 5e-100:
         base = 8.0
-    elif p_value < 5e-20:
+    elif p_value <= 5e-20:
         base = 7.0
-    elif p_value < 5e-8:
+    elif p_value <= 5e-8:
         base = 6.0
-    elif p_value < 5e-6:
+    elif p_value <= 5e-6:
         base = 4.0
-    elif p_value < 5e-4:
+    elif p_value <= 5e-4:
         base = 3.0
     else:
         base = 2.0
@@ -143,7 +151,13 @@ class GWASCatalogAnnotator(Annotator):
                 extracted = self.data_dir / tsv_names[0]
                 if extracted != tsv_path:
                     extracted.rename(tsv_path)
-            load_gwas_tsv(tsv_path, self._db_path, source_url=url, remote_signal=signal)
+            load_gwas_tsv(
+                tsv_path,
+                self._db_path,
+                source_url=url,
+                remote_signal=signal,
+                min_rows=GWAS_MIN_ROWS,
+            )
         finally:
             try:
                 zip_path.unlink()
@@ -183,6 +197,7 @@ class GWASCatalogAnnotator(Annotator):
                     self._db_path,
                     source_url=GWAS_CATALOG_URL,
                     remote_signal=self.cached_remote_signal(),
+                    min_rows=GWAS_MIN_ROWS,
                 )
             except Exception:
                 logger.warning("Auto-reingest from cached TSV failed", exc_info=True)

{allelix-2.0.0 → allelix-2.0.1}/allelix/annotators/pharmgkb.py RENAMED Viewed

@@ -24,6 +24,7 @@ from allelix.databases.manager import (
 from allelix.databases.pharmgkb_loader import (
     PHARMGKB_CLINICAL_URL,
     PHARMGKB_DB_FILENAME,
+    PHARMGKB_MIN_ROWS,
     _normalize_genotype,
     load_pharmgkb_tsv,
     schema_is_current,
@@ -146,6 +147,7 @@ class PharmGKBAnnotator(Annotator):
             source_url=url,
             remote_signal=signal,
             allele_function_lookup=cpic_lookup,
+            min_rows=PHARMGKB_MIN_ROWS,
         )
     def is_ready(self) -> bool:
@@ -489,6 +491,7 @@ def _reingest_pharmgkb_from_cached_zip(db_path: Path, data_dir: Path) -> bool:
             version=old_version,
             remote_signal=old_signal,
             allele_function_lookup=cpic_lookup,
+            min_rows=PHARMGKB_MIN_ROWS,
         )
     except Exception:
         logger.warning("Auto-reingest from cached ZIP failed", exc_info=True)

{allelix-2.0.0 → allelix-2.0.1}/allelix/cli/_helpers.py RENAMED Viewed

@@ -363,6 +363,30 @@ def _emit_build_diagnostics(result: object) -> None:
             f"coordinates differ between builds and silently using the wrong "
             f"one will miss every hit.[/yellow]"
         )
+    elif (
+        not diag.override
+        and diag.detected_build is None
+        and diag.header_build is not None
+        and diag.inspected_count > 0
+    ):
+        # Position-detection inspected known-rsID rows but couldn't pick a
+        # build — either votes tied across builds or no row matched any
+        # build's reference position. Without this warning, the pipeline
+        # silently falls through to header_build, and a GRCh36 file with a
+        # GRCh37-mislabeled header gets the GRCh37 ClinVar cache (the
+        # silent-coords trap #15). The dim "header (no position
+        # confirmation)" status line shows the same facts but reads as
+        # routine — yellow is what the situation deserves.
+        console.print(
+            f"[yellow]Build detection inconclusive: "
+            f"{diag.inspected_count} known-rsID position checks ran but "
+            f"did not converge on a build. Using the file's header-claimed "
+            f"build ({diag.header_build}), which has not been confirmed "
+            f"against your position data. If the file is actually a "
+            f"different build, pass --build grch37 or --build grch38 to "
+            f"force — wrong coordinates will silently mis-annotate every "
+            f"variant.[/yellow]"
+        )
     if diag.effective_build == "GRCh36":
         console.print(
             "[yellow]Warning: GRCh36 (hg18) detected. rsID-based annotations "

{allelix-2.0.0 → allelix-2.0.1}/allelix/cli/db.py RENAMED Viewed

@@ -18,29 +18,12 @@ from allelix.databases import resolve_data_dir
 if TYPE_CHECKING:
     from pathlib import Path
-    from allelix.annotators.base import Annotator
 @main.group()
 def db() -> None:
     """Manage local reference database cache."""
-def _stamp_remote_signal(annotator: Annotator, signal: str) -> None:
-    """Write a remote signal to an existing cache without re-downloading."""
-    import contextlib
-    import sqlite3
-    from allelix.databases.manager import stamp_remote_signal
-    db_path = getattr(annotator, "_db_path", None)
-    if db_path is None:
-        return
-    with contextlib.closing(sqlite3.connect(db_path)) as conn:
-        stamp_remote_signal(conn, annotator.name, signal)
-        conn.commit()
 def _confirm_cadd_license(*, license_held: bool = False) -> bool:
     """Show the CADD license notice and ask for confirmation."""
     if license_held:
@@ -207,14 +190,20 @@ def db_update(
                 continue
             if cached is None:
-                _stamp_remote_signal(annotator, remote)
+                # GH #20: a cache with no stored freshness signal almost
+                # always predates the signal mechanism — i.e., it is old.
+                # The previous behavior was to stamp the live remote signal
+                # onto the cache and call it current, which permanently
+                # marked stale data as fresh (only `--force` would escape).
+                # Treat tagless caches as needing a refresh.
                 console.print(
-                    f"  [dim]{annotator.name}: stamped remote signal "
-                    f"(version {annotator.version() or '(unknown)'})[/dim]"
+                    f"  [bold]{annotator.name}[/bold]: cache predates the "
+                    "freshness signal; re-downloading…"
+                )
+            else:
+                console.print(
+                    f"  [bold]{annotator.name}[/bold]: remote signal changed; refreshing…"
                 )
-                continue
-            console.print(f"  [bold]{annotator.name}[/bold]: remote signal changed; refreshing…")
             if _helpers._run_setup(annotator):
                 console.print(
                     f"  [green]✓ {annotator.name} refreshed[/green] "

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/_versions.py RENAMED Viewed

@@ -9,7 +9,7 @@ column of ``database_versions`` (e.g. ``iv:1``) so ``is_ready()`` can
 reject stale caches without forcing a full re-download.
 """
-CLINVAR_INTERPRETER_VERSION = 1
+CLINVAR_INTERPRETER_VERSION = 2  # v2.0.1: GH #42 CLNDN-join in iter_clinvar_records
 PHARMGKB_INTERPRETER_VERSION = 1
 GNOMAD_SCHEMA_VERSION = 1
 ALPHAMISSENSE_SCHEMA_VERSION = 1

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/alphamissense_loader.py RENAMED Viewed

@@ -23,7 +23,7 @@ if TYPE_CHECKING:
 ALPHAMISSENSE_DB_FILENAME = "alphamissense.sqlite"
 ALPHAMISSENSE_CACHE_URL = (
-    "https://huggingface.co/datasets/dial481/allelix-alphamissense"
+    "https://huggingface.co/datasets/allelix/allelix-alphamissense"
     "/resolve/13a15e199536512b5e2d208d79c4f93c0a73f71f/alphamissense.sqlite.gz"
 )

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/gnomad_loader.py RENAMED Viewed

@@ -24,7 +24,7 @@ if TYPE_CHECKING:
 GNOMAD_DB_FILENAME = "gnomad.sqlite"
 GNOMAD_CACHE_URL = (
-    "https://huggingface.co/datasets/dial481/allelix-gnomad"
+    "https://huggingface.co/datasets/allelix/allelix-gnomad"
     "/resolve/f0aadfb7940290c44930dc0d1b9b093bc089173f/gnomad.sqlite.gz"
 )

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/gwas_loader.py RENAMED Viewed

@@ -465,16 +465,32 @@ def iter_gwas_records(tsv_path: Path) -> Iterator[dict[str, object]]:
     yield from best.values()
+# Truncation sanity floor for production loads. This guards the count
+# returned by iter_gwas_records — i.e. rows AFTER haplotype/no-trait
+# filtering and (rsid, trait) dedup, not the raw catalog. EBI curates
+# ~625K lead associations (GWAS Catalog, 2025); the loaded count is lower
+# than that but still far above this floor. 100K only catches gross
+# truncation (a mid-stream download committed as "complete") while staying
+# permissive against legitimate upstream drift. Set to 0 from tests so
+# synthetic fixtures of any size load cleanly. See GH #19.
+GWAS_MIN_ROWS = 100_000
 def load_gwas_tsv(
     tsv_path: Path,
     db_path: Path,
     source_url: str = "",
     remote_signal: str | None = None,
+    min_rows: int = 0,
 ) -> int:
     """Parse a GWAS Catalog TSV into a fresh SQLite cache atomically.
     Writes to a `.tmp` sibling and `os.replace`s onto `db_path` only after a
     successful commit. Returns the number of records loaded.
+    ``min_rows`` is a sanity floor checked before the final ``os.replace``.
+    Set by production callers (see ``GwasAnnotator.setup``) to
+    ``GWAS_MIN_ROWS``; defaults to 0 so test fixtures of any size load.
     """
     tmp_path = db_path.parent / f"{db_path.name}.tmp"
     if tmp_path.exists():
@@ -535,6 +551,15 @@ def load_gwas_tsv(
                 ),
             )
             conn.commit()
+        if count < min_rows:
+            msg = (
+                f"GWAS Catalog load aborted: only {count:,} rows ingested "
+                f"(floor {min_rows:,}). The download was likely truncated "
+                f"in flight (chunked transfer with no Content-Length, or "
+                f"connection drop mid-stream). Retry with "
+                f"`allelix db update --force`."
+            )
+            raise OSError(msg)
         os.replace(tmp_path, db_path)
         return count
     except Exception:

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/manager.py RENAMED Viewed

@@ -197,9 +197,20 @@ def parse_clinvar_version(vcf_path: Path) -> str | None:
 def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
     """Stream parse a ClinVar VCF (.vcf or .vcf.gz). Skip entries without an RS id.
-    Multi-allelic rows (ALT="A,T") are split into one record per ALT. Parallel
-    INFO fields (CLNSIG, CLNDN, ALLELEID) are separated by `|` per ClinVar's
-    convention and index-paired with the ALTs.
+    Multi-allelic rows (ALT="A,T") are split into one record per ALT.
+    Parallel INFO fields ``CLNSIG`` and ``ALLELEID`` are separated by
+    ``|`` and index-paired with the ALTs.
+    GH #42: ``CLNDN`` is NOT index-paired with ALTs — its ``|`` separator
+    enumerates the union of conditions across all SCV submissions on the
+    variant, with no positional mapping to CLNSIG. Joining the full list
+    into a single ``condition`` string per record avoids the Frankenstein
+    pairing (one SCV's classification next to another SCV's condition)
+    that index-picking introduced. The primary classification
+    (``CLNSIG[0]``) is kept as-is — that value is correct as a
+    variant-level claim; only the condition-pairing was misleading.
+    Full per-(classification, condition) pairing via
+    ``submission_summary.txt.gz`` is tracked for v2.1.
     """
     opener = gzip.open if vcf_path.suffix == ".gz" else open
     with opener(vcf_path, "rt", encoding="utf-8") as fh:
@@ -231,6 +242,12 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
             review_status = info_dict.get("CLNREVSTAT", "")
             gene = _extract_gene(info_dict.get("GENEINFO", ""))
+            # GH #42: CLNDN's `|`-separator is per-SCV, not per-ALT.
+            # Join the full list once per row (same string emitted for
+            # every ALT split-out of this record). Empty/`.`/blank
+            # tokens are filtered out so callers don't see leading/trailing
+            # separators.
+            joined_condition = "; ".join(c.replace("_", " ") for c in clndns if c and c != ".")
             for i, alt in enumerate(alts):
                 yield {
                     "rsid": f"rs{rs}",
@@ -239,7 +256,7 @@ def iter_clinvar_records(vcf_path: Path) -> Iterator[dict[str, object]]:
                     "ref": ref,
                     "alt": alt,
                     "clinical_significance": _pick(clnsigs, i),
-                    "condition": _pick(clndns, i).replace("_", " "),
+                    "condition": joined_condition,
                     "gene": gene,
                     "review_status": review_status,
                     "allele_id": _safe_int(_pick(allele_ids, i)),

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/pharmgkb_loader.py RENAMED Viewed

@@ -334,6 +334,13 @@ def _safe_float(value: str) -> float | None:
         return None
+# Truncation sanity floor for production loads. Current ClinPGx clinical
+# annotations ship ~13K rows; ~5K is a generous floor that catches gross
+# truncation while staying permissive against upstream-data drift. Set
+# to 0 from tests so synthetic fixtures of any size load cleanly. See GH #19.
+PHARMGKB_MIN_ROWS = 5_000
 def load_pharmgkb_tsv(
     zip_or_dir: Path,
     db_path: Path,
@@ -341,6 +348,7 @@ def load_pharmgkb_tsv(
     version: str = "",
     remote_signal: str | None = None,
     allele_function_lookup: dict[tuple[str, str], str] | None = None,
+    min_rows: int = 0,
 ) -> int:
     """Load a ClinPGx clinical-annotations dump into a fresh SQLite cache atomically.
@@ -430,6 +438,15 @@ def load_pharmgkb_tsv(
                 ),
             )
             conn.commit()
+        if count < min_rows:
+            msg = (
+                f"ClinPGx load aborted: only {count:,} rows ingested "
+                f"(floor {min_rows:,}). The download was likely truncated "
+                f"in flight (chunked transfer with no Content-Length, or "
+                f"connection drop mid-stream). Retry with "
+                f"`allelix db update --force`."
+            )
+            raise OSError(msg)
         os.replace(tmp_path, db_path)
         return count
     except Exception:

{allelix-2.0.0 → allelix-2.0.1}/allelix/databases/snpedia_parser.py RENAMED Viewed

@@ -147,6 +147,13 @@ def _dedupe_existing(conn: sqlite3.Connection) -> int:
     return before - after
+# GH #12: identifier allowlist for the raw_table f-string interpolation
+# below. The interpolation cannot be parameterized (SQLite doesn't support
+# bind variables for identifiers); the allowlist gives a programmatic
+# guarantee that only these two literals can reach the SQL.
+_VALID_RAW_TABLES: frozenset[str] = frozenset({"_raw_pages", "pages"})
 def detect_raw_table(conn: sqlite3.Connection) -> str | None:
     """Return the name of the raw pages table, or None if absent."""
     tables = {
@@ -183,6 +190,16 @@ def _parse_raw_pages_inner(conn: sqlite3.Connection, *, verbose: bool = False) -
     raw_table = detect_raw_table(conn)
     if raw_table is None:
         return 0
+    # GH #12: `raw_table` flows into three SQL queries via f-string
+    # interpolation because SQLite doesn't support parameterized
+    # identifiers. Today it's safe — `detect_raw_table` only ever returns
+    # one of two literals or None — but the function's `str | None` return
+    # type doesn't pin that. A future edit (config-driven table name,
+    # scraped metadata) could drift it into an injection path. Allowlist
+    # explicitly so the guarantee outlives memory of the original design.
+    if raw_table not in _VALID_RAW_TABLES:
+        msg = f"unexpected raw table name: {raw_table!r}"
+        raise ValueError(msg)
     if verbose:
         logger.info("Parsing SNPedia raw pages from '%s' table", raw_table)

{allelix-2.0.0 → allelix-2.0.1}/allelix/models.py RENAMED Viewed

@@ -40,6 +40,25 @@ class Variant:
     allele2: str
     build: str = DEFAULT_BUILD
+    def __post_init__(self) -> None:
+        """Normalize allele case at construction (GH #14).
+        Reference databases (ClinVar, gnomAD, ClinPGx, etc.) all ship
+        uppercase alleles, and carrier matching is raw set membership
+        against ``{allele1, allele2}`` — a lowercase user allele would
+        silently fail to match and zero annotations would be produced
+        for a real carrier. Production parsers all emit uppercase
+        today, but a user-supplied filter file (custom panel) or a
+        future format variant could leak lowercase through. Normalize
+        at the model boundary so the invariant is impossible to
+        violate downstream. The no-call marker is left as-is;
+        multi-base alleles (indels) are uppercased in place.
+        """
+        if self.allele1 and self.allele1 != NO_CALL_MARKER:
+            self.allele1 = self.allele1.upper()
+        if self.allele2 and self.allele2 != NO_CALL_MARKER:
+            self.allele2 = self.allele2.upper()
     @property
     def is_heterozygous(self) -> bool:
         """True if the two alleles differ (and neither is a no-call)."""

{allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/_helpers.py RENAMED Viewed

@@ -17,10 +17,24 @@ logger = logging.getLogger(__name__)
 def split_csv_line(line: str) -> list[str]:
-    """Split a comma-delimited line and strip double-quotes from each field.
-    Handles single-quoted, double-quoted, and double-double-quoted fields
-    (the MyHeritage "extra quotes" variant).
+    """Split a comma-delimited line and strip surrounding quotes from each field.
+    Implementation is ``line.split(",")`` followed by a per-field
+    ``strip().strip('"')``. This is NOT a real CSV parser: a quoted field
+    containing a literal comma yields the wrong column count and is
+    silently dropped by callers' ``len(parts) != EXPECTED_COLUMNS``
+    guard.
+    Adequate for FTDNA / MyHeritage / Living DNA because every value in
+    those exports is either an rsID, chromosome identifier, integer
+    position, or concatenated genotype string — none of which contain
+    commas. If a future format ever ships embedded commas in quoted
+    fields, swap to ``csv.reader`` rather than relying on this helper.
+    Strips both surrounding double quotes (``"rs1"``) and the
+    double-double-quote variant some MyHeritage exports produce
+    (``""rs1""``) — the latter via two iterations of the trailing
+    ``strip('"')``.
     """
     return [field.strip().strip('"') for field in line.split(",")]

{allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/ftdna.py RENAMED Viewed

@@ -42,6 +42,14 @@ SNIFF_LINE_LIMIT = 50
 EXPECTED_COLUMNS = 4
 HEADER_CANONICAL = "RSID,CHROMOSOME,POSITION,RESULT"
+# GH #26: FTDNA and MyHeritage files share the same data shape and
+# header line. Without an explicit exclusion, both `can_parse`
+# implementations accept MyHeritage files and the routing was masked
+# only by registry order (MyHeritage listed first in parsers/__init__.py).
+# Reorder the registry and FTDNA silently mislabels source format. The
+# discriminator is the MyHeritage signature comment in the first line.
+_MYHERITAGE_SIGNATURE = "MyHeritage"
 def _is_header_line(line: str) -> bool:
     """True if *line* is the FTDNA column header (quoted or unquoted)."""
@@ -58,7 +66,14 @@ class FTDNAParser(GenotypeParser):
     url: ClassVar[str] = "https://www.familytreedna.com"
     def can_parse(self, file_path: Path) -> bool:
-        """Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header."""
+        """Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header.
+        GH #26: rejects files carrying the MyHeritage signature comment.
+        Both formats are byte-identical past the first comment line, so
+        the discriminator must be checked explicitly — otherwise FTDNA
+        also claims MyHeritage files and routing depends on registry
+        order (which has silently mislabeled formats in past audits).
+        """
         try:
             with file_path.open("r", encoding="utf-8") as fh:
                 for _ in range(SNIFF_LINE_LIMIT):
@@ -66,6 +81,8 @@ class FTDNAParser(GenotypeParser):
                     if not line:
                         return False
                     line = line.rstrip("\r\n")
+                    if _MYHERITAGE_SIGNATURE in line:
+                        return False
                     if not line or line.startswith("#"):
                         continue
                     return _is_header_line(line)

{allelix-2.0.0 → allelix-2.0.1}/allelix/parsers/livingdna.py RENAMED Viewed

@@ -32,6 +32,7 @@ Specifics:
 from __future__ import annotations
 import logging
+import re
 from typing import TYPE_CHECKING, ClassVar
 from allelix.models import DEFAULT_BUILD, Variant
@@ -49,6 +50,12 @@ SIGNATURE = "Living DNA"
 SNIFF_LINE_LIMIT = 50
 EXPECTED_COLUMNS = 4
+# GH #16: only inspect comment lines that look like a build marker.
+# Without this filter every comment line is fed to
+# ``normalize_build_label`` and a stray date / version digit can override
+# the real build line.
+_BUILD_MARKER_RE = re.compile(r"\b(build|reference|genome)\b", re.IGNORECASE)
 class LivingDNAParser(GenotypeParser):
     """Parser for Living DNA consumer genotype files."""
@@ -104,16 +111,30 @@ class LivingDNAParser(GenotypeParser):
                 )
     def get_metadata(self, file_path: Path) -> GenotypeMetadata:
-        """Extract build from header comments. Living DNA has no sample ID field."""
+        """Extract build from header comments. Living DNA has no sample ID field.
+        GH #16: previously every comment line was passed into
+        ``normalize_build_label`` and the *last* match won. A download-
+        date comment like ``# downloaded 2038-01-01`` would silently
+        retag the file as GRCh38. Only lines that look like an explicit
+        build marker (containing ``build``, ``reference``, or ``genome``)
+        are inspected, and the first match wins — matching the format
+        spec which puts the build line near the top:
+            # Human Genome Reference Build 37 (GRCh37.p13).
+        """
         build = DEFAULT_BUILD
         with file_path.open("r", encoding="utf-8") as fh:
             for raw in fh:
                 line = raw.rstrip("\r\n")
                 if not line.startswith("#"):
                     break
+                if not _BUILD_MARKER_RE.search(line):
+                    continue
                 normalized = normalize_build_label(line)
                 if normalized:
                     build = normalized
+                    break
         return GenotypeMetadata(
             format=self.name,
             sample_id="",

allelix 2.0.0__tar.gz → 2.0.1__tar.gz

allelix 2.0.0tar.gz → 2.0.1tar.gz