PyPI - allelix - Versions diffs - 1.8.1__py3-none-any.whl - Mend

allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

allelix/__init__.py +12 -0
allelix/annotators/__init__.py +90 -0
allelix/annotators/alphamissense.py +228 -0
allelix/annotators/base.py +214 -0
allelix/annotators/cadd.py +283 -0
allelix/annotators/clinvar.py +404 -0
allelix/annotators/gnomad.py +212 -0
allelix/annotators/gwas.py +354 -0
allelix/annotators/pharmgkb.py +406 -0
allelix/annotators/snpedia.py +276 -0
allelix/cli.py +1524 -0
allelix/compare.py +149 -0
allelix/config.py +143 -0
allelix/data/__init__.py +3 -0
allelix/data/high_value_snps.yaml +64 -0
allelix/databases/__init__.py +30 -0
allelix/databases/_versions.py +16 -0
allelix/databases/alphamissense_loader.py +48 -0
allelix/databases/cadd_loader.py +49 -0
allelix/databases/cpic_loader.py +234 -0
allelix/databases/gnomad_loader.py +49 -0
allelix/databases/gwas_loader.py +546 -0
allelix/databases/loader_utils.py +80 -0
allelix/databases/manager.py +515 -0
allelix/databases/pharmgkb_loader.py +437 -0
allelix/databases/schema.py +165 -0
allelix/databases/snpedia_loader.py +44 -0
allelix/databases/snpedia_parser.py +342 -0
allelix/exporters/__init__.py +3 -0
allelix/exporters/plink.py +144 -0
allelix/models.py +117 -0
allelix/parsers/__init__.py +73 -0
allelix/parsers/_helpers.py +41 -0
allelix/parsers/ancestrydna.py +130 -0
allelix/parsers/base.py +97 -0
allelix/parsers/ftdna.py +129 -0
allelix/parsers/livingdna.py +121 -0
allelix/parsers/myhappygenes.py +135 -0
allelix/parsers/myheritage.py +118 -0
allelix/parsers/twentythreeandme.py +150 -0
allelix/py.typed +0 -0
allelix/reports/__init__.py +40 -0
allelix/reports/_pipeline.py +497 -0
allelix/reports/diff.py +169 -0
allelix/reports/high_value.py +133 -0
allelix/reports/html.py +1130 -0
allelix/reports/json_report.py +163 -0
allelix/reports/methylation.py +50 -0
allelix/reports/terminal.py +203 -0
allelix/utils/__init__.py +3 -0
allelix/utils/allele.py +87 -0
allelix/utils/build_detect.py +203 -0
allelix-1.8.1.dist-info/METADATA +276 -0
allelix-1.8.1.dist-info/RECORD +58 -0
allelix-1.8.1.dist-info/WHEEL +5 -0
allelix-1.8.1.dist-info/entry_points.txt +2 -0
allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
allelix-1.8.1.dist-info/top_level.txt +1 -0

allelix/parsers/myheritage.py ADDED Viewed

@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2026 dial481
+"""Parser for MyHeritage DNA raw genotype export files.
+Format reference (from real sample files and snps package):
+    # MyHeritage, https://www.myheritage.com
+    RSID,CHROMOSOME,POSITION,RESULT
+    "rs4477212","1","82154","AA"
+    "rs3094315","1","752566","AG"
+    "rs9001001","1","100000","--"
+Specifics:
+    - CSV format, comma-delimited. Structurally identical to FTDNA.
+    - Detection key: ``MyHeritage`` in the first comment line.
+    - Header line: ``RSID,CHROMOSOME,POSITION,RESULT`` (quoted or unquoted).
+    - Data fields are double-quoted; some exports double-double-quote
+      fields (``""rs1""``). ``split_csv_line`` handles both.
+    - RESULT column is concatenated genotype (e.g., "AG" not "A","G").
+    - Haploid calls on MT/Y appear as single characters (e.g., "A").
+    - No-calls represented as ``--``.
+    - Build: not declared in file; position-based detection required.
+      Defaults to GRCh37.
+"""
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, ClassVar
+from allelix.models import DEFAULT_BUILD, Variant
+from allelix.parsers._helpers import split_csv_line, split_genotype
+from allelix.parsers.base import GenotypeMetadata, GenotypeParser
+from allelix.parsers.ftdna import HEADER_CANONICAL, _is_header_line
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from pathlib import Path
+logger = logging.getLogger(__name__)
+SIGNATURE = "MyHeritage"
+SNIFF_LINE_LIMIT = 50
+EXPECTED_COLUMNS = 4
+class MyHeritageParser(GenotypeParser):
+    """Parser for MyHeritage DNA consumer genotype files."""
+    name: ClassVar[str] = "myheritage"
+    display_name: ClassVar[str] = "MyHeritage DNA"
+    file_extensions: ClassVar[list[str]] = [".csv"]
+    url: ClassVar[str] = "https://www.myheritage.com"
+    def can_parse(self, file_path: Path) -> bool:
+        """Recognize the file by ``MyHeritage`` in the first comment line."""
+        try:
+            with file_path.open("r", encoding="utf-8") as fh:
+                first_line = fh.readline()
+                return SIGNATURE in first_line
+        except (OSError, UnicodeDecodeError):
+            return False
+    def parse(self, file_path: Path) -> Iterator[Variant]:
+        """Stream Variant objects, skipping comments and malformed lines."""
+        with file_path.open("r", encoding="utf-8") as fh:
+            header_seen = False
+            for lineno, raw in enumerate(fh, start=1):
+                line = raw.rstrip("\r\n")
+                if not line or line.startswith("#"):
+                    continue
+                if not header_seen:
+                    if _is_header_line(line):
+                        header_seen = True
+                        continue
+                    logger.warning(
+                        "Line %d: expected %s header, got %r — skipping",
+                        lineno,
+                        HEADER_CANONICAL,
+                        line,
+                    )
+                    continue
+                parts = split_csv_line(line)
+                if len(parts) != EXPECTED_COLUMNS:
+                    logger.warning(
+                        "Line %d: expected %d columns, got %d — skipping",
+                        lineno,
+                        EXPECTED_COLUMNS,
+                        len(parts),
+                    )
+                    continue
+                rsid, chrom, pos_str, genotype = parts
+                try:
+                    position = int(pos_str)
+                except ValueError:
+                    logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
+                    continue
+                allele1, allele2 = split_genotype(genotype)
+                yield Variant(
+                    rsid=rsid,
+                    chromosome=chrom,
+                    position=position,
+                    allele1=allele1,
+                    allele2=allele2,
+                    build=DEFAULT_BUILD,
+                )
+    def get_metadata(self, file_path: Path) -> GenotypeMetadata:
+        """Extract metadata. MyHeritage files have no sample ID or build field."""
+        return GenotypeMetadata(
+            format=self.name,
+            sample_id="",
+            build=DEFAULT_BUILD,
+        )

allelix/parsers/twentythreeandme.py ADDED Viewed

@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2026 dial481
+"""Parser for 23andMe raw genotype export files.
+Format reference (from real sample files and snps package):
+    # This data file generated by 23andMe at: Mon Jun 01 00:00:00 2020
+    #
+    # This file contains raw genotype data ...
+    #
+    # rsid  chromosome  position  genotype
+    rs4477212	1	82154	AA
+    rs3094315	1	752566	AG
+    i3000043	5	33951693	CT
+    rs9001001	1	100000	--
+Specifics:
+    - Comment lines start with `#`.
+    - Tab-delimited, 4 columns of data.
+    - Genotype is concatenated in one column (e.g., "AG" not "A<tab>G").
+    - Haploid calls on X/Y/MT appear as single characters (e.g., "A").
+    - No-calls represented as `--`.
+    - Contains I-prefixed internal probe IDs (i3000043, i5006212)
+      alongside rs-numbers. These are passed through as-is.
+    - Build 37 (older) or Build 38 (newer); check header comments.
+    - Detection key: canonical header ``# This data file generated by 23andMe``.
+      A bare "23andMe" substring in a comment is NOT sufficient — it would
+      false-positive on transcoded fixtures and third-party tool output.
+"""
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, ClassVar
+from allelix.models import DEFAULT_BUILD, NO_CALL_MARKER, Variant
+from allelix.parsers.base import GenotypeMetadata, GenotypeParser
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from pathlib import Path
+logger = logging.getLogger(__name__)
+_CANONICAL_PREFIX = "# This data file generated by 23andMe"
+SNIFF_LINE_LIMIT = 50
+EXPECTED_COLUMNS = 4
+class TwentyThreeAndMeParser(GenotypeParser):
+    """Parser for 23andMe consumer DNA genotype files."""
+    name: ClassVar[str] = "23andme"
+    display_name: ClassVar[str] = "23andMe"
+    file_extensions: ClassVar[list[str]] = [".txt"]
+    url: ClassVar[str] = "https://www.23andme.com"
+    def can_parse(self, file_path: Path) -> bool:
+        """Recognize a real 23andMe export by its canonical first-line header.
+        Real exports start with ``# This data file generated by 23andMe at: ...``.
+        Some files have user-prepended comments before the canonical line; the
+        fallback loop scans up to ``SNIFF_LINE_LIMIT`` comment lines. A bare
+        "23andMe" mention without the canonical prefix is rejected.
+        """
+        try:
+            with file_path.open("r", encoding="utf-8") as fh:
+                for _ in range(SNIFF_LINE_LIMIT):
+                    line = fh.readline()
+                    if not line:
+                        return False
+                    if not line.startswith("#"):
+                        return False
+                    if line.startswith(_CANONICAL_PREFIX):
+                        return True
+        except (OSError, UnicodeDecodeError):
+            return False
+        return False
+    def parse(self, file_path: Path) -> Iterator[Variant]:
+        """Stream Variant objects, skipping comments and malformed lines."""
+        with file_path.open("r", encoding="utf-8") as fh:
+            for lineno, raw in enumerate(fh, start=1):
+                line = raw.rstrip("\r\n")
+                if not line or line.startswith("#"):
+                    continue
+                parts = line.split("\t")
+                if len(parts) != EXPECTED_COLUMNS:
+                    logger.warning(
+                        "Line %d: expected %d columns, got %d — skipping",
+                        lineno,
+                        EXPECTED_COLUMNS,
+                        len(parts),
+                    )
+                    continue
+                rsid, chrom, pos_str, genotype = parts
+                try:
+                    position = int(pos_str)
+                except ValueError:
+                    logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
+                    continue
+                allele1, allele2 = _split_genotype(genotype)
+                yield Variant(
+                    rsid=rsid,
+                    chromosome=chrom,
+                    position=position,
+                    allele1=allele1,
+                    allele2=allele2,
+                    build=DEFAULT_BUILD,
+                )
+    def get_metadata(self, file_path: Path) -> GenotypeMetadata:
+        """Extract build from header comments. 23andMe files have no sample ID field."""
+        build = DEFAULT_BUILD
+        with file_path.open("r", encoding="utf-8") as fh:
+            for raw in fh:
+                line = raw.rstrip("\r\n")
+                if not line.startswith("#"):
+                    break
+                lowered = line.lower()
+                if "build 36" in lowered or "hg18" in lowered:
+                    build = "GRCh36"
+                elif "build 37" in lowered or "grch37" in lowered:
+                    build = "GRCh37"
+                elif "build 38" in lowered or "grch38" in lowered:
+                    build = "GRCh38"
+        return GenotypeMetadata(
+            format=self.name,
+            sample_id="",
+            build=build,
+        )
+def _split_genotype(genotype: str) -> tuple[str, str]:
+    """Split a concatenated genotype field into two alleles.
+    "AG" → ("A", "G"), "--" → ("-", "-"), "A" → ("A", "A") (haploid).
+    """
+    if genotype == "--":
+        return NO_CALL_MARKER, NO_CALL_MARKER
+    if len(genotype) == 2:
+        return genotype[0], genotype[1]
+    if len(genotype) == 1:
+        return genotype, genotype
+    logger.warning("Unexpected genotype format %r — treating as no-call", genotype)
+    return NO_CALL_MARKER, NO_CALL_MARKER

allelix/py.typed ADDED Viewed

File without changes

allelix/reports/__init__.py ADDED Viewed

@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (C) 2026 dial481
+"""Report rendering: terminal, JSON, and HTML."""
+from __future__ import annotations
+import contextlib
+import os
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from pathlib import Path
+# Single source of truth for the project's compliance / regulatory contract.
+# Surfaced verbatim in JSON `regulatory_notice` and the HTML banner. See ADR-0003.
+REGULATORY_NOTICE = (
+    "This report is informational research output. It surfaces classifications "
+    "made by external databases (ClinVar, PharmGKB, …) for variants present in "
+    "the input genotype file. It is not medical advice and not a diagnosis. "
+    "Every classification is attributed to its source database; Allelix does "
+    "not independently classify variants."
+)
+def atomic_write_text(path: Path, content: str, encoding: str = "utf-8") -> None:
+    """Write `content` to `path` via a `.tmp` sibling + `os.replace`.
+    Mirrors `download()` / `load_clinvar_vcf` atomicity: a killed process
+    mid-write leaves either the previous file or no file at the target,
+    never a half-written one.
+    """
+    tmp = path.with_name(path.name + ".tmp")
+    try:
+        tmp.write_text(content, encoding=encoding)
+        os.replace(tmp, path)
+    except Exception:
+        if tmp.exists():
+            with contextlib.suppress(OSError):
+                tmp.unlink()
+        raise