allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,118 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser for MyHeritage DNA raw genotype export files.
4
+
5
+ Format reference (from real sample files and snps package):
6
+
7
+ # MyHeritage, https://www.myheritage.com
8
+ RSID,CHROMOSOME,POSITION,RESULT
9
+ "rs4477212","1","82154","AA"
10
+ "rs3094315","1","752566","AG"
11
+ "rs9001001","1","100000","--"
12
+
13
+ Specifics:
14
+ - CSV format, comma-delimited. Structurally identical to FTDNA.
15
+ - Detection key: ``MyHeritage`` in the first comment line.
16
+ - Header line: ``RSID,CHROMOSOME,POSITION,RESULT`` (quoted or unquoted).
17
+ - Data fields are double-quoted; some exports double-double-quote
18
+ fields (``""rs1""``). ``split_csv_line`` handles both.
19
+ - RESULT column is concatenated genotype (e.g., "AG" not "A","G").
20
+ - Haploid calls on MT/Y appear as single characters (e.g., "A").
21
+ - No-calls represented as ``--``.
22
+ - Build: not declared in file; position-based detection required.
23
+ Defaults to GRCh37.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import logging
29
+ from typing import TYPE_CHECKING, ClassVar
30
+
31
+ from allelix.models import DEFAULT_BUILD, Variant
32
+ from allelix.parsers._helpers import split_csv_line, split_genotype
33
+ from allelix.parsers.base import GenotypeMetadata, GenotypeParser
34
+ from allelix.parsers.ftdna import HEADER_CANONICAL, _is_header_line
35
+
36
+ if TYPE_CHECKING:
37
+ from collections.abc import Iterator
38
+ from pathlib import Path
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ SIGNATURE = "MyHeritage"
43
+ SNIFF_LINE_LIMIT = 50
44
+ EXPECTED_COLUMNS = 4
45
+
46
+
47
+ class MyHeritageParser(GenotypeParser):
48
+ """Parser for MyHeritage DNA consumer genotype files."""
49
+
50
+ name: ClassVar[str] = "myheritage"
51
+ display_name: ClassVar[str] = "MyHeritage DNA"
52
+ file_extensions: ClassVar[list[str]] = [".csv"]
53
+ url: ClassVar[str] = "https://www.myheritage.com"
54
+
55
+ def can_parse(self, file_path: Path) -> bool:
56
+ """Recognize the file by ``MyHeritage`` in the first comment line."""
57
+ try:
58
+ with file_path.open("r", encoding="utf-8") as fh:
59
+ first_line = fh.readline()
60
+ return SIGNATURE in first_line
61
+ except (OSError, UnicodeDecodeError):
62
+ return False
63
+
64
+ def parse(self, file_path: Path) -> Iterator[Variant]:
65
+ """Stream Variant objects, skipping comments and malformed lines."""
66
+ with file_path.open("r", encoding="utf-8") as fh:
67
+ header_seen = False
68
+ for lineno, raw in enumerate(fh, start=1):
69
+ line = raw.rstrip("\r\n")
70
+ if not line or line.startswith("#"):
71
+ continue
72
+ if not header_seen:
73
+ if _is_header_line(line):
74
+ header_seen = True
75
+ continue
76
+ logger.warning(
77
+ "Line %d: expected %s header, got %r — skipping",
78
+ lineno,
79
+ HEADER_CANONICAL,
80
+ line,
81
+ )
82
+ continue
83
+
84
+ parts = split_csv_line(line)
85
+ if len(parts) != EXPECTED_COLUMNS:
86
+ logger.warning(
87
+ "Line %d: expected %d columns, got %d — skipping",
88
+ lineno,
89
+ EXPECTED_COLUMNS,
90
+ len(parts),
91
+ )
92
+ continue
93
+
94
+ rsid, chrom, pos_str, genotype = parts
95
+ try:
96
+ position = int(pos_str)
97
+ except ValueError:
98
+ logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
99
+ continue
100
+
101
+ allele1, allele2 = split_genotype(genotype)
102
+
103
+ yield Variant(
104
+ rsid=rsid,
105
+ chromosome=chrom,
106
+ position=position,
107
+ allele1=allele1,
108
+ allele2=allele2,
109
+ build=DEFAULT_BUILD,
110
+ )
111
+
112
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
113
+ """Extract metadata. MyHeritage files have no sample ID or build field."""
114
+ return GenotypeMetadata(
115
+ format=self.name,
116
+ sample_id="",
117
+ build=DEFAULT_BUILD,
118
+ )
@@ -0,0 +1,150 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser for 23andMe raw genotype export files.
4
+
5
+ Format reference (from real sample files and snps package):
6
+
7
+ # This data file generated by 23andMe at: Mon Jun 01 00:00:00 2020
8
+ #
9
+ # This file contains raw genotype data ...
10
+ #
11
+ # rsid chromosome position genotype
12
+ rs4477212 1 82154 AA
13
+ rs3094315 1 752566 AG
14
+ i3000043 5 33951693 CT
15
+ rs9001001 1 100000 --
16
+
17
+ Specifics:
18
+ - Comment lines start with `#`.
19
+ - Tab-delimited, 4 columns of data.
20
+ - Genotype is concatenated in one column (e.g., "AG" not "A<tab>G").
21
+ - Haploid calls on X/Y/MT appear as single characters (e.g., "A").
22
+ - No-calls represented as `--`.
23
+ - Contains I-prefixed internal probe IDs (i3000043, i5006212)
24
+ alongside rs-numbers. These are passed through as-is.
25
+ - Build 37 (older) or Build 38 (newer); check header comments.
26
+ - Detection key: canonical header ``# This data file generated by 23andMe``.
27
+ A bare "23andMe" substring in a comment is NOT sufficient — it would
28
+ false-positive on transcoded fixtures and third-party tool output.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import logging
34
+ from typing import TYPE_CHECKING, ClassVar
35
+
36
+ from allelix.models import DEFAULT_BUILD, NO_CALL_MARKER, Variant
37
+ from allelix.parsers.base import GenotypeMetadata, GenotypeParser
38
+
39
+ if TYPE_CHECKING:
40
+ from collections.abc import Iterator
41
+ from pathlib import Path
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ _CANONICAL_PREFIX = "# This data file generated by 23andMe"
46
+ SNIFF_LINE_LIMIT = 50
47
+ EXPECTED_COLUMNS = 4
48
+
49
+
50
+ class TwentyThreeAndMeParser(GenotypeParser):
51
+ """Parser for 23andMe consumer DNA genotype files."""
52
+
53
+ name: ClassVar[str] = "23andme"
54
+ display_name: ClassVar[str] = "23andMe"
55
+ file_extensions: ClassVar[list[str]] = [".txt"]
56
+ url: ClassVar[str] = "https://www.23andme.com"
57
+
58
+ def can_parse(self, file_path: Path) -> bool:
59
+ """Recognize a real 23andMe export by its canonical first-line header.
60
+
61
+ Real exports start with ``# This data file generated by 23andMe at: ...``.
62
+ Some files have user-prepended comments before the canonical line; the
63
+ fallback loop scans up to ``SNIFF_LINE_LIMIT`` comment lines. A bare
64
+ "23andMe" mention without the canonical prefix is rejected.
65
+ """
66
+ try:
67
+ with file_path.open("r", encoding="utf-8") as fh:
68
+ for _ in range(SNIFF_LINE_LIMIT):
69
+ line = fh.readline()
70
+ if not line:
71
+ return False
72
+ if not line.startswith("#"):
73
+ return False
74
+ if line.startswith(_CANONICAL_PREFIX):
75
+ return True
76
+ except (OSError, UnicodeDecodeError):
77
+ return False
78
+ return False
79
+
80
+ def parse(self, file_path: Path) -> Iterator[Variant]:
81
+ """Stream Variant objects, skipping comments and malformed lines."""
82
+ with file_path.open("r", encoding="utf-8") as fh:
83
+ for lineno, raw in enumerate(fh, start=1):
84
+ line = raw.rstrip("\r\n")
85
+ if not line or line.startswith("#"):
86
+ continue
87
+
88
+ parts = line.split("\t")
89
+ if len(parts) != EXPECTED_COLUMNS:
90
+ logger.warning(
91
+ "Line %d: expected %d columns, got %d — skipping",
92
+ lineno,
93
+ EXPECTED_COLUMNS,
94
+ len(parts),
95
+ )
96
+ continue
97
+
98
+ rsid, chrom, pos_str, genotype = parts
99
+ try:
100
+ position = int(pos_str)
101
+ except ValueError:
102
+ logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
103
+ continue
104
+
105
+ allele1, allele2 = _split_genotype(genotype)
106
+
107
+ yield Variant(
108
+ rsid=rsid,
109
+ chromosome=chrom,
110
+ position=position,
111
+ allele1=allele1,
112
+ allele2=allele2,
113
+ build=DEFAULT_BUILD,
114
+ )
115
+
116
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
117
+ """Extract build from header comments. 23andMe files have no sample ID field."""
118
+ build = DEFAULT_BUILD
119
+ with file_path.open("r", encoding="utf-8") as fh:
120
+ for raw in fh:
121
+ line = raw.rstrip("\r\n")
122
+ if not line.startswith("#"):
123
+ break
124
+ lowered = line.lower()
125
+ if "build 36" in lowered or "hg18" in lowered:
126
+ build = "GRCh36"
127
+ elif "build 37" in lowered or "grch37" in lowered:
128
+ build = "GRCh37"
129
+ elif "build 38" in lowered or "grch38" in lowered:
130
+ build = "GRCh38"
131
+ return GenotypeMetadata(
132
+ format=self.name,
133
+ sample_id="",
134
+ build=build,
135
+ )
136
+
137
+
138
+ def _split_genotype(genotype: str) -> tuple[str, str]:
139
+ """Split a concatenated genotype field into two alleles.
140
+
141
+ "AG" → ("A", "G"), "--" → ("-", "-"), "A" → ("A", "A") (haploid).
142
+ """
143
+ if genotype == "--":
144
+ return NO_CALL_MARKER, NO_CALL_MARKER
145
+ if len(genotype) == 2:
146
+ return genotype[0], genotype[1]
147
+ if len(genotype) == 1:
148
+ return genotype, genotype
149
+ logger.warning("Unexpected genotype format %r — treating as no-call", genotype)
150
+ return NO_CALL_MARKER, NO_CALL_MARKER
allelix/py.typed ADDED
File without changes
@@ -0,0 +1,40 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Report rendering: terminal, JSON, and HTML."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import contextlib
8
+ import os
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from pathlib import Path
13
+
14
+ # Single source of truth for the project's compliance / regulatory contract.
15
+ # Surfaced verbatim in JSON `regulatory_notice` and the HTML banner. See ADR-0003.
16
+ REGULATORY_NOTICE = (
17
+ "This report is informational research output. It surfaces classifications "
18
+ "made by external databases (ClinVar, PharmGKB, …) for variants present in "
19
+ "the input genotype file. It is not medical advice and not a diagnosis. "
20
+ "Every classification is attributed to its source database; Allelix does "
21
+ "not independently classify variants."
22
+ )
23
+
24
+
25
+ def atomic_write_text(path: Path, content: str, encoding: str = "utf-8") -> None:
26
+ """Write `content` to `path` via a `.tmp` sibling + `os.replace`.
27
+
28
+ Mirrors `download()` / `load_clinvar_vcf` atomicity: a killed process
29
+ mid-write leaves either the previous file or no file at the target,
30
+ never a half-written one.
31
+ """
32
+ tmp = path.with_name(path.name + ".tmp")
33
+ try:
34
+ tmp.write_text(content, encoding=encoding)
35
+ os.replace(tmp, path)
36
+ except Exception:
37
+ if tmp.exists():
38
+ with contextlib.suppress(OSError):
39
+ tmp.unlink()
40
+ raise