allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,41 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Shared helpers for parsers with CSV or concatenated-genotype formats.
4
+
5
+ Used by FTDNA, MyHeritage, and Living DNA parsers. Extracted here to avoid
6
+ duplicating the genotype-splitting and CSV-line-splitting logic across
7
+ structurally similar formats.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+
14
+ from allelix.models import NO_CALL_MARKER
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def split_csv_line(line: str) -> list[str]:
20
+ """Split a comma-delimited line and strip double-quotes from each field.
21
+
22
+ Handles single-quoted, double-quoted, and double-double-quoted fields
23
+ (the MyHeritage "extra quotes" variant).
24
+ """
25
+ return [field.strip().strip('"') for field in line.split(",")]
26
+
27
+
28
+ def split_genotype(genotype: str) -> tuple[str, str]:
29
+ """Split a concatenated genotype field into two alleles.
30
+
31
+ ``"AG"`` -> ``("A", "G")``, ``"--"`` -> ``("-", "-")``,
32
+ ``"A"`` -> ``("A", "A")`` (haploid MT/Y).
33
+ """
34
+ if genotype == "--":
35
+ return NO_CALL_MARKER, NO_CALL_MARKER
36
+ if len(genotype) == 2:
37
+ return genotype[0], genotype[1]
38
+ if len(genotype) == 1:
39
+ return genotype, genotype
40
+ logger.warning("Unexpected genotype format %r — treating as no-call", genotype)
41
+ return NO_CALL_MARKER, NO_CALL_MARKER
@@ -0,0 +1,130 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser for AncestryDNA raw genotype export files.
4
+
5
+ Format reference (from real sample files and snps package):
6
+
7
+ #AncestryDNA raw data download
8
+ #This file was generated by AncestryDNA ...
9
+ #Data was collected using AncestryDNA array version: V2.0
10
+ #
11
+ rsid chromosome position allele1 allele2
12
+ rs4477212 1 82154 A C
13
+ rs9001001 1 100000 0 0
14
+
15
+ Specifics:
16
+ - Comment lines start with ``#``.
17
+ - Tab-delimited, 5 columns of data (alleles in separate columns).
18
+ - Detection key: ``#AncestryDNA`` in the first comment line.
19
+ - Chromosome codes: 23 = X, 24 = Y, 25 = PAR (pseudo-autosomal,
20
+ mapped to X), 26 = MT.
21
+ - No-calls represented as ``0`` for each allele.
22
+ - Build 37 (GRCh37).
23
+ - V1 chip (pre-May 2016, ~682K SNPs) and V2 chip (May 2016+,
24
+ ~664K SNPs) share the same column layout.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ from typing import TYPE_CHECKING, ClassVar
31
+
32
+ from allelix.models import DEFAULT_BUILD, NO_CALL_MARKER, Variant
33
+ from allelix.parsers.base import GenotypeMetadata, GenotypeParser
34
+
35
+ if TYPE_CHECKING:
36
+ from collections.abc import Iterator
37
+ from pathlib import Path
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SIGNATURE = "#AncestryDNA"
42
+ EXPECTED_HEADER = "rsid\tchromosome\tposition\tallele1\tallele2"
43
+ EXPECTED_COLUMNS = 5
44
+ SNIFF_LINE_LIMIT = 50
45
+
46
+ _CHROM_MAP: dict[str, str] = {
47
+ "23": "X",
48
+ "24": "Y",
49
+ "25": "X",
50
+ "26": "MT",
51
+ }
52
+
53
+
54
+ class AncestryDNAParser(GenotypeParser):
55
+ """Parser for AncestryDNA consumer DNA genotype files."""
56
+
57
+ name: ClassVar[str] = "ancestrydna"
58
+ display_name: ClassVar[str] = "AncestryDNA"
59
+ file_extensions: ClassVar[list[str]] = [".txt"]
60
+ url: ClassVar[str] = "https://www.ancestry.com/dna"
61
+
62
+ def can_parse(self, file_path: Path) -> bool:
63
+ """Recognize the file by its ``#AncestryDNA`` signature on the first line."""
64
+ try:
65
+ with file_path.open("r", encoding="utf-8") as fh:
66
+ first_line = fh.readline()
67
+ return first_line.startswith(SIGNATURE)
68
+ except (OSError, UnicodeDecodeError):
69
+ return False
70
+
71
+ def parse(self, file_path: Path) -> Iterator[Variant]:
72
+ """Stream Variant objects, skipping comments and malformed lines."""
73
+ with file_path.open("r", encoding="utf-8") as fh:
74
+ header_seen = False
75
+ for lineno, raw in enumerate(fh, start=1):
76
+ line = raw.rstrip("\r\n")
77
+ if not line or line.startswith("#"):
78
+ continue
79
+ if not header_seen:
80
+ if line == EXPECTED_HEADER:
81
+ header_seen = True
82
+ continue
83
+ logger.warning(
84
+ "Line %d: expected header %r, got %r — skipping",
85
+ lineno,
86
+ EXPECTED_HEADER,
87
+ line,
88
+ )
89
+ continue
90
+
91
+ parts = line.split("\t")
92
+ if len(parts) != EXPECTED_COLUMNS:
93
+ logger.warning(
94
+ "Line %d: expected %d columns, got %d — skipping",
95
+ lineno,
96
+ EXPECTED_COLUMNS,
97
+ len(parts),
98
+ )
99
+ continue
100
+
101
+ rsid, chrom_raw, pos_str, allele1, allele2 = parts
102
+ try:
103
+ position = int(pos_str)
104
+ except ValueError:
105
+ logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
106
+ continue
107
+
108
+ chrom = _CHROM_MAP.get(chrom_raw, chrom_raw)
109
+
110
+ if allele1 == "0":
111
+ allele1 = NO_CALL_MARKER
112
+ if allele2 == "0":
113
+ allele2 = NO_CALL_MARKER
114
+
115
+ yield Variant(
116
+ rsid=rsid,
117
+ chromosome=chrom,
118
+ position=position,
119
+ allele1=allele1,
120
+ allele2=allele2,
121
+ build=DEFAULT_BUILD,
122
+ )
123
+
124
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
125
+ """Extract metadata from header comments. AncestryDNA has no sample ID field."""
126
+ return GenotypeMetadata(
127
+ format=self.name,
128
+ sample_id="",
129
+ build=DEFAULT_BUILD,
130
+ )
@@ -0,0 +1,97 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Abstract base class for genotype file parsers."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import TYPE_CHECKING, ClassVar, TypedDict
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Iterator
12
+ from pathlib import Path
13
+
14
+ from allelix.models import Variant
15
+
16
+
17
+ class GenotypeMetadata(TypedDict):
18
+ """File-level metadata extracted by `GenotypeParser.get_metadata`.
19
+
20
+ Header-derivable fields only. SNP count is intentionally NOT here — the
21
+ only reliable source for it is `parse()`, since lines that look like data
22
+ in a header scan may fail validation. Callers that need a count should
23
+ use `sum(1 for _ in parser.parse(file_path))`.
24
+
25
+ Keys:
26
+ format: Parser name (matches `GenotypeParser.name`).
27
+ sample_id: Vendor sample identifier, or "" if not present in the file.
28
+ build: Reference genome build (e.g., "GRCh37").
29
+ """
30
+
31
+ format: str
32
+ sample_id: str
33
+ build: str
34
+
35
+
36
+ class GenotypeParser(ABC):
37
+ """Base class for all genotype file parsers.
38
+
39
+ Subclasses define metadata as class attributes and implement the three
40
+ abstract methods. Parsers are stateless — `can_parse` and `parse` may be
41
+ called repeatedly on different files.
42
+
43
+ Attributes:
44
+ name: Lowercase identifier used by the registry and CLI (e.g., "myhappygenes").
45
+ display_name: Human-readable name for reports ("MyHappyGenes (Tempus)").
46
+ file_extensions: Common file extensions (e.g., [".txt"]). Informational only;
47
+ auto-detection uses `can_parse`, not extension matching.
48
+ url: Vendor URL.
49
+ """
50
+
51
+ name: ClassVar[str]
52
+ display_name: ClassVar[str]
53
+ file_extensions: ClassVar[list[str]]
54
+ url: ClassVar[str]
55
+
56
+ @abstractmethod
57
+ def can_parse(self, file_path: Path) -> bool:
58
+ """Sniff the file to determine if this parser handles it.
59
+
60
+ Must be fast — examines header/structural lines only, not the full file.
61
+ Used by the auto-detection registry.
62
+
63
+ Args:
64
+ file_path: Path to the candidate genotype file.
65
+
66
+ Returns:
67
+ True if this parser recognizes the format.
68
+ """
69
+ ...
70
+
71
+ @abstractmethod
72
+ def parse(self, file_path: Path) -> Iterator[Variant]:
73
+ """Yield normalized Variant objects from the file.
74
+
75
+ Streaming: yields one variant at a time. Never loads the whole file.
76
+ Malformed individual lines log a warning and are skipped — they do not
77
+ abort the whole parse.
78
+
79
+ Args:
80
+ file_path: Path to the genotype file.
81
+
82
+ Yields:
83
+ One Variant per data row in the file.
84
+ """
85
+ ...
86
+
87
+ @abstractmethod
88
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
89
+ """Extract header-derivable file metadata. Must be cheap (no full parse).
90
+
91
+ Args:
92
+ file_path: Path to the genotype file.
93
+
94
+ Returns:
95
+ A `GenotypeMetadata` dict.
96
+ """
97
+ ...
@@ -0,0 +1,129 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser for Family Tree DNA (FTDNA) raw genotype export files.
4
+
5
+ Format reference (from real sample files and snps package):
6
+
7
+ # FTDNA raw data download
8
+ RSID,CHROMOSOME,POSITION,RESULT
9
+ "rs4477212","1","82154","AA"
10
+ "rs3094315","1","752566","AG"
11
+ "rs9001001","1","100000","--"
12
+
13
+ Specifics:
14
+ - CSV format, comma-delimited.
15
+ - Optional comment lines starting with ``#``.
16
+ - Header line: ``RSID,CHROMOSOME,POSITION,RESULT`` (quoted or unquoted).
17
+ - Data fields are double-quoted; parser strips quotes.
18
+ - RESULT column is concatenated genotype (e.g., "AG" not "A","G").
19
+ - Haploid calls on MT/Y appear as single characters (e.g., "A").
20
+ - No-calls represented as ``--``.
21
+ - Build 37 (most files).
22
+ - Detection key: header line matching ``RSID,CHROMOSOME,POSITION,RESULT``
23
+ (case-insensitive, with or without quotes) within the first 50 lines.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import logging
29
+ from typing import TYPE_CHECKING, ClassVar
30
+
31
+ from allelix.models import DEFAULT_BUILD, Variant
32
+ from allelix.parsers._helpers import split_csv_line, split_genotype
33
+ from allelix.parsers.base import GenotypeMetadata, GenotypeParser
34
+
35
+ if TYPE_CHECKING:
36
+ from collections.abc import Iterator
37
+ from pathlib import Path
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SNIFF_LINE_LIMIT = 50
42
+ EXPECTED_COLUMNS = 4
43
+ HEADER_CANONICAL = "RSID,CHROMOSOME,POSITION,RESULT"
44
+
45
+
46
+ def _is_header_line(line: str) -> bool:
47
+ """True if *line* is the FTDNA column header (quoted or unquoted)."""
48
+ stripped = line.replace('"', "").replace("'", "").strip()
49
+ return stripped.upper() == HEADER_CANONICAL
50
+
51
+
52
+ class FTDNAParser(GenotypeParser):
53
+ """Parser for Family Tree DNA consumer DNA genotype files."""
54
+
55
+ name: ClassVar[str] = "ftdna"
56
+ display_name: ClassVar[str] = "Family Tree DNA"
57
+ file_extensions: ClassVar[list[str]] = [".csv"]
58
+ url: ClassVar[str] = "https://www.familytreedna.com"
59
+
60
+ def can_parse(self, file_path: Path) -> bool:
61
+ """Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header."""
62
+ try:
63
+ with file_path.open("r", encoding="utf-8") as fh:
64
+ for _ in range(SNIFF_LINE_LIMIT):
65
+ line = fh.readline()
66
+ if not line:
67
+ return False
68
+ line = line.rstrip("\r\n")
69
+ if not line or line.startswith("#"):
70
+ continue
71
+ return _is_header_line(line)
72
+ except (OSError, UnicodeDecodeError):
73
+ return False
74
+ return False
75
+
76
+ def parse(self, file_path: Path) -> Iterator[Variant]:
77
+ """Stream Variant objects, skipping comments and malformed lines."""
78
+ with file_path.open("r", encoding="utf-8") as fh:
79
+ header_seen = False
80
+ for lineno, raw in enumerate(fh, start=1):
81
+ line = raw.rstrip("\r\n")
82
+ if not line or line.startswith("#"):
83
+ continue
84
+ if not header_seen:
85
+ if _is_header_line(line):
86
+ header_seen = True
87
+ continue
88
+ logger.warning(
89
+ "Line %d: expected FTDNA header, got %r — skipping",
90
+ lineno,
91
+ line,
92
+ )
93
+ continue
94
+
95
+ parts = split_csv_line(line)
96
+ if len(parts) != EXPECTED_COLUMNS:
97
+ logger.warning(
98
+ "Line %d: expected %d columns, got %d — skipping",
99
+ lineno,
100
+ EXPECTED_COLUMNS,
101
+ len(parts),
102
+ )
103
+ continue
104
+
105
+ rsid, chrom, pos_str, genotype = parts
106
+ try:
107
+ position = int(pos_str)
108
+ except ValueError:
109
+ logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
110
+ continue
111
+
112
+ allele1, allele2 = split_genotype(genotype)
113
+
114
+ yield Variant(
115
+ rsid=rsid,
116
+ chromosome=chrom,
117
+ position=position,
118
+ allele1=allele1,
119
+ allele2=allele2,
120
+ build=DEFAULT_BUILD,
121
+ )
122
+
123
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
124
+ """Extract metadata from header. FTDNA files have no sample ID field."""
125
+ return GenotypeMetadata(
126
+ format=self.name,
127
+ sample_id="",
128
+ build=DEFAULT_BUILD,
129
+ )
@@ -0,0 +1,121 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser for Living DNA raw genotype export files.
4
+
5
+ Format reference (from snps package and H600 Project wiki):
6
+
7
+ # Living DNA customer genotype data download file version: 1.0.1
8
+ # This file contains raw genotype data ...
9
+ # Human Genome Reference Build 37 (GRCh37.p13).
10
+ # Genotypes are presented on the forward strand.
11
+ #
12
+ # rsid chromosome position genotype
13
+ rs1801133 1 11856378 AG
14
+ AX-12345678 3 15000000 GG
15
+ 1:726912 1 726912 AA
16
+
17
+ Specifics:
18
+ - Tab-delimited despite ``.csv`` file extension.
19
+ - Detection key: ``Living DNA`` in the first line.
20
+ - Comment lines start with ``#``, including the column header line.
21
+ - Four columns: ``rsid``, ``chromosome``, ``position``, ``genotype``.
22
+ - Concatenated genotype in result column (e.g., "AA", "CT").
23
+ - No-calls represented as ``--``.
24
+ - Build 37 (GRCh37.p13), forward strand.
25
+ - SNP ID types: rs-numbers, ``AX-`` prefixed (Affymetrix),
26
+ ``AFFX-`` prefixed (Affymetrix control probes), and positional
27
+ notation (``CHR:POS``, e.g., ``1:726912``).
28
+ - Y and MT chromosomes delivered as separate files; main file
29
+ has chromosomes 1-22 and X.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import logging
35
+ from typing import TYPE_CHECKING, ClassVar
36
+
37
+ from allelix.models import DEFAULT_BUILD, Variant
38
+ from allelix.parsers._helpers import split_genotype
39
+ from allelix.parsers.base import GenotypeMetadata, GenotypeParser
40
+ from allelix.utils.build_detect import normalize_build_label
41
+
42
+ if TYPE_CHECKING:
43
+ from collections.abc import Iterator
44
+ from pathlib import Path
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+ SIGNATURE = "Living DNA"
49
+ SNIFF_LINE_LIMIT = 50
50
+ EXPECTED_COLUMNS = 4
51
+
52
+
53
+ class LivingDNAParser(GenotypeParser):
54
+ """Parser for Living DNA consumer genotype files."""
55
+
56
+ name: ClassVar[str] = "livingdna"
57
+ display_name: ClassVar[str] = "Living DNA"
58
+ file_extensions: ClassVar[list[str]] = [".csv"]
59
+ url: ClassVar[str] = "https://livingdna.com"
60
+
61
+ def can_parse(self, file_path: Path) -> bool:
62
+ """Recognize the file by ``Living DNA`` in the first line."""
63
+ try:
64
+ with file_path.open("r", encoding="utf-8") as fh:
65
+ first_line = fh.readline()
66
+ return SIGNATURE in first_line
67
+ except (OSError, UnicodeDecodeError):
68
+ return False
69
+
70
+ def parse(self, file_path: Path) -> Iterator[Variant]:
71
+ """Stream Variant objects, skipping comments and malformed lines."""
72
+ with file_path.open("r", encoding="utf-8") as fh:
73
+ for lineno, raw in enumerate(fh, start=1):
74
+ line = raw.rstrip("\r\n")
75
+ if not line or line.startswith("#"):
76
+ continue
77
+
78
+ parts = line.split("\t")
79
+ if len(parts) != EXPECTED_COLUMNS:
80
+ logger.warning(
81
+ "Line %d: expected %d columns, got %d — skipping",
82
+ lineno,
83
+ EXPECTED_COLUMNS,
84
+ len(parts),
85
+ )
86
+ continue
87
+
88
+ rsid, chrom, pos_str, genotype = parts
89
+ try:
90
+ position = int(pos_str)
91
+ except ValueError:
92
+ logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
93
+ continue
94
+
95
+ allele1, allele2 = split_genotype(genotype)
96
+
97
+ yield Variant(
98
+ rsid=rsid,
99
+ chromosome=chrom,
100
+ position=position,
101
+ allele1=allele1,
102
+ allele2=allele2,
103
+ build=DEFAULT_BUILD,
104
+ )
105
+
106
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
107
+ """Extract build from header comments. Living DNA has no sample ID field."""
108
+ build = DEFAULT_BUILD
109
+ with file_path.open("r", encoding="utf-8") as fh:
110
+ for raw in fh:
111
+ line = raw.rstrip("\r\n")
112
+ if not line.startswith("#"):
113
+ break
114
+ normalized = normalize_build_label(line)
115
+ if normalized:
116
+ build = normalized
117
+ return GenotypeMetadata(
118
+ format=self.name,
119
+ sample_id="",
120
+ build=build,
121
+ )
@@ -0,0 +1,135 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser for MyHappyGenes (Tempus) genotype export files.
4
+
5
+ Format reference (from sample file):
6
+
7
+ # MyHappyGenes [TEMPUS]
8
+ # This file was generated by MyHappyGenes, Inc.
9
+ # ... (additional comment lines)
10
+ # Sample ID MHG000001
11
+ SNP Name Chr Position Allele1 - Forward Allele2 - Forward
12
+ rs9651229 1 632287 C C
13
+ rs9701872 1 632828 T T
14
+
15
+ Specifics:
16
+ - Comment lines start with `#`.
17
+ - Tab-delimited, 5 columns of data.
18
+ - Header claims build 37.1, but real-world exports we've verified
19
+ contain GRCh38 positions. The analyze pipeline runs auto-
20
+ detection (ADR-0021) and overrides the parser-reported build
21
+ based on position data. The build value returned here reflects
22
+ the literal header claim; downstream code should not assume it.
23
+ Forward strand with respect to the reference.
24
+ - No-calls represented as `-`.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ from typing import TYPE_CHECKING, ClassVar
31
+
32
+ from allelix.models import DEFAULT_BUILD, Variant
33
+ from allelix.parsers.base import GenotypeMetadata, GenotypeParser
34
+
35
+ if TYPE_CHECKING:
36
+ from collections.abc import Iterator
37
+ from pathlib import Path
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ SIGNATURE = "# MyHappyGenes"
42
+ SAMPLE_ID_PREFIX = "# Sample ID"
43
+ EXPECTED_HEADER = "SNP Name\tChr\tPosition\tAllele1 - Forward\tAllele2 - Forward"
44
+ EXPECTED_COLUMNS = 5
45
+ SNIFF_LINE_LIMIT = 50
46
+
47
+
48
+ class MyHappyGenesParser(GenotypeParser):
49
+ """Parser for MyHappyGenes/Tempus consumer DNA genotype files."""
50
+
51
+ name: ClassVar[str] = "myhappygenes"
52
+ display_name: ClassVar[str] = "MyHappyGenes (Tempus)"
53
+ file_extensions: ClassVar[list[str]] = [".txt"]
54
+ url: ClassVar[str] = "https://myhappygenes.com"
55
+
56
+ def can_parse(self, file_path: Path) -> bool:
57
+ """Recognize the file by its `# MyHappyGenes` signature line."""
58
+ try:
59
+ with file_path.open("r", encoding="utf-8") as fh:
60
+ for _ in range(SNIFF_LINE_LIMIT):
61
+ line = fh.readline()
62
+ if not line:
63
+ return False
64
+ if SIGNATURE in line:
65
+ return True
66
+ except (OSError, UnicodeDecodeError):
67
+ return False
68
+ return False
69
+
70
+ def parse(self, file_path: Path) -> Iterator[Variant]:
71
+ """Stream Variant objects, skipping comments and malformed lines."""
72
+ with file_path.open("r", encoding="utf-8") as fh:
73
+ header_seen = False
74
+ for lineno, raw in enumerate(fh, start=1):
75
+ line = raw.rstrip("\r\n")
76
+ if not line or line.startswith("#"):
77
+ continue
78
+ if not header_seen:
79
+ if line == EXPECTED_HEADER:
80
+ header_seen = True
81
+ continue
82
+ logger.warning(
83
+ "Line %d: expected header %r, got %r — skipping",
84
+ lineno,
85
+ EXPECTED_HEADER,
86
+ line,
87
+ )
88
+ continue
89
+
90
+ parts = line.split("\t")
91
+ if len(parts) != EXPECTED_COLUMNS:
92
+ logger.warning(
93
+ "Line %d: expected %d columns, got %d — skipping",
94
+ lineno,
95
+ EXPECTED_COLUMNS,
96
+ len(parts),
97
+ )
98
+ continue
99
+
100
+ rsid, chrom, pos_str, allele1, allele2 = parts
101
+ try:
102
+ position = int(pos_str)
103
+ except ValueError:
104
+ logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
105
+ continue
106
+
107
+ yield Variant(
108
+ rsid=rsid,
109
+ chromosome=chrom,
110
+ position=position,
111
+ allele1=allele1,
112
+ allele2=allele2,
113
+ build=DEFAULT_BUILD,
114
+ )
115
+
116
+ def get_metadata(self, file_path: Path) -> GenotypeMetadata:
117
+ """Extract sample ID from the file header. Cheap — no full parse."""
118
+ sample_id = ""
119
+ with file_path.open("r", encoding="utf-8") as fh:
120
+ for raw in fh:
121
+ line = raw.rstrip("\r\n")
122
+ if not line:
123
+ continue
124
+ if line.startswith(SAMPLE_ID_PREFIX):
125
+ parts = line.split("\t", 1)
126
+ if len(parts) == 2:
127
+ sample_id = parts[1].strip()
128
+ break
129
+ if not line.startswith("#"):
130
+ break
131
+ return GenotypeMetadata(
132
+ format=self.name,
133
+ sample_id=sample_id,
134
+ build=DEFAULT_BUILD,
135
+ )