allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Shared helpers for parsers with CSV or concatenated-genotype formats.
|
|
4
|
+
|
|
5
|
+
Used by FTDNA, MyHeritage, and Living DNA parsers. Extracted here to avoid
|
|
6
|
+
duplicating the genotype-splitting and CSV-line-splitting logic across
|
|
7
|
+
structurally similar formats.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
from allelix.models import NO_CALL_MARKER
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def split_csv_line(line: str) -> list[str]:
|
|
20
|
+
"""Split a comma-delimited line and strip double-quotes from each field.
|
|
21
|
+
|
|
22
|
+
Handles single-quoted, double-quoted, and double-double-quoted fields
|
|
23
|
+
(the MyHeritage "extra quotes" variant).
|
|
24
|
+
"""
|
|
25
|
+
return [field.strip().strip('"') for field in line.split(",")]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def split_genotype(genotype: str) -> tuple[str, str]:
|
|
29
|
+
"""Split a concatenated genotype field into two alleles.
|
|
30
|
+
|
|
31
|
+
``"AG"`` -> ``("A", "G")``, ``"--"`` -> ``("-", "-")``,
|
|
32
|
+
``"A"`` -> ``("A", "A")`` (haploid MT/Y).
|
|
33
|
+
"""
|
|
34
|
+
if genotype == "--":
|
|
35
|
+
return NO_CALL_MARKER, NO_CALL_MARKER
|
|
36
|
+
if len(genotype) == 2:
|
|
37
|
+
return genotype[0], genotype[1]
|
|
38
|
+
if len(genotype) == 1:
|
|
39
|
+
return genotype, genotype
|
|
40
|
+
logger.warning("Unexpected genotype format %r — treating as no-call", genotype)
|
|
41
|
+
return NO_CALL_MARKER, NO_CALL_MARKER
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Parser for AncestryDNA raw genotype export files.
|
|
4
|
+
|
|
5
|
+
Format reference (from real sample files and snps package):
|
|
6
|
+
|
|
7
|
+
#AncestryDNA raw data download
|
|
8
|
+
#This file was generated by AncestryDNA ...
|
|
9
|
+
#Data was collected using AncestryDNA array version: V2.0
|
|
10
|
+
#
|
|
11
|
+
rsid chromosome position allele1 allele2
|
|
12
|
+
rs4477212 1 82154 A C
|
|
13
|
+
rs9001001 1 100000 0 0
|
|
14
|
+
|
|
15
|
+
Specifics:
|
|
16
|
+
- Comment lines start with ``#``.
|
|
17
|
+
- Tab-delimited, 5 columns of data (alleles in separate columns).
|
|
18
|
+
- Detection key: ``#AncestryDNA`` in the first comment line.
|
|
19
|
+
- Chromosome codes: 23 = X, 24 = Y, 25 = PAR (pseudo-autosomal,
|
|
20
|
+
mapped to X), 26 = MT.
|
|
21
|
+
- No-calls represented as ``0`` for each allele.
|
|
22
|
+
- Build 37 (GRCh37).
|
|
23
|
+
- V1 chip (pre-May 2016, ~682K SNPs) and V2 chip (May 2016+,
|
|
24
|
+
~664K SNPs) share the same column layout.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
31
|
+
|
|
32
|
+
from allelix.models import DEFAULT_BUILD, NO_CALL_MARKER, Variant
|
|
33
|
+
from allelix.parsers.base import GenotypeMetadata, GenotypeParser
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from collections.abc import Iterator
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
SIGNATURE = "#AncestryDNA"
|
|
42
|
+
EXPECTED_HEADER = "rsid\tchromosome\tposition\tallele1\tallele2"
|
|
43
|
+
EXPECTED_COLUMNS = 5
|
|
44
|
+
SNIFF_LINE_LIMIT = 50
|
|
45
|
+
|
|
46
|
+
_CHROM_MAP: dict[str, str] = {
|
|
47
|
+
"23": "X",
|
|
48
|
+
"24": "Y",
|
|
49
|
+
"25": "X",
|
|
50
|
+
"26": "MT",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AncestryDNAParser(GenotypeParser):
|
|
55
|
+
"""Parser for AncestryDNA consumer DNA genotype files."""
|
|
56
|
+
|
|
57
|
+
name: ClassVar[str] = "ancestrydna"
|
|
58
|
+
display_name: ClassVar[str] = "AncestryDNA"
|
|
59
|
+
file_extensions: ClassVar[list[str]] = [".txt"]
|
|
60
|
+
url: ClassVar[str] = "https://www.ancestry.com/dna"
|
|
61
|
+
|
|
62
|
+
def can_parse(self, file_path: Path) -> bool:
|
|
63
|
+
"""Recognize the file by its ``#AncestryDNA`` signature on the first line."""
|
|
64
|
+
try:
|
|
65
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
66
|
+
first_line = fh.readline()
|
|
67
|
+
return first_line.startswith(SIGNATURE)
|
|
68
|
+
except (OSError, UnicodeDecodeError):
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
def parse(self, file_path: Path) -> Iterator[Variant]:
|
|
72
|
+
"""Stream Variant objects, skipping comments and malformed lines."""
|
|
73
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
74
|
+
header_seen = False
|
|
75
|
+
for lineno, raw in enumerate(fh, start=1):
|
|
76
|
+
line = raw.rstrip("\r\n")
|
|
77
|
+
if not line or line.startswith("#"):
|
|
78
|
+
continue
|
|
79
|
+
if not header_seen:
|
|
80
|
+
if line == EXPECTED_HEADER:
|
|
81
|
+
header_seen = True
|
|
82
|
+
continue
|
|
83
|
+
logger.warning(
|
|
84
|
+
"Line %d: expected header %r, got %r — skipping",
|
|
85
|
+
lineno,
|
|
86
|
+
EXPECTED_HEADER,
|
|
87
|
+
line,
|
|
88
|
+
)
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
parts = line.split("\t")
|
|
92
|
+
if len(parts) != EXPECTED_COLUMNS:
|
|
93
|
+
logger.warning(
|
|
94
|
+
"Line %d: expected %d columns, got %d — skipping",
|
|
95
|
+
lineno,
|
|
96
|
+
EXPECTED_COLUMNS,
|
|
97
|
+
len(parts),
|
|
98
|
+
)
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
rsid, chrom_raw, pos_str, allele1, allele2 = parts
|
|
102
|
+
try:
|
|
103
|
+
position = int(pos_str)
|
|
104
|
+
except ValueError:
|
|
105
|
+
logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
chrom = _CHROM_MAP.get(chrom_raw, chrom_raw)
|
|
109
|
+
|
|
110
|
+
if allele1 == "0":
|
|
111
|
+
allele1 = NO_CALL_MARKER
|
|
112
|
+
if allele2 == "0":
|
|
113
|
+
allele2 = NO_CALL_MARKER
|
|
114
|
+
|
|
115
|
+
yield Variant(
|
|
116
|
+
rsid=rsid,
|
|
117
|
+
chromosome=chrom,
|
|
118
|
+
position=position,
|
|
119
|
+
allele1=allele1,
|
|
120
|
+
allele2=allele2,
|
|
121
|
+
build=DEFAULT_BUILD,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def get_metadata(self, file_path: Path) -> GenotypeMetadata:
|
|
125
|
+
"""Extract metadata from header comments. AncestryDNA has no sample ID field."""
|
|
126
|
+
return GenotypeMetadata(
|
|
127
|
+
format=self.name,
|
|
128
|
+
sample_id="",
|
|
129
|
+
build=DEFAULT_BUILD,
|
|
130
|
+
)
|
allelix/parsers/base.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Abstract base class for genotype file parsers."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import TYPE_CHECKING, ClassVar, TypedDict
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from allelix.models import Variant
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GenotypeMetadata(TypedDict):
|
|
18
|
+
"""File-level metadata extracted by `GenotypeParser.get_metadata`.
|
|
19
|
+
|
|
20
|
+
Header-derivable fields only. SNP count is intentionally NOT here — the
|
|
21
|
+
only reliable source for it is `parse()`, since lines that look like data
|
|
22
|
+
in a header scan may fail validation. Callers that need a count should
|
|
23
|
+
use `sum(1 for _ in parser.parse(file_path))`.
|
|
24
|
+
|
|
25
|
+
Keys:
|
|
26
|
+
format: Parser name (matches `GenotypeParser.name`).
|
|
27
|
+
sample_id: Vendor sample identifier, or "" if not present in the file.
|
|
28
|
+
build: Reference genome build (e.g., "GRCh37").
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
format: str
|
|
32
|
+
sample_id: str
|
|
33
|
+
build: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class GenotypeParser(ABC):
|
|
37
|
+
"""Base class for all genotype file parsers.
|
|
38
|
+
|
|
39
|
+
Subclasses define metadata as class attributes and implement the three
|
|
40
|
+
abstract methods. Parsers are stateless — `can_parse` and `parse` may be
|
|
41
|
+
called repeatedly on different files.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
name: Lowercase identifier used by the registry and CLI (e.g., "myhappygenes").
|
|
45
|
+
display_name: Human-readable name for reports ("MyHappyGenes (Tempus)").
|
|
46
|
+
file_extensions: Common file extensions (e.g., [".txt"]). Informational only;
|
|
47
|
+
auto-detection uses `can_parse`, not extension matching.
|
|
48
|
+
url: Vendor URL.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
name: ClassVar[str]
|
|
52
|
+
display_name: ClassVar[str]
|
|
53
|
+
file_extensions: ClassVar[list[str]]
|
|
54
|
+
url: ClassVar[str]
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def can_parse(self, file_path: Path) -> bool:
|
|
58
|
+
"""Sniff the file to determine if this parser handles it.
|
|
59
|
+
|
|
60
|
+
Must be fast — examines header/structural lines only, not the full file.
|
|
61
|
+
Used by the auto-detection registry.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
file_path: Path to the candidate genotype file.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
True if this parser recognizes the format.
|
|
68
|
+
"""
|
|
69
|
+
...
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def parse(self, file_path: Path) -> Iterator[Variant]:
|
|
73
|
+
"""Yield normalized Variant objects from the file.
|
|
74
|
+
|
|
75
|
+
Streaming: yields one variant at a time. Never loads the whole file.
|
|
76
|
+
Malformed individual lines log a warning and are skipped — they do not
|
|
77
|
+
abort the whole parse.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
file_path: Path to the genotype file.
|
|
81
|
+
|
|
82
|
+
Yields:
|
|
83
|
+
One Variant per data row in the file.
|
|
84
|
+
"""
|
|
85
|
+
...
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def get_metadata(self, file_path: Path) -> GenotypeMetadata:
|
|
89
|
+
"""Extract header-derivable file metadata. Must be cheap (no full parse).
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
file_path: Path to the genotype file.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
A `GenotypeMetadata` dict.
|
|
96
|
+
"""
|
|
97
|
+
...
|
allelix/parsers/ftdna.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Parser for Family Tree DNA (FTDNA) raw genotype export files.
|
|
4
|
+
|
|
5
|
+
Format reference (from real sample files and snps package):
|
|
6
|
+
|
|
7
|
+
# FTDNA raw data download
|
|
8
|
+
RSID,CHROMOSOME,POSITION,RESULT
|
|
9
|
+
"rs4477212","1","82154","AA"
|
|
10
|
+
"rs3094315","1","752566","AG"
|
|
11
|
+
"rs9001001","1","100000","--"
|
|
12
|
+
|
|
13
|
+
Specifics:
|
|
14
|
+
- CSV format, comma-delimited.
|
|
15
|
+
- Optional comment lines starting with ``#``.
|
|
16
|
+
- Header line: ``RSID,CHROMOSOME,POSITION,RESULT`` (quoted or unquoted).
|
|
17
|
+
- Data fields are double-quoted; parser strips quotes.
|
|
18
|
+
- RESULT column is concatenated genotype (e.g., "AG" not "A","G").
|
|
19
|
+
- Haploid calls on MT/Y appear as single characters (e.g., "A").
|
|
20
|
+
- No-calls represented as ``--``.
|
|
21
|
+
- Build 37 (most files).
|
|
22
|
+
- Detection key: header line matching ``RSID,CHROMOSOME,POSITION,RESULT``
|
|
23
|
+
(case-insensitive, with or without quotes) within the first 50 lines.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
30
|
+
|
|
31
|
+
from allelix.models import DEFAULT_BUILD, Variant
|
|
32
|
+
from allelix.parsers._helpers import split_csv_line, split_genotype
|
|
33
|
+
from allelix.parsers.base import GenotypeMetadata, GenotypeParser
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from collections.abc import Iterator
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
SNIFF_LINE_LIMIT = 50
|
|
42
|
+
EXPECTED_COLUMNS = 4
|
|
43
|
+
HEADER_CANONICAL = "RSID,CHROMOSOME,POSITION,RESULT"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _is_header_line(line: str) -> bool:
|
|
47
|
+
"""True if *line* is the FTDNA column header (quoted or unquoted)."""
|
|
48
|
+
stripped = line.replace('"', "").replace("'", "").strip()
|
|
49
|
+
return stripped.upper() == HEADER_CANONICAL
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class FTDNAParser(GenotypeParser):
|
|
53
|
+
"""Parser for Family Tree DNA consumer DNA genotype files."""
|
|
54
|
+
|
|
55
|
+
name: ClassVar[str] = "ftdna"
|
|
56
|
+
display_name: ClassVar[str] = "Family Tree DNA"
|
|
57
|
+
file_extensions: ClassVar[list[str]] = [".csv"]
|
|
58
|
+
url: ClassVar[str] = "https://www.familytreedna.com"
|
|
59
|
+
|
|
60
|
+
def can_parse(self, file_path: Path) -> bool:
|
|
61
|
+
"""Recognize the file by its ``RSID,CHROMOSOME,POSITION,RESULT`` header."""
|
|
62
|
+
try:
|
|
63
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
64
|
+
for _ in range(SNIFF_LINE_LIMIT):
|
|
65
|
+
line = fh.readline()
|
|
66
|
+
if not line:
|
|
67
|
+
return False
|
|
68
|
+
line = line.rstrip("\r\n")
|
|
69
|
+
if not line or line.startswith("#"):
|
|
70
|
+
continue
|
|
71
|
+
return _is_header_line(line)
|
|
72
|
+
except (OSError, UnicodeDecodeError):
|
|
73
|
+
return False
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
def parse(self, file_path: Path) -> Iterator[Variant]:
|
|
77
|
+
"""Stream Variant objects, skipping comments and malformed lines."""
|
|
78
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
79
|
+
header_seen = False
|
|
80
|
+
for lineno, raw in enumerate(fh, start=1):
|
|
81
|
+
line = raw.rstrip("\r\n")
|
|
82
|
+
if not line or line.startswith("#"):
|
|
83
|
+
continue
|
|
84
|
+
if not header_seen:
|
|
85
|
+
if _is_header_line(line):
|
|
86
|
+
header_seen = True
|
|
87
|
+
continue
|
|
88
|
+
logger.warning(
|
|
89
|
+
"Line %d: expected FTDNA header, got %r — skipping",
|
|
90
|
+
lineno,
|
|
91
|
+
line,
|
|
92
|
+
)
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
parts = split_csv_line(line)
|
|
96
|
+
if len(parts) != EXPECTED_COLUMNS:
|
|
97
|
+
logger.warning(
|
|
98
|
+
"Line %d: expected %d columns, got %d — skipping",
|
|
99
|
+
lineno,
|
|
100
|
+
EXPECTED_COLUMNS,
|
|
101
|
+
len(parts),
|
|
102
|
+
)
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
rsid, chrom, pos_str, genotype = parts
|
|
106
|
+
try:
|
|
107
|
+
position = int(pos_str)
|
|
108
|
+
except ValueError:
|
|
109
|
+
logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
allele1, allele2 = split_genotype(genotype)
|
|
113
|
+
|
|
114
|
+
yield Variant(
|
|
115
|
+
rsid=rsid,
|
|
116
|
+
chromosome=chrom,
|
|
117
|
+
position=position,
|
|
118
|
+
allele1=allele1,
|
|
119
|
+
allele2=allele2,
|
|
120
|
+
build=DEFAULT_BUILD,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def get_metadata(self, file_path: Path) -> GenotypeMetadata:
|
|
124
|
+
"""Extract metadata from header. FTDNA files have no sample ID field."""
|
|
125
|
+
return GenotypeMetadata(
|
|
126
|
+
format=self.name,
|
|
127
|
+
sample_id="",
|
|
128
|
+
build=DEFAULT_BUILD,
|
|
129
|
+
)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Parser for Living DNA raw genotype export files.
|
|
4
|
+
|
|
5
|
+
Format reference (from snps package and H600 Project wiki):
|
|
6
|
+
|
|
7
|
+
# Living DNA customer genotype data download file version: 1.0.1
|
|
8
|
+
# This file contains raw genotype data ...
|
|
9
|
+
# Human Genome Reference Build 37 (GRCh37.p13).
|
|
10
|
+
# Genotypes are presented on the forward strand.
|
|
11
|
+
#
|
|
12
|
+
# rsid chromosome position genotype
|
|
13
|
+
rs1801133 1 11856378 AG
|
|
14
|
+
AX-12345678 3 15000000 GG
|
|
15
|
+
1:726912 1 726912 AA
|
|
16
|
+
|
|
17
|
+
Specifics:
|
|
18
|
+
- Tab-delimited despite ``.csv`` file extension.
|
|
19
|
+
- Detection key: ``Living DNA`` in the first line.
|
|
20
|
+
- Comment lines start with ``#``, including the column header line.
|
|
21
|
+
- Four columns: ``rsid``, ``chromosome``, ``position``, ``genotype``.
|
|
22
|
+
- Concatenated genotype in result column (e.g., "AA", "CT").
|
|
23
|
+
- No-calls represented as ``--``.
|
|
24
|
+
- Build 37 (GRCh37.p13), forward strand.
|
|
25
|
+
- SNP ID types: rs-numbers, ``AX-`` prefixed (Affymetrix),
|
|
26
|
+
``AFFX-`` prefixed (Affymetrix control probes), and positional
|
|
27
|
+
notation (``CHR:POS``, e.g., ``1:726912``).
|
|
28
|
+
- Y and MT chromosomes delivered as separate files; main file
|
|
29
|
+
has chromosomes 1-22 and X.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import logging
|
|
35
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
36
|
+
|
|
37
|
+
from allelix.models import DEFAULT_BUILD, Variant
|
|
38
|
+
from allelix.parsers._helpers import split_genotype
|
|
39
|
+
from allelix.parsers.base import GenotypeMetadata, GenotypeParser
|
|
40
|
+
from allelix.utils.build_detect import normalize_build_label
|
|
41
|
+
|
|
42
|
+
if TYPE_CHECKING:
|
|
43
|
+
from collections.abc import Iterator
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
SIGNATURE = "Living DNA"
|
|
49
|
+
SNIFF_LINE_LIMIT = 50
|
|
50
|
+
EXPECTED_COLUMNS = 4
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class LivingDNAParser(GenotypeParser):
|
|
54
|
+
"""Parser for Living DNA consumer genotype files."""
|
|
55
|
+
|
|
56
|
+
name: ClassVar[str] = "livingdna"
|
|
57
|
+
display_name: ClassVar[str] = "Living DNA"
|
|
58
|
+
file_extensions: ClassVar[list[str]] = [".csv"]
|
|
59
|
+
url: ClassVar[str] = "https://livingdna.com"
|
|
60
|
+
|
|
61
|
+
def can_parse(self, file_path: Path) -> bool:
|
|
62
|
+
"""Recognize the file by ``Living DNA`` in the first line."""
|
|
63
|
+
try:
|
|
64
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
65
|
+
first_line = fh.readline()
|
|
66
|
+
return SIGNATURE in first_line
|
|
67
|
+
except (OSError, UnicodeDecodeError):
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
def parse(self, file_path: Path) -> Iterator[Variant]:
|
|
71
|
+
"""Stream Variant objects, skipping comments and malformed lines."""
|
|
72
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
73
|
+
for lineno, raw in enumerate(fh, start=1):
|
|
74
|
+
line = raw.rstrip("\r\n")
|
|
75
|
+
if not line or line.startswith("#"):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
parts = line.split("\t")
|
|
79
|
+
if len(parts) != EXPECTED_COLUMNS:
|
|
80
|
+
logger.warning(
|
|
81
|
+
"Line %d: expected %d columns, got %d — skipping",
|
|
82
|
+
lineno,
|
|
83
|
+
EXPECTED_COLUMNS,
|
|
84
|
+
len(parts),
|
|
85
|
+
)
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
rsid, chrom, pos_str, genotype = parts
|
|
89
|
+
try:
|
|
90
|
+
position = int(pos_str)
|
|
91
|
+
except ValueError:
|
|
92
|
+
logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
allele1, allele2 = split_genotype(genotype)
|
|
96
|
+
|
|
97
|
+
yield Variant(
|
|
98
|
+
rsid=rsid,
|
|
99
|
+
chromosome=chrom,
|
|
100
|
+
position=position,
|
|
101
|
+
allele1=allele1,
|
|
102
|
+
allele2=allele2,
|
|
103
|
+
build=DEFAULT_BUILD,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def get_metadata(self, file_path: Path) -> GenotypeMetadata:
|
|
107
|
+
"""Extract build from header comments. Living DNA has no sample ID field."""
|
|
108
|
+
build = DEFAULT_BUILD
|
|
109
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
110
|
+
for raw in fh:
|
|
111
|
+
line = raw.rstrip("\r\n")
|
|
112
|
+
if not line.startswith("#"):
|
|
113
|
+
break
|
|
114
|
+
normalized = normalize_build_label(line)
|
|
115
|
+
if normalized:
|
|
116
|
+
build = normalized
|
|
117
|
+
return GenotypeMetadata(
|
|
118
|
+
format=self.name,
|
|
119
|
+
sample_id="",
|
|
120
|
+
build=build,
|
|
121
|
+
)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Parser for MyHappyGenes (Tempus) genotype export files.
|
|
4
|
+
|
|
5
|
+
Format reference (from sample file):
|
|
6
|
+
|
|
7
|
+
# MyHappyGenes [TEMPUS]
|
|
8
|
+
# This file was generated by MyHappyGenes, Inc.
|
|
9
|
+
# ... (additional comment lines)
|
|
10
|
+
# Sample ID MHG000001
|
|
11
|
+
SNP Name Chr Position Allele1 - Forward Allele2 - Forward
|
|
12
|
+
rs9651229 1 632287 C C
|
|
13
|
+
rs9701872 1 632828 T T
|
|
14
|
+
|
|
15
|
+
Specifics:
|
|
16
|
+
- Comment lines start with `#`.
|
|
17
|
+
- Tab-delimited, 5 columns of data.
|
|
18
|
+
- Header claims build 37.1, but real-world exports we've verified
|
|
19
|
+
contain GRCh38 positions. The analyze pipeline runs auto-
|
|
20
|
+
detection (ADR-0021) and overrides the parser-reported build
|
|
21
|
+
based on position data. The build value returned here reflects
|
|
22
|
+
the literal header claim; downstream code should not assume it.
|
|
23
|
+
Forward strand with respect to the reference.
|
|
24
|
+
- No-calls represented as `-`.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
31
|
+
|
|
32
|
+
from allelix.models import DEFAULT_BUILD, Variant
|
|
33
|
+
from allelix.parsers.base import GenotypeMetadata, GenotypeParser
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from collections.abc import Iterator
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
SIGNATURE = "# MyHappyGenes"
|
|
42
|
+
SAMPLE_ID_PREFIX = "# Sample ID"
|
|
43
|
+
EXPECTED_HEADER = "SNP Name\tChr\tPosition\tAllele1 - Forward\tAllele2 - Forward"
|
|
44
|
+
EXPECTED_COLUMNS = 5
|
|
45
|
+
SNIFF_LINE_LIMIT = 50
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class MyHappyGenesParser(GenotypeParser):
|
|
49
|
+
"""Parser for MyHappyGenes/Tempus consumer DNA genotype files."""
|
|
50
|
+
|
|
51
|
+
name: ClassVar[str] = "myhappygenes"
|
|
52
|
+
display_name: ClassVar[str] = "MyHappyGenes (Tempus)"
|
|
53
|
+
file_extensions: ClassVar[list[str]] = [".txt"]
|
|
54
|
+
url: ClassVar[str] = "https://myhappygenes.com"
|
|
55
|
+
|
|
56
|
+
def can_parse(self, file_path: Path) -> bool:
|
|
57
|
+
"""Recognize the file by its `# MyHappyGenes` signature line."""
|
|
58
|
+
try:
|
|
59
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
60
|
+
for _ in range(SNIFF_LINE_LIMIT):
|
|
61
|
+
line = fh.readline()
|
|
62
|
+
if not line:
|
|
63
|
+
return False
|
|
64
|
+
if SIGNATURE in line:
|
|
65
|
+
return True
|
|
66
|
+
except (OSError, UnicodeDecodeError):
|
|
67
|
+
return False
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
def parse(self, file_path: Path) -> Iterator[Variant]:
|
|
71
|
+
"""Stream Variant objects, skipping comments and malformed lines."""
|
|
72
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
73
|
+
header_seen = False
|
|
74
|
+
for lineno, raw in enumerate(fh, start=1):
|
|
75
|
+
line = raw.rstrip("\r\n")
|
|
76
|
+
if not line or line.startswith("#"):
|
|
77
|
+
continue
|
|
78
|
+
if not header_seen:
|
|
79
|
+
if line == EXPECTED_HEADER:
|
|
80
|
+
header_seen = True
|
|
81
|
+
continue
|
|
82
|
+
logger.warning(
|
|
83
|
+
"Line %d: expected header %r, got %r — skipping",
|
|
84
|
+
lineno,
|
|
85
|
+
EXPECTED_HEADER,
|
|
86
|
+
line,
|
|
87
|
+
)
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
parts = line.split("\t")
|
|
91
|
+
if len(parts) != EXPECTED_COLUMNS:
|
|
92
|
+
logger.warning(
|
|
93
|
+
"Line %d: expected %d columns, got %d — skipping",
|
|
94
|
+
lineno,
|
|
95
|
+
EXPECTED_COLUMNS,
|
|
96
|
+
len(parts),
|
|
97
|
+
)
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
rsid, chrom, pos_str, allele1, allele2 = parts
|
|
101
|
+
try:
|
|
102
|
+
position = int(pos_str)
|
|
103
|
+
except ValueError:
|
|
104
|
+
logger.warning("Line %d: invalid position %r — skipping", lineno, pos_str)
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
yield Variant(
|
|
108
|
+
rsid=rsid,
|
|
109
|
+
chromosome=chrom,
|
|
110
|
+
position=position,
|
|
111
|
+
allele1=allele1,
|
|
112
|
+
allele2=allele2,
|
|
113
|
+
build=DEFAULT_BUILD,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def get_metadata(self, file_path: Path) -> GenotypeMetadata:
|
|
117
|
+
"""Extract sample ID from the file header. Cheap — no full parse."""
|
|
118
|
+
sample_id = ""
|
|
119
|
+
with file_path.open("r", encoding="utf-8") as fh:
|
|
120
|
+
for raw in fh:
|
|
121
|
+
line = raw.rstrip("\r\n")
|
|
122
|
+
if not line:
|
|
123
|
+
continue
|
|
124
|
+
if line.startswith(SAMPLE_ID_PREFIX):
|
|
125
|
+
parts = line.split("\t", 1)
|
|
126
|
+
if len(parts) == 2:
|
|
127
|
+
sample_id = parts[1].strip()
|
|
128
|
+
break
|
|
129
|
+
if not line.startswith("#"):
|
|
130
|
+
break
|
|
131
|
+
return GenotypeMetadata(
|
|
132
|
+
format=self.name,
|
|
133
|
+
sample_id=sample_id,
|
|
134
|
+
build=DEFAULT_BUILD,
|
|
135
|
+
)
|