allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,342 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parse raw SNPedia wiki markup into structured genotype rows.
4
+
5
+ Called automatically by the SNPedia annotator when raw pages exist but
6
+ the structured ``snpedia_genotypes`` table does not. Can also be invoked
7
+ standalone via ``scripts/parse_snpedia.py``.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import contextlib
13
+ import logging
14
+ import sqlite3
15
+ from datetime import UTC, datetime
16
+
17
+ import mwparserfromhell
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _PARSER_VERSION = 6
22
+
23
+
24
+ def _parse_title_prefix(title: str) -> tuple[str, str] | None:
25
+ """Extract (prefix, number) from a title like 'Rs12345' or 'I4000178'.
26
+
27
+ Returns None if the title doesn't start with Rs or I followed by digits.
28
+ """
29
+ if title.startswith("Rs") or title.startswith("I"):
30
+ prefix = "Rs" if title.startswith("Rs") else "I"
31
+ rest = title[len(prefix) :]
32
+ digits = []
33
+ for ch in rest:
34
+ if ch.isdigit():
35
+ digits.append(ch)
36
+ else:
37
+ break
38
+ if digits:
39
+ return prefix, "".join(digits)
40
+ return None
41
+
42
+
43
+ def _parse_title_alleles(title: str) -> tuple[str, str] | None:
44
+ """Extract alleles from a title like 'Rs12345(A;G)' or 'I4000178(C;T)'.
45
+
46
+ Returns None if the title doesn't contain a valid (allele;allele) suffix.
47
+ """
48
+ paren_start = title.find("(")
49
+ if paren_start == -1 or not title.endswith(")"):
50
+ return None
51
+ inner = title[paren_start + 1 : -1]
52
+ semi = inner.find(";")
53
+ if semi == -1:
54
+ return None
55
+ a1 = inner[:semi].strip()
56
+ a2 = inner[semi + 1 :].strip()
57
+ if a1 and a2:
58
+ return a1, a2
59
+ return None
60
+
61
+
62
+ def _tmpl_param(tmpl: object, name: str) -> str:
63
+ """Extract a named parameter from a mwparserfromhell template."""
64
+ if tmpl.has(name):
65
+ return str(tmpl.get(name).value).strip()
66
+ return ""
67
+
68
+
69
+ _STRUCTURED_SCHEMA = """
70
+ CREATE TABLE IF NOT EXISTS snpedia_genotypes (
71
+ rsid TEXT NOT NULL,
72
+ allele1 TEXT NOT NULL,
73
+ allele2 TEXT NOT NULL,
74
+ magnitude REAL,
75
+ repute TEXT,
76
+ summary TEXT,
77
+ gene TEXT,
78
+ scraped_at TEXT
79
+ );
80
+
81
+ CREATE INDEX IF NOT EXISTS idx_snpedia_rsid_alleles
82
+ ON snpedia_genotypes(rsid, allele1, allele2);
83
+
84
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_snpedia_genotype_dedup
85
+ ON snpedia_genotypes(rsid, allele1, allele2, COALESCE(summary, ''));
86
+
87
+ CREATE TABLE IF NOT EXISTS database_versions (
88
+ name TEXT PRIMARY KEY,
89
+ source_url TEXT NOT NULL,
90
+ version TEXT,
91
+ downloaded_at TEXT NOT NULL,
92
+ record_count INTEGER NOT NULL,
93
+ remote_signal TEXT,
94
+ local_version_tag TEXT
95
+ );
96
+ """
97
+
98
+
99
+ def parser_is_current(conn: sqlite3.Connection) -> bool:
100
+ """Return True if the cache was built by the current parser version.
101
+
102
+ Checks ``local_version_tag`` first. If absent, falls back to the
103
+ legacy ``|pv:N`` suffix in ``remote_signal`` and migrates the tag
104
+ in-place — avoiding a full re-parse just to populate the column.
105
+ """
106
+ tag = f"pv:{_PARSER_VERSION}"
107
+ try:
108
+ row = conn.execute(
109
+ "SELECT local_version_tag FROM database_versions WHERE name='snpedia'"
110
+ ).fetchone()
111
+ if row and row[0] == tag:
112
+ return True
113
+ except sqlite3.OperationalError:
114
+ pass
115
+ try:
116
+ row = conn.execute(
117
+ "SELECT remote_signal FROM database_versions WHERE name='snpedia'"
118
+ ).fetchone()
119
+ if row and row[0] and f"|pv:{_PARSER_VERSION}" in row[0]:
120
+ from allelix.databases.manager import _ensure_local_version_tag_column
121
+
122
+ _ensure_local_version_tag_column(conn)
123
+ clean_signal = row[0].split("|pv:")[0]
124
+ conn.execute(
125
+ "UPDATE database_versions "
126
+ "SET remote_signal = ?, local_version_tag = ? WHERE name = 'snpedia'",
127
+ (clean_signal, tag),
128
+ )
129
+ conn.commit()
130
+ return True
131
+ except sqlite3.OperationalError:
132
+ pass
133
+ return False
134
+
135
+
136
+ def _dedupe_existing(conn: sqlite3.Connection) -> int:
137
+ """Collapse pre-existing duplicate rows in old caches. Returns rows removed."""
138
+ before = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
139
+ conn.execute("""
140
+ DELETE FROM snpedia_genotypes
141
+ WHERE rowid NOT IN (
142
+ SELECT MIN(rowid) FROM snpedia_genotypes
143
+ GROUP BY rsid, UPPER(allele1), UPPER(allele2), COALESCE(summary, '')
144
+ )
145
+ """)
146
+ after = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
147
+ return before - after
148
+
149
+
150
+ def detect_raw_table(conn: sqlite3.Connection) -> str | None:
151
+ """Return the name of the raw pages table, or None if absent."""
152
+ tables = {
153
+ row[0]
154
+ for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
155
+ }
156
+ if "_raw_pages" in tables:
157
+ return "_raw_pages"
158
+ if "pages" in tables:
159
+ return "pages"
160
+ return None
161
+
162
+
163
+ def has_structured_table(conn: sqlite3.Connection) -> bool:
164
+ """Return True if snpedia_genotypes exists and has rows."""
165
+ try:
166
+ count = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
167
+ return count > 0
168
+ except sqlite3.OperationalError:
169
+ return False
170
+
171
+
172
+ def parse_raw_pages(db_path: str, *, verbose: bool = False) -> int:
173
+ """Parse raw wiki markup into structured genotype rows.
174
+
175
+ Returns the number of structured rows created.
176
+ """
177
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
178
+ return _parse_raw_pages_inner(conn, verbose=verbose)
179
+
180
+
181
+ def _parse_raw_pages_inner(conn: sqlite3.Connection, *, verbose: bool = False) -> int:
182
+ """Inner parser logic. Caller owns the connection lifecycle."""
183
+ raw_table = detect_raw_table(conn)
184
+ if raw_table is None:
185
+ return 0
186
+
187
+ if verbose:
188
+ logger.info("Parsing SNPedia raw pages from '%s' table", raw_table)
189
+
190
+ conn.execute("DROP INDEX IF EXISTS idx_snpedia_genotype_dedup")
191
+
192
+ has_table = conn.execute(
193
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name='snpedia_genotypes'"
194
+ ).fetchone()
195
+ if has_table:
196
+ deduped = _dedupe_existing(conn)
197
+ if deduped:
198
+ logger.info("Backfill dedupe: removed %d duplicate row(s)", deduped)
199
+
200
+ conn.executescript(_STRUCTURED_SCHEMA)
201
+ conn.execute("DELETE FROM snpedia_genotypes")
202
+
203
+ # Build gene map from SNP pages
204
+ gene_map: dict[str, str] = {}
205
+ snp_rows = conn.execute(
206
+ f"SELECT title, content FROM {raw_table} WHERE category = 'snp'"
207
+ ).fetchall()
208
+ print(f" Building gene map from {len(snp_rows)} SNP pages...", flush=True)
209
+ for title, content in snp_rows:
210
+ parsed_prefix = _parse_title_prefix(title)
211
+ if not parsed_prefix or not content:
212
+ continue
213
+ prefix, num = parsed_prefix
214
+ snp_key = f"{prefix.lower()}{num}"
215
+ try:
216
+ wikicode = mwparserfromhell.parse(content)
217
+ for template in wikicode.filter_templates():
218
+ tname = template.name.strip().lower()
219
+ if tname in ("rsnum", "snp"):
220
+ gene = _tmpl_param(template, "Gene")
221
+ if gene:
222
+ gene_map[snp_key] = gene
223
+ break
224
+ if tname == "23andme snp":
225
+ gene = _tmpl_param(template, "Gene_s")
226
+ if gene:
227
+ gene_map[snp_key] = gene
228
+ break
229
+ except Exception:
230
+ logger.debug("Failed to parse SNP page %s", title, exc_info=True)
231
+ continue
232
+
233
+ print(f" Gene map: {len(gene_map)} mappings built.", flush=True)
234
+
235
+ # Parse genotype pages
236
+ genotype_rows = conn.execute(
237
+ f"SELECT title, content, scraped_at FROM {raw_table} WHERE category = 'genotype'"
238
+ ).fetchall()
239
+ print(f" Parsing {len(genotype_rows)} genotype pages...", flush=True)
240
+
241
+ batch: list[tuple[str, str, str, float | None, str | None, str | None, str | None, str]] = []
242
+
243
+ for title, content, scraped_at in genotype_rows:
244
+ parsed_prefix = _parse_title_prefix(title)
245
+ if not parsed_prefix or not content:
246
+ continue
247
+
248
+ prefix, num = parsed_prefix
249
+ snp_id = f"{prefix.lower()}{num}"
250
+
251
+ try:
252
+ wikicode = mwparserfromhell.parse(content)
253
+ except Exception:
254
+ logger.debug("Failed to parse genotype page %s", title, exc_info=True)
255
+ continue
256
+
257
+ templates = [
258
+ t for t in wikicode.filter_templates() if t.name.strip().lower() == "genotype"
259
+ ]
260
+ if not templates:
261
+ continue
262
+
263
+ tmpl = templates[0]
264
+
265
+ allele1 = _tmpl_param(tmpl, "allele1").upper()
266
+ allele2 = _tmpl_param(tmpl, "allele2").upper()
267
+ if not allele1 or not allele2:
268
+ title_alleles = _parse_title_alleles(title)
269
+ if not title_alleles:
270
+ continue
271
+ allele1, allele2 = title_alleles[0].upper(), title_alleles[1].upper()
272
+ if not allele1 or not allele2:
273
+ continue
274
+
275
+ if allele1 > allele2:
276
+ allele1, allele2 = allele2, allele1
277
+
278
+ mag_str = _tmpl_param(tmpl, "magnitude")
279
+ magnitude: float | None = None
280
+ if mag_str:
281
+ try:
282
+ magnitude = float(mag_str)
283
+ except ValueError:
284
+ magnitude = None
285
+
286
+ repute = _tmpl_param(tmpl, "repute") or None
287
+ summary = _tmpl_param(tmpl, "summary") or None
288
+ gene = gene_map.get(snp_id) or None
289
+
290
+ batch.append((snp_id, allele1, allele2, magnitude, repute, summary, gene, scraped_at))
291
+
292
+ if len(batch) >= 1000:
293
+ conn.executemany(
294
+ "INSERT OR IGNORE INTO snpedia_genotypes "
295
+ "(rsid, allele1, allele2, magnitude, repute, summary, gene, scraped_at) "
296
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
297
+ batch,
298
+ )
299
+ batch.clear()
300
+
301
+ if batch:
302
+ conn.executemany(
303
+ "INSERT OR IGNORE INTO snpedia_genotypes "
304
+ "(rsid, allele1, allele2, magnitude, repute, summary, gene, scraped_at) "
305
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
306
+ batch,
307
+ )
308
+
309
+ row_count = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
310
+
311
+ date_row = conn.execute(f"SELECT MIN(scraped_at) FROM {raw_table}").fetchone()
312
+ scrape_date = date_row[0][:10] if date_row and date_row[0] else "unknown"
313
+
314
+ existing_signal = ""
315
+ try:
316
+ sig_row = conn.execute(
317
+ "SELECT remote_signal FROM database_versions WHERE name = 'snpedia'"
318
+ ).fetchone()
319
+ if sig_row and sig_row[0]:
320
+ existing_signal = sig_row[0].split("|pv:")[0]
321
+ except sqlite3.OperationalError:
322
+ pass
323
+
324
+ conn.execute("DELETE FROM database_versions WHERE name = 'snpedia'")
325
+ conn.execute(
326
+ "INSERT INTO database_versions "
327
+ "(name, source_url, version, downloaded_at, record_count, "
328
+ "remote_signal, local_version_tag) "
329
+ "VALUES (?, ?, ?, ?, ?, ?, ?)",
330
+ (
331
+ "snpedia",
332
+ "https://bots.snpedia.com/api.php",
333
+ f"scraped {scrape_date} ({row_count} genotypes)",
334
+ datetime.now(UTC).isoformat(),
335
+ row_count,
336
+ existing_signal,
337
+ f"pv:{_PARSER_VERSION}",
338
+ ),
339
+ )
340
+
341
+ conn.commit()
342
+ return row_count
@@ -0,0 +1,3 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Format exporters for parsed genotype data."""
@@ -0,0 +1,144 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """PLINK1 binary format (.bed/.bim/.fam) exporter."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ from allelix.utils.allele import complement, is_strand_ambiguous
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Iterator
13
+ from pathlib import Path
14
+
15
+ from allelix.models import Variant
16
+
17
+ _BED_MAGIC = bytes([0x6C, 0x1B, 0x01])
18
+
19
+ _CHROM_CODES = {
20
+ "X": "23",
21
+ "Y": "24",
22
+ "MT": "26",
23
+ }
24
+
25
+
26
+ def _orient_genotype(
27
+ allele1: str,
28
+ allele2: str,
29
+ ref: str,
30
+ alt: str,
31
+ ) -> tuple[str, str] | None:
32
+ """Map user alleles to {ref, alt} in a consistent orientation.
33
+
34
+ Returns None for palindromic sites, indels, or alleles that don't fit.
35
+ Both alleles are tested in the same orientation — no mixed-strand.
36
+ """
37
+ if len(allele1) != 1 or len(allele2) != 1:
38
+ return None
39
+ if is_strand_ambiguous(ref, alt):
40
+ return None
41
+
42
+ pair = {allele1, allele2}
43
+ if pair <= {ref, alt}:
44
+ return (allele1, allele2)
45
+
46
+ c1, c2 = complement(allele1), complement(allele2)
47
+ if {c1, c2} <= {ref, alt}:
48
+ return (c1, c2)
49
+
50
+ return None
51
+
52
+
53
+ def export_plink(
54
+ variants: Iterator[Variant],
55
+ prefix: Path,
56
+ build: str,
57
+ ref_alt_map: dict[str, tuple[str, str]] | None = None,
58
+ ) -> tuple[int, int, int, int]:
59
+ """Write .bed/.bim/.fam from parsed variants.
60
+
61
+ Args:
62
+ variants: Parsed variant iterator (consumed once).
63
+ prefix: Base path for output files.
64
+ build: Genome build label (informational, not used for liftover).
65
+ ref_alt_map: ``{rsid: (ref, alt)}`` from gnomAD coordinate resolution.
66
+ When provided, uses ref/alt to assign A1/A2 for proper allele coding.
67
+ When None or rsid missing, falls back to ``A2="0"`` for homozygotes.
68
+
69
+ Returns:
70
+ ``(variants_written, no_calls_skipped, indels_skipped, monomorphic_count)``
71
+
72
+ Note:
73
+ No-call variants and indels (multi-character alleles) are dropped.
74
+ PLINK1 BIM is SNV-only (single-character A1/A2). Indels would
75
+ produce non-standard BIM rows that downstream tools may reject.
76
+ """
77
+ fam_path = prefix.with_suffix(".fam")
78
+ bim_path = prefix.with_suffix(".bim")
79
+ bed_path = prefix.with_suffix(".bed")
80
+
81
+ fam_path.write_text("0\tSAMPLE\t0\t0\t0\t-9\n")
82
+
83
+ written = 0
84
+ skipped = 0
85
+ indels = 0
86
+ monomorphic = 0
87
+
88
+ with bim_path.open("w") as bim_f, bed_path.open("wb") as bed_f:
89
+ bed_f.write(_BED_MAGIC)
90
+
91
+ for v in variants:
92
+ if v.is_no_call:
93
+ skipped += 1
94
+ continue
95
+
96
+ if len(v.allele1) != 1 or len(v.allele2) != 1:
97
+ indels += 1
98
+ continue
99
+
100
+ chrom_code = _CHROM_CODES.get(v.chromosome, v.chromosome)
101
+ a1: str
102
+ a2: str
103
+ bed_code: int
104
+
105
+ if ref_alt_map and v.rsid in ref_alt_map:
106
+ ref, alt = ref_alt_map[v.rsid]
107
+ resolved = _orient_genotype(v.allele1, v.allele2, ref, alt)
108
+ if resolved is not None:
109
+ r1, r2 = resolved
110
+ a1 = ref
111
+ a2 = alt
112
+ a2_count = sum(1 for a in (r1, r2) if a == alt)
113
+ if a2_count == 0:
114
+ bed_code = 0b00
115
+ elif a2_count == 1:
116
+ bed_code = 0b10
117
+ else:
118
+ bed_code = 0b11
119
+ else:
120
+ a1, a2, bed_code, is_mono = _fallback_coding(v)
121
+ if is_mono:
122
+ monomorphic += 1
123
+ else:
124
+ a1, a2, bed_code, is_mono = _fallback_coding(v)
125
+ if is_mono:
126
+ monomorphic += 1
127
+
128
+ bim_f.write(f"{chrom_code}\t{v.rsid}\t0\t{v.position}\t{a1}\t{a2}\n")
129
+ bed_f.write(bytes([bed_code]))
130
+ written += 1
131
+
132
+ return written, skipped, indels, monomorphic
133
+
134
+
135
+ def _fallback_coding(v: Variant) -> tuple[str, str, int, bool]:
136
+ """Fallback allele coding when ref/alt is unknown.
137
+
138
+ Returns ``(a1, a2, bed_code, is_monomorphic)``.
139
+ """
140
+ if v.is_heterozygous:
141
+ alleles = sorted([v.allele1, v.allele2])
142
+ return alleles[0], alleles[1], 0b10, False
143
+
144
+ return v.allele1, "0", 0b00, True
allelix/models.py ADDED
@@ -0,0 +1,117 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Core data models for genotype variants and reference annotations.
4
+
5
+ Trust boundary: parsers are responsible for validating raw input. Model
6
+ constructors do not enforce chromosome names, position bounds, or allele
7
+ encodings — they trust their caller. If a Variant or Annotation is
8
+ constructed by code outside the `allelix.parsers` package, the caller owns
9
+ the validation.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+
16
+ NO_CALL_MARKER = "-"
17
+ DEFAULT_BUILD = "GRCh37"
18
+
19
+
20
+ @dataclass
21
+ class Variant:
22
+ """A single genotype call: which alleles a sample carries at one position.
23
+
24
+ All parsers normalize to this representation. Downstream code (annotators,
25
+ reports) only sees Variants, never raw file formats.
26
+
27
+ Attributes:
28
+ rsid: dbSNP reference identifier (e.g., "rs1801133").
29
+ chromosome: Chromosome name. "1"-"22", "X", "Y", or "MT".
30
+ position: 1-based genomic coordinate in the given build.
31
+ allele1: First observed allele. A/T/G/C, multi-base for indels, or "-" for no-call.
32
+ allele2: Second observed allele. Same encoding as allele1.
33
+ build: Reference genome build. "GRCh37" (hg19) or "GRCh38" (hg38).
34
+ """
35
+
36
+ rsid: str
37
+ chromosome: str
38
+ position: int
39
+ allele1: str
40
+ allele2: str
41
+ build: str = DEFAULT_BUILD
42
+
43
+ @property
44
+ def is_heterozygous(self) -> bool:
45
+ """True if the two alleles differ (and neither is a no-call)."""
46
+ if self.is_no_call:
47
+ return False
48
+ return self.allele1 != self.allele2
49
+
50
+ @property
51
+ def is_no_call(self) -> bool:
52
+ """True if either allele is the no-call marker.
53
+
54
+ Typically indicates assay failure at this position, but the precise
55
+ meaning is format-dependent (some VCFs use `-` for indel deletions).
56
+ """
57
+ return self.allele1 == NO_CALL_MARKER or self.allele2 == NO_CALL_MARKER
58
+
59
+ @property
60
+ def genotype(self) -> str:
61
+ """Human-readable genotype string (e.g., "C/T")."""
62
+ return f"{self.allele1}/{self.allele2}"
63
+
64
+
65
+ @dataclass
66
+ class Annotation:
67
+ """A claim about a variant sourced from a specific reference database.
68
+
69
+ Allelix never asserts variant significance directly — every Annotation is
70
+ attributed to its source database. See README § Regulatory Posture.
71
+
72
+ Attributes:
73
+ source: Lowercase database identifier (e.g., "clinvar", "pharmgkb").
74
+ rsid: The variant this annotation applies to.
75
+ significance: Source-prefixed classification (e.g., "clinvar_pathogenic").
76
+ category: Coarse filter bucket. Use non-diagnostic labels: "clinical",
77
+ "pharma", "carrier", "trait", "methylation". Never bare medical terms
78
+ like "pathogenic" — those would read as Allelix's own classification.
79
+ magnitude: 0-10 importance score (SNPedia-style).
80
+ description: Human-readable explanation.
81
+ attribution: Display name of the source ("ClinVar", "PharmGKB", ...).
82
+ genotype_match: Which genotype triggers this annotation (e.g., "T/T").
83
+ references: PubMed IDs or URLs supporting the claim.
84
+ condition: Disease or condition name, if applicable.
85
+ gene: Gene symbol, if known.
86
+ review_status: ClinVar review status (CLNREVSTAT), empty for non-ClinVar.
87
+ is_must_include: Internal flag for GWAS rollup; excluded from public output.
88
+ """
89
+
90
+ source: str
91
+ rsid: str
92
+ significance: str
93
+ category: str
94
+ magnitude: float
95
+ description: str
96
+ attribution: str
97
+ genotype_match: str
98
+ references: list[str] = field(default_factory=list)
99
+ condition: str = ""
100
+ gene: str = ""
101
+ review_status: str = ""
102
+ alt: str = ""
103
+ is_must_include: bool = False
104
+ allele_frequency: float | None = None
105
+ am_pathogenicity: float | None = None
106
+ am_class: str = ""
107
+ cadd_phred: float | None = None
108
+
109
+ @property
110
+ def zygosity(self) -> str:
111
+ """Classify the genotype call as Heterozygous, Homozygous, or No Call."""
112
+ if NO_CALL_MARKER in self.genotype_match:
113
+ return "No Call"
114
+ parts = self.genotype_match.split("/")
115
+ if len(parts) != 2:
116
+ return "Homozygous" if len(set(self.genotype_match)) == 1 else "Heterozygous"
117
+ return "Heterozygous" if parts[0] != parts[1] else "Homozygous"
@@ -0,0 +1,73 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Parser registry. Auto-detection tries each registered parser; first match wins."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ from allelix.parsers.ancestrydna import AncestryDNAParser
10
+ from allelix.parsers.base import GenotypeParser
11
+ from allelix.parsers.ftdna import FTDNAParser
12
+ from allelix.parsers.livingdna import LivingDNAParser
13
+ from allelix.parsers.myhappygenes import MyHappyGenesParser
14
+ from allelix.parsers.myheritage import MyHeritageParser
15
+ from allelix.parsers.twentythreeandme import TwentyThreeAndMeParser
16
+
17
+ if TYPE_CHECKING:
18
+ from pathlib import Path
19
+
20
+ PARSERS: list[GenotypeParser] = [
21
+ MyHappyGenesParser(),
22
+ TwentyThreeAndMeParser(),
23
+ AncestryDNAParser(),
24
+ LivingDNAParser(),
25
+ MyHeritageParser(),
26
+ FTDNAParser(),
27
+ ]
28
+
29
+
30
+ class ParserNotFoundError(ValueError):
31
+ """Raised when no parser can handle a file or a named parser does not exist."""
32
+
33
+
34
+ def get_parser_by_name(name: str) -> GenotypeParser:
35
+ """Look up a parser by its `name` attribute.
36
+
37
+ Args:
38
+ name: Lowercase parser identifier (e.g., "myhappygenes").
39
+
40
+ Raises:
41
+ ParserNotFoundError: If no registered parser has that name.
42
+ """
43
+ for parser in PARSERS:
44
+ if parser.name == name:
45
+ return parser
46
+ available = ", ".join(p.name for p in PARSERS)
47
+ raise ParserNotFoundError(f"Unknown parser {name!r}. Available: {available}")
48
+
49
+
50
+ def detect_parser(file_path: Path) -> GenotypeParser:
51
+ """Auto-detect the parser for a file. First match wins.
52
+
53
+ Args:
54
+ file_path: Path to the genotype file.
55
+
56
+ Raises:
57
+ ParserNotFoundError: If no parser recognizes the format.
58
+ """
59
+ for parser in PARSERS:
60
+ if parser.can_parse(file_path):
61
+ return parser
62
+ raise ParserNotFoundError(
63
+ f"No parser recognized {file_path.name!r}. Try forcing a format with --format <name>."
64
+ )
65
+
66
+
67
+ __all__ = [
68
+ "PARSERS",
69
+ "GenotypeParser",
70
+ "ParserNotFoundError",
71
+ "detect_parser",
72
+ "get_parser_by_name",
73
+ ]