allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Parse raw SNPedia wiki markup into structured genotype rows.
|
|
4
|
+
|
|
5
|
+
Called automatically by the SNPedia annotator when raw pages exist but
|
|
6
|
+
the structured ``snpedia_genotypes`` table does not. Can also be invoked
|
|
7
|
+
standalone via ``scripts/parse_snpedia.py``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import contextlib
|
|
13
|
+
import logging
|
|
14
|
+
import sqlite3
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
|
|
17
|
+
import mwparserfromhell
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_PARSER_VERSION = 6
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_title_prefix(title: str) -> tuple[str, str] | None:
|
|
25
|
+
"""Extract (prefix, number) from a title like 'Rs12345' or 'I4000178'.
|
|
26
|
+
|
|
27
|
+
Returns None if the title doesn't start with Rs or I followed by digits.
|
|
28
|
+
"""
|
|
29
|
+
if title.startswith("Rs") or title.startswith("I"):
|
|
30
|
+
prefix = "Rs" if title.startswith("Rs") else "I"
|
|
31
|
+
rest = title[len(prefix) :]
|
|
32
|
+
digits = []
|
|
33
|
+
for ch in rest:
|
|
34
|
+
if ch.isdigit():
|
|
35
|
+
digits.append(ch)
|
|
36
|
+
else:
|
|
37
|
+
break
|
|
38
|
+
if digits:
|
|
39
|
+
return prefix, "".join(digits)
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _parse_title_alleles(title: str) -> tuple[str, str] | None:
|
|
44
|
+
"""Extract alleles from a title like 'Rs12345(A;G)' or 'I4000178(C;T)'.
|
|
45
|
+
|
|
46
|
+
Returns None if the title doesn't contain a valid (allele;allele) suffix.
|
|
47
|
+
"""
|
|
48
|
+
paren_start = title.find("(")
|
|
49
|
+
if paren_start == -1 or not title.endswith(")"):
|
|
50
|
+
return None
|
|
51
|
+
inner = title[paren_start + 1 : -1]
|
|
52
|
+
semi = inner.find(";")
|
|
53
|
+
if semi == -1:
|
|
54
|
+
return None
|
|
55
|
+
a1 = inner[:semi].strip()
|
|
56
|
+
a2 = inner[semi + 1 :].strip()
|
|
57
|
+
if a1 and a2:
|
|
58
|
+
return a1, a2
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _tmpl_param(tmpl: object, name: str) -> str:
|
|
63
|
+
"""Extract a named parameter from a mwparserfromhell template."""
|
|
64
|
+
if tmpl.has(name):
|
|
65
|
+
return str(tmpl.get(name).value).strip()
|
|
66
|
+
return ""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
_STRUCTURED_SCHEMA = """
|
|
70
|
+
CREATE TABLE IF NOT EXISTS snpedia_genotypes (
|
|
71
|
+
rsid TEXT NOT NULL,
|
|
72
|
+
allele1 TEXT NOT NULL,
|
|
73
|
+
allele2 TEXT NOT NULL,
|
|
74
|
+
magnitude REAL,
|
|
75
|
+
repute TEXT,
|
|
76
|
+
summary TEXT,
|
|
77
|
+
gene TEXT,
|
|
78
|
+
scraped_at TEXT
|
|
79
|
+
);
|
|
80
|
+
|
|
81
|
+
CREATE INDEX IF NOT EXISTS idx_snpedia_rsid_alleles
|
|
82
|
+
ON snpedia_genotypes(rsid, allele1, allele2);
|
|
83
|
+
|
|
84
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_snpedia_genotype_dedup
|
|
85
|
+
ON snpedia_genotypes(rsid, allele1, allele2, COALESCE(summary, ''));
|
|
86
|
+
|
|
87
|
+
CREATE TABLE IF NOT EXISTS database_versions (
|
|
88
|
+
name TEXT PRIMARY KEY,
|
|
89
|
+
source_url TEXT NOT NULL,
|
|
90
|
+
version TEXT,
|
|
91
|
+
downloaded_at TEXT NOT NULL,
|
|
92
|
+
record_count INTEGER NOT NULL,
|
|
93
|
+
remote_signal TEXT,
|
|
94
|
+
local_version_tag TEXT
|
|
95
|
+
);
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def parser_is_current(conn: sqlite3.Connection) -> bool:
|
|
100
|
+
"""Return True if the cache was built by the current parser version.
|
|
101
|
+
|
|
102
|
+
Checks ``local_version_tag`` first. If absent, falls back to the
|
|
103
|
+
legacy ``|pv:N`` suffix in ``remote_signal`` and migrates the tag
|
|
104
|
+
in-place — avoiding a full re-parse just to populate the column.
|
|
105
|
+
"""
|
|
106
|
+
tag = f"pv:{_PARSER_VERSION}"
|
|
107
|
+
try:
|
|
108
|
+
row = conn.execute(
|
|
109
|
+
"SELECT local_version_tag FROM database_versions WHERE name='snpedia'"
|
|
110
|
+
).fetchone()
|
|
111
|
+
if row and row[0] == tag:
|
|
112
|
+
return True
|
|
113
|
+
except sqlite3.OperationalError:
|
|
114
|
+
pass
|
|
115
|
+
try:
|
|
116
|
+
row = conn.execute(
|
|
117
|
+
"SELECT remote_signal FROM database_versions WHERE name='snpedia'"
|
|
118
|
+
).fetchone()
|
|
119
|
+
if row and row[0] and f"|pv:{_PARSER_VERSION}" in row[0]:
|
|
120
|
+
from allelix.databases.manager import _ensure_local_version_tag_column
|
|
121
|
+
|
|
122
|
+
_ensure_local_version_tag_column(conn)
|
|
123
|
+
clean_signal = row[0].split("|pv:")[0]
|
|
124
|
+
conn.execute(
|
|
125
|
+
"UPDATE database_versions "
|
|
126
|
+
"SET remote_signal = ?, local_version_tag = ? WHERE name = 'snpedia'",
|
|
127
|
+
(clean_signal, tag),
|
|
128
|
+
)
|
|
129
|
+
conn.commit()
|
|
130
|
+
return True
|
|
131
|
+
except sqlite3.OperationalError:
|
|
132
|
+
pass
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _dedupe_existing(conn: sqlite3.Connection) -> int:
|
|
137
|
+
"""Collapse pre-existing duplicate rows in old caches. Returns rows removed."""
|
|
138
|
+
before = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
|
|
139
|
+
conn.execute("""
|
|
140
|
+
DELETE FROM snpedia_genotypes
|
|
141
|
+
WHERE rowid NOT IN (
|
|
142
|
+
SELECT MIN(rowid) FROM snpedia_genotypes
|
|
143
|
+
GROUP BY rsid, UPPER(allele1), UPPER(allele2), COALESCE(summary, '')
|
|
144
|
+
)
|
|
145
|
+
""")
|
|
146
|
+
after = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
|
|
147
|
+
return before - after
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def detect_raw_table(conn: sqlite3.Connection) -> str | None:
|
|
151
|
+
"""Return the name of the raw pages table, or None if absent."""
|
|
152
|
+
tables = {
|
|
153
|
+
row[0]
|
|
154
|
+
for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
|
155
|
+
}
|
|
156
|
+
if "_raw_pages" in tables:
|
|
157
|
+
return "_raw_pages"
|
|
158
|
+
if "pages" in tables:
|
|
159
|
+
return "pages"
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def has_structured_table(conn: sqlite3.Connection) -> bool:
|
|
164
|
+
"""Return True if snpedia_genotypes exists and has rows."""
|
|
165
|
+
try:
|
|
166
|
+
count = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
|
|
167
|
+
return count > 0
|
|
168
|
+
except sqlite3.OperationalError:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def parse_raw_pages(db_path: str, *, verbose: bool = False) -> int:
|
|
173
|
+
"""Parse raw wiki markup into structured genotype rows.
|
|
174
|
+
|
|
175
|
+
Returns the number of structured rows created.
|
|
176
|
+
"""
|
|
177
|
+
with contextlib.closing(sqlite3.connect(db_path)) as conn:
|
|
178
|
+
return _parse_raw_pages_inner(conn, verbose=verbose)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _parse_raw_pages_inner(conn: sqlite3.Connection, *, verbose: bool = False) -> int:
|
|
182
|
+
"""Inner parser logic. Caller owns the connection lifecycle."""
|
|
183
|
+
raw_table = detect_raw_table(conn)
|
|
184
|
+
if raw_table is None:
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
if verbose:
|
|
188
|
+
logger.info("Parsing SNPedia raw pages from '%s' table", raw_table)
|
|
189
|
+
|
|
190
|
+
conn.execute("DROP INDEX IF EXISTS idx_snpedia_genotype_dedup")
|
|
191
|
+
|
|
192
|
+
has_table = conn.execute(
|
|
193
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='snpedia_genotypes'"
|
|
194
|
+
).fetchone()
|
|
195
|
+
if has_table:
|
|
196
|
+
deduped = _dedupe_existing(conn)
|
|
197
|
+
if deduped:
|
|
198
|
+
logger.info("Backfill dedupe: removed %d duplicate row(s)", deduped)
|
|
199
|
+
|
|
200
|
+
conn.executescript(_STRUCTURED_SCHEMA)
|
|
201
|
+
conn.execute("DELETE FROM snpedia_genotypes")
|
|
202
|
+
|
|
203
|
+
# Build gene map from SNP pages
|
|
204
|
+
gene_map: dict[str, str] = {}
|
|
205
|
+
snp_rows = conn.execute(
|
|
206
|
+
f"SELECT title, content FROM {raw_table} WHERE category = 'snp'"
|
|
207
|
+
).fetchall()
|
|
208
|
+
print(f" Building gene map from {len(snp_rows)} SNP pages...", flush=True)
|
|
209
|
+
for title, content in snp_rows:
|
|
210
|
+
parsed_prefix = _parse_title_prefix(title)
|
|
211
|
+
if not parsed_prefix or not content:
|
|
212
|
+
continue
|
|
213
|
+
prefix, num = parsed_prefix
|
|
214
|
+
snp_key = f"{prefix.lower()}{num}"
|
|
215
|
+
try:
|
|
216
|
+
wikicode = mwparserfromhell.parse(content)
|
|
217
|
+
for template in wikicode.filter_templates():
|
|
218
|
+
tname = template.name.strip().lower()
|
|
219
|
+
if tname in ("rsnum", "snp"):
|
|
220
|
+
gene = _tmpl_param(template, "Gene")
|
|
221
|
+
if gene:
|
|
222
|
+
gene_map[snp_key] = gene
|
|
223
|
+
break
|
|
224
|
+
if tname == "23andme snp":
|
|
225
|
+
gene = _tmpl_param(template, "Gene_s")
|
|
226
|
+
if gene:
|
|
227
|
+
gene_map[snp_key] = gene
|
|
228
|
+
break
|
|
229
|
+
except Exception:
|
|
230
|
+
logger.debug("Failed to parse SNP page %s", title, exc_info=True)
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
print(f" Gene map: {len(gene_map)} mappings built.", flush=True)
|
|
234
|
+
|
|
235
|
+
# Parse genotype pages
|
|
236
|
+
genotype_rows = conn.execute(
|
|
237
|
+
f"SELECT title, content, scraped_at FROM {raw_table} WHERE category = 'genotype'"
|
|
238
|
+
).fetchall()
|
|
239
|
+
print(f" Parsing {len(genotype_rows)} genotype pages...", flush=True)
|
|
240
|
+
|
|
241
|
+
batch: list[tuple[str, str, str, float | None, str | None, str | None, str | None, str]] = []
|
|
242
|
+
|
|
243
|
+
for title, content, scraped_at in genotype_rows:
|
|
244
|
+
parsed_prefix = _parse_title_prefix(title)
|
|
245
|
+
if not parsed_prefix or not content:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
prefix, num = parsed_prefix
|
|
249
|
+
snp_id = f"{prefix.lower()}{num}"
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
wikicode = mwparserfromhell.parse(content)
|
|
253
|
+
except Exception:
|
|
254
|
+
logger.debug("Failed to parse genotype page %s", title, exc_info=True)
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
templates = [
|
|
258
|
+
t for t in wikicode.filter_templates() if t.name.strip().lower() == "genotype"
|
|
259
|
+
]
|
|
260
|
+
if not templates:
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
tmpl = templates[0]
|
|
264
|
+
|
|
265
|
+
allele1 = _tmpl_param(tmpl, "allele1").upper()
|
|
266
|
+
allele2 = _tmpl_param(tmpl, "allele2").upper()
|
|
267
|
+
if not allele1 or not allele2:
|
|
268
|
+
title_alleles = _parse_title_alleles(title)
|
|
269
|
+
if not title_alleles:
|
|
270
|
+
continue
|
|
271
|
+
allele1, allele2 = title_alleles[0].upper(), title_alleles[1].upper()
|
|
272
|
+
if not allele1 or not allele2:
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
if allele1 > allele2:
|
|
276
|
+
allele1, allele2 = allele2, allele1
|
|
277
|
+
|
|
278
|
+
mag_str = _tmpl_param(tmpl, "magnitude")
|
|
279
|
+
magnitude: float | None = None
|
|
280
|
+
if mag_str:
|
|
281
|
+
try:
|
|
282
|
+
magnitude = float(mag_str)
|
|
283
|
+
except ValueError:
|
|
284
|
+
magnitude = None
|
|
285
|
+
|
|
286
|
+
repute = _tmpl_param(tmpl, "repute") or None
|
|
287
|
+
summary = _tmpl_param(tmpl, "summary") or None
|
|
288
|
+
gene = gene_map.get(snp_id) or None
|
|
289
|
+
|
|
290
|
+
batch.append((snp_id, allele1, allele2, magnitude, repute, summary, gene, scraped_at))
|
|
291
|
+
|
|
292
|
+
if len(batch) >= 1000:
|
|
293
|
+
conn.executemany(
|
|
294
|
+
"INSERT OR IGNORE INTO snpedia_genotypes "
|
|
295
|
+
"(rsid, allele1, allele2, magnitude, repute, summary, gene, scraped_at) "
|
|
296
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
297
|
+
batch,
|
|
298
|
+
)
|
|
299
|
+
batch.clear()
|
|
300
|
+
|
|
301
|
+
if batch:
|
|
302
|
+
conn.executemany(
|
|
303
|
+
"INSERT OR IGNORE INTO snpedia_genotypes "
|
|
304
|
+
"(rsid, allele1, allele2, magnitude, repute, summary, gene, scraped_at) "
|
|
305
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
306
|
+
batch,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
row_count = conn.execute("SELECT COUNT(*) FROM snpedia_genotypes").fetchone()[0]
|
|
310
|
+
|
|
311
|
+
date_row = conn.execute(f"SELECT MIN(scraped_at) FROM {raw_table}").fetchone()
|
|
312
|
+
scrape_date = date_row[0][:10] if date_row and date_row[0] else "unknown"
|
|
313
|
+
|
|
314
|
+
existing_signal = ""
|
|
315
|
+
try:
|
|
316
|
+
sig_row = conn.execute(
|
|
317
|
+
"SELECT remote_signal FROM database_versions WHERE name = 'snpedia'"
|
|
318
|
+
).fetchone()
|
|
319
|
+
if sig_row and sig_row[0]:
|
|
320
|
+
existing_signal = sig_row[0].split("|pv:")[0]
|
|
321
|
+
except sqlite3.OperationalError:
|
|
322
|
+
pass
|
|
323
|
+
|
|
324
|
+
conn.execute("DELETE FROM database_versions WHERE name = 'snpedia'")
|
|
325
|
+
conn.execute(
|
|
326
|
+
"INSERT INTO database_versions "
|
|
327
|
+
"(name, source_url, version, downloaded_at, record_count, "
|
|
328
|
+
"remote_signal, local_version_tag) "
|
|
329
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
330
|
+
(
|
|
331
|
+
"snpedia",
|
|
332
|
+
"https://bots.snpedia.com/api.php",
|
|
333
|
+
f"scraped {scrape_date} ({row_count} genotypes)",
|
|
334
|
+
datetime.now(UTC).isoformat(),
|
|
335
|
+
row_count,
|
|
336
|
+
existing_signal,
|
|
337
|
+
f"pv:{_PARSER_VERSION}",
|
|
338
|
+
),
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
conn.commit()
|
|
342
|
+
return row_count
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""PLINK1 binary format (.bed/.bim/.fam) exporter."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from allelix.utils.allele import complement, is_strand_ambiguous
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from allelix.models import Variant
|
|
16
|
+
|
|
17
|
+
_BED_MAGIC = bytes([0x6C, 0x1B, 0x01])
|
|
18
|
+
|
|
19
|
+
_CHROM_CODES = {
|
|
20
|
+
"X": "23",
|
|
21
|
+
"Y": "24",
|
|
22
|
+
"MT": "26",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _orient_genotype(
|
|
27
|
+
allele1: str,
|
|
28
|
+
allele2: str,
|
|
29
|
+
ref: str,
|
|
30
|
+
alt: str,
|
|
31
|
+
) -> tuple[str, str] | None:
|
|
32
|
+
"""Map user alleles to {ref, alt} in a consistent orientation.
|
|
33
|
+
|
|
34
|
+
Returns None for palindromic sites, indels, or alleles that don't fit.
|
|
35
|
+
Both alleles are tested in the same orientation — no mixed-strand.
|
|
36
|
+
"""
|
|
37
|
+
if len(allele1) != 1 or len(allele2) != 1:
|
|
38
|
+
return None
|
|
39
|
+
if is_strand_ambiguous(ref, alt):
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
pair = {allele1, allele2}
|
|
43
|
+
if pair <= {ref, alt}:
|
|
44
|
+
return (allele1, allele2)
|
|
45
|
+
|
|
46
|
+
c1, c2 = complement(allele1), complement(allele2)
|
|
47
|
+
if {c1, c2} <= {ref, alt}:
|
|
48
|
+
return (c1, c2)
|
|
49
|
+
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def export_plink(
|
|
54
|
+
variants: Iterator[Variant],
|
|
55
|
+
prefix: Path,
|
|
56
|
+
build: str,
|
|
57
|
+
ref_alt_map: dict[str, tuple[str, str]] | None = None,
|
|
58
|
+
) -> tuple[int, int, int, int]:
|
|
59
|
+
"""Write .bed/.bim/.fam from parsed variants.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
variants: Parsed variant iterator (consumed once).
|
|
63
|
+
prefix: Base path for output files.
|
|
64
|
+
build: Genome build label (informational, not used for liftover).
|
|
65
|
+
ref_alt_map: ``{rsid: (ref, alt)}`` from gnomAD coordinate resolution.
|
|
66
|
+
When provided, uses ref/alt to assign A1/A2 for proper allele coding.
|
|
67
|
+
When None or rsid missing, falls back to ``A2="0"`` for homozygotes.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
``(variants_written, no_calls_skipped, indels_skipped, monomorphic_count)``
|
|
71
|
+
|
|
72
|
+
Note:
|
|
73
|
+
No-call variants and indels (multi-character alleles) are dropped.
|
|
74
|
+
PLINK1 BIM is SNV-only (single-character A1/A2). Indels would
|
|
75
|
+
produce non-standard BIM rows that downstream tools may reject.
|
|
76
|
+
"""
|
|
77
|
+
fam_path = prefix.with_suffix(".fam")
|
|
78
|
+
bim_path = prefix.with_suffix(".bim")
|
|
79
|
+
bed_path = prefix.with_suffix(".bed")
|
|
80
|
+
|
|
81
|
+
fam_path.write_text("0\tSAMPLE\t0\t0\t0\t-9\n")
|
|
82
|
+
|
|
83
|
+
written = 0
|
|
84
|
+
skipped = 0
|
|
85
|
+
indels = 0
|
|
86
|
+
monomorphic = 0
|
|
87
|
+
|
|
88
|
+
with bim_path.open("w") as bim_f, bed_path.open("wb") as bed_f:
|
|
89
|
+
bed_f.write(_BED_MAGIC)
|
|
90
|
+
|
|
91
|
+
for v in variants:
|
|
92
|
+
if v.is_no_call:
|
|
93
|
+
skipped += 1
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
if len(v.allele1) != 1 or len(v.allele2) != 1:
|
|
97
|
+
indels += 1
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
chrom_code = _CHROM_CODES.get(v.chromosome, v.chromosome)
|
|
101
|
+
a1: str
|
|
102
|
+
a2: str
|
|
103
|
+
bed_code: int
|
|
104
|
+
|
|
105
|
+
if ref_alt_map and v.rsid in ref_alt_map:
|
|
106
|
+
ref, alt = ref_alt_map[v.rsid]
|
|
107
|
+
resolved = _orient_genotype(v.allele1, v.allele2, ref, alt)
|
|
108
|
+
if resolved is not None:
|
|
109
|
+
r1, r2 = resolved
|
|
110
|
+
a1 = ref
|
|
111
|
+
a2 = alt
|
|
112
|
+
a2_count = sum(1 for a in (r1, r2) if a == alt)
|
|
113
|
+
if a2_count == 0:
|
|
114
|
+
bed_code = 0b00
|
|
115
|
+
elif a2_count == 1:
|
|
116
|
+
bed_code = 0b10
|
|
117
|
+
else:
|
|
118
|
+
bed_code = 0b11
|
|
119
|
+
else:
|
|
120
|
+
a1, a2, bed_code, is_mono = _fallback_coding(v)
|
|
121
|
+
if is_mono:
|
|
122
|
+
monomorphic += 1
|
|
123
|
+
else:
|
|
124
|
+
a1, a2, bed_code, is_mono = _fallback_coding(v)
|
|
125
|
+
if is_mono:
|
|
126
|
+
monomorphic += 1
|
|
127
|
+
|
|
128
|
+
bim_f.write(f"{chrom_code}\t{v.rsid}\t0\t{v.position}\t{a1}\t{a2}\n")
|
|
129
|
+
bed_f.write(bytes([bed_code]))
|
|
130
|
+
written += 1
|
|
131
|
+
|
|
132
|
+
return written, skipped, indels, monomorphic
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _fallback_coding(v: Variant) -> tuple[str, str, int, bool]:
|
|
136
|
+
"""Fallback allele coding when ref/alt is unknown.
|
|
137
|
+
|
|
138
|
+
Returns ``(a1, a2, bed_code, is_monomorphic)``.
|
|
139
|
+
"""
|
|
140
|
+
if v.is_heterozygous:
|
|
141
|
+
alleles = sorted([v.allele1, v.allele2])
|
|
142
|
+
return alleles[0], alleles[1], 0b10, False
|
|
143
|
+
|
|
144
|
+
return v.allele1, "0", 0b00, True
|
allelix/models.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Core data models for genotype variants and reference annotations.
|
|
4
|
+
|
|
5
|
+
Trust boundary: parsers are responsible for validating raw input. Model
|
|
6
|
+
constructors do not enforce chromosome names, position bounds, or allele
|
|
7
|
+
encodings — they trust their caller. If a Variant or Annotation is
|
|
8
|
+
constructed by code outside the `allelix.parsers` package, the caller owns
|
|
9
|
+
the validation.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
|
|
16
|
+
NO_CALL_MARKER = "-"
|
|
17
|
+
DEFAULT_BUILD = "GRCh37"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Variant:
|
|
22
|
+
"""A single genotype call: which alleles a sample carries at one position.
|
|
23
|
+
|
|
24
|
+
All parsers normalize to this representation. Downstream code (annotators,
|
|
25
|
+
reports) only sees Variants, never raw file formats.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
rsid: dbSNP reference identifier (e.g., "rs1801133").
|
|
29
|
+
chromosome: Chromosome name. "1"-"22", "X", "Y", or "MT".
|
|
30
|
+
position: 1-based genomic coordinate in the given build.
|
|
31
|
+
allele1: First observed allele. A/T/G/C, multi-base for indels, or "-" for no-call.
|
|
32
|
+
allele2: Second observed allele. Same encoding as allele1.
|
|
33
|
+
build: Reference genome build. "GRCh37" (hg19) or "GRCh38" (hg38).
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
rsid: str
|
|
37
|
+
chromosome: str
|
|
38
|
+
position: int
|
|
39
|
+
allele1: str
|
|
40
|
+
allele2: str
|
|
41
|
+
build: str = DEFAULT_BUILD
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def is_heterozygous(self) -> bool:
|
|
45
|
+
"""True if the two alleles differ (and neither is a no-call)."""
|
|
46
|
+
if self.is_no_call:
|
|
47
|
+
return False
|
|
48
|
+
return self.allele1 != self.allele2
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def is_no_call(self) -> bool:
|
|
52
|
+
"""True if either allele is the no-call marker.
|
|
53
|
+
|
|
54
|
+
Typically indicates assay failure at this position, but the precise
|
|
55
|
+
meaning is format-dependent (some VCFs use `-` for indel deletions).
|
|
56
|
+
"""
|
|
57
|
+
return self.allele1 == NO_CALL_MARKER or self.allele2 == NO_CALL_MARKER
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def genotype(self) -> str:
|
|
61
|
+
"""Human-readable genotype string (e.g., "C/T")."""
|
|
62
|
+
return f"{self.allele1}/{self.allele2}"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class Annotation:
|
|
67
|
+
"""A claim about a variant sourced from a specific reference database.
|
|
68
|
+
|
|
69
|
+
Allelix never asserts variant significance directly — every Annotation is
|
|
70
|
+
attributed to its source database. See README § Regulatory Posture.
|
|
71
|
+
|
|
72
|
+
Attributes:
|
|
73
|
+
source: Lowercase database identifier (e.g., "clinvar", "pharmgkb").
|
|
74
|
+
rsid: The variant this annotation applies to.
|
|
75
|
+
significance: Source-prefixed classification (e.g., "clinvar_pathogenic").
|
|
76
|
+
category: Coarse filter bucket. Use non-diagnostic labels: "clinical",
|
|
77
|
+
"pharma", "carrier", "trait", "methylation". Never bare medical terms
|
|
78
|
+
like "pathogenic" — those would read as Allelix's own classification.
|
|
79
|
+
magnitude: 0-10 importance score (SNPedia-style).
|
|
80
|
+
description: Human-readable explanation.
|
|
81
|
+
attribution: Display name of the source ("ClinVar", "PharmGKB", ...).
|
|
82
|
+
genotype_match: Which genotype triggers this annotation (e.g., "T/T").
|
|
83
|
+
references: PubMed IDs or URLs supporting the claim.
|
|
84
|
+
condition: Disease or condition name, if applicable.
|
|
85
|
+
gene: Gene symbol, if known.
|
|
86
|
+
review_status: ClinVar review status (CLNREVSTAT), empty for non-ClinVar.
|
|
87
|
+
is_must_include: Internal flag for GWAS rollup; excluded from public output.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
source: str
|
|
91
|
+
rsid: str
|
|
92
|
+
significance: str
|
|
93
|
+
category: str
|
|
94
|
+
magnitude: float
|
|
95
|
+
description: str
|
|
96
|
+
attribution: str
|
|
97
|
+
genotype_match: str
|
|
98
|
+
references: list[str] = field(default_factory=list)
|
|
99
|
+
condition: str = ""
|
|
100
|
+
gene: str = ""
|
|
101
|
+
review_status: str = ""
|
|
102
|
+
alt: str = ""
|
|
103
|
+
is_must_include: bool = False
|
|
104
|
+
allele_frequency: float | None = None
|
|
105
|
+
am_pathogenicity: float | None = None
|
|
106
|
+
am_class: str = ""
|
|
107
|
+
cadd_phred: float | None = None
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def zygosity(self) -> str:
|
|
111
|
+
"""Classify the genotype call as Heterozygous, Homozygous, or No Call."""
|
|
112
|
+
if NO_CALL_MARKER in self.genotype_match:
|
|
113
|
+
return "No Call"
|
|
114
|
+
parts = self.genotype_match.split("/")
|
|
115
|
+
if len(parts) != 2:
|
|
116
|
+
return "Homozygous" if len(set(self.genotype_match)) == 1 else "Heterozygous"
|
|
117
|
+
return "Heterozygous" if parts[0] != parts[1] else "Homozygous"
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Parser registry. Auto-detection tries each registered parser; first match wins."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from allelix.parsers.ancestrydna import AncestryDNAParser
|
|
10
|
+
from allelix.parsers.base import GenotypeParser
|
|
11
|
+
from allelix.parsers.ftdna import FTDNAParser
|
|
12
|
+
from allelix.parsers.livingdna import LivingDNAParser
|
|
13
|
+
from allelix.parsers.myhappygenes import MyHappyGenesParser
|
|
14
|
+
from allelix.parsers.myheritage import MyHeritageParser
|
|
15
|
+
from allelix.parsers.twentythreeandme import TwentyThreeAndMeParser
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
PARSERS: list[GenotypeParser] = [
|
|
21
|
+
MyHappyGenesParser(),
|
|
22
|
+
TwentyThreeAndMeParser(),
|
|
23
|
+
AncestryDNAParser(),
|
|
24
|
+
LivingDNAParser(),
|
|
25
|
+
MyHeritageParser(),
|
|
26
|
+
FTDNAParser(),
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ParserNotFoundError(ValueError):
|
|
31
|
+
"""Raised when no parser can handle a file or a named parser does not exist."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_parser_by_name(name: str) -> GenotypeParser:
|
|
35
|
+
"""Look up a parser by its `name` attribute.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
name: Lowercase parser identifier (e.g., "myhappygenes").
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
ParserNotFoundError: If no registered parser has that name.
|
|
42
|
+
"""
|
|
43
|
+
for parser in PARSERS:
|
|
44
|
+
if parser.name == name:
|
|
45
|
+
return parser
|
|
46
|
+
available = ", ".join(p.name for p in PARSERS)
|
|
47
|
+
raise ParserNotFoundError(f"Unknown parser {name!r}. Available: {available}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def detect_parser(file_path: Path) -> GenotypeParser:
|
|
51
|
+
"""Auto-detect the parser for a file. First match wins.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
file_path: Path to the genotype file.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ParserNotFoundError: If no parser recognizes the format.
|
|
58
|
+
"""
|
|
59
|
+
for parser in PARSERS:
|
|
60
|
+
if parser.can_parse(file_path):
|
|
61
|
+
return parser
|
|
62
|
+
raise ParserNotFoundError(
|
|
63
|
+
f"No parser recognized {file_path.name!r}. Try forcing a format with --format <name>."
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
__all__ = [
|
|
68
|
+
"PARSERS",
|
|
69
|
+
"GenotypeParser",
|
|
70
|
+
"ParserNotFoundError",
|
|
71
|
+
"detect_parser",
|
|
72
|
+
"get_parser_by_name",
|
|
73
|
+
]
|