allelix 1.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- allelix/__init__.py +12 -0
- allelix/annotators/__init__.py +90 -0
- allelix/annotators/alphamissense.py +228 -0
- allelix/annotators/base.py +214 -0
- allelix/annotators/cadd.py +283 -0
- allelix/annotators/clinvar.py +404 -0
- allelix/annotators/gnomad.py +212 -0
- allelix/annotators/gwas.py +354 -0
- allelix/annotators/pharmgkb.py +406 -0
- allelix/annotators/snpedia.py +276 -0
- allelix/cli.py +1524 -0
- allelix/compare.py +149 -0
- allelix/config.py +143 -0
- allelix/data/__init__.py +3 -0
- allelix/data/high_value_snps.yaml +64 -0
- allelix/databases/__init__.py +30 -0
- allelix/databases/_versions.py +16 -0
- allelix/databases/alphamissense_loader.py +48 -0
- allelix/databases/cadd_loader.py +49 -0
- allelix/databases/cpic_loader.py +234 -0
- allelix/databases/gnomad_loader.py +49 -0
- allelix/databases/gwas_loader.py +546 -0
- allelix/databases/loader_utils.py +80 -0
- allelix/databases/manager.py +515 -0
- allelix/databases/pharmgkb_loader.py +437 -0
- allelix/databases/schema.py +165 -0
- allelix/databases/snpedia_loader.py +44 -0
- allelix/databases/snpedia_parser.py +342 -0
- allelix/exporters/__init__.py +3 -0
- allelix/exporters/plink.py +144 -0
- allelix/models.py +117 -0
- allelix/parsers/__init__.py +73 -0
- allelix/parsers/_helpers.py +41 -0
- allelix/parsers/ancestrydna.py +130 -0
- allelix/parsers/base.py +97 -0
- allelix/parsers/ftdna.py +129 -0
- allelix/parsers/livingdna.py +121 -0
- allelix/parsers/myhappygenes.py +135 -0
- allelix/parsers/myheritage.py +118 -0
- allelix/parsers/twentythreeandme.py +150 -0
- allelix/py.typed +0 -0
- allelix/reports/__init__.py +40 -0
- allelix/reports/_pipeline.py +497 -0
- allelix/reports/diff.py +169 -0
- allelix/reports/high_value.py +133 -0
- allelix/reports/html.py +1130 -0
- allelix/reports/json_report.py +163 -0
- allelix/reports/methylation.py +50 -0
- allelix/reports/terminal.py +203 -0
- allelix/utils/__init__.py +3 -0
- allelix/utils/allele.py +87 -0
- allelix/utils/build_detect.py +203 -0
- allelix-1.8.1.dist-info/METADATA +276 -0
- allelix-1.8.1.dist-info/RECORD +58 -0
- allelix-1.8.1.dist-info/WHEEL +5 -0
- allelix-1.8.1.dist-info/entry_points.txt +2 -0
- allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
- allelix-1.8.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Shared analysis pipeline used by `analyze`, `methylation`, and `pharmacogenomics`.
|
|
4
|
+
|
|
5
|
+
The CLI builds an `AnalysisResult` once and hands it to a renderer
|
|
6
|
+
(terminal, JSON, HTML). Renderers never query the database or re-iterate
|
|
7
|
+
the parser — they receive a fully-populated value object.
|
|
8
|
+
|
|
9
|
+
ADR-0021: this pipeline owns build detection. Parsers report the
|
|
10
|
+
header-claimed build; the pipeline replaces each variant's `build`
|
|
11
|
+
with the build detected from position data (or the user's `--build`
|
|
12
|
+
override) before annotators see the variant.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import contextlib
|
|
18
|
+
import logging
|
|
19
|
+
from dataclasses import dataclass, field, replace
|
|
20
|
+
from typing import TYPE_CHECKING
|
|
21
|
+
|
|
22
|
+
from allelix.utils.build_detect import (
|
|
23
|
+
BUILD_GRCH36,
|
|
24
|
+
BUILD_GRCH37,
|
|
25
|
+
BUILD_GRCH38,
|
|
26
|
+
KNOWN_SNP_POSITIONS,
|
|
27
|
+
detect_build,
|
|
28
|
+
normalize_build_label,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from collections.abc import Callable, Iterable
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
from allelix.annotators.alphamissense import AlphaMissenseAnnotator
|
|
36
|
+
from allelix.annotators.base import Annotator
|
|
37
|
+
from allelix.annotators.cadd import CaddAnnotator
|
|
38
|
+
from allelix.annotators.gnomad import GnomadAnnotator
|
|
39
|
+
from allelix.models import Annotation, Variant
|
|
40
|
+
from allelix.parsers.base import GenotypeParser
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# How many input variants to buffer while waiting for detection to
|
|
44
|
+
# converge. Detection completes once every entry in KNOWN_SNP_POSITIONS
|
|
45
|
+
# has been seen; typical files cover the table within the first ~5000
|
|
46
|
+
# probes. Cap so a file with no known SNPs doesn't buffer the whole
|
|
47
|
+
# input.
|
|
48
|
+
_DETECTION_BUFFER_LIMIT = 100_000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class BuildDiagnostics:
|
|
53
|
+
"""What the pipeline learned about the file's genome build.
|
|
54
|
+
|
|
55
|
+
`header_build` is the build claimed by the file header (normalized
|
|
56
|
+
to GRCh37/GRCh38 via `normalize_build_label`; may be None if the
|
|
57
|
+
header doesn't say or uses an unrecognized label).
|
|
58
|
+
|
|
59
|
+
`detected_build` is what position data says (None if no known SNPs
|
|
60
|
+
appeared in the input).
|
|
61
|
+
|
|
62
|
+
`effective_build` is what was actually used for annotation — either
|
|
63
|
+
a CLI `--build` override, the detected build, or a fallback. Always
|
|
64
|
+
set when the pipeline ran on any data.
|
|
65
|
+
|
|
66
|
+
`mismatch` is True when header_build and detected_build disagree
|
|
67
|
+
AND no override was supplied. The CLI surfaces this as a warning.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
header_build: str | None
|
|
71
|
+
detected_build: str | None
|
|
72
|
+
effective_build: str
|
|
73
|
+
override: bool
|
|
74
|
+
matched_count: int
|
|
75
|
+
inspected_count: int
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def mismatch(self) -> bool:
|
|
79
|
+
return (
|
|
80
|
+
not self.override
|
|
81
|
+
and self.header_build is not None
|
|
82
|
+
and self.detected_build is not None
|
|
83
|
+
and self.header_build != self.detected_build
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class AnalysisResult:
|
|
89
|
+
"""Everything a renderer needs to produce a report."""
|
|
90
|
+
|
|
91
|
+
file_path: Path
|
|
92
|
+
parser_name: str
|
|
93
|
+
parser_display_name: str
|
|
94
|
+
sample_id: str
|
|
95
|
+
build: str
|
|
96
|
+
total_variants: int
|
|
97
|
+
skipped_count: int
|
|
98
|
+
annotators_used: list[tuple[str, str | None]]
|
|
99
|
+
annotations: list[Annotation] = field(default_factory=list)
|
|
100
|
+
build_diagnostics: BuildDiagnostics | None = None
|
|
101
|
+
|
|
102
|
+
def filter(
|
|
103
|
+
self,
|
|
104
|
+
*,
|
|
105
|
+
min_magnitude: float = 0.0,
|
|
106
|
+
category: str | None = None,
|
|
107
|
+
genes: Iterable[str] | None = None,
|
|
108
|
+
source_min_magnitudes: dict[str, float] | None = None,
|
|
109
|
+
) -> list[Annotation]:
|
|
110
|
+
"""Apply the standard filters and return a sorted list of annotations.
|
|
111
|
+
|
|
112
|
+
Filters are independent and combine with AND. Sort is by magnitude
|
|
113
|
+
descending, then rsid ascending (stable, deterministic).
|
|
114
|
+
|
|
115
|
+
`source_min_magnitudes` overrides the floor for specific sources
|
|
116
|
+
(e.g. ``{"gwas": 9.0, "snpedia": 2.0}``). When a source has an
|
|
117
|
+
entry, that value IS the floor for that source — it can raise OR
|
|
118
|
+
lower the global ``min_magnitude``. Sources without an entry use
|
|
119
|
+
the global floor.
|
|
120
|
+
"""
|
|
121
|
+
gene_set = {g.upper() for g in genes} if genes else None
|
|
122
|
+
out: list[Annotation] = []
|
|
123
|
+
for a in self.annotations:
|
|
124
|
+
if (
|
|
125
|
+
source_min_magnitudes
|
|
126
|
+
and a.source in source_min_magnitudes
|
|
127
|
+
and not a.is_must_include
|
|
128
|
+
):
|
|
129
|
+
floor = source_min_magnitudes[a.source]
|
|
130
|
+
else:
|
|
131
|
+
floor = min_magnitude
|
|
132
|
+
if a.magnitude < floor:
|
|
133
|
+
continue
|
|
134
|
+
if category is not None and a.category != category:
|
|
135
|
+
continue
|
|
136
|
+
if gene_set is not None and (a.gene or "").upper() not in gene_set:
|
|
137
|
+
continue
|
|
138
|
+
out.append(a)
|
|
139
|
+
out.sort(key=lambda a: (-a.magnitude, a.rsid))
|
|
140
|
+
return out
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _gwas_base_trait(description: str) -> str | None:
|
|
144
|
+
"""Extract trait text from a GWAS description, stripping MTAG suffix and PheCode label."""
|
|
145
|
+
marker = "GWAS Catalog: "
|
|
146
|
+
if marker not in description:
|
|
147
|
+
return None
|
|
148
|
+
s = description.split(marker, 1)[1]
|
|
149
|
+
s = s.split(" (p=", 1)[0]
|
|
150
|
+
if s.endswith(" (MTAG)"):
|
|
151
|
+
s = s[: -len(" (MTAG)")]
|
|
152
|
+
s = s.split(" (PheCode ", 1)[0]
|
|
153
|
+
return s.strip().lower()
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _gwas_phecode_parent(description: str) -> str | None:
|
|
157
|
+
"""Extract PheCode parent (numeric prefix before the dot), or None."""
|
|
158
|
+
idx = description.find("(PheCode ")
|
|
159
|
+
if idx == -1:
|
|
160
|
+
return None
|
|
161
|
+
rest = description[idx + len("(PheCode ") :]
|
|
162
|
+
end = rest.find(")")
|
|
163
|
+
if end == -1:
|
|
164
|
+
return None
|
|
165
|
+
code = rest[:end].strip()
|
|
166
|
+
parent = code.split(".", 1)[0]
|
|
167
|
+
return parent if parent.isdigit() else None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _gwas_p_value(description: str) -> float:
|
|
171
|
+
"""Extract p-value from a GWAS description. Returns inf if unparseable."""
|
|
172
|
+
idx = description.find("(p=")
|
|
173
|
+
if idx == -1:
|
|
174
|
+
return float("inf")
|
|
175
|
+
rest = description[idx + len("(p=") :]
|
|
176
|
+
end = rest.find(",")
|
|
177
|
+
if end == -1:
|
|
178
|
+
end = rest.find(")")
|
|
179
|
+
if end == -1:
|
|
180
|
+
return float("inf")
|
|
181
|
+
try:
|
|
182
|
+
return float(rest[:end].strip())
|
|
183
|
+
except ValueError:
|
|
184
|
+
return float("inf")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def rollup_gwas_duplicates(annotations: list[Annotation]) -> list[Annotation]:
|
|
188
|
+
"""Collapse GWAS MTAG twins and PheCode parent/child hierarchies.
|
|
189
|
+
|
|
190
|
+
Operates on the filtered annotation list (the output of
|
|
191
|
+
AnalysisResult.filter). Non-GWAS rows pass through untouched.
|
|
192
|
+
Must-include rows are never dropped.
|
|
193
|
+
|
|
194
|
+
See ADR-0024 'MTAG and PheCode rollup' for rules.
|
|
195
|
+
"""
|
|
196
|
+
survivors: list[Annotation] = []
|
|
197
|
+
gwas_rows: list[Annotation] = []
|
|
198
|
+
for a in annotations:
|
|
199
|
+
(gwas_rows if a.source == "gwas" else survivors).append(a)
|
|
200
|
+
|
|
201
|
+
if not gwas_rows:
|
|
202
|
+
return annotations
|
|
203
|
+
|
|
204
|
+
plain_keys = {
|
|
205
|
+
(a.rsid, _gwas_base_trait(a.description))
|
|
206
|
+
for a in gwas_rows
|
|
207
|
+
if "(MTAG)" not in a.description
|
|
208
|
+
}
|
|
209
|
+
after_mtag = [
|
|
210
|
+
a
|
|
211
|
+
for a in gwas_rows
|
|
212
|
+
if a.is_must_include
|
|
213
|
+
or "(MTAG)" not in a.description
|
|
214
|
+
or (a.rsid, _gwas_base_trait(a.description)) not in plain_keys
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
by_parent: dict[tuple[str, str], list[Annotation]] = {}
|
|
218
|
+
no_phecode: list[Annotation] = []
|
|
219
|
+
for a in after_mtag:
|
|
220
|
+
parent = _gwas_phecode_parent(a.description)
|
|
221
|
+
if parent is None or a.is_must_include:
|
|
222
|
+
no_phecode.append(a)
|
|
223
|
+
else:
|
|
224
|
+
by_parent.setdefault((a.rsid, parent), []).append(a)
|
|
225
|
+
for group in by_parent.values():
|
|
226
|
+
winner = min(group, key=lambda x: _gwas_p_value(x.description))
|
|
227
|
+
no_phecode.append(winner)
|
|
228
|
+
|
|
229
|
+
survivors.extend(no_phecode)
|
|
230
|
+
survivors.sort(key=lambda a: (-a.magnitude, a.rsid))
|
|
231
|
+
return survivors
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _lookup_user_allele(
|
|
235
|
+
user_alt: str,
|
|
236
|
+
coords: list[tuple[str, int, str, str]],
|
|
237
|
+
scores: dict[tuple[str, int, str, str], float],
|
|
238
|
+
resolve_strand: Callable[[str, str, str], str | None],
|
|
239
|
+
) -> float | None:
|
|
240
|
+
"""Find the CADD score for a specific user allele at a multi-allelic site.
|
|
241
|
+
|
|
242
|
+
Prefers a direct allele match over a complement (minus-strand) match
|
|
243
|
+
to avoid false positives where the complement of the user's allele
|
|
244
|
+
coincidentally equals a different alt at the same position.
|
|
245
|
+
"""
|
|
246
|
+
for chrom, pos, ref, alt in coords:
|
|
247
|
+
if user_alt == alt:
|
|
248
|
+
return scores.get((chrom, pos, ref, alt))
|
|
249
|
+
for chrom, pos, ref, alt in coords:
|
|
250
|
+
resolved = resolve_strand(user_alt, ref, alt)
|
|
251
|
+
if resolved is not None and resolved == alt:
|
|
252
|
+
return scores.get((chrom, pos, ref, alt))
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _enrich_cadd(
|
|
257
|
+
annotations: list[Annotation],
|
|
258
|
+
gnomad: GnomadAnnotator,
|
|
259
|
+
cadd: CaddAnnotator,
|
|
260
|
+
) -> None:
|
|
261
|
+
"""Stamp annotations with CADD PHRED scores via coordinate resolution.
|
|
262
|
+
|
|
263
|
+
Resolves rsIDs to genomic coordinates through gnomAD, normalizes
|
|
264
|
+
alleles to reference-forward orientation, and looks up CADD scores.
|
|
265
|
+
"""
|
|
266
|
+
from allelix.utils.allele import resolve_strand
|
|
267
|
+
|
|
268
|
+
rsids = {a.rsid for a in annotations}
|
|
269
|
+
coord_map = gnomad.bulk_resolve_coordinates(rsids)
|
|
270
|
+
if not coord_map:
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
cadd_keys: set[tuple[str, int, str, str]] = set()
|
|
274
|
+
for coords in coord_map.values():
|
|
275
|
+
for chrom, pos, ref, alt in coords:
|
|
276
|
+
cadd_keys.add((chrom, pos, ref, alt))
|
|
277
|
+
scores = cadd.bulk_lookup(cadd_keys)
|
|
278
|
+
if not scores:
|
|
279
|
+
return
|
|
280
|
+
|
|
281
|
+
for a in annotations:
|
|
282
|
+
coords = coord_map.get(a.rsid)
|
|
283
|
+
if not coords:
|
|
284
|
+
continue
|
|
285
|
+
if a.alt:
|
|
286
|
+
score = _lookup_user_allele(a.alt, coords, scores, resolve_strand)
|
|
287
|
+
a.cadd_phred = score
|
|
288
|
+
else:
|
|
289
|
+
best: float | None = None
|
|
290
|
+
for chrom, pos, ref, alt in coords:
|
|
291
|
+
score = scores.get((chrom, pos, ref, alt))
|
|
292
|
+
if score is not None and (best is None or score > best):
|
|
293
|
+
best = score
|
|
294
|
+
a.cadd_phred = best
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def run_analysis(
|
|
298
|
+
file_path: Path,
|
|
299
|
+
parser: GenotypeParser,
|
|
300
|
+
annotators: list[Annotator],
|
|
301
|
+
skipped_count_provider: Callable[[], int] = lambda: 0,
|
|
302
|
+
*,
|
|
303
|
+
build_override: str | None = None,
|
|
304
|
+
gnomad: GnomadAnnotator | None = None,
|
|
305
|
+
alphamissense: AlphaMissenseAnnotator | None = None,
|
|
306
|
+
cadd: CaddAnnotator | None = None,
|
|
307
|
+
) -> AnalysisResult:
|
|
308
|
+
"""Stream the file once, query every ready annotator per variant, return results.
|
|
309
|
+
|
|
310
|
+
`build_override` short-circuits build detection: when supplied
|
|
311
|
+
(e.g., from `--build grch37`), every variant gets that build and
|
|
312
|
+
the position-data detector is skipped. When None, the pipeline
|
|
313
|
+
buffers the head of the stream until detection is confident, then
|
|
314
|
+
flushes through annotation.
|
|
315
|
+
|
|
316
|
+
Annotators are entered into a `contextlib.ExitStack` so their resources
|
|
317
|
+
(e.g., SQLite connections) are deterministically closed.
|
|
318
|
+
"""
|
|
319
|
+
metadata = parser.get_metadata(file_path)
|
|
320
|
+
header_build = normalize_build_label(metadata.get("build"))
|
|
321
|
+
|
|
322
|
+
annotations: list[Annotation] = []
|
|
323
|
+
total = 0
|
|
324
|
+
diag = _BuildDetectionState(override=build_override, header_build=header_build)
|
|
325
|
+
|
|
326
|
+
with contextlib.ExitStack() as stack:
|
|
327
|
+
bound = [stack.enter_context(a) for a in annotators]
|
|
328
|
+
for variant in parser.parse(file_path):
|
|
329
|
+
total += 1
|
|
330
|
+
ready, batch = diag.feed(variant)
|
|
331
|
+
if not ready:
|
|
332
|
+
continue
|
|
333
|
+
for v in batch:
|
|
334
|
+
for annotator in bound:
|
|
335
|
+
annotations.extend(annotator.annotate(v))
|
|
336
|
+
# End of stream: flush any buffered variants with the best
|
|
337
|
+
# effective build we can resolve (detected → header → default).
|
|
338
|
+
for v in diag.flush():
|
|
339
|
+
for annotator in bound:
|
|
340
|
+
annotations.extend(annotator.annotate(v))
|
|
341
|
+
|
|
342
|
+
if gnomad is not None and gnomad.is_ready():
|
|
343
|
+
exact_keys = {(a.rsid, a.alt) for a in annotations if a.alt}
|
|
344
|
+
max_rsids = {a.rsid for a in annotations if not a.alt}
|
|
345
|
+
exact_freq = gnomad.bulk_lookup_by_alt(exact_keys)
|
|
346
|
+
max_freq = gnomad.bulk_lookup(max_rsids)
|
|
347
|
+
for a in annotations:
|
|
348
|
+
if a.alt:
|
|
349
|
+
a.allele_frequency = exact_freq.get((a.rsid, a.alt))
|
|
350
|
+
else:
|
|
351
|
+
a.allele_frequency = max_freq.get(a.rsid)
|
|
352
|
+
|
|
353
|
+
if alphamissense is not None and alphamissense.is_ready():
|
|
354
|
+
exact_keys = {(a.rsid, a.alt) for a in annotations if a.alt}
|
|
355
|
+
max_rsids = {a.rsid for a in annotations if not a.alt}
|
|
356
|
+
exact_am = alphamissense.bulk_lookup_by_alt(exact_keys)
|
|
357
|
+
max_am = alphamissense.bulk_lookup(max_rsids)
|
|
358
|
+
for a in annotations:
|
|
359
|
+
hit = exact_am.get((a.rsid, a.alt)) if a.alt else max_am.get(a.rsid)
|
|
360
|
+
if hit is not None:
|
|
361
|
+
a.am_pathogenicity, a.am_class = hit
|
|
362
|
+
|
|
363
|
+
if cadd is not None and cadd.is_ready() and gnomad is not None and gnomad.is_ready():
|
|
364
|
+
if getattr(cadd, "_full_mode", False) and diag.effective_build != BUILD_GRCH38:
|
|
365
|
+
logging.getLogger(__name__).warning(
|
|
366
|
+
"CADD full mode requires GRCh38 coordinates; "
|
|
367
|
+
"detected %s — skipping CADD enrichment",
|
|
368
|
+
diag.effective_build,
|
|
369
|
+
)
|
|
370
|
+
else:
|
|
371
|
+
_enrich_cadd(annotations, gnomad, cadd)
|
|
372
|
+
|
|
373
|
+
annotators_used = [(a.name, a.version()) for a in annotators]
|
|
374
|
+
if gnomad is not None and gnomad.is_ready():
|
|
375
|
+
annotators_used.append((gnomad.name, gnomad.version()))
|
|
376
|
+
if alphamissense is not None and alphamissense.is_ready():
|
|
377
|
+
annotators_used.append((alphamissense.name, alphamissense.version()))
|
|
378
|
+
if cadd is not None and cadd.is_ready():
|
|
379
|
+
annotators_used.append((cadd.name, cadd.version()))
|
|
380
|
+
|
|
381
|
+
return AnalysisResult(
|
|
382
|
+
file_path=file_path,
|
|
383
|
+
parser_name=parser.name,
|
|
384
|
+
parser_display_name=parser.display_name,
|
|
385
|
+
sample_id=metadata["sample_id"],
|
|
386
|
+
build=diag.effective_build,
|
|
387
|
+
total_variants=total,
|
|
388
|
+
skipped_count=skipped_count_provider(),
|
|
389
|
+
annotators_used=annotators_used,
|
|
390
|
+
annotations=annotations,
|
|
391
|
+
build_diagnostics=diag.diagnostics(),
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class _BuildDetectionState:
|
|
396
|
+
"""Buffer-and-flush state machine for build detection during streaming.
|
|
397
|
+
|
|
398
|
+
`feed(variant)` returns (ready, batch). When `ready` is False, the
|
|
399
|
+
variant has been buffered and the caller should keep streaming.
|
|
400
|
+
When True, `batch` contains one or more variants with their build
|
|
401
|
+
field set to the effective build, ready to be annotated.
|
|
402
|
+
|
|
403
|
+
`flush()` is called at end of stream to drain anything still
|
|
404
|
+
buffered (which only happens when detection never converged).
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
def __init__(self, *, override: str | None, header_build: str | None) -> None:
|
|
408
|
+
self.header_build = header_build
|
|
409
|
+
self.override = override
|
|
410
|
+
# Effective build: starts as override (if given), else None until detection runs.
|
|
411
|
+
self.effective: str | None = override
|
|
412
|
+
self.detected: str | None = None
|
|
413
|
+
self.matched_count = 0
|
|
414
|
+
self.inspected_count = 0
|
|
415
|
+
self._buffer: list[Variant] = []
|
|
416
|
+
|
|
417
|
+
@property
|
|
418
|
+
def effective_build(self) -> str:
|
|
419
|
+
"""Best-effort effective build at flush time."""
|
|
420
|
+
return self.effective or self.header_build or BUILD_GRCH37
|
|
421
|
+
|
|
422
|
+
def feed(self, variant: Variant) -> tuple[bool, list[Variant]]:
|
|
423
|
+
if self.effective is not None:
|
|
424
|
+
return True, [replace(variant, build=self.effective)]
|
|
425
|
+
# Buffering until detection converges or we hit the cap.
|
|
426
|
+
self._buffer.append(variant)
|
|
427
|
+
if variant.rsid in KNOWN_SNP_POSITIONS:
|
|
428
|
+
result = detect_build(self._buffer)
|
|
429
|
+
if result.is_confident:
|
|
430
|
+
self.detected = result.build
|
|
431
|
+
self.matched_count = result.matched
|
|
432
|
+
self.inspected_count = result.inspected
|
|
433
|
+
self.effective = result.build
|
|
434
|
+
batch = [replace(v, build=result.build) for v in self._buffer]
|
|
435
|
+
self._buffer.clear()
|
|
436
|
+
return True, batch
|
|
437
|
+
if len(self._buffer) >= _DETECTION_BUFFER_LIMIT:
|
|
438
|
+
# Buffer full before detection converged. Run partial detection
|
|
439
|
+
# so the GRCh36 safety guard can fire (same logic as flush()).
|
|
440
|
+
result = detect_build(self._buffer)
|
|
441
|
+
if result.build is not None:
|
|
442
|
+
self.detected = result.build
|
|
443
|
+
self.matched_count = result.matched
|
|
444
|
+
self.inspected_count = result.inspected
|
|
445
|
+
if result.build == BUILD_GRCH36:
|
|
446
|
+
self.effective = BUILD_GRCH36
|
|
447
|
+
else:
|
|
448
|
+
self.effective = self.header_build or BUILD_GRCH37
|
|
449
|
+
batch = [replace(v, build=self.effective) for v in self._buffer]
|
|
450
|
+
self._buffer.clear()
|
|
451
|
+
return True, batch
|
|
452
|
+
return False, []
|
|
453
|
+
|
|
454
|
+
def flush(self) -> list[Variant]:
|
|
455
|
+
if not self._buffer:
|
|
456
|
+
return []
|
|
457
|
+
# Detection never converged. Re-run on the full buffer to capture
|
|
458
|
+
# partial counts even if not confident.
|
|
459
|
+
result = detect_build(self._buffer)
|
|
460
|
+
if result.is_confident:
|
|
461
|
+
self.detected = result.build
|
|
462
|
+
self.effective = result.build
|
|
463
|
+
else:
|
|
464
|
+
if result.build is not None:
|
|
465
|
+
self.detected = result.build
|
|
466
|
+
# GRCh36 must fail safe: there is no GRCh36 ClinVar cache,
|
|
467
|
+
# so falling back to GRCh37 would silently query wrong
|
|
468
|
+
# coordinates and bypass the GRCh36 safety guard.
|
|
469
|
+
if result.build == BUILD_GRCH36:
|
|
470
|
+
self.effective = BUILD_GRCH36
|
|
471
|
+
else:
|
|
472
|
+
self.effective = self.header_build or BUILD_GRCH37
|
|
473
|
+
self.matched_count = result.matched
|
|
474
|
+
self.inspected_count = result.inspected
|
|
475
|
+
out = [replace(v, build=self.effective) for v in self._buffer]
|
|
476
|
+
self._buffer.clear()
|
|
477
|
+
return out
|
|
478
|
+
|
|
479
|
+
def diagnostics(self) -> BuildDiagnostics:
|
|
480
|
+
return BuildDiagnostics(
|
|
481
|
+
header_build=self.header_build,
|
|
482
|
+
detected_build=self.detected,
|
|
483
|
+
effective_build=self.effective_build,
|
|
484
|
+
override=self.override is not None,
|
|
485
|
+
matched_count=self.matched_count,
|
|
486
|
+
inspected_count=self.inspected_count,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
__all__ = [
|
|
491
|
+
"BUILD_GRCH37",
|
|
492
|
+
"BUILD_GRCH38",
|
|
493
|
+
"AnalysisResult",
|
|
494
|
+
"BuildDiagnostics",
|
|
495
|
+
"rollup_gwas_duplicates",
|
|
496
|
+
"run_analysis",
|
|
497
|
+
]
|
allelix/reports/diff.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 dial481
|
|
3
|
+
"""Report diff engine for comparing analysis runs.
|
|
4
|
+
|
|
5
|
+
Compares a current analysis run against a previous JSON report to surface
|
|
6
|
+
new, removed, and changed annotations. Primary use cases: regression
|
|
7
|
+
detection after code changes, QA after database refreshes, and user
|
|
8
|
+
version-to-version comparison.
|
|
9
|
+
|
|
10
|
+
Diff key: ``(source, rsid, condition)``. This groups annotations so that
|
|
11
|
+
reclassifications (significance changes) appear as "changed" rather than
|
|
12
|
+
"removed + added." ``genotype_match`` is excluded because the typical
|
|
13
|
+
diff workflow reruns the same genotype file against updated databases.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
from collections import Counter
|
|
20
|
+
from dataclasses import asdict, dataclass, field
|
|
21
|
+
from typing import TYPE_CHECKING
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
from allelix.models import Annotation
|
|
27
|
+
|
|
28
|
+
_SUPPORTED_SCHEMA_VERSIONS = {"1", "2", "3", "4"}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ChangedAnnotation:
|
|
33
|
+
"""An annotation whose significance or magnitude changed between runs."""
|
|
34
|
+
|
|
35
|
+
current: Annotation
|
|
36
|
+
previous_significance: str
|
|
37
|
+
previous_magnitude: float
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class DiffResult:
|
|
42
|
+
"""The result of comparing current annotations against a previous report."""
|
|
43
|
+
|
|
44
|
+
new: list[Annotation] = field(default_factory=list)
|
|
45
|
+
removed: list[dict] = field(default_factory=list)
|
|
46
|
+
changed: list[ChangedAnnotation] = field(default_factory=list)
|
|
47
|
+
previous_generated_at: str = ""
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def has_changes(self) -> bool:
|
|
51
|
+
"""True if any annotations were added, removed, or changed."""
|
|
52
|
+
return bool(self.new or self.removed or self.changed)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _diff_key_from_annotation(a: Annotation) -> tuple[str, str, str, str]:
|
|
56
|
+
return (a.source, a.rsid, a.condition, a.description)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _diff_key_from_dict(d: dict) -> tuple[str, str, str, str]:
|
|
60
|
+
return (d["source"], d["rsid"], d.get("condition", ""), d.get("description", ""))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_previous_report(path: Path) -> dict:
|
|
64
|
+
"""Load and validate a previous JSON report.
|
|
65
|
+
|
|
66
|
+
Raises ValueError on invalid JSON or unsupported schema version.
|
|
67
|
+
"""
|
|
68
|
+
text = path.read_text(encoding="utf-8")
|
|
69
|
+
try:
|
|
70
|
+
data = json.loads(text)
|
|
71
|
+
except json.JSONDecodeError as exc:
|
|
72
|
+
msg = f"Cannot parse {path.name} as JSON: {exc}"
|
|
73
|
+
raise ValueError(msg) from exc
|
|
74
|
+
|
|
75
|
+
version = data.get("schema_version")
|
|
76
|
+
if version not in _SUPPORTED_SCHEMA_VERSIONS:
|
|
77
|
+
msg = (
|
|
78
|
+
f"Cannot diff against schema version {version!r} "
|
|
79
|
+
f"(expected one of {sorted(_SUPPORTED_SCHEMA_VERSIONS)}). "
|
|
80
|
+
"Re-generate the baseline report with the current version of Allelix."
|
|
81
|
+
)
|
|
82
|
+
raise ValueError(msg)
|
|
83
|
+
|
|
84
|
+
if "annotations" not in data:
|
|
85
|
+
msg = f"{path.name} has no 'annotations' key."
|
|
86
|
+
raise ValueError(msg)
|
|
87
|
+
|
|
88
|
+
return data
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def compute_diff(
|
|
92
|
+
current: list[Annotation],
|
|
93
|
+
previous_annotations: list[dict],
|
|
94
|
+
previous_generated_at: str,
|
|
95
|
+
) -> DiffResult:
|
|
96
|
+
"""Compare current annotations against a previous report's annotation list."""
|
|
97
|
+
prev_by_key: dict[tuple[str, str, str, str], dict] = {}
|
|
98
|
+
for p in previous_annotations:
|
|
99
|
+
key = _diff_key_from_dict(p)
|
|
100
|
+
prev_by_key[key] = p
|
|
101
|
+
|
|
102
|
+
curr_by_key: dict[tuple[str, str, str, str], Annotation] = {}
|
|
103
|
+
for c in current:
|
|
104
|
+
key = _diff_key_from_annotation(c)
|
|
105
|
+
curr_by_key[key] = c
|
|
106
|
+
|
|
107
|
+
new = [c for key, c in curr_by_key.items() if key not in prev_by_key]
|
|
108
|
+
removed = [p for key, p in prev_by_key.items() if key not in curr_by_key]
|
|
109
|
+
|
|
110
|
+
changed: list[ChangedAnnotation] = []
|
|
111
|
+
for key, c in curr_by_key.items():
|
|
112
|
+
if key in prev_by_key:
|
|
113
|
+
p = prev_by_key[key]
|
|
114
|
+
if c.significance != p.get("significance") or c.magnitude != p.get("magnitude"):
|
|
115
|
+
changed.append(
|
|
116
|
+
ChangedAnnotation(
|
|
117
|
+
current=c,
|
|
118
|
+
previous_significance=p.get("significance", ""),
|
|
119
|
+
previous_magnitude=p.get("magnitude", 0.0),
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
new.sort(key=lambda a: (-a.magnitude, a.rsid))
|
|
124
|
+
removed.sort(key=lambda d: (-d.get("magnitude", 0.0), d.get("rsid", "")))
|
|
125
|
+
|
|
126
|
+
return DiffResult(
|
|
127
|
+
new=new,
|
|
128
|
+
removed=removed,
|
|
129
|
+
changed=changed,
|
|
130
|
+
previous_generated_at=previous_generated_at,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def summarize_diff(diff: DiffResult) -> str:
|
|
135
|
+
"""Human-readable one-line summary of changes."""
|
|
136
|
+
parts: list[str] = []
|
|
137
|
+
|
|
138
|
+
if diff.new:
|
|
139
|
+
counts: Counter[str] = Counter()
|
|
140
|
+
for a in diff.new:
|
|
141
|
+
counts[a.attribution] += 1
|
|
142
|
+
breakdown = ", ".join(f"{n} {src}" for src, n in counts.most_common())
|
|
143
|
+
parts.append(f"{len(diff.new)} new ({breakdown})")
|
|
144
|
+
|
|
145
|
+
if diff.changed:
|
|
146
|
+
parts.append(f"{len(diff.changed)} changed")
|
|
147
|
+
|
|
148
|
+
if diff.removed:
|
|
149
|
+
parts.append(f"{len(diff.removed)} removed")
|
|
150
|
+
|
|
151
|
+
if not parts:
|
|
152
|
+
return "No changes since previous report."
|
|
153
|
+
|
|
154
|
+
date_str = ""
|
|
155
|
+
if diff.previous_generated_at:
|
|
156
|
+
date_str = diff.previous_generated_at[:10]
|
|
157
|
+
|
|
158
|
+
summary = "; ".join(parts)
|
|
159
|
+
if date_str:
|
|
160
|
+
return f"Changes since {date_str}: {summary}."
|
|
161
|
+
return f"Changes: {summary}."
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def diff_annotation_to_dict(a: ChangedAnnotation) -> dict:
|
|
165
|
+
"""Serialize a ChangedAnnotation for JSON output."""
|
|
166
|
+
d = {k: v for k, v in asdict(a.current).items() if k != "is_must_include"}
|
|
167
|
+
d["previous_significance"] = a.previous_significance
|
|
168
|
+
d["previous_magnitude"] = a.previous_magnitude
|
|
169
|
+
return d
|