allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,497 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Shared analysis pipeline used by `analyze`, `methylation`, and `pharmacogenomics`.
4
+
5
+ The CLI builds an `AnalysisResult` once and hands it to a renderer
6
+ (terminal, JSON, HTML). Renderers never query the database or re-iterate
7
+ the parser — they receive a fully-populated value object.
8
+
9
+ ADR-0021: this pipeline owns build detection. Parsers report the
10
+ header-claimed build; the pipeline replaces each variant's `build`
11
+ with the build detected from position data (or the user's `--build`
12
+ override) before annotators see the variant.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import contextlib
18
+ import logging
19
+ from dataclasses import dataclass, field, replace
20
+ from typing import TYPE_CHECKING
21
+
22
+ from allelix.utils.build_detect import (
23
+ BUILD_GRCH36,
24
+ BUILD_GRCH37,
25
+ BUILD_GRCH38,
26
+ KNOWN_SNP_POSITIONS,
27
+ detect_build,
28
+ normalize_build_label,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from collections.abc import Callable, Iterable
33
+ from pathlib import Path
34
+
35
+ from allelix.annotators.alphamissense import AlphaMissenseAnnotator
36
+ from allelix.annotators.base import Annotator
37
+ from allelix.annotators.cadd import CaddAnnotator
38
+ from allelix.annotators.gnomad import GnomadAnnotator
39
+ from allelix.models import Annotation, Variant
40
+ from allelix.parsers.base import GenotypeParser
41
+
42
+
43
+ # How many input variants to buffer while waiting for detection to
44
+ # converge. Detection completes once every entry in KNOWN_SNP_POSITIONS
45
+ # has been seen; typical files cover the table within the first ~5000
46
+ # probes. Cap so a file with no known SNPs doesn't buffer the whole
47
+ # input.
48
+ _DETECTION_BUFFER_LIMIT = 100_000
49
+
50
+
51
+ @dataclass
52
+ class BuildDiagnostics:
53
+ """What the pipeline learned about the file's genome build.
54
+
55
+ `header_build` is the build claimed by the file header (normalized
56
+ to GRCh37/GRCh38 via `normalize_build_label`; may be None if the
57
+ header doesn't say or uses an unrecognized label).
58
+
59
+ `detected_build` is what position data says (None if no known SNPs
60
+ appeared in the input).
61
+
62
+ `effective_build` is what was actually used for annotation — either
63
+ a CLI `--build` override, the detected build, or a fallback. Always
64
+ set when the pipeline ran on any data.
65
+
66
+ `mismatch` is True when header_build and detected_build disagree
67
+ AND no override was supplied. The CLI surfaces this as a warning.
68
+ """
69
+
70
+ header_build: str | None
71
+ detected_build: str | None
72
+ effective_build: str
73
+ override: bool
74
+ matched_count: int
75
+ inspected_count: int
76
+
77
+ @property
78
+ def mismatch(self) -> bool:
79
+ return (
80
+ not self.override
81
+ and self.header_build is not None
82
+ and self.detected_build is not None
83
+ and self.header_build != self.detected_build
84
+ )
85
+
86
+
87
+ @dataclass
88
+ class AnalysisResult:
89
+ """Everything a renderer needs to produce a report."""
90
+
91
+ file_path: Path
92
+ parser_name: str
93
+ parser_display_name: str
94
+ sample_id: str
95
+ build: str
96
+ total_variants: int
97
+ skipped_count: int
98
+ annotators_used: list[tuple[str, str | None]]
99
+ annotations: list[Annotation] = field(default_factory=list)
100
+ build_diagnostics: BuildDiagnostics | None = None
101
+
102
+ def filter(
103
+ self,
104
+ *,
105
+ min_magnitude: float = 0.0,
106
+ category: str | None = None,
107
+ genes: Iterable[str] | None = None,
108
+ source_min_magnitudes: dict[str, float] | None = None,
109
+ ) -> list[Annotation]:
110
+ """Apply the standard filters and return a sorted list of annotations.
111
+
112
+ Filters are independent and combine with AND. Sort is by magnitude
113
+ descending, then rsid ascending (stable, deterministic).
114
+
115
+ `source_min_magnitudes` overrides the floor for specific sources
116
+ (e.g. ``{"gwas": 9.0, "snpedia": 2.0}``). When a source has an
117
+ entry, that value IS the floor for that source — it can raise OR
118
+ lower the global ``min_magnitude``. Sources without an entry use
119
+ the global floor.
120
+ """
121
+ gene_set = {g.upper() for g in genes} if genes else None
122
+ out: list[Annotation] = []
123
+ for a in self.annotations:
124
+ if (
125
+ source_min_magnitudes
126
+ and a.source in source_min_magnitudes
127
+ and not a.is_must_include
128
+ ):
129
+ floor = source_min_magnitudes[a.source]
130
+ else:
131
+ floor = min_magnitude
132
+ if a.magnitude < floor:
133
+ continue
134
+ if category is not None and a.category != category:
135
+ continue
136
+ if gene_set is not None and (a.gene or "").upper() not in gene_set:
137
+ continue
138
+ out.append(a)
139
+ out.sort(key=lambda a: (-a.magnitude, a.rsid))
140
+ return out
141
+
142
+
143
+ def _gwas_base_trait(description: str) -> str | None:
144
+ """Extract trait text from a GWAS description, stripping MTAG suffix and PheCode label."""
145
+ marker = "GWAS Catalog: "
146
+ if marker not in description:
147
+ return None
148
+ s = description.split(marker, 1)[1]
149
+ s = s.split(" (p=", 1)[0]
150
+ if s.endswith(" (MTAG)"):
151
+ s = s[: -len(" (MTAG)")]
152
+ s = s.split(" (PheCode ", 1)[0]
153
+ return s.strip().lower()
154
+
155
+
156
+ def _gwas_phecode_parent(description: str) -> str | None:
157
+ """Extract PheCode parent (numeric prefix before the dot), or None."""
158
+ idx = description.find("(PheCode ")
159
+ if idx == -1:
160
+ return None
161
+ rest = description[idx + len("(PheCode ") :]
162
+ end = rest.find(")")
163
+ if end == -1:
164
+ return None
165
+ code = rest[:end].strip()
166
+ parent = code.split(".", 1)[0]
167
+ return parent if parent.isdigit() else None
168
+
169
+
170
+ def _gwas_p_value(description: str) -> float:
171
+ """Extract p-value from a GWAS description. Returns inf if unparseable."""
172
+ idx = description.find("(p=")
173
+ if idx == -1:
174
+ return float("inf")
175
+ rest = description[idx + len("(p=") :]
176
+ end = rest.find(",")
177
+ if end == -1:
178
+ end = rest.find(")")
179
+ if end == -1:
180
+ return float("inf")
181
+ try:
182
+ return float(rest[:end].strip())
183
+ except ValueError:
184
+ return float("inf")
185
+
186
+
187
+ def rollup_gwas_duplicates(annotations: list[Annotation]) -> list[Annotation]:
188
+ """Collapse GWAS MTAG twins and PheCode parent/child hierarchies.
189
+
190
+ Operates on the filtered annotation list (the output of
191
+ AnalysisResult.filter). Non-GWAS rows pass through untouched.
192
+ Must-include rows are never dropped.
193
+
194
+ See ADR-0024 'MTAG and PheCode rollup' for rules.
195
+ """
196
+ survivors: list[Annotation] = []
197
+ gwas_rows: list[Annotation] = []
198
+ for a in annotations:
199
+ (gwas_rows if a.source == "gwas" else survivors).append(a)
200
+
201
+ if not gwas_rows:
202
+ return annotations
203
+
204
+ plain_keys = {
205
+ (a.rsid, _gwas_base_trait(a.description))
206
+ for a in gwas_rows
207
+ if "(MTAG)" not in a.description
208
+ }
209
+ after_mtag = [
210
+ a
211
+ for a in gwas_rows
212
+ if a.is_must_include
213
+ or "(MTAG)" not in a.description
214
+ or (a.rsid, _gwas_base_trait(a.description)) not in plain_keys
215
+ ]
216
+
217
+ by_parent: dict[tuple[str, str], list[Annotation]] = {}
218
+ no_phecode: list[Annotation] = []
219
+ for a in after_mtag:
220
+ parent = _gwas_phecode_parent(a.description)
221
+ if parent is None or a.is_must_include:
222
+ no_phecode.append(a)
223
+ else:
224
+ by_parent.setdefault((a.rsid, parent), []).append(a)
225
+ for group in by_parent.values():
226
+ winner = min(group, key=lambda x: _gwas_p_value(x.description))
227
+ no_phecode.append(winner)
228
+
229
+ survivors.extend(no_phecode)
230
+ survivors.sort(key=lambda a: (-a.magnitude, a.rsid))
231
+ return survivors
232
+
233
+
234
+ def _lookup_user_allele(
235
+ user_alt: str,
236
+ coords: list[tuple[str, int, str, str]],
237
+ scores: dict[tuple[str, int, str, str], float],
238
+ resolve_strand: Callable[[str, str, str], str | None],
239
+ ) -> float | None:
240
+ """Find the CADD score for a specific user allele at a multi-allelic site.
241
+
242
+ Prefers a direct allele match over a complement (minus-strand) match
243
+ to avoid false positives where the complement of the user's allele
244
+ coincidentally equals a different alt at the same position.
245
+ """
246
+ for chrom, pos, ref, alt in coords:
247
+ if user_alt == alt:
248
+ return scores.get((chrom, pos, ref, alt))
249
+ for chrom, pos, ref, alt in coords:
250
+ resolved = resolve_strand(user_alt, ref, alt)
251
+ if resolved is not None and resolved == alt:
252
+ return scores.get((chrom, pos, ref, alt))
253
+ return None
254
+
255
+
256
+ def _enrich_cadd(
257
+ annotations: list[Annotation],
258
+ gnomad: GnomadAnnotator,
259
+ cadd: CaddAnnotator,
260
+ ) -> None:
261
+ """Stamp annotations with CADD PHRED scores via coordinate resolution.
262
+
263
+ Resolves rsIDs to genomic coordinates through gnomAD, normalizes
264
+ alleles to reference-forward orientation, and looks up CADD scores.
265
+ """
266
+ from allelix.utils.allele import resolve_strand
267
+
268
+ rsids = {a.rsid for a in annotations}
269
+ coord_map = gnomad.bulk_resolve_coordinates(rsids)
270
+ if not coord_map:
271
+ return
272
+
273
+ cadd_keys: set[tuple[str, int, str, str]] = set()
274
+ for coords in coord_map.values():
275
+ for chrom, pos, ref, alt in coords:
276
+ cadd_keys.add((chrom, pos, ref, alt))
277
+ scores = cadd.bulk_lookup(cadd_keys)
278
+ if not scores:
279
+ return
280
+
281
+ for a in annotations:
282
+ coords = coord_map.get(a.rsid)
283
+ if not coords:
284
+ continue
285
+ if a.alt:
286
+ score = _lookup_user_allele(a.alt, coords, scores, resolve_strand)
287
+ a.cadd_phred = score
288
+ else:
289
+ best: float | None = None
290
+ for chrom, pos, ref, alt in coords:
291
+ score = scores.get((chrom, pos, ref, alt))
292
+ if score is not None and (best is None or score > best):
293
+ best = score
294
+ a.cadd_phred = best
295
+
296
+
297
+ def run_analysis(
298
+ file_path: Path,
299
+ parser: GenotypeParser,
300
+ annotators: list[Annotator],
301
+ skipped_count_provider: Callable[[], int] = lambda: 0,
302
+ *,
303
+ build_override: str | None = None,
304
+ gnomad: GnomadAnnotator | None = None,
305
+ alphamissense: AlphaMissenseAnnotator | None = None,
306
+ cadd: CaddAnnotator | None = None,
307
+ ) -> AnalysisResult:
308
+ """Stream the file once, query every ready annotator per variant, return results.
309
+
310
+ `build_override` short-circuits build detection: when supplied
311
+ (e.g., from `--build grch37`), every variant gets that build and
312
+ the position-data detector is skipped. When None, the pipeline
313
+ buffers the head of the stream until detection is confident, then
314
+ flushes through annotation.
315
+
316
+ Annotators are entered into a `contextlib.ExitStack` so their resources
317
+ (e.g., SQLite connections) are deterministically closed.
318
+ """
319
+ metadata = parser.get_metadata(file_path)
320
+ header_build = normalize_build_label(metadata.get("build"))
321
+
322
+ annotations: list[Annotation] = []
323
+ total = 0
324
+ diag = _BuildDetectionState(override=build_override, header_build=header_build)
325
+
326
+ with contextlib.ExitStack() as stack:
327
+ bound = [stack.enter_context(a) for a in annotators]
328
+ for variant in parser.parse(file_path):
329
+ total += 1
330
+ ready, batch = diag.feed(variant)
331
+ if not ready:
332
+ continue
333
+ for v in batch:
334
+ for annotator in bound:
335
+ annotations.extend(annotator.annotate(v))
336
+ # End of stream: flush any buffered variants with the best
337
+ # effective build we can resolve (detected → header → default).
338
+ for v in diag.flush():
339
+ for annotator in bound:
340
+ annotations.extend(annotator.annotate(v))
341
+
342
+ if gnomad is not None and gnomad.is_ready():
343
+ exact_keys = {(a.rsid, a.alt) for a in annotations if a.alt}
344
+ max_rsids = {a.rsid for a in annotations if not a.alt}
345
+ exact_freq = gnomad.bulk_lookup_by_alt(exact_keys)
346
+ max_freq = gnomad.bulk_lookup(max_rsids)
347
+ for a in annotations:
348
+ if a.alt:
349
+ a.allele_frequency = exact_freq.get((a.rsid, a.alt))
350
+ else:
351
+ a.allele_frequency = max_freq.get(a.rsid)
352
+
353
+ if alphamissense is not None and alphamissense.is_ready():
354
+ exact_keys = {(a.rsid, a.alt) for a in annotations if a.alt}
355
+ max_rsids = {a.rsid for a in annotations if not a.alt}
356
+ exact_am = alphamissense.bulk_lookup_by_alt(exact_keys)
357
+ max_am = alphamissense.bulk_lookup(max_rsids)
358
+ for a in annotations:
359
+ hit = exact_am.get((a.rsid, a.alt)) if a.alt else max_am.get(a.rsid)
360
+ if hit is not None:
361
+ a.am_pathogenicity, a.am_class = hit
362
+
363
+ if cadd is not None and cadd.is_ready() and gnomad is not None and gnomad.is_ready():
364
+ if getattr(cadd, "_full_mode", False) and diag.effective_build != BUILD_GRCH38:
365
+ logging.getLogger(__name__).warning(
366
+ "CADD full mode requires GRCh38 coordinates; "
367
+ "detected %s — skipping CADD enrichment",
368
+ diag.effective_build,
369
+ )
370
+ else:
371
+ _enrich_cadd(annotations, gnomad, cadd)
372
+
373
+ annotators_used = [(a.name, a.version()) for a in annotators]
374
+ if gnomad is not None and gnomad.is_ready():
375
+ annotators_used.append((gnomad.name, gnomad.version()))
376
+ if alphamissense is not None and alphamissense.is_ready():
377
+ annotators_used.append((alphamissense.name, alphamissense.version()))
378
+ if cadd is not None and cadd.is_ready():
379
+ annotators_used.append((cadd.name, cadd.version()))
380
+
381
+ return AnalysisResult(
382
+ file_path=file_path,
383
+ parser_name=parser.name,
384
+ parser_display_name=parser.display_name,
385
+ sample_id=metadata["sample_id"],
386
+ build=diag.effective_build,
387
+ total_variants=total,
388
+ skipped_count=skipped_count_provider(),
389
+ annotators_used=annotators_used,
390
+ annotations=annotations,
391
+ build_diagnostics=diag.diagnostics(),
392
+ )
393
+
394
+
395
+ class _BuildDetectionState:
396
+ """Buffer-and-flush state machine for build detection during streaming.
397
+
398
+ `feed(variant)` returns (ready, batch). When `ready` is False, the
399
+ variant has been buffered and the caller should keep streaming.
400
+ When True, `batch` contains one or more variants with their build
401
+ field set to the effective build, ready to be annotated.
402
+
403
+ `flush()` is called at end of stream to drain anything still
404
+ buffered (which only happens when detection never converged).
405
+ """
406
+
407
+ def __init__(self, *, override: str | None, header_build: str | None) -> None:
408
+ self.header_build = header_build
409
+ self.override = override
410
+ # Effective build: starts as override (if given), else None until detection runs.
411
+ self.effective: str | None = override
412
+ self.detected: str | None = None
413
+ self.matched_count = 0
414
+ self.inspected_count = 0
415
+ self._buffer: list[Variant] = []
416
+
417
+ @property
418
+ def effective_build(self) -> str:
419
+ """Best-effort effective build at flush time."""
420
+ return self.effective or self.header_build or BUILD_GRCH37
421
+
422
+ def feed(self, variant: Variant) -> tuple[bool, list[Variant]]:
423
+ if self.effective is not None:
424
+ return True, [replace(variant, build=self.effective)]
425
+ # Buffering until detection converges or we hit the cap.
426
+ self._buffer.append(variant)
427
+ if variant.rsid in KNOWN_SNP_POSITIONS:
428
+ result = detect_build(self._buffer)
429
+ if result.is_confident:
430
+ self.detected = result.build
431
+ self.matched_count = result.matched
432
+ self.inspected_count = result.inspected
433
+ self.effective = result.build
434
+ batch = [replace(v, build=result.build) for v in self._buffer]
435
+ self._buffer.clear()
436
+ return True, batch
437
+ if len(self._buffer) >= _DETECTION_BUFFER_LIMIT:
438
+ # Buffer full before detection converged. Run partial detection
439
+ # so the GRCh36 safety guard can fire (same logic as flush()).
440
+ result = detect_build(self._buffer)
441
+ if result.build is not None:
442
+ self.detected = result.build
443
+ self.matched_count = result.matched
444
+ self.inspected_count = result.inspected
445
+ if result.build == BUILD_GRCH36:
446
+ self.effective = BUILD_GRCH36
447
+ else:
448
+ self.effective = self.header_build or BUILD_GRCH37
449
+ batch = [replace(v, build=self.effective) for v in self._buffer]
450
+ self._buffer.clear()
451
+ return True, batch
452
+ return False, []
453
+
454
+ def flush(self) -> list[Variant]:
455
+ if not self._buffer:
456
+ return []
457
+ # Detection never converged. Re-run on the full buffer to capture
458
+ # partial counts even if not confident.
459
+ result = detect_build(self._buffer)
460
+ if result.is_confident:
461
+ self.detected = result.build
462
+ self.effective = result.build
463
+ else:
464
+ if result.build is not None:
465
+ self.detected = result.build
466
+ # GRCh36 must fail safe: there is no GRCh36 ClinVar cache,
467
+ # so falling back to GRCh37 would silently query wrong
468
+ # coordinates and bypass the GRCh36 safety guard.
469
+ if result.build == BUILD_GRCH36:
470
+ self.effective = BUILD_GRCH36
471
+ else:
472
+ self.effective = self.header_build or BUILD_GRCH37
473
+ self.matched_count = result.matched
474
+ self.inspected_count = result.inspected
475
+ out = [replace(v, build=self.effective) for v in self._buffer]
476
+ self._buffer.clear()
477
+ return out
478
+
479
+ def diagnostics(self) -> BuildDiagnostics:
480
+ return BuildDiagnostics(
481
+ header_build=self.header_build,
482
+ detected_build=self.detected,
483
+ effective_build=self.effective_build,
484
+ override=self.override is not None,
485
+ matched_count=self.matched_count,
486
+ inspected_count=self.inspected_count,
487
+ )
488
+
489
+
490
+ __all__ = [
491
+ "BUILD_GRCH37",
492
+ "BUILD_GRCH38",
493
+ "AnalysisResult",
494
+ "BuildDiagnostics",
495
+ "rollup_gwas_duplicates",
496
+ "run_analysis",
497
+ ]
@@ -0,0 +1,169 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Report diff engine for comparing analysis runs.
4
+
5
+ Compares a current analysis run against a previous JSON report to surface
6
+ new, removed, and changed annotations. Primary use cases: regression
7
+ detection after code changes, QA after database refreshes, and user
8
+ version-to-version comparison.
9
+
10
+ Diff key: ``(source, rsid, condition)``. This groups annotations so that
11
+ reclassifications (significance changes) appear as "changed" rather than
12
+ "removed + added." ``genotype_match`` is excluded because the typical
13
+ diff workflow reruns the same genotype file against updated databases.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ from collections import Counter
20
+ from dataclasses import asdict, dataclass, field
21
+ from typing import TYPE_CHECKING
22
+
23
+ if TYPE_CHECKING:
24
+ from pathlib import Path
25
+
26
+ from allelix.models import Annotation
27
+
28
+ _SUPPORTED_SCHEMA_VERSIONS = {"1", "2", "3", "4"}
29
+
30
+
31
+ @dataclass
32
+ class ChangedAnnotation:
33
+ """An annotation whose significance or magnitude changed between runs."""
34
+
35
+ current: Annotation
36
+ previous_significance: str
37
+ previous_magnitude: float
38
+
39
+
40
+ @dataclass
41
+ class DiffResult:
42
+ """The result of comparing current annotations against a previous report."""
43
+
44
+ new: list[Annotation] = field(default_factory=list)
45
+ removed: list[dict] = field(default_factory=list)
46
+ changed: list[ChangedAnnotation] = field(default_factory=list)
47
+ previous_generated_at: str = ""
48
+
49
+ @property
50
+ def has_changes(self) -> bool:
51
+ """True if any annotations were added, removed, or changed."""
52
+ return bool(self.new or self.removed or self.changed)
53
+
54
+
55
+ def _diff_key_from_annotation(a: Annotation) -> tuple[str, str, str, str]:
56
+ return (a.source, a.rsid, a.condition, a.description)
57
+
58
+
59
+ def _diff_key_from_dict(d: dict) -> tuple[str, str, str, str]:
60
+ return (d["source"], d["rsid"], d.get("condition", ""), d.get("description", ""))
61
+
62
+
63
+ def load_previous_report(path: Path) -> dict:
64
+ """Load and validate a previous JSON report.
65
+
66
+ Raises ValueError on invalid JSON or unsupported schema version.
67
+ """
68
+ text = path.read_text(encoding="utf-8")
69
+ try:
70
+ data = json.loads(text)
71
+ except json.JSONDecodeError as exc:
72
+ msg = f"Cannot parse {path.name} as JSON: {exc}"
73
+ raise ValueError(msg) from exc
74
+
75
+ version = data.get("schema_version")
76
+ if version not in _SUPPORTED_SCHEMA_VERSIONS:
77
+ msg = (
78
+ f"Cannot diff against schema version {version!r} "
79
+ f"(expected one of {sorted(_SUPPORTED_SCHEMA_VERSIONS)}). "
80
+ "Re-generate the baseline report with the current version of Allelix."
81
+ )
82
+ raise ValueError(msg)
83
+
84
+ if "annotations" not in data:
85
+ msg = f"{path.name} has no 'annotations' key."
86
+ raise ValueError(msg)
87
+
88
+ return data
89
+
90
+
91
+ def compute_diff(
92
+ current: list[Annotation],
93
+ previous_annotations: list[dict],
94
+ previous_generated_at: str,
95
+ ) -> DiffResult:
96
+ """Compare current annotations against a previous report's annotation list."""
97
+ prev_by_key: dict[tuple[str, str, str, str], dict] = {}
98
+ for p in previous_annotations:
99
+ key = _diff_key_from_dict(p)
100
+ prev_by_key[key] = p
101
+
102
+ curr_by_key: dict[tuple[str, str, str, str], Annotation] = {}
103
+ for c in current:
104
+ key = _diff_key_from_annotation(c)
105
+ curr_by_key[key] = c
106
+
107
+ new = [c for key, c in curr_by_key.items() if key not in prev_by_key]
108
+ removed = [p for key, p in prev_by_key.items() if key not in curr_by_key]
109
+
110
+ changed: list[ChangedAnnotation] = []
111
+ for key, c in curr_by_key.items():
112
+ if key in prev_by_key:
113
+ p = prev_by_key[key]
114
+ if c.significance != p.get("significance") or c.magnitude != p.get("magnitude"):
115
+ changed.append(
116
+ ChangedAnnotation(
117
+ current=c,
118
+ previous_significance=p.get("significance", ""),
119
+ previous_magnitude=p.get("magnitude", 0.0),
120
+ )
121
+ )
122
+
123
+ new.sort(key=lambda a: (-a.magnitude, a.rsid))
124
+ removed.sort(key=lambda d: (-d.get("magnitude", 0.0), d.get("rsid", "")))
125
+
126
+ return DiffResult(
127
+ new=new,
128
+ removed=removed,
129
+ changed=changed,
130
+ previous_generated_at=previous_generated_at,
131
+ )
132
+
133
+
134
+ def summarize_diff(diff: DiffResult) -> str:
135
+ """Human-readable one-line summary of changes."""
136
+ parts: list[str] = []
137
+
138
+ if diff.new:
139
+ counts: Counter[str] = Counter()
140
+ for a in diff.new:
141
+ counts[a.attribution] += 1
142
+ breakdown = ", ".join(f"{n} {src}" for src, n in counts.most_common())
143
+ parts.append(f"{len(diff.new)} new ({breakdown})")
144
+
145
+ if diff.changed:
146
+ parts.append(f"{len(diff.changed)} changed")
147
+
148
+ if diff.removed:
149
+ parts.append(f"{len(diff.removed)} removed")
150
+
151
+ if not parts:
152
+ return "No changes since previous report."
153
+
154
+ date_str = ""
155
+ if diff.previous_generated_at:
156
+ date_str = diff.previous_generated_at[:10]
157
+
158
+ summary = "; ".join(parts)
159
+ if date_str:
160
+ return f"Changes since {date_str}: {summary}."
161
+ return f"Changes: {summary}."
162
+
163
+
164
+ def diff_annotation_to_dict(a: ChangedAnnotation) -> dict:
165
+ """Serialize a ChangedAnnotation for JSON output."""
166
+ d = {k: v for k, v in asdict(a.current).items() if k != "is_must_include"}
167
+ d["previous_significance"] = a.previous_significance
168
+ d["previous_magnitude"] = a.previous_magnitude
169
+ return d