allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,163 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """JSON report renderer.
4
+
5
+ Output schema (versioned via `schema_version`):
6
+
7
+ {
8
+ "schema_version": "4",
9
+ "allelix_version": "1.1.0",
10
+ "generated_at": "2026-05-11T12:34:56+00:00",
11
+ "regulatory_notice": "...",
12
+ "input": {
13
+ "file": "genotype.txt",
14
+ "format": "myhappygenes",
15
+ "sample_id": "MHG000001",
16
+ "build": "GRCh37",
17
+ "total_variants": 2015,
18
+ "skipped_lines": 0
19
+ },
20
+ "annotators": [
21
+ {"name": "clinvar", "version": "20260101"}
22
+ ],
23
+ "filters": {
24
+ "min_magnitude": 5.0,
25
+ "category": null,
26
+ "genes": null
27
+ },
28
+ "annotations": [ ... ]
29
+ }
30
+
31
+ Every annotation is source-attributed (ADR-0003); the renderer never adds
32
+ or strips that field.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import json
38
+ from dataclasses import asdict
39
+ from datetime import UTC, datetime
40
+ from typing import TYPE_CHECKING
41
+
42
+ from allelix import __version__
43
+ from allelix.reports import REGULATORY_NOTICE, atomic_write_text
44
+ from allelix.reports._pipeline import rollup_gwas_duplicates
45
+
46
+ if TYPE_CHECKING:
47
+ from collections.abc import Iterable
48
+ from pathlib import Path
49
+
50
+ from allelix.models import Annotation
51
+ from allelix.reports._pipeline import AnalysisResult
52
+ from allelix.reports.diff import DiffResult
53
+
54
+
55
+ SCHEMA_VERSION = "4"
56
+
57
+
58
+ def _annotation_dict(a: Annotation) -> dict:
59
+ """Serialize an annotation, adding AM caveat for non-protein sources."""
60
+ d = {k: v for k, v in asdict(a).items() if k != "is_must_include"}
61
+ d["zygosity"] = a.zygosity
62
+ if a.am_pathogenicity is not None and a.source == "pharmgkb":
63
+ d["am_caveat"] = "protein structure impact only"
64
+ return d
65
+
66
+
67
+ __all__ = ["REGULATORY_NOTICE", "SCHEMA_VERSION", "render_json"]
68
+
69
+
70
+ def _license_attributions(
71
+ annotators_used: list[tuple[str, str | None]],
72
+ ) -> list[dict[str, str]]:
73
+ """Return license attribution dicts from annotator LicenseDescriptors."""
74
+ import logging
75
+
76
+ from allelix.annotators import get_annotator_class
77
+
78
+ logger = logging.getLogger(__name__)
79
+ result: list[dict[str, str]] = []
80
+ for name, _version in annotators_used:
81
+ cls = get_annotator_class(name)
82
+ if cls is None:
83
+ logger.warning("No annotator class found for '%s' — attribution omitted", name)
84
+ continue
85
+ desc = cls.license
86
+ entry: dict[str, str] = {
87
+ "source": cls.display_name,
88
+ "source_url": desc.source_url or desc.license_url,
89
+ "license": desc.spdx,
90
+ "license_url": desc.license_url,
91
+ "attribution": desc.attribution_text,
92
+ }
93
+ if desc.citation:
94
+ entry["citation"] = desc.citation
95
+ result.append(entry)
96
+ return result
97
+
98
+
99
+ def render_json(
100
+ result: AnalysisResult,
101
+ *,
102
+ output_path: Path,
103
+ min_magnitude: float = 0.0,
104
+ category: str | None = None,
105
+ genes: Iterable[str] | None = None,
106
+ source_min_magnitudes: dict[str, float] | None = None,
107
+ diff: DiffResult | None = None,
108
+ high_value_no_calls: list[dict[str, str]] | None = None,
109
+ ) -> int:
110
+ """Write a JSON report to `output_path`. Returns the number of annotations included."""
111
+ filtered = result.filter(
112
+ min_magnitude=min_magnitude,
113
+ category=category,
114
+ genes=genes,
115
+ source_min_magnitudes=source_min_magnitudes,
116
+ )
117
+ filtered = rollup_gwas_duplicates(filtered)
118
+ payload: dict = {
119
+ "schema_version": SCHEMA_VERSION,
120
+ "allelix_version": __version__,
121
+ "generated_at": datetime.now(UTC).isoformat(),
122
+ "regulatory_notice": REGULATORY_NOTICE,
123
+ "input": {
124
+ "file": result.file_path.name,
125
+ "format": result.parser_name,
126
+ "sample_id": result.sample_id,
127
+ "build": result.build,
128
+ "total_variants": result.total_variants,
129
+ "skipped_lines": result.skipped_count,
130
+ },
131
+ "annotators": [
132
+ {"name": name, "version": version} for name, version in result.annotators_used
133
+ ],
134
+ "filters": {
135
+ "min_magnitude": min_magnitude,
136
+ "category": category,
137
+ "genes": sorted(genes) if genes else None,
138
+ },
139
+ "annotations": [_annotation_dict(a) for a in filtered],
140
+ }
141
+
142
+ license_attrs = _license_attributions(result.annotators_used)
143
+ if license_attrs:
144
+ payload["license_attributions"] = license_attrs
145
+
146
+ if high_value_no_calls:
147
+ payload["high_value_no_calls"] = high_value_no_calls
148
+
149
+ if diff is not None:
150
+ from allelix.reports.diff import diff_annotation_to_dict, summarize_diff
151
+
152
+ payload["diff"] = {
153
+ "previous_report": diff.previous_generated_at,
154
+ "summary": summarize_diff(diff),
155
+ "new": [
156
+ {k: v for k, v in asdict(a).items() if k != "is_must_include"} for a in diff.new
157
+ ],
158
+ "changed": [diff_annotation_to_dict(c) for c in diff.changed],
159
+ "removed": diff.removed,
160
+ }
161
+
162
+ atomic_write_text(output_path, json.dumps(payload, indent=2, sort_keys=False) + "\n")
163
+ return len(filtered)
@@ -0,0 +1,50 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Methylation pathway gene panel.
4
+
5
+ The set is intentionally small and curated — covering the one-carbon /
6
+ folate / methylation cycle genes most often discussed in consumer methylation
7
+ reports. Not medical guidance; see ADR-0003 (regulatory posture).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ # Folate / one-carbon / methylation cycle genes. Add via PR + ADR if expanding.
13
+ METHYLATION_PANEL_GENES: frozenset[str] = frozenset(
14
+ {
15
+ "ACAT1",
16
+ "AHCY",
17
+ "BHMT",
18
+ "BHMT2",
19
+ "CBS",
20
+ "COMT",
21
+ "DHFR",
22
+ "DNMT1",
23
+ "DNMT3A",
24
+ "DNMT3B",
25
+ "FOLR1",
26
+ "FOLR2",
27
+ "FUT2",
28
+ "GNMT",
29
+ "GSTM1",
30
+ "GSTP1",
31
+ "MAOA",
32
+ "MAT1A",
33
+ "MAT2A",
34
+ "MAT2B",
35
+ "MTHFD1",
36
+ "MTHFD1L",
37
+ "MTHFR",
38
+ "MTR",
39
+ "MTRR",
40
+ "NOS3",
41
+ "PEMT",
42
+ "SHMT1",
43
+ "SHMT2",
44
+ "SLC19A1",
45
+ "SUOX",
46
+ "TCN1",
47
+ "TCN2",
48
+ "VDR",
49
+ }
50
+ )
@@ -0,0 +1,203 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Terminal report rendering for `allelix analyze`."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ from rich.table import Table
10
+
11
+ from allelix.reports._pipeline import rollup_gwas_duplicates
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Iterable
15
+
16
+ from rich.console import Console
17
+
18
+ from allelix.models import Annotation
19
+ from allelix.reports._pipeline import AnalysisResult
20
+ from allelix.reports.diff import DiffResult
21
+
22
+
23
+ def render_terminal(
24
+ result: AnalysisResult,
25
+ console: Console,
26
+ *,
27
+ min_magnitude: float = 0.0,
28
+ category: str | None = None,
29
+ genes: Iterable[str] | None = None,
30
+ source_min_magnitudes: dict[str, float] | None = None,
31
+ ) -> int:
32
+ """Render an AnalysisResult as a Rich table. Returns annotation count.
33
+
34
+ Per ADR-0003 (regulatory posture), every row shows the source attribution
35
+ in its own column — no rendered claim is unattributed.
36
+ """
37
+ filtered = result.filter(
38
+ min_magnitude=min_magnitude,
39
+ category=category,
40
+ genes=genes,
41
+ source_min_magnitudes=source_min_magnitudes,
42
+ )
43
+ filtered = rollup_gwas_duplicates(filtered)
44
+ _print_table(filtered, console)
45
+ return len(filtered)
46
+
47
+
48
+ def render_terminal_diff(
49
+ diff: DiffResult,
50
+ console: Console,
51
+ ) -> int:
52
+ """Render a diff summary and tables for new/changed/removed annotations."""
53
+ from allelix.reports.diff import summarize_diff
54
+
55
+ summary = summarize_diff(diff)
56
+ if not diff.has_changes:
57
+ console.print(f"[green]{summary}[/green]")
58
+ return 0
59
+
60
+ console.print(f"[bold]{summary}[/bold]")
61
+ total = 0
62
+
63
+ if diff.new:
64
+ table = Table(title=f"New Annotations ({len(diff.new)})")
65
+ table.add_column("rsID", style="cyan", no_wrap=True)
66
+ table.add_column("Gene", style="magenta", no_wrap=True)
67
+ table.add_column("Source", style="blue", no_wrap=True)
68
+ table.add_column("Significance", style="yellow")
69
+ table.add_column("Review Status", style="dim")
70
+ table.add_column("Magnitude", justify="right")
71
+ table.add_column("Genotype", no_wrap=True)
72
+ table.add_column("Condition", overflow="fold")
73
+ for a in diff.new:
74
+ table.add_row(
75
+ a.rsid,
76
+ a.gene or "—",
77
+ a.attribution,
78
+ a.significance,
79
+ a.review_status or "—",
80
+ f"{a.magnitude:.1f}",
81
+ a.genotype_match,
82
+ a.condition or "—",
83
+ )
84
+ console.print(table)
85
+ total += len(diff.new)
86
+
87
+ if diff.changed:
88
+ table = Table(title=f"Changed Annotations ({len(diff.changed)})")
89
+ table.add_column("rsID", style="cyan", no_wrap=True)
90
+ table.add_column("Gene", style="magenta", no_wrap=True)
91
+ table.add_column("Source", style="blue", no_wrap=True)
92
+ table.add_column("Old Sig", style="dim")
93
+ table.add_column("New Sig", style="yellow")
94
+ table.add_column("Review Status", style="dim")
95
+ table.add_column("Old Mag", justify="right", style="dim")
96
+ table.add_column("New Mag", justify="right")
97
+ table.add_column("Condition", overflow="fold")
98
+ for c in diff.changed:
99
+ table.add_row(
100
+ c.current.rsid,
101
+ c.current.gene or "—",
102
+ c.current.attribution,
103
+ c.previous_significance,
104
+ c.current.significance,
105
+ c.current.review_status or "—",
106
+ f"{c.previous_magnitude:.1f}",
107
+ f"{c.current.magnitude:.1f}",
108
+ c.current.condition or "—",
109
+ )
110
+ console.print(table)
111
+ total += len(diff.changed)
112
+
113
+ if diff.removed:
114
+ table = Table(title=f"Removed Annotations ({len(diff.removed)})")
115
+ table.add_column("rsID", style="dim cyan", no_wrap=True)
116
+ table.add_column("Gene", style="dim magenta", no_wrap=True)
117
+ table.add_column("Source", style="dim blue", no_wrap=True)
118
+ table.add_column("Significance", style="dim")
119
+ table.add_column("Review Status", style="dim")
120
+ table.add_column("Magnitude", justify="right", style="dim")
121
+ table.add_column("Condition", overflow="fold", style="dim")
122
+ for d in diff.removed:
123
+ table.add_row(
124
+ d.get("rsid", ""),
125
+ d.get("gene", "") or "—",
126
+ d.get("attribution", ""),
127
+ d.get("significance", ""),
128
+ d.get("review_status", "") or "—",
129
+ f"{d.get('magnitude', 0.0):.1f}",
130
+ d.get("condition", "") or "—",
131
+ )
132
+ console.print(table)
133
+ total += len(diff.removed)
134
+
135
+ return total
136
+
137
+
138
+ def _format_freq(af: float | None) -> str:
139
+ if af is None:
140
+ return "—"
141
+ pct = af * 100
142
+ if pct < 0.01:
143
+ return "<0.01%"
144
+ return f"{pct:.2f}%"
145
+
146
+
147
+ def _print_table(filtered: list[Annotation], console: Console) -> None:
148
+ if not filtered:
149
+ console.print("[yellow]No annotations matched the current filters.[/yellow]")
150
+ return
151
+
152
+ has_freq = any(a.allele_frequency is not None for a in filtered)
153
+ has_am = any(a.am_pathogenicity is not None for a in filtered)
154
+ has_am_caveat = any(
155
+ a.am_pathogenicity is not None and a.source == "pharmgkb" for a in filtered
156
+ )
157
+ has_cadd = any(a.cadd_phred is not None for a in filtered)
158
+
159
+ table = Table(title=f"Annotations ({len(filtered)})")
160
+ table.add_column("rsID", style="cyan", no_wrap=True)
161
+ table.add_column("Gene", style="magenta", no_wrap=True)
162
+ table.add_column("Source", style="blue", no_wrap=True)
163
+ table.add_column("Significance", style="yellow")
164
+ table.add_column("Review Status", style="dim")
165
+ table.add_column("Magnitude", justify="right")
166
+ table.add_column("Genotype", no_wrap=True)
167
+ table.add_column("Zygosity", no_wrap=True)
168
+ if has_freq:
169
+ table.add_column("Freq", justify="right", no_wrap=True)
170
+ if has_am:
171
+ table.add_column("AM", justify="right", no_wrap=True)
172
+ if has_cadd:
173
+ table.add_column("CADD", justify="right", no_wrap=True)
174
+ table.add_column("Condition", overflow="fold")
175
+
176
+ for a in filtered:
177
+ row = [
178
+ a.rsid,
179
+ a.gene or "—",
180
+ a.attribution,
181
+ a.significance,
182
+ a.review_status or "—",
183
+ f"{a.magnitude:.1f}",
184
+ a.genotype_match,
185
+ a.zygosity,
186
+ ]
187
+ if has_freq:
188
+ row.append(_format_freq(a.allele_frequency))
189
+ if has_am:
190
+ if a.am_pathogenicity is not None:
191
+ am_str = f"{a.am_pathogenicity:.3f}"
192
+ if a.source == "pharmgkb":
193
+ am_str = f"[dim]{am_str}*[/dim]"
194
+ row.append(am_str)
195
+ else:
196
+ row.append("—")
197
+ if has_cadd:
198
+ row.append(f"{a.cadd_phred:.1f}" if a.cadd_phred is not None else "—")
199
+ row.append(a.condition or "—")
200
+ table.add_row(*row)
201
+ console.print(table)
202
+ if has_am_caveat:
203
+ console.print("[dim]* AM score on drug-response row — protein structure impact only[/dim]")
@@ -0,0 +1,3 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Utility modules: strand flipping, allele complement, etc."""
@@ -0,0 +1,87 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """Strand flipping, complement logic, and ambiguous-SNP detection.
4
+
5
+ A SNP read on the reverse strand has its alleles complemented (A↔T, C↔G).
6
+ Two databases reporting the "same" variant on opposite strands will list
7
+ opposite allele letters. For most SNPs this is unambiguous and reversible.
8
+ For A/T and C/G SNPs (palindromic), the complement equals the alternative —
9
+ so a strand-flip is undetectable from sequence alone and is best handled by
10
+ extra information (allele frequency, surrounding context).
11
+
12
+ ADR-0010 documents the design.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from allelix.models import NO_CALL_MARKER
18
+
19
+ _COMPLEMENT: dict[str, str] = {"A": "T", "T": "A", "C": "G", "G": "C"}
20
+
21
+ # A/T and C/G SNPs are palindromic; their complement equals the alternative,
22
+ # so strand orientation cannot be inferred from the alleles alone.
23
+ _AMBIGUOUS_PAIRS: frozenset[frozenset[str]] = frozenset(
24
+ {frozenset({"A", "T"}), frozenset({"C", "G"})}
25
+ )
26
+
27
+
28
+ def complement(allele: str) -> str:
29
+ """Return the reverse-complement of a single allele string.
30
+
31
+ A → T, T → A, C → G, G → C. The no-call marker `-` and any unrecognized
32
+ character are returned unchanged. Handles indels (multi-base alleles) by
33
+ complementing each base in reverse order.
34
+ """
35
+ if allele == NO_CALL_MARKER or not allele:
36
+ return allele
37
+ if len(allele) == 1:
38
+ return _COMPLEMENT.get(allele, allele)
39
+ return "".join(_COMPLEMENT.get(b, b) for b in reversed(allele))
40
+
41
+
42
+ def flip_genotype(allele1: str, allele2: str) -> tuple[str, str]:
43
+ """Return both alleles complemented (the reverse-strand reading)."""
44
+ return complement(allele1), complement(allele2)
45
+
46
+
47
+ _PALINDROMIC = frozenset({("A", "T"), ("T", "A"), ("C", "G"), ("G", "C")})
48
+
49
+
50
+ def resolve_strand(user_allele: str, gnomad_ref: str, gnomad_alt: str) -> str | None:
51
+ """Return reference-forward allele, or None if ambiguous.
52
+
53
+ Maps an array-reported allele to its reference-forward equivalent
54
+ using gnomAD's ref/alt as the ground truth. If the user allele
55
+ matches ref or alt directly, it's already forward. If the
56
+ complement matches, the array was minus-strand. Palindromic SNPs
57
+ (A/T, C/G ref/alt pairs) cannot be resolved and return None.
58
+
59
+ Only operates on single-base alleles. Multi-base alleles (indels)
60
+ pass through as-is — array indels are rare and not minus-strand
61
+ reported.
62
+ """
63
+ if len(user_allele) != 1:
64
+ return user_allele
65
+ if user_allele in (gnomad_ref, gnomad_alt):
66
+ return user_allele
67
+ comp = _COMPLEMENT.get(user_allele)
68
+ if comp is None:
69
+ return None
70
+ if comp in (gnomad_ref, gnomad_alt):
71
+ if (gnomad_ref, gnomad_alt) in _PALINDROMIC:
72
+ return None
73
+ return comp
74
+ return None
75
+
76
+
77
+ def is_strand_ambiguous(ref: str, alt: str) -> bool:
78
+ """True if (ref, alt) is an A/T or C/G pair — strand cannot be inferred.
79
+
80
+ Multi-base indels and any allele containing a no-call or unknown letter
81
+ are reported as not ambiguous (they have other ways to disambiguate).
82
+ """
83
+ if len(ref) != 1 or len(alt) != 1:
84
+ return False
85
+ if ref not in _COMPLEMENT or alt not in _COMPLEMENT:
86
+ return False
87
+ return frozenset({ref, alt}) in _AMBIGUOUS_PAIRS