allelix 1.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. allelix/__init__.py +12 -0
  2. allelix/annotators/__init__.py +90 -0
  3. allelix/annotators/alphamissense.py +228 -0
  4. allelix/annotators/base.py +214 -0
  5. allelix/annotators/cadd.py +283 -0
  6. allelix/annotators/clinvar.py +404 -0
  7. allelix/annotators/gnomad.py +212 -0
  8. allelix/annotators/gwas.py +354 -0
  9. allelix/annotators/pharmgkb.py +406 -0
  10. allelix/annotators/snpedia.py +276 -0
  11. allelix/cli.py +1524 -0
  12. allelix/compare.py +149 -0
  13. allelix/config.py +143 -0
  14. allelix/data/__init__.py +3 -0
  15. allelix/data/high_value_snps.yaml +64 -0
  16. allelix/databases/__init__.py +30 -0
  17. allelix/databases/_versions.py +16 -0
  18. allelix/databases/alphamissense_loader.py +48 -0
  19. allelix/databases/cadd_loader.py +49 -0
  20. allelix/databases/cpic_loader.py +234 -0
  21. allelix/databases/gnomad_loader.py +49 -0
  22. allelix/databases/gwas_loader.py +546 -0
  23. allelix/databases/loader_utils.py +80 -0
  24. allelix/databases/manager.py +515 -0
  25. allelix/databases/pharmgkb_loader.py +437 -0
  26. allelix/databases/schema.py +165 -0
  27. allelix/databases/snpedia_loader.py +44 -0
  28. allelix/databases/snpedia_parser.py +342 -0
  29. allelix/exporters/__init__.py +3 -0
  30. allelix/exporters/plink.py +144 -0
  31. allelix/models.py +117 -0
  32. allelix/parsers/__init__.py +73 -0
  33. allelix/parsers/_helpers.py +41 -0
  34. allelix/parsers/ancestrydna.py +130 -0
  35. allelix/parsers/base.py +97 -0
  36. allelix/parsers/ftdna.py +129 -0
  37. allelix/parsers/livingdna.py +121 -0
  38. allelix/parsers/myhappygenes.py +135 -0
  39. allelix/parsers/myheritage.py +118 -0
  40. allelix/parsers/twentythreeandme.py +150 -0
  41. allelix/py.typed +0 -0
  42. allelix/reports/__init__.py +40 -0
  43. allelix/reports/_pipeline.py +497 -0
  44. allelix/reports/diff.py +169 -0
  45. allelix/reports/high_value.py +133 -0
  46. allelix/reports/html.py +1130 -0
  47. allelix/reports/json_report.py +163 -0
  48. allelix/reports/methylation.py +50 -0
  49. allelix/reports/terminal.py +203 -0
  50. allelix/utils/__init__.py +3 -0
  51. allelix/utils/allele.py +87 -0
  52. allelix/utils/build_detect.py +203 -0
  53. allelix-1.8.1.dist-info/METADATA +276 -0
  54. allelix-1.8.1.dist-info/RECORD +58 -0
  55. allelix-1.8.1.dist-info/WHEEL +5 -0
  56. allelix-1.8.1.dist-info/entry_points.txt +2 -0
  57. allelix-1.8.1.dist-info/licenses/LICENSE +671 -0
  58. allelix-1.8.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,212 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """gnomAD population frequency enrichment.
4
+
5
+ gnomAD is not a clinical annotator — it does not produce Annotation
6
+ objects. It enriches existing annotations with population allele
7
+ frequency context. The pipeline calls ``bulk_lookup()`` after all
8
+ annotators have run, and stamps each annotation's ``allele_frequency``
9
+ field.
10
+
11
+ License: ODbL v1.0 (Open Database License). We extract only rsID +
12
+ allele frequencies (no SpliceAI or other restrictively licensed fields).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import sqlite3
19
+ from typing import TYPE_CHECKING, ClassVar
20
+
21
+ from allelix.annotators.base import Annotator, LicenseDescriptor
22
+ from allelix.databases._versions import GNOMAD_SCHEMA_VERSION
23
+ from allelix.databases.gnomad_loader import (
24
+ GNOMAD_CACHE_URL,
25
+ GNOMAD_DB_FILENAME,
26
+ GNOMAD_EXPECTED_SHA256,
27
+ install_prebuilt_cache,
28
+ )
29
+ from allelix.databases.manager import (
30
+ download,
31
+ get_database_info,
32
+ verify_file_hash,
33
+ )
34
+
35
+ if TYPE_CHECKING:
36
+ from pathlib import Path
37
+
38
+ from allelix.models import Annotation, Variant
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ _BULK_BATCH_SIZE = 900
43
+
44
+
45
+ class GnomadAnnotator(Annotator):
46
+ """Population frequency enrichment from gnomAD.
47
+
48
+ Subclasses Annotator for ``db update`` / ``db status`` / ``is_ready()``
49
+ integration. ``annotate()`` always returns ``[]`` — gnomAD does not
50
+ participate in the per-variant annotation loop.
51
+ """
52
+
53
+ name: ClassVar[str] = "gnomad"
54
+ display_name: ClassVar[str] = "gnomAD"
55
+ attribution: ClassVar[str] = "gnomAD"
56
+ requires_download: ClassVar[bool] = True
57
+ server_driven_freshness: ClassVar[bool] = False
58
+ license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
59
+ spdx="ODbL-1.0",
60
+ license_url="https://opendatacommons.org/licenses/odbl/1-0/",
61
+ attribution_text=("Population frequencies sourced from gnomAD, used under ODbL v1.0."),
62
+ source_url="https://gnomad.broadinstitute.org",
63
+ commercial_ok=True,
64
+ )
65
+
66
+ def __init__(self, data_dir: Path) -> None:
67
+ """Bind to the data directory."""
68
+ super().__init__(data_dir)
69
+ self._db_path = data_dir / GNOMAD_DB_FILENAME
70
+ self._conn: sqlite3.Connection | None = None
71
+
72
+ def _connection(self) -> sqlite3.Connection:
73
+ if self._conn is None:
74
+ if not self._db_path.exists():
75
+ raise FileNotFoundError(
76
+ f"gnomAD cache not found at {self._db_path}. Run `allelix db update` first."
77
+ )
78
+ self._conn = sqlite3.connect(self._db_path)
79
+ return self._conn
80
+
81
+ def setup(self) -> None:
82
+ """Download the pre-built gnomAD exome frequency cache from HuggingFace."""
83
+ gz_path = self.data_dir / "gnomad.sqlite.gz"
84
+ download(GNOMAD_CACHE_URL, gz_path)
85
+ verify_file_hash(gz_path, "sha256", GNOMAD_EXPECTED_SHA256)
86
+ install_prebuilt_cache(
87
+ gz_path,
88
+ self._db_path,
89
+ source_url=GNOMAD_CACHE_URL,
90
+ )
91
+ try:
92
+ gz_path.unlink()
93
+ except OSError:
94
+ logger.warning("Could not remove staged file at %s", gz_path)
95
+
96
+ def is_ready(self) -> bool:
97
+ """True when the gnomAD SQLite cache exists with current schema version."""
98
+ info = get_database_info(self._db_path, "gnomad")
99
+ if info is None:
100
+ return False
101
+ tag = info.get("local_version_tag") or ""
102
+ return tag == f"sv:{GNOMAD_SCHEMA_VERSION}" or not tag
103
+
104
+ def version(self) -> str | None:
105
+ """Return the cached database version, or None."""
106
+ info = get_database_info(self._db_path, "gnomad")
107
+ return info["version"] if info else None
108
+
109
+ def record_count(self) -> int | None:
110
+ """Return the number of rsIDs in the cache, or None."""
111
+ info = get_database_info(self._db_path, "gnomad")
112
+ return info["record_count"] if info else None
113
+
114
+ def close(self) -> None:
115
+ """Close the SQLite connection if open."""
116
+ if self._conn is not None:
117
+ self._conn.close()
118
+ self._conn = None
119
+
120
+ def fetch_remote_signal(self) -> str | None:
121
+ """Code-driven source — no runtime freshness probe (ADR-0030)."""
122
+ return None
123
+
124
+ def cached_remote_signal(self) -> str | None:
125
+ """Code-driven source — no cached signal to compare (ADR-0030)."""
126
+ return None
127
+
128
+ def annotate(self, variant: Variant) -> list[Annotation]:
129
+ """Not used — gnomAD enriches, does not annotate. Always returns []."""
130
+ return []
131
+
132
+ def lookup(self, rsid: str) -> float | None:
133
+ """Return global allele frequency for a single rsID, or None."""
134
+ conn = self._connection()
135
+ row = conn.execute(
136
+ "SELECT MAX(af) FROM gnomad_frequencies WHERE rsid = ?", (rsid,)
137
+ ).fetchone()
138
+ return row[0] if row else None
139
+
140
+ def bulk_lookup(self, rsids: set[str]) -> dict[str, float]:
141
+ """Return ``{rsid: af}`` for all rsIDs found in the cache.
142
+
143
+ Fallback for annotations without a known alt allele. Uses MAX to
144
+ resolve multi-allelic sites. Prefer ``bulk_lookup_by_alt`` when alt
145
+ is available.
146
+
147
+ Batches into chunks of 900 to stay within SQLite's variable limit.
148
+ """
149
+ if not rsids:
150
+ return {}
151
+ conn = self._connection()
152
+ result: dict[str, float] = {}
153
+ rsid_list = list(rsids)
154
+ for i in range(0, len(rsid_list), _BULK_BATCH_SIZE):
155
+ batch = rsid_list[i : i + _BULK_BATCH_SIZE]
156
+ placeholders = ",".join("?" * len(batch))
157
+ rows = conn.execute(
158
+ f"SELECT rsid, MAX(af) FROM gnomad_frequencies"
159
+ f" WHERE rsid IN ({placeholders}) GROUP BY rsid",
160
+ batch,
161
+ ).fetchall()
162
+ for rsid, af in rows:
163
+ if af is not None:
164
+ result[rsid] = af
165
+ return result
166
+
167
+ def bulk_resolve_coordinates(
168
+ self, rsids: set[str]
169
+ ) -> dict[str, list[tuple[str, int, str, str]]]:
170
+ """Return ``{rsid: [(chrom, pos, ref, alt), ...]}`` from the gnomAD cache.
171
+
172
+ Maps rsIDs to genomic coordinates for coordinate-based lookups
173
+ (CADD, future VCF-keyed sources). Multi-allelic sites return
174
+ multiple tuples per rsid.
175
+ """
176
+ if not rsids:
177
+ return {}
178
+ conn = self._connection()
179
+ result: dict[str, list[tuple[str, int, str, str]]] = {}
180
+ rsid_list = list(rsids)
181
+ for i in range(0, len(rsid_list), _BULK_BATCH_SIZE):
182
+ batch = rsid_list[i : i + _BULK_BATCH_SIZE]
183
+ placeholders = ",".join("?" * len(batch))
184
+ rows = conn.execute(
185
+ f"SELECT rsid, chrom, pos, ref, alt FROM gnomad_frequencies"
186
+ f" WHERE rsid IN ({placeholders})",
187
+ batch,
188
+ ).fetchall()
189
+ for rsid, chrom, pos, ref, alt in rows:
190
+ result.setdefault(rsid, []).append((chrom, pos, ref, alt))
191
+ return result
192
+
193
+ def bulk_lookup_by_alt(self, keys: set[tuple[str, str]]) -> dict[tuple[str, str], float]:
194
+ """Return ``{(rsid, alt): af}`` for exact allele matches."""
195
+ if not keys:
196
+ return {}
197
+ conn = self._connection()
198
+ result: dict[tuple[str, str], float] = {}
199
+ key_list = list(keys)
200
+ batch_size = _BULK_BATCH_SIZE // 2
201
+ for i in range(0, len(key_list), batch_size):
202
+ batch = key_list[i : i + batch_size]
203
+ clauses = " OR ".join(["(rsid = ? AND alt = ?)"] * len(batch))
204
+ params = [v for rsid, alt in batch for v in (rsid, alt)]
205
+ rows = conn.execute(
206
+ f"SELECT rsid, alt, af FROM gnomad_frequencies WHERE {clauses}",
207
+ params,
208
+ ).fetchall()
209
+ for rsid, alt, af in rows:
210
+ if af is not None:
211
+ result[(rsid, alt)] = af
212
+ return result
@@ -0,0 +1,354 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 dial481
3
+ """GWAS Catalog annotator. Source-attributed trait associations."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import contextlib
8
+ import logging
9
+ import sqlite3
10
+ import zipfile
11
+ from typing import TYPE_CHECKING, ClassVar
12
+
13
+ from allelix.annotators.base import Annotator, LicenseDescriptor
14
+ from allelix.databases.gwas_loader import (
15
+ _CATEGORIZER_VERSION,
16
+ _REQUIRED_GWAS_COLUMNS,
17
+ GWAS_CATALOG_URL,
18
+ GWAS_DB_FILENAME,
19
+ load_gwas_tsv,
20
+ schema_is_current,
21
+ )
22
+ from allelix.databases.manager import (
23
+ _ensure_local_version_tag_column,
24
+ download,
25
+ get_database_info,
26
+ head_request_headers,
27
+ )
28
+ from allelix.models import Annotation
29
+
30
+ _EXCLUDED_TRAIT_CATEGORIES = frozenset(
31
+ {
32
+ "body_measurement",
33
+ "lipid_measurement",
34
+ "hematological_measurement",
35
+ "other_measurement",
36
+ "behavioral",
37
+ }
38
+ )
39
+
40
+ _MUST_INCLUDE_RSIDS = frozenset(
41
+ {
42
+ "rs10737680", # CFH — age-related macular degeneration
43
+ "rs11209026", # IL23R — inflammatory bowel disease
44
+ "rs9271366", # HLA-DRB1 — multiple sclerosis
45
+ }
46
+ )
47
+
48
+ if TYPE_CHECKING:
49
+ from pathlib import Path
50
+
51
+ from allelix.models import Variant
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ def _magnitude(p_value: float | None, or_beta: float | None) -> float:
57
+ """Derive magnitude from p-value and optional effect size."""
58
+ if p_value is None:
59
+ base = 2.0
60
+ elif p_value < 5e-100:
61
+ base = 8.0
62
+ elif p_value < 5e-20:
63
+ base = 7.0
64
+ elif p_value < 5e-8:
65
+ base = 6.0
66
+ elif p_value < 5e-6:
67
+ base = 4.0
68
+ elif p_value < 5e-4:
69
+ base = 3.0
70
+ else:
71
+ base = 2.0
72
+
73
+ if or_beta is not None and or_beta > 0:
74
+ if or_beta >= 3.0 or or_beta <= 0.33:
75
+ base = min(base + 1.0, 9.0)
76
+ elif or_beta >= 2.0 or or_beta <= 0.5:
77
+ base = min(base + 0.5, 9.0)
78
+
79
+ return base
80
+
81
+
82
+ _UNKNOWN_RISK_ALLELE_MAG_CAP = 3.0
83
+
84
+
85
+ class GWASCatalogAnnotator(Annotator):
86
+ """Annotates variants with GWAS Catalog trait associations."""
87
+
88
+ name: ClassVar[str] = "gwas"
89
+ display_name: ClassVar[str] = "GWAS Catalog"
90
+ attribution: ClassVar[str] = "GWAS Catalog"
91
+ requires_download: ClassVar[bool] = True
92
+ license: ClassVar[LicenseDescriptor] = LicenseDescriptor(
93
+ spdx="custom-embl-ebi",
94
+ license_url="https://www.ebi.ac.uk/gwas/docs/about",
95
+ attribution_text=(
96
+ "GWAS Catalog data sourced from NHGRI-EBI GWAS Catalog,"
97
+ " available under EMBL-EBI Terms of Use."
98
+ ),
99
+ source_url="https://www.ebi.ac.uk/gwas/",
100
+ commercial_ok=True,
101
+ )
102
+
103
+ def __init__(self, data_dir: Path, *, filter_traits: bool = True) -> None:
104
+ """Initialize with path to the data directory."""
105
+ super().__init__(data_dir)
106
+ self._db_path = data_dir / GWAS_DB_FILENAME
107
+ self._conn: sqlite3.Connection | None = None
108
+ self._filter_traits = filter_traits
109
+
110
+ def _connection(self) -> sqlite3.Connection:
111
+ if self._conn is None:
112
+ self._conn = sqlite3.connect(self._db_path)
113
+ return self._conn
114
+
115
+ def setup(self) -> None:
116
+ """Download GWAS Catalog associations ZIP, extract TSV, and ingest."""
117
+ url = GWAS_CATALOG_URL
118
+ signal = self.fetch_remote_signal()
119
+ if signal is None:
120
+ msg = (
121
+ "gwas: cannot verify remote freshness signal. "
122
+ "Refresh aborted to avoid persisting an incomplete cache stamp. "
123
+ "Retry, or pass --force if you accept that next `db update` "
124
+ "will re-download to re-establish the signal."
125
+ )
126
+ raise RuntimeError(msg)
127
+ zip_path = self.data_dir / "gwas_catalog_associations.zip"
128
+ tsv_path = self.data_dir / "gwas_catalog_associations.tsv"
129
+ # No content-hash verification: EBI publishes no checksum for this
130
+ # file and the content is mutable, so there is nothing to pin or
131
+ # fetch. TLS + Content-Length truncation guard only. See ADR-0029.
132
+ download(url, zip_path)
133
+ try:
134
+ with zipfile.ZipFile(zip_path) as zf:
135
+ tsv_names = [n for n in zf.namelist() if n.endswith(".tsv")]
136
+ if not tsv_names:
137
+ msg = f"No .tsv file found in {zip_path}"
138
+ raise RuntimeError(msg)
139
+ zf.extract(tsv_names[0], self.data_dir)
140
+ extracted = self.data_dir / tsv_names[0]
141
+ if extracted != tsv_path:
142
+ extracted.rename(tsv_path)
143
+ load_gwas_tsv(tsv_path, self._db_path, source_url=url, remote_signal=signal)
144
+ finally:
145
+ try:
146
+ zip_path.unlink()
147
+ except FileNotFoundError:
148
+ pass
149
+ except OSError:
150
+ logger.warning("Could not remove staged file at %s", zip_path)
151
+
152
+ def is_ready(self) -> bool:
153
+ """Return True when the local GWAS cache exists and has current schema.
154
+
155
+ Handles three states:
156
+
157
+ 1. Current tag in ``local_version_tag`` — ready.
158
+ 2. No tag (legacy cache or ``|cv:`` still in ``remote_signal``) —
159
+ one-shot migration via ``_stamp_existing_gwas_cache``.
160
+ 3. Stale tag (categorizer bumped) — auto-reingest from cached TSV
161
+ if still on disk.
162
+ """
163
+ info = get_database_info(self._db_path, "gwas")
164
+ if info is None:
165
+ return False
166
+ tag = info.get("local_version_tag") or ""
167
+ if tag == f"cv:{_CATEGORIZER_VERSION}":
168
+ return _has_current_gwas_columns(self._db_path)
169
+ if not tag and _stamp_existing_gwas_cache(self._db_path):
170
+ return _has_current_gwas_columns(self._db_path)
171
+ tsv_path = self.data_dir / "gwas_catalog_associations.tsv"
172
+ if tsv_path.exists():
173
+ print(
174
+ "GWAS categorizer changed — re-ingesting from cached TSV...",
175
+ flush=True,
176
+ )
177
+ try:
178
+ load_gwas_tsv(
179
+ tsv_path,
180
+ self._db_path,
181
+ source_url=GWAS_CATALOG_URL,
182
+ remote_signal=self.cached_remote_signal(),
183
+ )
184
+ except Exception:
185
+ logger.warning("Auto-reingest from cached TSV failed", exc_info=True)
186
+ return False
187
+ return schema_is_current(self._db_path)
188
+ return False
189
+
190
+ def version(self) -> str | None:
191
+ """Return the cached database version string, or None."""
192
+ info = get_database_info(self._db_path, "gwas")
193
+ return info["version"] if info else None
194
+
195
+ def record_count(self) -> int | None:
196
+ """Return the number of cached GWAS association records, or None."""
197
+ info = get_database_info(self._db_path, "gwas")
198
+ return info["record_count"] if info else None
199
+
200
+ def close(self) -> None:
201
+ """Close the SQLite connection if open."""
202
+ if self._conn is not None:
203
+ self._conn.close()
204
+ self._conn = None
205
+
206
+ def fetch_remote_signal(self) -> str | None:
207
+ """Probe the GWAS Catalog URL for ETag or Last-Modified."""
208
+ headers = head_request_headers(GWAS_CATALOG_URL)
209
+ if headers is None:
210
+ return None
211
+ etag = headers.get("ETag") or headers.get("Etag")
212
+ last_modified = headers.get("Last-Modified") or headers.get("Last-modified")
213
+ if etag:
214
+ return f"etag:{etag.strip()}"
215
+ if last_modified:
216
+ return f"lm:{last_modified.strip()}"
217
+ return None
218
+
219
+ def cached_remote_signal(self) -> str | None:
220
+ """Return the remote signal stored during the last successful ingest."""
221
+ info = get_database_info(self._db_path, "gwas")
222
+ if not info or not info["remote_signal"]:
223
+ return None
224
+ return info["remote_signal"] or None
225
+
226
+ def annotate(self, variant: Variant) -> list[Annotation]:
227
+ """Return GWAS Catalog annotations for variants the user carries.
228
+
229
+ Carrier matching uses the risk allele when specified. When the
230
+ GWAS entry doesn't specify a risk allele, the annotation fires
231
+ on rsid match alone with a magnitude penalty.
232
+ """
233
+ if variant.is_no_call:
234
+ return []
235
+
236
+ sql = (
237
+ "SELECT risk_allele, trait, p_value, or_beta, gene, "
238
+ "study_accession, pubmed_id, trait_category "
239
+ "FROM gwas_associations WHERE rsid = ?"
240
+ )
241
+ rows = self._connection().execute(sql, (variant.rsid,)).fetchall()
242
+ annotations: list[Annotation] = []
243
+ user_diploid = _user_diploid(variant)
244
+
245
+ for row in rows:
246
+ (
247
+ risk_allele,
248
+ trait,
249
+ p_value,
250
+ or_beta,
251
+ gene,
252
+ study_accession,
253
+ pubmed_id,
254
+ trait_category,
255
+ ) = row
256
+
257
+ if self._filter_traits and trait_category in _EXCLUDED_TRAIT_CATEGORIES:
258
+ continue
259
+
260
+ if risk_allele is not None:
261
+ if variant.allele1 != risk_allele and variant.allele2 != risk_allele:
262
+ continue
263
+ mag = _magnitude(p_value, or_beta)
264
+ risk_note = ""
265
+ else:
266
+ # ADR-0024: unknown risk allele fires on rsID match alone
267
+ # but capped at 3.0 so it doesn't pass typical --min-magnitude
268
+ # thresholds. Without knowing which allele is the risk allele,
269
+ # we can't apply the carrier rule (ADR-0007).
270
+ mag = min(_magnitude(p_value, or_beta), _UNKNOWN_RISK_ALLELE_MAG_CAP)
271
+ risk_note = " (risk allele not specified in study)"
272
+
273
+ p_str = f"p={p_value:.1e}" if p_value is not None else "p=N/A"
274
+ gene_str = gene or "—"
275
+ description = f"GWAS Catalog: {trait} ({p_str}, gene: {gene_str}){risk_note}"
276
+
277
+ references: list[str] = []
278
+ if pubmed_id:
279
+ references.append(f"pubmed:{pubmed_id}")
280
+ if study_accession:
281
+ references.append(f"gwas:{study_accession}")
282
+
283
+ annotations.append(
284
+ Annotation(
285
+ source=self.name,
286
+ rsid=variant.rsid,
287
+ significance="gwas_association",
288
+ category="trait",
289
+ magnitude=mag,
290
+ description=description,
291
+ attribution=self.attribution,
292
+ genotype_match=user_diploid,
293
+ references=references,
294
+ condition=trait,
295
+ gene=gene or "",
296
+ alt="",
297
+ is_must_include=variant.rsid in _MUST_INCLUDE_RSIDS,
298
+ )
299
+ )
300
+ return annotations
301
+
302
+
303
+ def _user_diploid(variant: Variant) -> str:
304
+ """Sorted two-letter diploid for SNVs; indel passthrough verbatim."""
305
+ a1, a2 = variant.allele1, variant.allele2
306
+ if len(a1) == 1 and len(a2) == 1:
307
+ return "".join(sorted((a1, a2)))
308
+ return f"{a1}/{a2}"
309
+
310
+
311
+ def _has_current_gwas_columns(db_path: Path) -> bool:
312
+ """True iff the gwas_associations table has the required columns."""
313
+ try:
314
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
315
+ cols = {row[1] for row in conn.execute("PRAGMA table_info(gwas_associations)")}
316
+ return _REQUIRED_GWAS_COLUMNS.issubset(cols)
317
+ except sqlite3.DatabaseError:
318
+ return False
319
+
320
+
321
+ def _stamp_existing_gwas_cache(db_path: Path) -> bool:
322
+ """One-shot migration: stamp ``local_version_tag`` on a GWAS cache.
323
+
324
+ Handles legacy caches with ``|cv:N`` baked into ``remote_signal``
325
+ by moving the tag and cleaning the signal. Returns True if the
326
+ current categorizer version is now stamped.
327
+ """
328
+ if not db_path.exists():
329
+ return False
330
+ tag = f"cv:{_CATEGORIZER_VERSION}"
331
+ try:
332
+ with contextlib.closing(sqlite3.connect(db_path)) as conn:
333
+ _ensure_local_version_tag_column(conn)
334
+ row = conn.execute(
335
+ "SELECT remote_signal, local_version_tag FROM database_versions WHERE name='gwas'"
336
+ ).fetchone()
337
+ if not row:
338
+ return False
339
+ sig, existing_tag = row
340
+ if existing_tag == tag:
341
+ return True
342
+ if existing_tag is not None:
343
+ return False
344
+ clean_signal = (sig or "").split("|cv:")[0]
345
+ conn.execute(
346
+ "UPDATE database_versions "
347
+ "SET remote_signal = ?, local_version_tag = ? "
348
+ "WHERE name = 'gwas'",
349
+ (clean_signal, tag),
350
+ )
351
+ conn.commit()
352
+ return True
353
+ except (sqlite3.OperationalError, sqlite3.DatabaseError):
354
+ return False