vcfclick 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ """Annotation service. Single tier, single license.
2
+
3
+ from annotations import position_for_gene, gene_at, clinvar_lookup
4
+
5
+ The transcript / exon / CDS / UTR hierarchy lives in
6
+ annotations/transcripts.py (Phase 2; not yet implemented).
7
+ """
8
+
9
+ from annotations.db import (
10
+ GeneRange,
11
+ ClinVarRecord,
12
+ position_for_gene,
13
+ gene_at,
14
+ clinvar_lookup,
15
+ )
16
+
17
+ __all__ = [
18
+ "GeneRange",
19
+ "ClinVarRecord",
20
+ "position_for_gene",
21
+ "gene_at",
22
+ "clinvar_lookup",
23
+ ]
annotations/db.py ADDED
@@ -0,0 +1,144 @@
1
+ """Annotation service backed by DuckDB.
2
+
3
+ Holds STATIC PUBLIC reference data: gene coordinates (RefSeq GFF3) and
4
+ ClinVar significance. Transcript / exon / CDS hierarchy lands in
5
+ annotations/transcripts.py (Phase 2). All tiers ship under the same
6
+ OSS license as the engine.
7
+
8
+ Architectural role: the two stores (ClickHouse for sample data, DuckDB
9
+ for reference data) are intentionally separated so the MCP server
10
+ composes across them at query time. Reference data updates monthly
11
+ (ClinVar) without touching the sample store; sample data grows without
12
+ touching the reference store. This separation is *operationally* useful
13
+ for everyone, and would also be the basis for a security boundary if
14
+ the hosted tier ever takes on regulated workloads.
15
+
16
+ DuckDB is the right tool here for three reasons:
17
+ 1. Embedded — no extra process, no network hop from the MCP server.
18
+ 2. Vectorised — gene overlap queries and ClinVar joins are fast.
19
+ 3. Public data — the curated DuckDB file can be shipped as a
20
+ downloadable artefact alongside the OSS package.
21
+
22
+ Data source: RefSeq GFF3 + ClinVar VCF, GRCh38 only. Loaders live
23
+ under annotations/loaders/ (not yet written) and are idempotent batch
24
+ jobs that run on a cron, not part of the query path.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from dataclasses import dataclass
30
+ from pathlib import Path
31
+
32
+ import duckdb
33
+
34
+
35
+ DUCKDB_PATH = Path(__file__).parent / "annotations.duckdb"
36
+
37
+
38
+ SCHEMA_DDL = """
39
+ CREATE TABLE IF NOT EXISTS refseq_genes (
40
+ gene_symbol VARCHAR PRIMARY KEY, -- HGNC symbol, e.g. 'BRCA1'
41
+ chrom VARCHAR NOT NULL, -- 'chr17' style
42
+ start_pos UINTEGER NOT NULL, -- 1-based, inclusive
43
+ end_pos UINTEGER NOT NULL, -- 1-based, inclusive
44
+ strand VARCHAR, -- '+' or '-'
45
+ refseq_id VARCHAR, -- NCBI Gene ID
46
+ description VARCHAR
47
+ );
48
+
49
+ CREATE INDEX IF NOT EXISTS idx_genes_range
50
+ ON refseq_genes (chrom, start_pos, end_pos);
51
+
52
+ CREATE TABLE IF NOT EXISTS clinvar_variants (
53
+ chrom VARCHAR NOT NULL,
54
+ pos UINTEGER NOT NULL,
55
+ ref VARCHAR NOT NULL,
56
+ alt VARCHAR NOT NULL,
57
+ clin_sig VARCHAR, -- 'Pathogenic', 'Likely_pathogenic', ...
58
+ review_status VARCHAR, -- ClinVar gold-stars equivalent
59
+ clinvar_id VARCHAR, -- VCV accession
60
+ condition VARCHAR, -- semicolon-joined trait names
61
+ PRIMARY KEY (chrom, pos, ref, alt)
62
+ );
63
+ """
64
+
65
+
66
+ def get_connection() -> duckdb.DuckDBPyConnection:
67
+ """Open (and initialise on first use) the DuckDB annotation store."""
68
+ conn = duckdb.connect(str(DUCKDB_PATH))
69
+ conn.execute(SCHEMA_DDL)
70
+ return conn
71
+
72
+
73
+ @dataclass(frozen=True)
74
+ class GeneRange:
75
+ gene_symbol: str
76
+ chrom: str
77
+ start_pos: int
78
+ end_pos: int
79
+ strand: str | None
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class ClinVarRecord:
84
+ chrom: str
85
+ pos: int
86
+ ref: str
87
+ alt: str
88
+ clin_sig: str | None
89
+ review_status: str | None
90
+ clinvar_id: str | None
91
+ condition: str | None
92
+
93
+
94
+ def position_for_gene(symbol: str) -> GeneRange | None:
95
+ """Translate a gene symbol to GRCh38 coordinates.
96
+
97
+ Used by the LLM to convert questions like "calls in BRCA1" into a
98
+ range filter on the ClickHouse genotypes table.
99
+ """
100
+ conn = get_connection()
101
+ row = conn.execute(
102
+ """
103
+ SELECT gene_symbol, chrom, start_pos, end_pos, strand
104
+ FROM refseq_genes
105
+ WHERE gene_symbol = ?
106
+ """,
107
+ [symbol.upper()],
108
+ ).fetchone()
109
+ return GeneRange(*row) if row else None
110
+
111
+
112
+ def gene_at(chrom: str, pos: int) -> list[GeneRange]:
113
+ """All genes overlapping a single position. Multiple results possible
114
+ (overlapping transcripts, antisense genes)."""
115
+ conn = get_connection()
116
+ rows = conn.execute(
117
+ """
118
+ SELECT gene_symbol, chrom, start_pos, end_pos, strand
119
+ FROM refseq_genes
120
+ WHERE chrom = ? AND ? BETWEEN start_pos AND end_pos
121
+ ORDER BY start_pos
122
+ """,
123
+ [chrom, pos],
124
+ ).fetchall()
125
+ return [GeneRange(*r) for r in rows]
126
+
127
+
128
+ def clinvar_lookup(chrom: str, pos: int, ref: str, alt: str) -> ClinVarRecord | None:
129
+ """Look up ClinVar significance for a specific allele.
130
+
131
+ Returns None if the variant is not in ClinVar — which the caller
132
+ should distinguish from "benign" in user-facing output.
133
+ """
134
+ conn = get_connection()
135
+ row = conn.execute(
136
+ """
137
+ SELECT chrom, pos, ref, alt, clin_sig, review_status,
138
+ clinvar_id, condition
139
+ FROM clinvar_variants
140
+ WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ?
141
+ """,
142
+ [chrom, pos, ref, alt],
143
+ ).fetchone()
144
+ return ClinVarRecord(*row) if row else None
@@ -0,0 +1 @@
1
+ """Reference-data loaders (GENCODE genes, ClinVar, etc.) → DuckDB."""
@@ -0,0 +1,167 @@
1
+ """GENCODE GFF3 → DuckDB refseq_genes table.
2
+
3
+ GENCODE is used (not RefSeq's NCBI GFF) because:
4
+ - It's GRCh38-native with 'chr'-prefixed contig names — matches our
5
+ sample-data convention without any contig-name remapping.
6
+ - Single download, clean attribute format, ubiquitous in research
7
+ bioinformatics.
8
+ - Same gene symbols as HGNC (which is what bioinformaticians type).
9
+
10
+ Only `gene` feature rows are extracted — we need name + coordinates.
11
+ Transcripts/exons/CDS land in `annotations/transcripts.py` in Phase 2
12
+ when that depth of annotation matters.
13
+
14
+ Usage:
15
+ uv run python -m annotations.loaders.gencode_genes
16
+ uv run python -m annotations.loaders.gencode_genes --gff path/to/local.gff3.gz
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import gzip
23
+ import time
24
+ import urllib.request
25
+ from pathlib import Path
26
+
27
+ from annotations.db import get_connection
28
+
29
+ GENCODE_VERSION = "45"
30
+ GENCODE_URL = (
31
+ f"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/"
32
+ f"release_{GENCODE_VERSION}/gencode.v{GENCODE_VERSION}.annotation.gff3.gz"
33
+ )
34
+
35
+ CACHE_DIR = Path(__file__).parent / "_cache"
36
+ CACHED_GFF = CACHE_DIR / f"gencode.v{GENCODE_VERSION}.annotation.gff3.gz"
37
+
38
+
39
+ def download_gencode() -> Path:
40
+ """Download the GENCODE GFF3 once and cache it. Returns the local path."""
41
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
42
+ if CACHED_GFF.exists():
43
+ print(f"[gencode] already cached: {CACHED_GFF}")
44
+ return CACHED_GFF
45
+ print(f"[gencode] downloading {GENCODE_URL} → {CACHED_GFF}")
46
+ started = time.time()
47
+ urllib.request.urlretrieve(GENCODE_URL, CACHED_GFF)
48
+ size_mb = CACHED_GFF.stat().st_size / 1_000_000
49
+ print(f"[gencode] done ({size_mb:.0f} MB in {time.time() - started:.1f}s)")
50
+ return CACHED_GFF
51
+
52
+
53
+ def parse_attributes(attrs: str) -> dict[str, str]:
54
+ """GFF3 attribute column: semicolon-delimited key=value pairs."""
55
+ out = {}
56
+ for pair in attrs.split(";"):
57
+ if "=" in pair:
58
+ k, v = pair.split("=", 1)
59
+ out[k.strip()] = v.strip()
60
+ return out
61
+
62
+
63
+ # Primary GRCh38 contigs only: chr1-22, X, Y, M. Drops alt-locus
64
+ # (`_alt`), patch (`_fix`), random (`_random`), unplaced (`chrUn_`)
65
+ # scaffolds so a gene symbol resolves to exactly one canonical
66
+ # coordinate range. Research-bioinformatics queries almost never want
67
+ # alt-locus coords; a Phase 2 schema change can lift this when needed.
68
+ PRIMARY_CONTIGS = frozenset(
69
+ [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
70
+ )
71
+
72
+
73
+ def iter_genes(gff_path: Path):
74
+ """Stream a GFF3 file, yielding (gene_symbol, chrom, start, end, strand,
75
+ refseq_id, description) tuples for each gene on a primary contig.
76
+
77
+ Duplicate gene_symbols on the same primary contig are also possible
78
+ (rare; usually PAR genes like XG that appear on both chrX and chrY).
79
+ We keep the first occurrence and drop the rest; PAR resolution is a
80
+ Phase 2 concern.
81
+ """
82
+ opener = gzip.open if gff_path.suffix == ".gz" else open
83
+ seen: set[str] = set()
84
+ with opener(gff_path, "rt") as f:
85
+ for line in f:
86
+ if line.startswith("#"):
87
+ continue
88
+ fields = line.rstrip("\n").split("\t")
89
+ if len(fields) < 9 or fields[2] != "gene":
90
+ continue
91
+ chrom, _, _, start, end, _, strand, _, attrs_str = fields
92
+ if chrom not in PRIMARY_CONTIGS:
93
+ continue
94
+ attrs = parse_attributes(attrs_str)
95
+ # gene_name is the HGNC symbol (e.g. 'BRCA1'). gene_id is the
96
+ # Ensembl ID (e.g. 'ENSG00000012048.24'). We prefer the name.
97
+ symbol = attrs.get("gene_name")
98
+ if not symbol or symbol in seen:
99
+ continue
100
+ seen.add(symbol)
101
+ yield (
102
+ symbol,
103
+ chrom,
104
+ int(start),
105
+ int(end),
106
+ strand,
107
+ attrs.get("gene_id"),
108
+ attrs.get("gene_type"),
109
+ )
110
+
111
+
112
+ def load(gff_path: Path | None = None, replace: bool = True) -> int:
113
+ """Populate the refseq_genes DuckDB table from a GENCODE GFF3 file.
114
+
115
+ Returns the number of gene rows written. Pass `replace=False` to
116
+ keep prior rows (DuckDB will error on duplicate primary keys).
117
+ """
118
+ if gff_path is None:
119
+ gff_path = download_gencode()
120
+
121
+ conn = get_connection()
122
+ if replace:
123
+ conn.execute("DELETE FROM refseq_genes")
124
+
125
+ rows = list(iter_genes(gff_path))
126
+ if not rows:
127
+ raise RuntimeError(f"No gene features parsed from {gff_path}")
128
+
129
+ conn.executemany(
130
+ "INSERT INTO refseq_genes "
131
+ "(gene_symbol, chrom, start_pos, end_pos, strand, refseq_id, description) "
132
+ "VALUES (?, ?, ?, ?, ?, ?, ?)",
133
+ rows,
134
+ )
135
+ print(f"[gencode] loaded {len(rows):,} genes into refseq_genes")
136
+
137
+ # Quick verification — the demo gene should be reachable.
138
+ brca1 = conn.execute(
139
+ "SELECT chrom, start_pos, end_pos FROM refseq_genes WHERE gene_symbol = 'BRCA1'"
140
+ ).fetchone()
141
+ if brca1:
142
+ print(f"[gencode] BRCA1 → {brca1[0]}:{brca1[1]}-{brca1[2]}")
143
+ else:
144
+ print("[gencode] WARNING: BRCA1 not found in loaded data")
145
+
146
+ return len(rows)
147
+
148
+
149
+ def main() -> None:
150
+ ap = argparse.ArgumentParser(description=__doc__)
151
+ ap.add_argument(
152
+ "--gff",
153
+ type=Path,
154
+ default=None,
155
+ help="Local GENCODE GFF3 (.gff3 or .gff3.gz). Downloads from EBI if omitted.",
156
+ )
157
+ ap.add_argument(
158
+ "--keep-existing",
159
+ action="store_true",
160
+ help="Don't truncate refseq_genes before loading.",
161
+ )
162
+ args = ap.parse_args()
163
+ load(args.gff, replace=not args.keep_existing)
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()
@@ -0,0 +1,93 @@
1
+ """Transcript / exon / CDS / UTR annotation hierarchy.
2
+
3
+ Phase 2 work. Same license as the engine. No tier boundary.
4
+
5
+ Ships open alongside the rest. The stubs below capture the planned API
6
+ so the design is visible in the repo, and so anyone reading the code
7
+ can see what's coming without having to dig through a roadmap doc.
8
+
9
+ Why transcript-level matters:
10
+ - "non-ref in BRCA1" includes deep intronic + UTR calls that are
11
+ usually noise for downstream interpretation.
12
+ - "non-ref in BRCA1 CDS, AF < 0.01" is the question a research
13
+ bioinformatician actually wants.
14
+ - Canonical transcript (MANE Select) avoids spurious hits on rare
15
+ isoforms.
16
+ - Splice-site distance is needed for any consequence prediction.
17
+
18
+ Data source: GTF/GFF (RefSeq GFF3 + Ensembl GTF, both GRCh38).
19
+ NCBI's GFF3 is public domain; Ensembl GTF is Apache 2 / open data.
20
+ The integration — curated, GRCh38-pinned, canonical-tagged DuckDB
21
+ artefact — is shipped under the same OSS license as the engine.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+
27
+ TRANSCRIPTS_SCHEMA_DDL = """
28
+ CREATE TABLE IF NOT EXISTS transcripts (
29
+ transcript_id VARCHAR PRIMARY KEY, -- e.g. 'NM_007294.4'
30
+ gene_symbol VARCHAR NOT NULL,
31
+ chrom VARCHAR NOT NULL,
32
+ start_pos UINTEGER NOT NULL,
33
+ end_pos UINTEGER NOT NULL,
34
+ strand VARCHAR,
35
+ biotype VARCHAR, -- 'protein_coding', 'lncRNA', ...
36
+ is_canonical BOOLEAN DEFAULT FALSE, -- MANE Select tag
37
+ is_mane_plus BOOLEAN DEFAULT FALSE -- MANE Plus Clinical
38
+ );
39
+
40
+ CREATE INDEX IF NOT EXISTS idx_transcripts_gene
41
+ ON transcripts (gene_symbol);
42
+
43
+ CREATE TABLE IF NOT EXISTS exons (
44
+ transcript_id VARCHAR NOT NULL,
45
+ exon_number USMALLINT NOT NULL, -- 1-indexed in transcription order
46
+ chrom VARCHAR NOT NULL,
47
+ start_pos UINTEGER NOT NULL,
48
+ end_pos UINTEGER NOT NULL,
49
+ PRIMARY KEY (transcript_id, exon_number)
50
+ );
51
+
52
+ CREATE INDEX IF NOT EXISTS idx_exons_range
53
+ ON exons (chrom, start_pos, end_pos);
54
+
55
+ CREATE TABLE IF NOT EXISTS cds (
56
+ transcript_id VARCHAR NOT NULL,
57
+ exon_number USMALLINT NOT NULL,
58
+ chrom VARCHAR NOT NULL,
59
+ start_pos UINTEGER NOT NULL, -- coding-only, excludes UTR
60
+ end_pos UINTEGER NOT NULL,
61
+ phase USMALLINT, -- 0, 1, or 2
62
+ PRIMARY KEY (transcript_id, exon_number)
63
+ );
64
+
65
+ CREATE INDEX IF NOT EXISTS idx_cds_range
66
+ ON cds (chrom, start_pos, end_pos);
67
+ """
68
+
69
+
70
+ def transcripts_for_gene(symbol: str) -> list:
71
+ """All transcripts of a gene."""
72
+ raise NotImplementedError("Phase 2.")
73
+
74
+
75
+ def canonical_transcript(symbol: str):
76
+ """The MANE Select transcript for a gene, if defined."""
77
+ raise NotImplementedError("Phase 2.")
78
+
79
+
80
+ def cds_regions_for_gene(symbol: str) -> list:
81
+ """Disjoint CDS ranges for a gene, suitable for a SQL range filter.
82
+ The clinically meaningful version of position_for_gene()."""
83
+ raise NotImplementedError("Phase 2.")
84
+
85
+
86
+ def exon_at(chrom: str, pos: int) -> list:
87
+ """All (transcript_id, exon_number) pairs containing a position."""
88
+ raise NotImplementedError("Phase 2.")
89
+
90
+
91
+ def splice_site_distance(chrom: str, pos: int) -> int | None:
92
+ """Distance in bp to the nearest exon/intron boundary (signed)."""
93
+ raise NotImplementedError("Phase 2.")
cli/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """vcfclick command-line interface."""