PyPI - vcfclick - Versions diffs - 0.1.0__py3-none-any.whl - Mend

vcfclick 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

annotations/__init__.py +23 -0
annotations/db.py +144 -0
annotations/loaders/__init__.py +1 -0
annotations/loaders/gencode_genes.py +167 -0
annotations/transcripts.py +93 -0
cli/__init__.py +1 -0
cli/main.py +374 -0
export/__init__.py +1 -0
export/parquet.py +101 -0
ingest/__init__.py +1 -0
ingest/_arrow.py +166 -0
ingest/_tabix.py +146 -0
ingest/parallel.py +372 -0
ingest/vcf_load.py +410 -0
schema/01_variants.sql +79 -0
schema/02_genotypes.sql +67 -0
schema/03_samples.sql +36 -0
storage/__init__.py +23 -0
storage/db.py +169 -0
vcfclick-0.1.0.dist-info/METADATA +261 -0
vcfclick-0.1.0.dist-info/RECORD +25 -0
vcfclick-0.1.0.dist-info/WHEEL +4 -0
vcfclick-0.1.0.dist-info/entry_points.txt +2 -0
vcfclick_mcp/__init__.py +1 -0
vcfclick_mcp/server.py +216 -0

annotations/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Annotation service. Single tier, single license.
+    from annotations import position_for_gene, gene_at, clinvar_lookup
+The transcript / exon / CDS / UTR hierarchy lives in
+annotations/transcripts.py (Phase 2; not yet implemented).
+"""
+from annotations.db import (
+    GeneRange,
+    ClinVarRecord,
+    position_for_gene,
+    gene_at,
+    clinvar_lookup,
+)
+__all__ = [
+    "GeneRange",
+    "ClinVarRecord",
+    "position_for_gene",
+    "gene_at",
+    "clinvar_lookup",
+]

annotations/db.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Annotation service backed by DuckDB.
+Holds STATIC PUBLIC reference data: gene coordinates (RefSeq GFF3) and
+ClinVar significance. Transcript / exon / CDS hierarchy lands in
+annotations/transcripts.py (Phase 2). All tiers ship under the same
+OSS license as the engine.
+Architectural role: the two stores (ClickHouse for sample data, DuckDB
+for reference data) are intentionally separated so the MCP server
+composes across them at query time. Reference data updates monthly
+(ClinVar) without touching the sample store; sample data grows without
+touching the reference store. This separation is *operationally* useful
+for everyone, and would also be the basis for a security boundary if
+the hosted tier ever takes on regulated workloads.
+DuckDB is the right tool here for three reasons:
+  1. Embedded — no extra process, no network hop from the MCP server.
+  2. Vectorised — gene overlap queries and ClinVar joins are fast.
+  3. Public data — the curated DuckDB file can be shipped as a
+     downloadable artefact alongside the OSS package.
+Data source: RefSeq GFF3 + ClinVar VCF, GRCh38 only. Loaders live
+under annotations/loaders/ (not yet written) and are idempotent batch
+jobs that run on a cron, not part of the query path.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import duckdb
+DUCKDB_PATH = Path(__file__).parent / "annotations.duckdb"
+SCHEMA_DDL = """
+CREATE TABLE IF NOT EXISTS refseq_genes (
+    gene_symbol  VARCHAR PRIMARY KEY,    -- HGNC symbol, e.g. 'BRCA1'
+    chrom        VARCHAR NOT NULL,       -- 'chr17' style
+    start_pos    UINTEGER NOT NULL,      -- 1-based, inclusive
+    end_pos      UINTEGER NOT NULL,      -- 1-based, inclusive
+    strand       VARCHAR,                -- '+' or '-'
+    refseq_id    VARCHAR,                -- NCBI Gene ID
+    description  VARCHAR
+);
+CREATE INDEX IF NOT EXISTS idx_genes_range
+    ON refseq_genes (chrom, start_pos, end_pos);
+CREATE TABLE IF NOT EXISTS clinvar_variants (
+    chrom         VARCHAR NOT NULL,
+    pos           UINTEGER NOT NULL,
+    ref           VARCHAR NOT NULL,
+    alt           VARCHAR NOT NULL,
+    clin_sig      VARCHAR,           -- 'Pathogenic', 'Likely_pathogenic', ...
+    review_status VARCHAR,           -- ClinVar gold-stars equivalent
+    clinvar_id    VARCHAR,           -- VCV accession
+    condition     VARCHAR,           -- semicolon-joined trait names
+    PRIMARY KEY (chrom, pos, ref, alt)
+);
+"""
+def get_connection() -> duckdb.DuckDBPyConnection:
+    """Open (and initialise on first use) the DuckDB annotation store."""
+    conn = duckdb.connect(str(DUCKDB_PATH))
+    conn.execute(SCHEMA_DDL)
+    return conn
+@dataclass(frozen=True)
+class GeneRange:
+    gene_symbol: str
+    chrom: str
+    start_pos: int
+    end_pos: int
+    strand: str | None
+@dataclass(frozen=True)
+class ClinVarRecord:
+    chrom: str
+    pos: int
+    ref: str
+    alt: str
+    clin_sig: str | None
+    review_status: str | None
+    clinvar_id: str | None
+    condition: str | None
+def position_for_gene(symbol: str) -> GeneRange | None:
+    """Translate a gene symbol to GRCh38 coordinates.
+    Used by the LLM to convert questions like "calls in BRCA1" into a
+    range filter on the ClickHouse genotypes table.
+    """
+    conn = get_connection()
+    row = conn.execute(
+        """
+        SELECT gene_symbol, chrom, start_pos, end_pos, strand
+        FROM refseq_genes
+        WHERE gene_symbol = ?
+        """,
+        [symbol.upper()],
+    ).fetchone()
+    return GeneRange(*row) if row else None
+def gene_at(chrom: str, pos: int) -> list[GeneRange]:
+    """All genes overlapping a single position. Multiple results possible
+    (overlapping transcripts, antisense genes)."""
+    conn = get_connection()
+    rows = conn.execute(
+        """
+        SELECT gene_symbol, chrom, start_pos, end_pos, strand
+        FROM refseq_genes
+        WHERE chrom = ? AND ? BETWEEN start_pos AND end_pos
+        ORDER BY start_pos
+        """,
+        [chrom, pos],
+    ).fetchall()
+    return [GeneRange(*r) for r in rows]
+def clinvar_lookup(chrom: str, pos: int, ref: str, alt: str) -> ClinVarRecord | None:
+    """Look up ClinVar significance for a specific allele.
+    Returns None if the variant is not in ClinVar — which the caller
+    should distinguish from "benign" in user-facing output.
+    """
+    conn = get_connection()
+    row = conn.execute(
+        """
+        SELECT chrom, pos, ref, alt, clin_sig, review_status,
+               clinvar_id, condition
+        FROM clinvar_variants
+        WHERE chrom = ? AND pos = ? AND ref = ? AND alt = ?
+        """,
+        [chrom, pos, ref, alt],
+    ).fetchone()
+    return ClinVarRecord(*row) if row else None

annotations/loaders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Reference-data loaders (GENCODE genes, ClinVar, etc.) → DuckDB."""

annotations/loaders/gencode_genes.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""GENCODE GFF3 → DuckDB refseq_genes table.
+GENCODE is used (not RefSeq's NCBI GFF) because:
+  - It's GRCh38-native with 'chr'-prefixed contig names — matches our
+    sample-data convention without any contig-name remapping.
+  - Single download, clean attribute format, ubiquitous in research
+    bioinformatics.
+  - Same gene symbols as HGNC (which is what bioinformaticians type).
+Only `gene` feature rows are extracted — we need name + coordinates.
+Transcripts/exons/CDS land in `annotations/transcripts.py` in Phase 2
+when that depth of annotation matters.
+Usage:
+    uv run python -m annotations.loaders.gencode_genes
+    uv run python -m annotations.loaders.gencode_genes --gff path/to/local.gff3.gz
+"""
+from __future__ import annotations
+import argparse
+import gzip
+import time
+import urllib.request
+from pathlib import Path
+from annotations.db import get_connection
+GENCODE_VERSION = "45"
+GENCODE_URL = (
+    f"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/"
+    f"release_{GENCODE_VERSION}/gencode.v{GENCODE_VERSION}.annotation.gff3.gz"
+)
+CACHE_DIR = Path(__file__).parent / "_cache"
+CACHED_GFF = CACHE_DIR / f"gencode.v{GENCODE_VERSION}.annotation.gff3.gz"
+def download_gencode() -> Path:
+    """Download the GENCODE GFF3 once and cache it. Returns the local path."""
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    if CACHED_GFF.exists():
+        print(f"[gencode] already cached: {CACHED_GFF}")
+        return CACHED_GFF
+    print(f"[gencode] downloading {GENCODE_URL} → {CACHED_GFF}")
+    started = time.time()
+    urllib.request.urlretrieve(GENCODE_URL, CACHED_GFF)
+    size_mb = CACHED_GFF.stat().st_size / 1_000_000
+    print(f"[gencode] done ({size_mb:.0f} MB in {time.time() - started:.1f}s)")
+    return CACHED_GFF
+def parse_attributes(attrs: str) -> dict[str, str]:
+    """GFF3 attribute column: semicolon-delimited key=value pairs."""
+    out = {}
+    for pair in attrs.split(";"):
+        if "=" in pair:
+            k, v = pair.split("=", 1)
+            out[k.strip()] = v.strip()
+    return out
+# Primary GRCh38 contigs only: chr1-22, X, Y, M. Drops alt-locus
+# (`_alt`), patch (`_fix`), random (`_random`), unplaced (`chrUn_`)
+# scaffolds so a gene symbol resolves to exactly one canonical
+# coordinate range. Research-bioinformatics queries almost never want
+# alt-locus coords; a Phase 2 schema change can lift this when needed.
+PRIMARY_CONTIGS = frozenset(
+    [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
+)
+def iter_genes(gff_path: Path):
+    """Stream a GFF3 file, yielding (gene_symbol, chrom, start, end, strand,
+    refseq_id, description) tuples for each gene on a primary contig.
+    Duplicate gene_symbols on the same primary contig are also possible
+    (rare; usually PAR genes like XG that appear on both chrX and chrY).
+    We keep the first occurrence and drop the rest; PAR resolution is a
+    Phase 2 concern.
+    """
+    opener = gzip.open if gff_path.suffix == ".gz" else open
+    seen: set[str] = set()
+    with opener(gff_path, "rt") as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            fields = line.rstrip("\n").split("\t")
+            if len(fields) < 9 or fields[2] != "gene":
+                continue
+            chrom, _, _, start, end, _, strand, _, attrs_str = fields
+            if chrom not in PRIMARY_CONTIGS:
+                continue
+            attrs = parse_attributes(attrs_str)
+            # gene_name is the HGNC symbol (e.g. 'BRCA1'). gene_id is the
+            # Ensembl ID (e.g. 'ENSG00000012048.24'). We prefer the name.
+            symbol = attrs.get("gene_name")
+            if not symbol or symbol in seen:
+                continue
+            seen.add(symbol)
+            yield (
+                symbol,
+                chrom,
+                int(start),
+                int(end),
+                strand,
+                attrs.get("gene_id"),
+                attrs.get("gene_type"),
+            )
+def load(gff_path: Path | None = None, replace: bool = True) -> int:
+    """Populate the refseq_genes DuckDB table from a GENCODE GFF3 file.
+    Returns the number of gene rows written. Pass `replace=False` to
+    keep prior rows (DuckDB will error on duplicate primary keys).
+    """
+    if gff_path is None:
+        gff_path = download_gencode()
+    conn = get_connection()
+    if replace:
+        conn.execute("DELETE FROM refseq_genes")
+    rows = list(iter_genes(gff_path))
+    if not rows:
+        raise RuntimeError(f"No gene features parsed from {gff_path}")
+    conn.executemany(
+        "INSERT INTO refseq_genes "
+        "(gene_symbol, chrom, start_pos, end_pos, strand, refseq_id, description) "
+        "VALUES (?, ?, ?, ?, ?, ?, ?)",
+        rows,
+    )
+    print(f"[gencode] loaded {len(rows):,} genes into refseq_genes")
+    # Quick verification — the demo gene should be reachable.
+    brca1 = conn.execute(
+        "SELECT chrom, start_pos, end_pos FROM refseq_genes WHERE gene_symbol = 'BRCA1'"
+    ).fetchone()
+    if brca1:
+        print(f"[gencode] BRCA1 → {brca1[0]}:{brca1[1]}-{brca1[2]}")
+    else:
+        print("[gencode] WARNING: BRCA1 not found in loaded data")
+    return len(rows)
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument(
+        "--gff",
+        type=Path,
+        default=None,
+        help="Local GENCODE GFF3 (.gff3 or .gff3.gz). Downloads from EBI if omitted.",
+    )
+    ap.add_argument(
+        "--keep-existing",
+        action="store_true",
+        help="Don't truncate refseq_genes before loading.",
+    )
+    args = ap.parse_args()
+    load(args.gff, replace=not args.keep_existing)
+if __name__ == "__main__":
+    main()

annotations/transcripts.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""Transcript / exon / CDS / UTR annotation hierarchy.
+Phase 2 work. Same license as the engine. No tier boundary.
+Ships open alongside the rest. The stubs below capture the planned API
+so the design is visible in the repo, and so anyone reading the code
+can see what's coming without having to dig through a roadmap doc.
+Why transcript-level matters:
+  - "non-ref in BRCA1" includes deep intronic + UTR calls that are
+    usually noise for downstream interpretation.
+  - "non-ref in BRCA1 CDS, AF < 0.01" is the question a research
+    bioinformatician actually wants.
+  - Canonical transcript (MANE Select) avoids spurious hits on rare
+    isoforms.
+  - Splice-site distance is needed for any consequence prediction.
+Data source: GTF/GFF (RefSeq GFF3 + Ensembl GTF, both GRCh38).
+NCBI's GFF3 is public domain; Ensembl GTF is Apache 2 / open data.
+The integration — curated, GRCh38-pinned, canonical-tagged DuckDB
+artefact — is shipped under the same OSS license as the engine.
+"""
+from __future__ import annotations
+TRANSCRIPTS_SCHEMA_DDL = """
+CREATE TABLE IF NOT EXISTS transcripts (
+    transcript_id    VARCHAR PRIMARY KEY,    -- e.g. 'NM_007294.4'
+    gene_symbol      VARCHAR NOT NULL,
+    chrom            VARCHAR NOT NULL,
+    start_pos        UINTEGER NOT NULL,
+    end_pos          UINTEGER NOT NULL,
+    strand           VARCHAR,
+    biotype          VARCHAR,                -- 'protein_coding', 'lncRNA', ...
+    is_canonical     BOOLEAN DEFAULT FALSE,  -- MANE Select tag
+    is_mane_plus     BOOLEAN DEFAULT FALSE   -- MANE Plus Clinical
+);
+CREATE INDEX IF NOT EXISTS idx_transcripts_gene
+    ON transcripts (gene_symbol);
+CREATE TABLE IF NOT EXISTS exons (
+    transcript_id    VARCHAR NOT NULL,
+    exon_number      USMALLINT NOT NULL,     -- 1-indexed in transcription order
+    chrom            VARCHAR NOT NULL,
+    start_pos        UINTEGER NOT NULL,
+    end_pos          UINTEGER NOT NULL,
+    PRIMARY KEY (transcript_id, exon_number)
+);
+CREATE INDEX IF NOT EXISTS idx_exons_range
+    ON exons (chrom, start_pos, end_pos);
+CREATE TABLE IF NOT EXISTS cds (
+    transcript_id    VARCHAR NOT NULL,
+    exon_number      USMALLINT NOT NULL,
+    chrom            VARCHAR NOT NULL,
+    start_pos        UINTEGER NOT NULL,      -- coding-only, excludes UTR
+    end_pos          UINTEGER NOT NULL,
+    phase            USMALLINT,              -- 0, 1, or 2
+    PRIMARY KEY (transcript_id, exon_number)
+);
+CREATE INDEX IF NOT EXISTS idx_cds_range
+    ON cds (chrom, start_pos, end_pos);
+"""
+def transcripts_for_gene(symbol: str) -> list:
+    """All transcripts of a gene."""
+    raise NotImplementedError("Phase 2.")
+def canonical_transcript(symbol: str):
+    """The MANE Select transcript for a gene, if defined."""
+    raise NotImplementedError("Phase 2.")
+def cds_regions_for_gene(symbol: str) -> list:
+    """Disjoint CDS ranges for a gene, suitable for a SQL range filter.
+    The clinically meaningful version of position_for_gene()."""
+    raise NotImplementedError("Phase 2.")
+def exon_at(chrom: str, pos: int) -> list:
+    """All (transcript_id, exon_number) pairs containing a position."""
+    raise NotImplementedError("Phase 2.")
+def splice_site_distance(chrom: str, pos: int) -> int | None:
+    """Distance in bp to the nearest exon/intron boundary (signed)."""
+    raise NotImplementedError("Phase 2.")

cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """vcfclick command-line interface."""