vcfclick 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vcfclick-0.1.0/.gitignore +51 -0
- vcfclick-0.1.0/LICENSING.md +15 -0
- vcfclick-0.1.0/PKG-INFO +261 -0
- vcfclick-0.1.0/README.md +228 -0
- vcfclick-0.1.0/annotations/__init__.py +23 -0
- vcfclick-0.1.0/annotations/db.py +144 -0
- vcfclick-0.1.0/annotations/loaders/__init__.py +1 -0
- vcfclick-0.1.0/annotations/loaders/gencode_genes.py +167 -0
- vcfclick-0.1.0/annotations/transcripts.py +93 -0
- vcfclick-0.1.0/bench/BENCHMARK.md +179 -0
- vcfclick-0.1.0/cli/__init__.py +1 -0
- vcfclick-0.1.0/cli/main.py +374 -0
- vcfclick-0.1.0/export/__init__.py +1 -0
- vcfclick-0.1.0/export/parquet.py +101 -0
- vcfclick-0.1.0/ingest/__init__.py +1 -0
- vcfclick-0.1.0/ingest/_arrow.py +166 -0
- vcfclick-0.1.0/ingest/_tabix.py +146 -0
- vcfclick-0.1.0/ingest/parallel.py +372 -0
- vcfclick-0.1.0/ingest/vcf_load.py +410 -0
- vcfclick-0.1.0/pyproject.toml +83 -0
- vcfclick-0.1.0/schema/01_variants.sql +79 -0
- vcfclick-0.1.0/schema/02_genotypes.sql +67 -0
- vcfclick-0.1.0/schema/03_samples.sql +36 -0
- vcfclick-0.1.0/storage/__init__.py +23 -0
- vcfclick-0.1.0/storage/db.py +169 -0
- vcfclick-0.1.0/vcfclick_mcp/__init__.py +1 -0
- vcfclick-0.1.0/vcfclick_mcp/server.py +216 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
.pytest_cache/
|
|
7
|
+
.mypy_cache/
|
|
8
|
+
.ruff_cache/
|
|
9
|
+
|
|
10
|
+
# Virtualenv (managed by uv)
|
|
11
|
+
.venv/
|
|
12
|
+
|
|
13
|
+
# chDB persistent state (regenerated by ingestion)
|
|
14
|
+
.chdb/
|
|
15
|
+
|
|
16
|
+
# DuckDB annotation database (downloaded reference data, regenerated by loaders)
|
|
17
|
+
annotations/*.duckdb
|
|
18
|
+
annotations/*.duckdb.wal
|
|
19
|
+
|
|
20
|
+
# Cached reference-data downloads (regenerated by loaders)
|
|
21
|
+
annotations/loaders/_cache/
|
|
22
|
+
|
|
23
|
+
# Benchmark working files (per-sample VCFs, TileDB-VCF array, sample lists).
|
|
24
|
+
# Keep BENCHMARK.md; ignore the GBs of intermediate data.
|
|
25
|
+
bench/per_sample_vcfs/
|
|
26
|
+
bench/tiledbvcf_out/
|
|
27
|
+
bench/samples_list*.txt
|
|
28
|
+
bench/.tiledbvcf-venv/
|
|
29
|
+
|
|
30
|
+
# VCF inputs — too large for git, fetched on demand. See README.
|
|
31
|
+
data/*.vcf
|
|
32
|
+
data/*.vcf.gz
|
|
33
|
+
data/*.vcf.gz.tbi
|
|
34
|
+
data/*.bcf
|
|
35
|
+
data/*.bcf.csi
|
|
36
|
+
data/*.parquet
|
|
37
|
+
|
|
38
|
+
# Editor / OS
|
|
39
|
+
.vscode/
|
|
40
|
+
.idea/
|
|
41
|
+
*.swp
|
|
42
|
+
.DS_Store
|
|
43
|
+
|
|
44
|
+
# Marketing/landing site — lives on disk, not in the OSS repo.
|
|
45
|
+
web/
|
|
46
|
+
|
|
47
|
+
# Browser-automation artefacts (Playwright MCP screenshots etc.)
|
|
48
|
+
.playwright-mcp/
|
|
49
|
+
*.png
|
|
50
|
+
*.jpeg
|
|
51
|
+
*.jpg
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# License
|
|
2
|
+
|
|
3
|
+
The open-source license has not been finalised. The candidates under
|
|
4
|
+
consideration are AGPL-3.0 and BSL.
|
|
5
|
+
|
|
6
|
+
- **AGPL-3.0** — strong copyleft; network-deployment changes must be
|
|
7
|
+
shared back. Some commercial users avoid AGPL on principle.
|
|
8
|
+
- **BSL** — time-delayed conversion to a permissive license (e.g.,
|
|
9
|
+
Apache 2 after a fixed term). Pioneered by MariaDB, used by Sentry
|
|
10
|
+
and CockroachDB.
|
|
11
|
+
|
|
12
|
+
A standard `LICENSE` file will be added before the first tagged release.
|
|
13
|
+
|
|
14
|
+
Until then: use in private / internal projects is welcome. Please do
|
|
15
|
+
not redistribute or build commercial derivatives.
|
vcfclick-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vcfclick
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Small VCF databases. One per cohort. Embedded ClickHouse engine, embedded DuckDB annotations, MCP natural-language layer.
|
|
5
|
+
Project-URL: Homepage, https://github.com/nuin/vcfclick
|
|
6
|
+
Project-URL: Repository, https://github.com/nuin/vcfclick
|
|
7
|
+
Project-URL: Issues, https://github.com/nuin/vcfclick/issues
|
|
8
|
+
Project-URL: Benchmark, https://github.com/nuin/vcfclick/blob/main/bench/BENCHMARK.md
|
|
9
|
+
Author-email: nuin <nuin@genedrift.org>
|
|
10
|
+
Keywords: bioinformatics,clickhouse,duckdb,embedded-database,genomics,mcp,model-context-protocol,natural-language-sql,vcf
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: Other/Proprietary License
|
|
16
|
+
Classifier: Operating System :: MacOS
|
|
17
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
+
Classifier: Topic :: Database
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Requires-Dist: chdb>=4.1.8
|
|
27
|
+
Requires-Dist: click>=8.1.0
|
|
28
|
+
Requires-Dist: cyvcf2>=0.31.0
|
|
29
|
+
Requires-Dist: duckdb>=1.0.0
|
|
30
|
+
Requires-Dist: mcp>=1.0.0
|
|
31
|
+
Requires-Dist: pyarrow>=15.0.0
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# vcfclick
|
|
35
|
+
|
|
36
|
+
A modern VCF database for research labs and bioinformatics teams.
|
|
37
|
+
Embedded chDB (ClickHouse engine, no server) for sample data, embedded
|
|
38
|
+
DuckDB for reference annotations, and a natural-language query layer
|
|
39
|
+
that turns plain English into SQL you can read.
|
|
40
|
+
|
|
41
|
+
Single binary. `uv run vcfclick`. No Docker, no port, no server, no
|
|
42
|
+
Gatekeeper dialog. The headline demo runs from a clean `git clone`.
|
|
43
|
+
|
|
44
|
+
Status: research preview. Architecture validated against real 1000 Genomes data.
|
|
45
|
+
|
|
46
|
+
## Why
|
|
47
|
+
|
|
48
|
+
Two complaints heard repeatedly in research bioinformatics:
|
|
49
|
+
|
|
50
|
+
1. *"My cohort grew and `bcftools | pandas` stopped scaling."* When
|
|
51
|
+
you have 500+ samples, ad-hoc cohort correlation queries become
|
|
52
|
+
painfully slow. The standard answer is "go install Hail," which is
|
|
53
|
+
correct and operationally expensive.
|
|
54
|
+
|
|
55
|
+
2. *"I can write the SQL, but I shouldn't have to type the boilerplate
|
|
56
|
+
every time — and when it's written for me, I want to see it."*
|
|
57
|
+
Bioinformaticians don't want SQL hidden. They want it generated and
|
|
58
|
+
visible, because trust comes from being able to read what ran.
|
|
59
|
+
|
|
60
|
+
vcfclick closes both:
|
|
61
|
+
|
|
62
|
+
- **chDB** (ClickHouse embedded as a library) handles cohort scale.
|
|
63
|
+
We've measured ~963 variants/sec single-process ingest, 6% sparse
|
|
64
|
+
compression vs dense, in-process Native query speed.
|
|
65
|
+
- The **MCP server** lets any LLM client translate plain English into
|
|
66
|
+
the SQL underneath. The generated SQL is shown alongside the result —
|
|
67
|
+
it's *part* of the answer, not a debug trace.
|
|
68
|
+
|
|
69
|
+
## Architecture
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
┌────────────────────────────────────┐
|
|
73
|
+
│ Tiny web UI (separate repo) │ English in → SQL + result out
|
|
74
|
+
└────────────────┬───────────────────┘
|
|
75
|
+
│
|
|
76
|
+
┌────────────────▼───────────────────┐
|
|
77
|
+
│ MCP server (Python) │ Composes the two embedded stores
|
|
78
|
+
│ Tools: get_schema, run_sql, │
|
|
79
|
+
│ position_for_gene, gene_at, │
|
|
80
|
+
│ clinvar_lookup │
|
|
81
|
+
└────┬─────────────────────────┬─────┘
|
|
82
|
+
│ │
|
|
83
|
+
┌────▼──────────────┐ ┌───────▼────────────┐
|
|
84
|
+
│ chDB │ │ DuckDB │
|
|
85
|
+
│ (embedded) │ │ (embedded) │
|
|
86
|
+
│ sample data │ │ reference data │
|
|
87
|
+
│ - variants │ │ - genes (RefSeq) │
|
|
88
|
+
│ - genotypes │ │ - clinvar_* │
|
|
89
|
+
│ - samples │ │ │
|
|
90
|
+
│ - ingestions │ │ │
|
|
91
|
+
└───────────────────┘ └────────────────────┘
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Two embedded stores, distinct purposes:
|
|
95
|
+
|
|
96
|
+
- **chDB** holds sample data: wide pre-declared schema for VCF 4.3
|
|
97
|
+
reserved + common GATK INFO/FORMAT fields, with
|
|
98
|
+
`Map(String, String)` overflow for anything else. **Same SQL surface,
|
|
99
|
+
same MergeTree engines, same projections as full ClickHouse — no
|
|
100
|
+
server.** Persistent on disk under `.chdb/`.
|
|
101
|
+
- **DuckDB** holds reference data: RefSeq genes, ClinVar. Embedded,
|
|
102
|
+
swappable, monthly refresh. Never touches sample data.
|
|
103
|
+
|
|
104
|
+
The MCP server composes across them at query time. Annotation lookups
|
|
105
|
+
happen first (DuckDB), then their results parameterise the sample
|
|
106
|
+
query (chDB). The chain of reasoning is visible in the UI.
|
|
107
|
+
|
|
108
|
+
## Using vcfclick
|
|
109
|
+
|
|
110
|
+
Each cohort / study / VCF lives in its own small database under
|
|
111
|
+
`~/.vcfclick/dbs/<name>/`. The `vcfclick` CLI manages them.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Normalise the VCF (one-time per file)
|
|
115
|
+
bcftools norm -m - input.vcf.gz | bgzip > normalised.vcf.gz
|
|
116
|
+
|
|
117
|
+
# Create a database for this cohort
|
|
118
|
+
vcfclick db create my-cohort
|
|
119
|
+
|
|
120
|
+
# Ingest the VCF into it
|
|
121
|
+
vcfclick db ingest my-cohort normalised.vcf.gz \
|
|
122
|
+
--cohort demo --ingest-id batch_a
|
|
123
|
+
|
|
124
|
+
# Inspect what's in it
|
|
125
|
+
vcfclick db info my-cohort
|
|
126
|
+
|
|
127
|
+
# Run SQL directly
|
|
128
|
+
vcfclick db query my-cohort "SELECT count() FROM variants"
|
|
129
|
+
|
|
130
|
+
# Export the whole database as Parquet (interop with DuckDB,
|
|
131
|
+
# Snowflake, BigQuery, Spark, Iceberg)
|
|
132
|
+
vcfclick db dump my-cohort --out my-cohort-export/
|
|
133
|
+
|
|
134
|
+
# Bundle a database as a single tar.gz for sharing
|
|
135
|
+
vcfclick db push my-cohort /path/to/my-cohort.tar.gz
|
|
136
|
+
|
|
137
|
+
# Restore from a bundle — local file or HTTPS URL
|
|
138
|
+
vcfclick db pull other-cohort https://example.com/other-cohort.tar.gz
|
|
139
|
+
|
|
140
|
+
# List, remove
|
|
141
|
+
vcfclick db list
|
|
142
|
+
vcfclick db rm my-cohort
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Each database is a self-contained chDB session — the on-disk format is
|
|
146
|
+
byte-identical to a full ClickHouse server. Multiple databases sit side
|
|
147
|
+
by side; each is cheap to create, dump, share, or delete.
|
|
148
|
+
|
|
149
|
+
The ingester prints a classification of the VCF's INFO/FORMAT fields
|
|
150
|
+
on startup — what landed in typed columns vs. the overflow Maps. That
|
|
151
|
+
log line is the "adapts to any VCF" claim made literally visible.
|
|
152
|
+
|
|
153
|
+
**Per-ingestion identity inside a database.** Every row carries
|
|
154
|
+
`ingest_id`. Rows are NOT merged across uploads — the same
|
|
155
|
+
`(chrom, pos, ref, alt)` observed in two different VCFs is two rows,
|
|
156
|
+
because annotations and QC origin can differ. Re-running with the same
|
|
157
|
+
`--ingest-id` is idempotent (silently replaces prior rows via
|
|
158
|
+
`ReplacingMergeTree`). Using a new `--ingest-id` appends.
|
|
159
|
+
|
|
160
|
+
**Parallel ingestion** is the default; pass `--serial` to force the
|
|
161
|
+
single-process loader. The parallel splitter does a single-pass count
|
|
162
|
+
of variants per 100Kb position bucket via the tabix `.tbi` index (~1 ms)
|
|
163
|
+
and greedy-splits each contig into ranges of approximately equal
|
|
164
|
+
variant count — so dense subregions (gene panels, exomes) don't leave
|
|
165
|
+
N–1 workers idle.
|
|
166
|
+
|
|
167
|
+
### Pointing the MCP server at a specific database
|
|
168
|
+
|
|
169
|
+
In your Claude Desktop / MCP-client config, set `VCFCLICK_DB_NAME` to
|
|
170
|
+
the database you want the LLM to talk to:
|
|
171
|
+
|
|
172
|
+
```jsonc
|
|
173
|
+
"vcfclick": {
|
|
174
|
+
"command": "/path/to/vcfclick/.venv/bin/python",
|
|
175
|
+
"args": ["-m", "vcfclick_mcp.server"],
|
|
176
|
+
"cwd": "/path/to/vcfclick",
|
|
177
|
+
"env": {
|
|
178
|
+
"PYTHONPATH": "/path/to/vcfclick",
|
|
179
|
+
"VCFCLICK_DB_NAME": "my-cohort"
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Register multiple `vcfclick-<dbname>` entries if you want the LLM to be
|
|
185
|
+
able to switch between cohorts in a single Claude Desktop session.
|
|
186
|
+
|
|
187
|
+
### Legacy Python-module entry points
|
|
188
|
+
|
|
189
|
+
The pre-CLI module commands still work for scripted use:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
# Single database at ./.chdb/ (no CLI involvement)
|
|
193
|
+
uv run python -m ingest.parallel normalised.vcf.gz \
|
|
194
|
+
--cohort demo --ingest-id batch --workers 4
|
|
195
|
+
|
|
196
|
+
uv run python -m export.parquet variants /path/out.parquet
|
|
197
|
+
uv run python -m export.parquet --all /path/output_dir/
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
These ingest into / read from `./.chdb/` (or `VCFCLICK_DB`-pointed
|
|
201
|
+
directory) and ignore the named-DB layout.
|
|
202
|
+
|
|
203
|
+
## Layout
|
|
204
|
+
|
|
205
|
+
- `schema/` — ClickHouse DDL (chDB applies it unchanged).
|
|
206
|
+
- `storage/db.py` — chDB session singleton; `apply_schema()` helper.
|
|
207
|
+
- `ingest/vcf_load.py` — serial cyvcf2-based ingester.
|
|
208
|
+
- `ingest/parallel.py` — multi-process variant; Parquet staging.
|
|
209
|
+
- `ingest/_arrow.py` — pyarrow schemas matching the ClickHouse tables.
|
|
210
|
+
- `export/parquet.py` — table → Parquet export CLI.
|
|
211
|
+
- `annotations/db.py` — DuckDB annotation API (gene, ClinVar).
|
|
212
|
+
- `annotations/transcripts.py` — transcript/exon/CDS API stubs (Phase 2).
|
|
213
|
+
- `vcfclick_mcp/server.py` — MCP server (chDB + DuckDB tool surface).
|
|
214
|
+
Renamed from `mcp/` so the directory does not shadow the upstream
|
|
215
|
+
`mcp` Python SDK.
|
|
216
|
+
- `data/` — VCF inputs (gitignored).
|
|
217
|
+
|
|
218
|
+
## Validated against real data
|
|
219
|
+
|
|
220
|
+
| Workload | Vars | Samples | Calls stored | Throughput |
|
|
221
|
+
|---|---|---|---|---|
|
|
222
|
+
| BRCA1 region (1000G 30x) | 1,863 | 3,202 | 369,776 | small-VCF baseline |
|
|
223
|
+
| 10 Mb chr17 (1000G 30x) — serial | 235,768 | 3,202 | **44,986,737** | 952 v/s |
|
|
224
|
+
| 10 Mb chr17 (1000G 30x) — parallel 4 workers | 235,768 | 3,202 | **44,986,737** | 1,983 v/s (2.1×) |
|
|
225
|
+
| 10 Mb chr17 (1000G 30x) — parallel 8 workers | 235,768 | 3,202 | **44,986,737** | 2,466 v/s (2.6×) |
|
|
226
|
+
|
|
227
|
+
Parallel speedup comes from the variant-count-aware splitter — each
|
|
228
|
+
worker gets approximately equal work regardless of where the data
|
|
229
|
+
actually lives along the chromosome. Sparse-table compression
|
|
230
|
+
empirically 6.2% of dense theoretical max.
|
|
231
|
+
|
|
232
|
+
## TileDB-VCF comparison
|
|
233
|
+
|
|
234
|
+
End-to-end on the same 235k-variant / 3,202-sample workload, native
|
|
235
|
+
arm64 (vcfclick) vs Rosetta-emulated linux/amd64 (TileDB-VCF Docker):
|
|
236
|
+
|
|
237
|
+
| | vcfclick | TileDB-VCF |
|
|
238
|
+
|---|---|---|
|
|
239
|
+
| Source VCF format | joint VCF ingested directly | per-sample VCFs only ("Combined VCFs are currently not supported") |
|
|
240
|
+
| Pre-processing | none | bcftools +split + tabix × 3,202 ≈ 8+ min |
|
|
241
|
+
| Source VCF disk | 114 MB | 15.1 GB (132× inflation) |
|
|
242
|
+
| Ingest, best stable config | **69 s** (parallel-8) | **~79 min** projected (single-thread, multi-thread failed) |
|
|
243
|
+
| End-to-end | **~1 min** | **~87 min** |
|
|
244
|
+
|
|
245
|
+
Full methodology, caveats (including the Rosetta penalty), and
|
|
246
|
+
reproduction commands: [`bench/BENCHMARK.md`](bench/BENCHMARK.md).
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
License choice (AGPL-3.0 vs BSL) is pending. A standard `LICENSE` file
|
|
251
|
+
will be added before the first tagged release. See
|
|
252
|
+
[LICENSING.md](LICENSING.md).
|
|
253
|
+
|
|
254
|
+
## Open work
|
|
255
|
+
|
|
256
|
+
- VCF schema auto-discovery utility (`vcf-discover`).
|
|
257
|
+
- ClinVar VCF loader under `annotations/loaders/` (the GENCODE gene
|
|
258
|
+
loader is in; ClinVar significance lookup is still stubbed).
|
|
259
|
+
- Phase 2: transcript / exon / CDS hierarchy + corresponding MCP tools.
|
|
260
|
+
- End-to-end MCP integration test with a real LLM client — the
|
|
261
|
+
`SCHEMA_DESCRIPTION` prompt is theoretical until it's stress-tested.
|
vcfclick-0.1.0/README.md
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# vcfclick
|
|
2
|
+
|
|
3
|
+
A modern VCF database for research labs and bioinformatics teams.
|
|
4
|
+
Embedded chDB (ClickHouse engine, no server) for sample data, embedded
|
|
5
|
+
DuckDB for reference annotations, and a natural-language query layer
|
|
6
|
+
that turns plain English into SQL you can read.
|
|
7
|
+
|
|
8
|
+
Single binary. `uv run vcfclick`. No Docker, no port, no server, no
|
|
9
|
+
Gatekeeper dialog. The headline demo runs from a clean `git clone`.
|
|
10
|
+
|
|
11
|
+
Status: research preview. Architecture validated against real 1000 Genomes data.
|
|
12
|
+
|
|
13
|
+
## Why
|
|
14
|
+
|
|
15
|
+
Two complaints heard repeatedly in research bioinformatics:
|
|
16
|
+
|
|
17
|
+
1. *"My cohort grew and `bcftools | pandas` stopped scaling."* When
|
|
18
|
+
you have 500+ samples, ad-hoc cohort correlation queries become
|
|
19
|
+
painfully slow. The standard answer is "go install Hail," which is
|
|
20
|
+
correct and operationally expensive.
|
|
21
|
+
|
|
22
|
+
2. *"I can write the SQL, but I shouldn't have to type the boilerplate
|
|
23
|
+
every time — and when it's written for me, I want to see it."*
|
|
24
|
+
Bioinformaticians don't want SQL hidden. They want it generated and
|
|
25
|
+
visible, because trust comes from being able to read what ran.
|
|
26
|
+
|
|
27
|
+
vcfclick closes both:
|
|
28
|
+
|
|
29
|
+
- **chDB** (ClickHouse embedded as a library) handles cohort scale.
|
|
30
|
+
We've measured ~963 variants/sec single-process ingest, 6% sparse
|
|
31
|
+
compression vs dense, in-process Native query speed.
|
|
32
|
+
- The **MCP server** lets any LLM client translate plain English into
|
|
33
|
+
the SQL underneath. The generated SQL is shown alongside the result —
|
|
34
|
+
it's *part* of the answer, not a debug trace.
|
|
35
|
+
|
|
36
|
+
## Architecture
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
┌────────────────────────────────────┐
|
|
40
|
+
│ Tiny web UI (separate repo) │ English in → SQL + result out
|
|
41
|
+
└────────────────┬───────────────────┘
|
|
42
|
+
│
|
|
43
|
+
┌────────────────▼───────────────────┐
|
|
44
|
+
│ MCP server (Python) │ Composes the two embedded stores
|
|
45
|
+
│ Tools: get_schema, run_sql, │
|
|
46
|
+
│ position_for_gene, gene_at, │
|
|
47
|
+
│ clinvar_lookup │
|
|
48
|
+
└────┬─────────────────────────┬─────┘
|
|
49
|
+
│ │
|
|
50
|
+
┌────▼──────────────┐ ┌───────▼────────────┐
|
|
51
|
+
│ chDB │ │ DuckDB │
|
|
52
|
+
│ (embedded) │ │ (embedded) │
|
|
53
|
+
│ sample data │ │ reference data │
|
|
54
|
+
│ - variants │ │ - genes (RefSeq) │
|
|
55
|
+
│ - genotypes │ │ - clinvar_* │
|
|
56
|
+
│ - samples │ │ │
|
|
57
|
+
│ - ingestions │ │ │
|
|
58
|
+
└───────────────────┘ └────────────────────┘
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Two embedded stores, distinct purposes:
|
|
62
|
+
|
|
63
|
+
- **chDB** holds sample data: wide pre-declared schema for VCF 4.3
|
|
64
|
+
reserved + common GATK INFO/FORMAT fields, with
|
|
65
|
+
`Map(String, String)` overflow for anything else. **Same SQL surface,
|
|
66
|
+
same MergeTree engines, same projections as full ClickHouse — no
|
|
67
|
+
server.** Persistent on disk under `.chdb/`.
|
|
68
|
+
- **DuckDB** holds reference data: RefSeq genes, ClinVar. Embedded,
|
|
69
|
+
swappable, monthly refresh. Never touches sample data.
|
|
70
|
+
|
|
71
|
+
The MCP server composes across them at query time. Annotation lookups
|
|
72
|
+
happen first (DuckDB), then their results parameterise the sample
|
|
73
|
+
query (chDB). The chain of reasoning is visible in the UI.
|
|
74
|
+
|
|
75
|
+
## Using vcfclick
|
|
76
|
+
|
|
77
|
+
Each cohort / study / VCF lives in its own small database under
|
|
78
|
+
`~/.vcfclick/dbs/<name>/`. The `vcfclick` CLI manages them.
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Normalise the VCF (one-time per file)
|
|
82
|
+
bcftools norm -m - input.vcf.gz | bgzip > normalised.vcf.gz
|
|
83
|
+
|
|
84
|
+
# Create a database for this cohort
|
|
85
|
+
vcfclick db create my-cohort
|
|
86
|
+
|
|
87
|
+
# Ingest the VCF into it
|
|
88
|
+
vcfclick db ingest my-cohort normalised.vcf.gz \
|
|
89
|
+
--cohort demo --ingest-id batch_a
|
|
90
|
+
|
|
91
|
+
# Inspect what's in it
|
|
92
|
+
vcfclick db info my-cohort
|
|
93
|
+
|
|
94
|
+
# Run SQL directly
|
|
95
|
+
vcfclick db query my-cohort "SELECT count() FROM variants"
|
|
96
|
+
|
|
97
|
+
# Export the whole database as Parquet (interop with DuckDB,
|
|
98
|
+
# Snowflake, BigQuery, Spark, Iceberg)
|
|
99
|
+
vcfclick db dump my-cohort --out my-cohort-export/
|
|
100
|
+
|
|
101
|
+
# Bundle a database as a single tar.gz for sharing
|
|
102
|
+
vcfclick db push my-cohort /path/to/my-cohort.tar.gz
|
|
103
|
+
|
|
104
|
+
# Restore from a bundle — local file or HTTPS URL
|
|
105
|
+
vcfclick db pull other-cohort https://example.com/other-cohort.tar.gz
|
|
106
|
+
|
|
107
|
+
# List, remove
|
|
108
|
+
vcfclick db list
|
|
109
|
+
vcfclick db rm my-cohort
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Each database is a self-contained chDB session — the on-disk format is
|
|
113
|
+
byte-identical to a full ClickHouse server. Multiple databases sit side
|
|
114
|
+
by side; each is cheap to create, dump, share, or delete.
|
|
115
|
+
|
|
116
|
+
The ingester prints a classification of the VCF's INFO/FORMAT fields
|
|
117
|
+
on startup — what landed in typed columns vs. the overflow Maps. That
|
|
118
|
+
log line is the "adapts to any VCF" claim made literally visible.
|
|
119
|
+
|
|
120
|
+
**Per-ingestion identity inside a database.** Every row carries
|
|
121
|
+
`ingest_id`. Rows are NOT merged across uploads — the same
|
|
122
|
+
`(chrom, pos, ref, alt)` observed in two different VCFs is two rows,
|
|
123
|
+
because annotations and QC origin can differ. Re-running with the same
|
|
124
|
+
`--ingest-id` is idempotent (silently replaces prior rows via
|
|
125
|
+
`ReplacingMergeTree`). Using a new `--ingest-id` appends.
|
|
126
|
+
|
|
127
|
+
**Parallel ingestion** is the default; pass `--serial` to force the
|
|
128
|
+
single-process loader. The parallel splitter does a single-pass count
|
|
129
|
+
of variants per 100Kb position bucket via the tabix `.tbi` index (~1 ms)
|
|
130
|
+
and greedy-splits each contig into ranges of approximately equal
|
|
131
|
+
variant count — so dense subregions (gene panels, exomes) don't leave
|
|
132
|
+
N–1 workers idle.
|
|
133
|
+
|
|
134
|
+
### Pointing the MCP server at a specific database
|
|
135
|
+
|
|
136
|
+
In your Claude Desktop / MCP-client config, set `VCFCLICK_DB_NAME` to
|
|
137
|
+
the database you want the LLM to talk to:
|
|
138
|
+
|
|
139
|
+
```jsonc
|
|
140
|
+
"vcfclick": {
|
|
141
|
+
"command": "/path/to/vcfclick/.venv/bin/python",
|
|
142
|
+
"args": ["-m", "vcfclick_mcp.server"],
|
|
143
|
+
"cwd": "/path/to/vcfclick",
|
|
144
|
+
"env": {
|
|
145
|
+
"PYTHONPATH": "/path/to/vcfclick",
|
|
146
|
+
"VCFCLICK_DB_NAME": "my-cohort"
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Register multiple `vcfclick-<dbname>` entries if you want the LLM to be
|
|
152
|
+
able to switch between cohorts in a single Claude Desktop session.
|
|
153
|
+
|
|
154
|
+
### Legacy Python-module entry points
|
|
155
|
+
|
|
156
|
+
The pre-CLI module commands still work for scripted use:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# Single database at ./.chdb/ (no CLI involvement)
|
|
160
|
+
uv run python -m ingest.parallel normalised.vcf.gz \
|
|
161
|
+
--cohort demo --ingest-id batch --workers 4
|
|
162
|
+
|
|
163
|
+
uv run python -m export.parquet variants /path/out.parquet
|
|
164
|
+
uv run python -m export.parquet --all /path/output_dir/
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
These ingest into / read from `./.chdb/` (or `VCFCLICK_DB`-pointed
|
|
168
|
+
directory) and ignore the named-DB layout.
|
|
169
|
+
|
|
170
|
+
## Layout
|
|
171
|
+
|
|
172
|
+
- `schema/` — ClickHouse DDL (chDB applies it unchanged).
|
|
173
|
+
- `storage/db.py` — chDB session singleton; `apply_schema()` helper.
|
|
174
|
+
- `ingest/vcf_load.py` — serial cyvcf2-based ingester.
|
|
175
|
+
- `ingest/parallel.py` — multi-process variant; Parquet staging.
|
|
176
|
+
- `ingest/_arrow.py` — pyarrow schemas matching the ClickHouse tables.
|
|
177
|
+
- `export/parquet.py` — table → Parquet export CLI.
|
|
178
|
+
- `annotations/db.py` — DuckDB annotation API (gene, ClinVar).
|
|
179
|
+
- `annotations/transcripts.py` — transcript/exon/CDS API stubs (Phase 2).
|
|
180
|
+
- `vcfclick_mcp/server.py` — MCP server (chDB + DuckDB tool surface).
|
|
181
|
+
Renamed from `mcp/` so the directory does not shadow the upstream
|
|
182
|
+
`mcp` Python SDK.
|
|
183
|
+
- `data/` — VCF inputs (gitignored).
|
|
184
|
+
|
|
185
|
+
## Validated against real data
|
|
186
|
+
|
|
187
|
+
| Workload | Vars | Samples | Calls stored | Throughput |
|
|
188
|
+
|---|---|---|---|---|
|
|
189
|
+
| BRCA1 region (1000G 30x) | 1,863 | 3,202 | 369,776 | small-VCF baseline |
|
|
190
|
+
| 10 Mb chr17 (1000G 30x) — serial | 235,768 | 3,202 | **44,986,737** | 952 v/s |
|
|
191
|
+
| 10 Mb chr17 (1000G 30x) — parallel 4 workers | 235,768 | 3,202 | **44,986,737** | 1,983 v/s (2.1×) |
|
|
192
|
+
| 10 Mb chr17 (1000G 30x) — parallel 8 workers | 235,768 | 3,202 | **44,986,737** | 2,466 v/s (2.6×) |
|
|
193
|
+
|
|
194
|
+
Parallel speedup comes from the variant-count-aware splitter — each
|
|
195
|
+
worker gets approximately equal work regardless of where the data
|
|
196
|
+
actually lives along the chromosome. Sparse-table compression
|
|
197
|
+
empirically 6.2% of dense theoretical max.
|
|
198
|
+
|
|
199
|
+
## TileDB-VCF comparison
|
|
200
|
+
|
|
201
|
+
End-to-end on the same 235k-variant / 3,202-sample workload, native
|
|
202
|
+
arm64 (vcfclick) vs Rosetta-emulated linux/amd64 (TileDB-VCF Docker):
|
|
203
|
+
|
|
204
|
+
| | vcfclick | TileDB-VCF |
|
|
205
|
+
|---|---|---|
|
|
206
|
+
| Source VCF format | joint VCF ingested directly | per-sample VCFs only ("Combined VCFs are currently not supported") |
|
|
207
|
+
| Pre-processing | none | bcftools +split + tabix × 3,202 ≈ 8+ min |
|
|
208
|
+
| Source VCF disk | 114 MB | 15.1 GB (132× inflation) |
|
|
209
|
+
| Ingest, best stable config | **69 s** (parallel-8) | **~79 min** projected (single-thread, multi-thread failed) |
|
|
210
|
+
| End-to-end | **~1 min** | **~87 min** |
|
|
211
|
+
|
|
212
|
+
Full methodology, caveats (including the Rosetta penalty), and
|
|
213
|
+
reproduction commands: [`bench/BENCHMARK.md`](bench/BENCHMARK.md).
|
|
214
|
+
|
|
215
|
+
## License
|
|
216
|
+
|
|
217
|
+
License choice (AGPL-3.0 vs BSL) is pending. A standard `LICENSE` file
|
|
218
|
+
will be added before the first tagged release. See
|
|
219
|
+
[LICENSING.md](LICENSING.md).
|
|
220
|
+
|
|
221
|
+
## Open work
|
|
222
|
+
|
|
223
|
+
- VCF schema auto-discovery utility (`vcf-discover`).
|
|
224
|
+
- ClinVar VCF loader under `annotations/loaders/` (the GENCODE gene
|
|
225
|
+
loader is in; ClinVar significance lookup is still stubbed).
|
|
226
|
+
- Phase 2: transcript / exon / CDS hierarchy + corresponding MCP tools.
|
|
227
|
+
- End-to-end MCP integration test with a real LLM client — the
|
|
228
|
+
`SCHEMA_DESCRIPTION` prompt is theoretical until it's stress-tested.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Annotation service. Single tier, single license.
|
|
2
|
+
|
|
3
|
+
from annotations import position_for_gene, gene_at, clinvar_lookup
|
|
4
|
+
|
|
5
|
+
The transcript / exon / CDS / UTR hierarchy lives in
|
|
6
|
+
annotations/transcripts.py (Phase 2; not yet implemented).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from annotations.db import (
|
|
10
|
+
GeneRange,
|
|
11
|
+
ClinVarRecord,
|
|
12
|
+
position_for_gene,
|
|
13
|
+
gene_at,
|
|
14
|
+
clinvar_lookup,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"GeneRange",
|
|
19
|
+
"ClinVarRecord",
|
|
20
|
+
"position_for_gene",
|
|
21
|
+
"gene_at",
|
|
22
|
+
"clinvar_lookup",
|
|
23
|
+
]
|