tablebridge 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ["3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v5
20
+
21
+ - name: Create venv (Python ${{ matrix.python-version }})
22
+ run: uv venv -p ${{ matrix.python-version }}
23
+
24
+ - name: Install project (with dev deps)
25
+ run: uv pip install -e ".[dev]"
26
+
27
+ - name: Lint
28
+ run: uv run ruff check .
29
+
30
+ - name: Test
31
+ run: uv run pytest --cov=tablebridge --cov-report=term-missing
@@ -0,0 +1,25 @@
1
+ name: Publish to MCP Registry
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ tags: ["v*"]
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write
13
+ contents: read
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install mcp-publisher
18
+ run: |
19
+ curl -L "https://github.com/modelcontextprotocol/registry/releases/latest/download/mcp-publisher_linux_amd64.tar.gz" | tar xz mcp-publisher
20
+
21
+ - name: Login to the registry (GitHub OIDC)
22
+ run: ./mcp-publisher login github-oidc
23
+
24
+ - name: Publish server.json
25
+ run: ./mcp-publisher publish
@@ -0,0 +1,36 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - name: Install uv
13
+ uses: astral-sh/setup-uv@v5
14
+ - name: Build sdist and wheel
15
+ run: uv build
16
+ - name: Upload dist artifact
17
+ uses: actions/upload-artifact@v4
18
+ with:
19
+ name: dist
20
+ path: dist/
21
+
22
+ publish:
23
+ needs: build
24
+ runs-on: ubuntu-latest
25
+ # Trusted Publishing (OIDC) — no API token stored in the repo.
26
+ environment: pypi
27
+ permissions:
28
+ id-token: write
29
+ steps:
30
+ - name: Download dist artifact
31
+ uses: actions/download-artifact@v4
32
+ with:
33
+ name: dist
34
+ path: dist/
35
+ - name: Publish to PyPI
36
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,21 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+
11
+ # Tooling
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .coverage
15
+ htmlcov/
16
+ .mypy_cache/
17
+
18
+ # Editors / OS
19
+ .idea/
20
+ .vscode/
21
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/), and this project adheres to
5
+ [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.0] - 2026-06-21
8
+
9
+ ### Added
10
+ - Initial release.
11
+ - Exposes a directory of CSV/TSV/Parquet/JSON/NDJSON files as DuckDB tables.
12
+ - Tools: `list_sources`, `describe`, `preview`, `query`, `refresh`, `server_info`.
13
+ - Read-only, sandboxed security model: materialized in-memory tables, external
14
+ filesystem access disabled post-load, single read-only statement validation
15
+ rejecting writes and raw file-access functions.
16
+ - Configurable via `TABLEBRIDGE_DATA_DIR`, `TABLEBRIDGE_MAX_ROWS`,
17
+ `TABLEBRIDGE_RECURSIVE`.
18
+ - Test suite using real DuckDB over temp files; CI on Python 3.11 and 3.12.
19
+ - MCP registry manifest, PyPI publish + registry-publish workflows.
20
+
21
+ [0.1.0]: https://github.com/Michael-WhiteCapData/tablebridge-mcp/releases/tag/v0.1.0
@@ -0,0 +1,33 @@
1
+ # Contributing to tablebridge
2
+
3
+ Thanks for your interest! This server stays small, focused, and read-only-safe — contributions that keep it that way merge easiest.
4
+
5
+ ## Getting set up
6
+
7
+ ```bash
8
+ git clone https://github.com/Michael-WhiteCapData/tablebridge-mcp
9
+ cd tablebridge-mcp
10
+ uv pip install -e ".[dev]"
11
+ ```
12
+
13
+ ## Before opening a PR
14
+
15
+ - `ruff check .` passes (`ruff check --fix .` to autofix).
16
+ - `pytest` passes. Tests use real DuckDB over temp files — no external services.
17
+ - New behavior comes with a test.
18
+ - **Security:** any change to `query`/SQL handling must keep the read-only guarantees — single statement, no writes, no raw file-access functions, no escaping the data directory. Add a test proving the new path stays sandboxed.
19
+
20
+ ## Architecture
21
+
22
+ - `config.py` — env-driven config (data dir, row cap) + supported formats.
23
+ - `db.py` — DuckDB engine: materializes files as tables, validates SQL, runs queries.
24
+ - `server.py` — the MCP tool layer (thin; delegates to `TableBridge`).
25
+
26
+ ## Ideas welcome
27
+
28
+ - More input formats (e.g. Excel via the DuckDB `excel` extension).
29
+ - A `schema_summary` tool that profiles columns.
30
+
31
+ ## Code of conduct
32
+
33
+ Be decent, assume good faith, keep it constructive.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Michael Tierney
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.4
2
+ Name: tablebridge
3
+ Version: 0.1.0
4
+ Summary: An MCP server that turns a folder of CSV / Parquet / JSON files into one SQL-queryable source for your AI agent.
5
+ Project-URL: Homepage, https://github.com/Michael-WhiteCapData/tablebridge-mcp
6
+ Project-URL: Repository, https://github.com/Michael-WhiteCapData/tablebridge-mcp
7
+ Project-URL: Issues, https://github.com/Michael-WhiteCapData/tablebridge-mcp/issues
8
+ Author: Michael Tierney
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: claude,csv,data-integration,duckdb,mcp,model-context-protocol,parquet,sql
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Database
18
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: duckdb>=1.0
21
+ Requires-Dist: mcp>=1.2
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
24
+ Requires-Dist: pytest>=8; extra == 'dev'
25
+ Requires-Dist: ruff>=0.6; extra == 'dev'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <!-- mcp-name: io.github.Michael-WhiteCapData/tablebridge-mcp -->
29
+
30
+ # tablebridge
31
+
32
+ **Turn a folder of CSV / Parquet / JSON files into one SQL-queryable source for your AI agent.**
33
+
34
+ [![CI](https://github.com/Michael-WhiteCapData/tablebridge-mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/Michael-WhiteCapData/tablebridge-mcp/actions/workflows/ci.yml)
35
+ [![PyPI](https://img.shields.io/pypi/v/tablebridge?color=3775A9&logo=pypi&logoColor=white)](https://pypi.org/project/tablebridge/)
36
+ [![Python](https://img.shields.io/badge/python-3.11%2B-3776AB?logo=python&logoColor=white)](https://www.python.org/)
37
+ [![MCP](https://img.shields.io/badge/MCP-server-D97757)](https://modelcontextprotocol.io/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
39
+
40
+ Small businesses don't have a data warehouse — they have a folder full of exports: `customers.csv`, last month's `orders.xlsx`, a `regions.json` someone emailed over. `tablebridge` is an [MCP](https://modelcontextprotocol.io/) server that points [DuckDB](https://duckdb.org/) at that folder, exposes **each file as a SQL table**, and lets your agent run **read-only SQL — including JOINs across files** — to answer questions over all of them at once. Scattered spreadsheets become one queryable source of truth.
41
+
42
+ It's **read-only and sandboxed**: files are loaded into an in-memory database, the data directory is the only thing it can see, and queries are validated so an agent can't write, escape to other paths, or call raw file functions.
43
+
44
+ ---
45
+
46
+ ## Why you'd want this
47
+
48
+ - 🔗 **One source over many files.** JOIN `orders.csv` to `customers.csv` to `regions.json` in a single query — no ETL, no database to stand up.
49
+ - 🦆 **DuckDB-powered.** Fast analytical SQL over CSV, TSV, Parquet, JSON/NDJSON.
50
+ - 🔒 **Safe by design.** Files are materialized into memory; queries are validated read-only; raw file-access functions and out-of-sandbox paths are rejected.
51
+ - 🤖 **Agent-friendly.** `list_sources` → `describe` → `query` is a natural flow the agent can follow on its own.
52
+ - 🪶 **Two dependencies** (`mcp`, `duckdb`), fully typed and tested.
53
+
54
+ ## Install
55
+
56
+ ```bash
57
+ uvx tablebridge # run directly
58
+ # or
59
+ pip install tablebridge # then run: tablebridge
60
+ ```
61
+
62
+ ### Claude Code
63
+
64
+ ```bash
65
+ TABLEBRIDGE_DATA_DIR=/path/to/your/data claude mcp add tablebridge -- uvx tablebridge
66
+ ```
67
+
68
+ ### Claude Desktop / Cursor
69
+
70
+ ```jsonc
71
+ {
72
+ "mcpServers": {
73
+ "tablebridge": {
74
+ "command": "uvx",
75
+ "args": ["tablebridge"],
76
+ "env": { "TABLEBRIDGE_DATA_DIR": "/path/to/your/data" }
77
+ }
78
+ }
79
+ }
80
+ ```
81
+
82
+ ## Tools
83
+
84
+ | Tool | Description |
85
+ | --- | --- |
86
+ | `list_sources` | List the tables (one per data file) with column counts — start here |
87
+ | `describe` | A table's columns and types |
88
+ | `preview` | First N rows of a table |
89
+ | `query` | Run read-only SQL (DuckDB dialect) across the tables, JOINs included |
90
+ | `refresh` | Re-scan the data directory for added/changed files |
91
+ | `server_info` | Effective config (data dir, row cap, supported formats) |
92
+
93
+ ## Example
94
+
95
+ With a folder containing `customers.csv`, `orders.csv`, and `regions.json`:
96
+
97
+ > **You:** Who are my top 3 customers by total spend, and what region are they in?
98
+ >
99
+ > **Agent:** *(calls `list_sources`, then `query`)*
100
+ > ```sql
101
+ > SELECT c.name, r.region, SUM(o.total) AS spend
102
+ > FROM customers c
103
+ > JOIN orders o ON o.customer_id = c.id
104
+ > JOIN regions r ON r.customer_id = c.id
105
+ > GROUP BY c.name, r.region
106
+ > ORDER BY spend DESC
107
+ > LIMIT 3;
108
+ > ```
109
+
110
+ ## Configuration
111
+
112
+ | Variable | Default | Description |
113
+ | --- | --- | --- |
114
+ | `TABLEBRIDGE_DATA_DIR` | `.` | Directory of files to expose (the sandbox boundary) |
115
+ | `TABLEBRIDGE_MAX_ROWS` | `1000` | Max rows returned per query/preview |
116
+ | `TABLEBRIDGE_RECURSIVE` | `1` | Scan subdirectories too |
117
+
118
+ Supported formats: `.csv`, `.tsv`, `.parquet`, `.json`, `.ndjson`.
119
+
120
+ ## Security model
121
+
122
+ 1. **Sandboxed** to `TABLEBRIDGE_DATA_DIR` — only files under it are loaded.
123
+ 2. **Materialized** into an in-memory DuckDB, then external filesystem access is disabled — queries can't reach other paths.
124
+ 3. **Validated SQL** — a single read-only statement only; writes and raw file-reader functions are rejected.
125
+
126
+ ## Development
127
+
128
+ ```bash
129
+ git clone https://github.com/Michael-WhiteCapData/tablebridge-mcp
130
+ cd tablebridge-mcp
131
+ uv pip install -e ".[dev]"
132
+ ruff check .
133
+ pytest # uses real DuckDB over temp files
134
+ ```
135
+
136
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
137
+
138
+ ## License
139
+
140
+ [MIT](LICENSE) © Michael Tierney
@@ -0,0 +1,113 @@
1
+ <!-- mcp-name: io.github.Michael-WhiteCapData/tablebridge-mcp -->
2
+
3
+ # tablebridge
4
+
5
+ **Turn a folder of CSV / Parquet / JSON files into one SQL-queryable source for your AI agent.**
6
+
7
+ [![CI](https://github.com/Michael-WhiteCapData/tablebridge-mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/Michael-WhiteCapData/tablebridge-mcp/actions/workflows/ci.yml)
8
+ [![PyPI](https://img.shields.io/pypi/v/tablebridge?color=3775A9&logo=pypi&logoColor=white)](https://pypi.org/project/tablebridge/)
9
+ [![Python](https://img.shields.io/badge/python-3.11%2B-3776AB?logo=python&logoColor=white)](https://www.python.org/)
10
+ [![MCP](https://img.shields.io/badge/MCP-server-D97757)](https://modelcontextprotocol.io/)
11
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
12
+
13
+ Small businesses don't have a data warehouse — they have a folder full of exports: `customers.csv`, last month's `orders.xlsx`, a `regions.json` someone emailed over. `tablebridge` is an [MCP](https://modelcontextprotocol.io/) server that points [DuckDB](https://duckdb.org/) at that folder, exposes **each file as a SQL table**, and lets your agent run **read-only SQL — including JOINs across files** — to answer questions over all of them at once. Scattered spreadsheets become one queryable source of truth.
14
+
15
+ It's **read-only and sandboxed**: files are loaded into an in-memory database, the data directory is the only thing it can see, and queries are validated so an agent can't write, escape to other paths, or call raw file functions.
16
+
17
+ ---
18
+
19
+ ## Why you'd want this
20
+
21
+ - 🔗 **One source over many files.** JOIN `orders.csv` to `customers.csv` to `regions.json` in a single query — no ETL, no database to stand up.
22
+ - 🦆 **DuckDB-powered.** Fast analytical SQL over CSV, TSV, Parquet, JSON/NDJSON.
23
+ - 🔒 **Safe by design.** Files are materialized into memory; queries are validated read-only; raw file-access functions and out-of-sandbox paths are rejected.
24
+ - 🤖 **Agent-friendly.** `list_sources` → `describe` → `query` is a natural flow the agent can follow on its own.
25
+ - 🪶 **Two dependencies** (`mcp`, `duckdb`), fully typed and tested.
26
+
27
+ ## Install
28
+
29
+ ```bash
30
+ uvx tablebridge # run directly
31
+ # or
32
+ pip install tablebridge # then run: tablebridge
33
+ ```
34
+
35
+ ### Claude Code
36
+
37
+ ```bash
38
+ TABLEBRIDGE_DATA_DIR=/path/to/your/data claude mcp add tablebridge -- uvx tablebridge
39
+ ```
40
+
41
+ ### Claude Desktop / Cursor
42
+
43
+ ```jsonc
44
+ {
45
+ "mcpServers": {
46
+ "tablebridge": {
47
+ "command": "uvx",
48
+ "args": ["tablebridge"],
49
+ "env": { "TABLEBRIDGE_DATA_DIR": "/path/to/your/data" }
50
+ }
51
+ }
52
+ }
53
+ ```
54
+
55
+ ## Tools
56
+
57
+ | Tool | Description |
58
+ | --- | --- |
59
+ | `list_sources` | List the tables (one per data file) with column counts — start here |
60
+ | `describe` | A table's columns and types |
61
+ | `preview` | First N rows of a table |
62
+ | `query` | Run read-only SQL (DuckDB dialect) across the tables, JOINs included |
63
+ | `refresh` | Re-scan the data directory for added/changed files |
64
+ | `server_info` | Effective config (data dir, row cap, supported formats) |
65
+
66
+ ## Example
67
+
68
+ With a folder containing `customers.csv`, `orders.csv`, and `regions.json`:
69
+
70
+ > **You:** Who are my top 3 customers by total spend, and what region are they in?
71
+ >
72
+ > **Agent:** *(calls `list_sources`, then `query`)*
73
+ > ```sql
74
+ > SELECT c.name, r.region, SUM(o.total) AS spend
75
+ > FROM customers c
76
+ > JOIN orders o ON o.customer_id = c.id
77
+ > JOIN regions r ON r.customer_id = c.id
78
+ > GROUP BY c.name, r.region
79
+ > ORDER BY spend DESC
80
+ > LIMIT 3;
81
+ > ```
82
+
83
+ ## Configuration
84
+
85
+ | Variable | Default | Description |
86
+ | --- | --- | --- |
87
+ | `TABLEBRIDGE_DATA_DIR` | `.` | Directory of files to expose (the sandbox boundary) |
88
+ | `TABLEBRIDGE_MAX_ROWS` | `1000` | Max rows returned per query/preview |
89
+ | `TABLEBRIDGE_RECURSIVE` | `1` | Scan subdirectories too |
90
+
91
+ Supported formats: `.csv`, `.tsv`, `.parquet`, `.json`, `.ndjson`.
92
+
93
+ ## Security model
94
+
95
+ 1. **Sandboxed** to `TABLEBRIDGE_DATA_DIR` — only files under it are loaded.
96
+ 2. **Materialized** into an in-memory DuckDB, then external filesystem access is disabled — queries can't reach other paths.
97
+ 3. **Validated SQL** — a single read-only statement only; writes and raw file-reader functions are rejected.
98
+
99
+ ## Development
100
+
101
+ ```bash
102
+ git clone https://github.com/Michael-WhiteCapData/tablebridge-mcp
103
+ cd tablebridge-mcp
104
+ uv pip install -e ".[dev]"
105
+ ruff check .
106
+ pytest # uses real DuckDB over temp files
107
+ ```
108
+
109
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
110
+
111
+ ## License
112
+
113
+ [MIT](LICENSE) © Michael Tierney
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "tablebridge"
7
+ version = "0.1.0"
8
+ description = "An MCP server that turns a folder of CSV / Parquet / JSON files into one SQL-queryable source for your AI agent."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.11"
12
+ authors = [{ name = "Michael Tierney" }]
13
+ keywords = ["mcp", "model-context-protocol", "duckdb", "sql", "csv", "parquet", "data-integration", "claude"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Database",
21
+ "Topic :: Scientific/Engineering :: Information Analysis",
22
+ ]
23
+ dependencies = [
24
+ "mcp>=1.2",
25
+ "duckdb>=1.0",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/Michael-WhiteCapData/tablebridge-mcp"
30
+ Repository = "https://github.com/Michael-WhiteCapData/tablebridge-mcp"
31
+ Issues = "https://github.com/Michael-WhiteCapData/tablebridge-mcp/issues"
32
+
33
+ [project.scripts]
34
+ tablebridge = "tablebridge.server:main"
35
+
36
+ [project.optional-dependencies]
37
+ dev = ["pytest>=8", "pytest-cov>=5", "ruff>=0.6"]
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/tablebridge"]
41
+
42
+ [tool.pytest.ini_options]
43
+ addopts = "-q"
44
+ testpaths = ["tests"]
45
+
46
+ [tool.ruff]
47
+ line-length = 110
48
+ target-version = "py311"
49
+
50
+ [tool.ruff.lint]
51
+ select = ["E", "F", "I", "UP", "B", "SIM", "PLC"]
@@ -0,0 +1,38 @@
1
+ {
2
+ "$schema": "https://static.modelcontextprotocol.io/schemas/2025-09-29/server.schema.json",
3
+ "name": "io.github.Michael-WhiteCapData/tablebridge-mcp",
4
+ "description": "Query a folder of CSV / Parquet / JSON files with SQL — one queryable source, read-only.",
5
+ "repository": {
6
+ "url": "https://github.com/Michael-WhiteCapData/tablebridge-mcp",
7
+ "source": "github"
8
+ },
9
+ "version": "0.1.0",
10
+ "packages": [
11
+ {
12
+ "registryType": "pypi",
13
+ "identifier": "tablebridge",
14
+ "version": "0.1.0",
15
+ "transport": { "type": "stdio" },
16
+ "environmentVariables": [
17
+ {
18
+ "name": "TABLEBRIDGE_DATA_DIR",
19
+ "description": "Directory of data files to expose as SQL tables (sandboxed).",
20
+ "default": ".",
21
+ "isRequired": false
22
+ },
23
+ {
24
+ "name": "TABLEBRIDGE_MAX_ROWS",
25
+ "description": "Maximum rows returned per query/preview.",
26
+ "default": "1000",
27
+ "isRequired": false
28
+ },
29
+ {
30
+ "name": "TABLEBRIDGE_RECURSIVE",
31
+ "description": "Scan subdirectories too (1/0).",
32
+ "default": "1",
33
+ "isRequired": false
34
+ }
35
+ ]
36
+ }
37
+ ]
38
+ }
@@ -0,0 +1,13 @@
1
+ """tablebridge — query your scattered CSV / Parquet / JSON files with SQL, via MCP.
2
+
3
+ Points a DuckDB engine at a directory of tabular files, exposes each as a SQL
4
+ view, and lets an agent run read-only SQL (including JOINs across files) — so a
5
+ pile of exports becomes one queryable source of truth. Sandboxed to a single
6
+ data directory and read-only by default.
7
+ """
8
+
9
+ from .config import Config
10
+ from .db import TableBridge, TableBridgeError
11
+
12
+ __all__ = ["Config", "TableBridge", "TableBridgeError", "__version__"]
13
+ __version__ = "0.1.0"
@@ -0,0 +1,43 @@
1
+ """Environment-driven configuration for the tablebridge server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ DEFAULT_MAX_ROWS = 1000
10
+ # File extensions we expose as SQL views, mapped to the DuckDB reader function.
11
+ READERS = {
12
+ ".csv": "read_csv_auto",
13
+ ".tsv": "read_csv_auto",
14
+ ".parquet": "read_parquet",
15
+ ".json": "read_json_auto",
16
+ ".ndjson": "read_json_auto",
17
+ }
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class Config:
22
+ """Effective server configuration, sourced from the environment."""
23
+
24
+ data_dir: Path = Path(".")
25
+ max_rows: int = DEFAULT_MAX_ROWS
26
+ recursive: bool = True
27
+
28
+ @classmethod
29
+ def from_env(cls, env: dict[str, str] | None = None) -> Config:
30
+ src = os.environ if env is None else env
31
+ return cls(
32
+ data_dir=Path(src.get("TABLEBRIDGE_DATA_DIR", ".")).expanduser().resolve(),
33
+ max_rows=int(src.get("TABLEBRIDGE_MAX_ROWS", str(DEFAULT_MAX_ROWS))),
34
+ recursive=src.get("TABLEBRIDGE_RECURSIVE", "1").lower() not in ("0", "false", "no"),
35
+ )
36
+
37
+ def as_dict(self) -> dict[str, object]:
38
+ return {
39
+ "data_dir": str(self.data_dir),
40
+ "max_rows": self.max_rows,
41
+ "recursive": self.recursive,
42
+ "supported_extensions": sorted(READERS),
43
+ }
@@ -0,0 +1,161 @@
1
+ """DuckDB engine: load a directory of tabular files as in-memory tables and run
2
+ read-only SQL over them.
3
+
4
+ Security posture:
5
+ - Files are **materialized** into in-memory tables at scan time, so queries never
6
+ touch the filesystem afterward.
7
+ - Query SQL is validated to be a single read-only statement, and raw file-reader
8
+ functions (read_csv, read_parquet, glob, copy, attach, …) are rejected — an
9
+ agent cannot read a path outside the configured data directory.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import contextlib
15
+ import re
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from .config import READERS, Config
20
+
21
+
22
+ class TableBridgeError(RuntimeError):
23
+ """A user-facing error (bad SQL, unknown table, load failure)."""
24
+
25
+
26
+ _ALLOWED_START = {
27
+ "SELECT", "WITH", "FROM", "DESCRIBE", "SUMMARIZE", "SHOW", "EXPLAIN", "VALUES", "TABLE",
28
+ }
29
+ _FORBIDDEN = re.compile(
30
+ r"\b(read_csv|read_csv_auto|read_parquet|read_json|read_json_auto|read_ndjson|"
31
+ r"read_text|read_blob|parquet_scan|glob|copy|attach|detach|install|load|export|import)\b",
32
+ re.IGNORECASE,
33
+ )
34
+ _IDENT = re.compile(r"\W+")
35
+
36
+
37
+ def _table_name(path: Path, taken: set[str]) -> str:
38
+ base = _IDENT.sub("_", path.stem).strip("_").lower() or "table"
39
+ name, i = base, 2
40
+ while name in taken:
41
+ name, i = f"{base}_{i}", i + 1
42
+ return name
43
+
44
+
45
+ def validate_sql(sql: str) -> str:
46
+ """Return the SQL if it is a single safe read-only statement, else raise."""
47
+ stmts = [s for s in (part.strip() for part in sql.split(";")) if s]
48
+ if len(stmts) != 1:
49
+ raise TableBridgeError("Provide exactly one SQL statement.")
50
+ stmt = stmts[0]
51
+ first = stmt.split(None, 1)[0].upper() if stmt.split() else ""
52
+ if first not in _ALLOWED_START:
53
+ raise TableBridgeError(
54
+ f"Only read-only queries are allowed (got '{first or '?'}'). "
55
+ "Use SELECT / WITH / DESCRIBE / SUMMARIZE / SHOW."
56
+ )
57
+ if _FORBIDDEN.search(stmt):
58
+ raise TableBridgeError(
59
+ "Raw file access functions are not allowed. Query the registered tables "
60
+ "by name (see list_sources)."
61
+ )
62
+ return stmt
63
+
64
+
65
+ class TableBridge:
66
+ """Loads a data directory into DuckDB and answers read-only queries."""
67
+
68
+ def __init__(self, config: Config, con: Any = None) -> None:
69
+ self._config = config
70
+ self._registry: dict[str, dict[str, str]] = {}
71
+ self._own_con = con is None
72
+ self._con = con if con is not None else self._new_con()
73
+ self.scan()
74
+
75
+ def _new_con(self) -> Any:
76
+ import duckdb # noqa: PLC0415
77
+
78
+ return duckdb.connect(":memory:")
79
+
80
+ @property
81
+ def config(self) -> Config:
82
+ return self._config
83
+
84
+ # -- loading -------------------------------------------------------------
85
+
86
+ def scan(self) -> int:
87
+ """(Re)load supported files under the data dir as in-memory tables.
88
+
89
+ Reconnects first (when we own the connection) so a prior scan's
90
+ ``enable_external_access=false`` lock is reset and files can be read again.
91
+ """
92
+ if self._own_con:
93
+ self._con = self._new_con()
94
+ self._registry.clear()
95
+ pattern = "**/*" if self._config.recursive else "*"
96
+ taken: set[str] = set()
97
+ for path in sorted(self._config.data_dir.glob(pattern)):
98
+ reader = READERS.get(path.suffix.lower())
99
+ if not path.is_file() or reader is None:
100
+ continue
101
+ name = _table_name(path, taken)
102
+ taken.add(name)
103
+ try:
104
+ self._con.execute(
105
+ f'CREATE OR REPLACE TABLE "{name}" AS SELECT * FROM {reader}(?)',
106
+ [str(path)],
107
+ )
108
+ except Exception as exc: # noqa: BLE001 - surface load errors per file
109
+ raise TableBridgeError(f"Failed to load {path.name}: {exc}") from exc
110
+ rel = str(path.relative_to(self._config.data_dir))
111
+ self._registry[name] = {"file": rel, "kind": path.suffix.lower().lstrip(".")}
112
+ # Defense in depth: once data is materialized, forbid further file access.
113
+ with contextlib.suppress(Exception):
114
+ self._con.execute("SET enable_external_access=false")
115
+ return len(self._registry)
116
+
117
+ # -- introspection -------------------------------------------------------
118
+
119
+ def list_sources(self) -> list[dict[str, Any]]:
120
+ out = []
121
+ for name, meta in self._registry.items():
122
+ cols = self._con.execute(f'SELECT * FROM "{name}" LIMIT 0').description
123
+ out.append({"table": name, "file": meta["file"], "kind": meta["kind"], "columns": len(cols)})
124
+ return out
125
+
126
+ def describe(self, table: str) -> list[dict[str, str]]:
127
+ self._require(table)
128
+ rows = self._con.execute(f'DESCRIBE "{table}"').fetchall()
129
+ return [{"column": r[0], "type": r[1]} for r in rows]
130
+
131
+ def preview(self, table: str, n: int = 20) -> dict[str, Any]:
132
+ self._require(table)
133
+ n = max(1, min(n, self._config.max_rows))
134
+ return self._fetch(f'SELECT * FROM "{table}" LIMIT {n}')
135
+
136
+ def query(self, sql: str) -> dict[str, Any]:
137
+ return self._fetch(validate_sql(sql))
138
+
139
+ # -- helpers -------------------------------------------------------------
140
+
141
+ def _require(self, table: str) -> None:
142
+ if table not in self._registry:
143
+ known = ", ".join(self._registry) or "(none)"
144
+ raise TableBridgeError(f"Unknown table '{table}'. Available: {known}")
145
+
146
+ def _fetch(self, sql: str) -> dict[str, Any]:
147
+ try:
148
+ cur = self._con.execute(sql)
149
+ except Exception as exc: # noqa: BLE001 - return query errors to the agent
150
+ raise TableBridgeError(f"Query failed: {exc}") from exc
151
+ columns = [d[0] for d in cur.description] if cur.description else []
152
+ cap = self._config.max_rows
153
+ rows = cur.fetchmany(cap + 1)
154
+ truncated = len(rows) > cap
155
+ rows = rows[:cap]
156
+ return {
157
+ "columns": columns,
158
+ "rows": [dict(zip(columns, r, strict=False)) for r in rows],
159
+ "row_count": len(rows),
160
+ "truncated": truncated,
161
+ }
@@ -0,0 +1,91 @@
1
+ """The tablebridge MCP server.
2
+
3
+ Tools return JSON so the agent gets structured results. Everything is read-only
4
+ and sandboxed to the configured data directory.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from typing import Any
11
+
12
+ from mcp.server.fastmcp import FastMCP
13
+
14
+ from .config import Config
15
+ from .db import TableBridge
16
+
17
+ mcp = FastMCP("tablebridge")
18
+
19
+ _bridge: TableBridge | None = None
20
+
21
+
22
+ def get_bridge() -> TableBridge:
23
+ global _bridge
24
+ if _bridge is None:
25
+ _bridge = TableBridge(Config.from_env())
26
+ return _bridge
27
+
28
+
29
+ def set_bridge(bridge: TableBridge) -> None:
30
+ """Replace the module-level bridge (used by tests)."""
31
+ global _bridge
32
+ _bridge = bridge
33
+
34
+
35
+ def _json(data: Any) -> str:
36
+ return json.dumps(data, indent=2, default=str)
37
+
38
+
39
+ @mcp.tool()
40
+ def list_sources() -> str:
41
+ """List the tables available to query (one per data file) with column counts.
42
+
43
+ Start here: each CSV/Parquet/JSON file under the data directory is exposed as
44
+ a table you can SELECT from and JOIN across.
45
+ """
46
+ return _json(get_bridge().list_sources())
47
+
48
+
49
+ @mcp.tool()
50
+ def describe(table: str) -> str:
51
+ """Show a table's columns and types."""
52
+ return _json(get_bridge().describe(table))
53
+
54
+
55
+ @mcp.tool()
56
+ def preview(table: str, n: int = 20) -> str:
57
+ """Return the first ``n`` rows of a table (capped by TABLEBRIDGE_MAX_ROWS)."""
58
+ return _json(get_bridge().preview(table, n))
59
+
60
+
61
+ @mcp.tool()
62
+ def query(sql: str) -> str:
63
+ """Run a read-only SQL query (DuckDB dialect) across the loaded tables.
64
+
65
+ Supports SELECT / WITH / DESCRIBE / SUMMARIZE and JOINs across files. Writes
66
+ and raw file-access functions are rejected. Results are capped at
67
+ TABLEBRIDGE_MAX_ROWS; a ``truncated`` flag indicates when more rows exist.
68
+ """
69
+ return _json(get_bridge().query(sql))
70
+
71
+
72
+ @mcp.tool()
73
+ def refresh() -> str:
74
+ """Re-scan the data directory (pick up added/changed files) and report the count."""
75
+ count = get_bridge().scan()
76
+ return _json({"reloaded_tables": count})
77
+
78
+
79
+ @mcp.tool()
80
+ def server_info() -> str:
81
+ """Report the effective configuration (data dir, row cap, supported formats)."""
82
+ return _json(get_bridge().config.as_dict())
83
+
84
+
85
+ def main() -> None:
86
+ """Console-script entry point: run the server over stdio."""
87
+ mcp.run()
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
@@ -0,0 +1,26 @@
1
+ """Fixtures: a temp data directory with real CSV/JSON files + a live bridge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from tablebridge.config import Config
8
+ from tablebridge.db import TableBridge
9
+
10
+
11
+ @pytest.fixture
12
+ def data_dir(tmp_path):
13
+ (tmp_path / "customers.csv").write_text("id,name\n1,Alice\n2,Bob\n")
14
+ (tmp_path / "orders.csv").write_text(
15
+ "id,customer_id,total\n10,1,99.5\n11,1,5.0\n12,2,20.0\n"
16
+ )
17
+ (tmp_path / "regions.json").write_text(
18
+ '[{"customer_id":1,"region":"NY"},{"customer_id":2,"region":"CA"}]'
19
+ )
20
+ return tmp_path
21
+
22
+
23
+ @pytest.fixture
24
+ def bridge(data_dir):
25
+ # max_rows=2 so truncation behavior is exercised.
26
+ return TableBridge(Config(data_dir=data_dir, max_rows=2))
@@ -0,0 +1,49 @@
1
+ import json
2
+
3
+ from tablebridge import server
4
+ from tablebridge.config import Config
5
+
6
+
7
+ def test_config_from_env(tmp_path):
8
+ cfg = Config.from_env(env={"TABLEBRIDGE_DATA_DIR": str(tmp_path), "TABLEBRIDGE_MAX_ROWS": "50"})
9
+ assert cfg.data_dir == tmp_path.resolve()
10
+ assert cfg.max_rows == 50
11
+ assert cfg.recursive is True
12
+
13
+
14
+ def test_config_recursive_off():
15
+ assert Config.from_env(env={"TABLEBRIDGE_RECURSIVE": "0"}).recursive is False
16
+
17
+
18
+ class StubBridge:
19
+ def __init__(self):
20
+ self.config = Config()
21
+ self.calls = []
22
+
23
+ def list_sources(self):
24
+ return [{"table": "customers", "columns": 2}]
25
+
26
+ def query(self, sql):
27
+ self.calls.append(sql)
28
+ return {"columns": ["x"], "rows": [{"x": 1}], "row_count": 1, "truncated": False}
29
+
30
+ def scan(self):
31
+ return 3
32
+
33
+
34
+ def test_list_sources_tool_returns_json():
35
+ server.set_bridge(StubBridge())
36
+ assert json.loads(server.list_sources())[0]["table"] == "customers"
37
+
38
+
39
+ def test_query_tool_delegates():
40
+ stub = StubBridge()
41
+ server.set_bridge(stub)
42
+ out = json.loads(server.query("SELECT 1"))
43
+ assert out["row_count"] == 1
44
+ assert stub.calls == ["SELECT 1"]
45
+
46
+
47
+ def test_refresh_tool_reports_count():
48
+ server.set_bridge(StubBridge())
49
+ assert json.loads(server.refresh())["reloaded_tables"] == 3
@@ -0,0 +1,52 @@
1
+ import pytest
2
+
3
+ from tablebridge.db import TableBridgeError
4
+
5
+
6
+ def test_scan_registers_tables(bridge):
7
+ tables = {s["table"] for s in bridge.list_sources()}
8
+ assert {"customers", "orders", "regions"} <= tables
9
+
10
+
11
+ def test_describe_columns(bridge):
12
+ cols = {c["column"] for c in bridge.describe("customers")}
13
+ assert cols == {"id", "name"}
14
+
15
+
16
+ def test_preview_respects_max_rows(bridge):
17
+ out = bridge.preview("orders", n=20) # max_rows=2
18
+ assert out["row_count"] == 2
19
+
20
+
21
+ def test_query_join_across_files(bridge):
22
+ out = bridge.query(
23
+ "SELECT c.name, SUM(o.total) AS spend FROM customers c "
24
+ "JOIN orders o ON o.customer_id = c.id GROUP BY c.name ORDER BY c.name"
25
+ )
26
+ assert out["columns"] == ["name", "spend"]
27
+ # max_rows=2 caps the two groups; both fit
28
+ names = {r["name"] for r in out["rows"]}
29
+ assert "Alice" in names
30
+
31
+
32
+ def test_query_truncation_flag(bridge):
33
+ out = bridge.query("SELECT * FROM orders") # 3 rows, cap 2
34
+ assert out["row_count"] == 2
35
+ assert out["truncated"] is True
36
+
37
+
38
+ def test_query_blocks_file_escape(bridge):
39
+ with pytest.raises(TableBridgeError):
40
+ bridge.query("SELECT * FROM read_csv_auto('/etc/passwd')")
41
+
42
+
43
+ def test_unknown_table_errors(bridge):
44
+ with pytest.raises(TableBridgeError, match="Unknown table"):
45
+ bridge.describe("nope")
46
+
47
+
48
+ def test_refresh_picks_up_new_file(bridge, data_dir):
49
+ (data_dir / "extra.csv").write_text("a\n1\n")
50
+ count = bridge.scan()
51
+ assert "extra" in {s["table"] for s in bridge.list_sources()}
52
+ assert count >= 4
@@ -0,0 +1,48 @@
1
+ import pytest
2
+
3
+ from tablebridge.db import TableBridgeError, validate_sql
4
+
5
+
6
+ def test_allows_select_and_with():
7
+ assert validate_sql("SELECT 1") == "SELECT 1"
8
+ assert validate_sql("WITH t AS (SELECT 1) SELECT * FROM t").startswith("WITH")
9
+
10
+
11
+ def test_strips_trailing_semicolon():
12
+ assert validate_sql("SELECT 1;") == "SELECT 1"
13
+
14
+
15
+ @pytest.mark.parametrize(
16
+ "sql",
17
+ [
18
+ "INSERT INTO t VALUES (1)",
19
+ "UPDATE t SET x=1",
20
+ "DELETE FROM t",
21
+ "DROP TABLE t",
22
+ "CREATE TABLE t (x int)",
23
+ "COPY t TO 'out.csv'",
24
+ "ATTACH 'x.db'",
25
+ "SET enable_external_access=true",
26
+ ],
27
+ )
28
+ def test_rejects_writes_and_dangerous(sql):
29
+ with pytest.raises(TableBridgeError):
30
+ validate_sql(sql)
31
+
32
+
33
+ @pytest.mark.parametrize(
34
+ "sql",
35
+ [
36
+ "SELECT * FROM read_csv_auto('/etc/passwd')",
37
+ "SELECT * FROM read_parquet('x')",
38
+ "SELECT * FROM glob('/**')",
39
+ ],
40
+ )
41
+ def test_rejects_raw_file_readers(sql):
42
+ with pytest.raises(TableBridgeError, match="Raw file access"):
43
+ validate_sql(sql)
44
+
45
+
46
+ def test_rejects_multiple_statements():
47
+ with pytest.raises(TableBridgeError, match="exactly one"):
48
+ validate_sql("SELECT 1; SELECT 2")