tablebridge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ """tablebridge — query your scattered CSV / Parquet / JSON files with SQL, via MCP.
2
+
3
+ Points a DuckDB engine at a directory of tabular files, exposes each as a SQL
4
+ view, and lets an agent run read-only SQL (including JOINs across files) — so a
5
+ pile of exports becomes one queryable source of truth. Sandboxed to a single
6
+ data directory and read-only by default.
7
+ """
8
+
9
+ from .config import Config
10
+ from .db import TableBridge, TableBridgeError
11
+
12
+ __all__ = ["Config", "TableBridge", "TableBridgeError", "__version__"]
13
+ __version__ = "0.1.0"
tablebridge/config.py ADDED
@@ -0,0 +1,43 @@
1
+ """Environment-driven configuration for the tablebridge server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ DEFAULT_MAX_ROWS = 1000
10
+ # File extensions we expose as SQL views, mapped to the DuckDB reader function.
11
+ READERS = {
12
+ ".csv": "read_csv_auto",
13
+ ".tsv": "read_csv_auto",
14
+ ".parquet": "read_parquet",
15
+ ".json": "read_json_auto",
16
+ ".ndjson": "read_json_auto",
17
+ }
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class Config:
22
+ """Effective server configuration, sourced from the environment."""
23
+
24
+ data_dir: Path = Path(".")
25
+ max_rows: int = DEFAULT_MAX_ROWS
26
+ recursive: bool = True
27
+
28
+ @classmethod
29
+ def from_env(cls, env: dict[str, str] | None = None) -> Config:
30
+ src = os.environ if env is None else env
31
+ return cls(
32
+ data_dir=Path(src.get("TABLEBRIDGE_DATA_DIR", ".")).expanduser().resolve(),
33
+ max_rows=int(src.get("TABLEBRIDGE_MAX_ROWS", str(DEFAULT_MAX_ROWS))),
34
+ recursive=src.get("TABLEBRIDGE_RECURSIVE", "1").lower() not in ("0", "false", "no"),
35
+ )
36
+
37
+ def as_dict(self) -> dict[str, object]:
38
+ return {
39
+ "data_dir": str(self.data_dir),
40
+ "max_rows": self.max_rows,
41
+ "recursive": self.recursive,
42
+ "supported_extensions": sorted(READERS),
43
+ }
tablebridge/db.py ADDED
@@ -0,0 +1,161 @@
1
+ """DuckDB engine: load a directory of tabular files as in-memory tables and run
2
+ read-only SQL over them.
3
+
4
+ Security posture:
5
+ - Files are **materialized** into in-memory tables at scan time, so queries never
6
+ touch the filesystem afterward.
7
+ - Query SQL is validated to be a single read-only statement, and raw file-reader
8
+ functions (read_csv, read_parquet, glob, copy, attach, …) are rejected — an
9
+ agent cannot read a path outside the configured data directory.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import contextlib
15
+ import re
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from .config import READERS, Config
20
+
21
+
22
+ class TableBridgeError(RuntimeError):
23
+ """A user-facing error (bad SQL, unknown table, load failure)."""
24
+
25
+
26
+ _ALLOWED_START = {
27
+ "SELECT", "WITH", "FROM", "DESCRIBE", "SUMMARIZE", "SHOW", "EXPLAIN", "VALUES", "TABLE",
28
+ }
29
+ _FORBIDDEN = re.compile(
30
+ r"\b(read_csv|read_csv_auto|read_parquet|read_json|read_json_auto|read_ndjson|"
31
+ r"read_text|read_blob|parquet_scan|glob|copy|attach|detach|install|load|export|import)\b",
32
+ re.IGNORECASE,
33
+ )
34
+ _IDENT = re.compile(r"\W+")
35
+
36
+
37
+ def _table_name(path: Path, taken: set[str]) -> str:
38
+ base = _IDENT.sub("_", path.stem).strip("_").lower() or "table"
39
+ name, i = base, 2
40
+ while name in taken:
41
+ name, i = f"{base}_{i}", i + 1
42
+ return name
43
+
44
+
45
+ def validate_sql(sql: str) -> str:
46
+ """Return the SQL if it is a single safe read-only statement, else raise."""
47
+ stmts = [s for s in (part.strip() for part in sql.split(";")) if s]
48
+ if len(stmts) != 1:
49
+ raise TableBridgeError("Provide exactly one SQL statement.")
50
+ stmt = stmts[0]
51
+ first = stmt.split(None, 1)[0].upper() if stmt.split() else ""
52
+ if first not in _ALLOWED_START:
53
+ raise TableBridgeError(
54
+ f"Only read-only queries are allowed (got '{first or '?'}'). "
55
+ "Use SELECT / WITH / DESCRIBE / SUMMARIZE / SHOW."
56
+ )
57
+ if _FORBIDDEN.search(stmt):
58
+ raise TableBridgeError(
59
+ "Raw file access functions are not allowed. Query the registered tables "
60
+ "by name (see list_sources)."
61
+ )
62
+ return stmt
63
+
64
+
65
+ class TableBridge:
66
+ """Loads a data directory into DuckDB and answers read-only queries."""
67
+
68
+ def __init__(self, config: Config, con: Any = None) -> None:
69
+ self._config = config
70
+ self._registry: dict[str, dict[str, str]] = {}
71
+ self._own_con = con is None
72
+ self._con = con if con is not None else self._new_con()
73
+ self.scan()
74
+
75
+ def _new_con(self) -> Any:
76
+ import duckdb # noqa: PLC0415
77
+
78
+ return duckdb.connect(":memory:")
79
+
80
+ @property
81
+ def config(self) -> Config:
82
+ return self._config
83
+
84
+ # -- loading -------------------------------------------------------------
85
+
86
+ def scan(self) -> int:
87
+ """(Re)load supported files under the data dir as in-memory tables.
88
+
89
+ Reconnects first (when we own the connection) so a prior scan's
90
+ ``enable_external_access=false`` lock is reset and files can be read again.
91
+ """
92
+ if self._own_con:
93
+ self._con = self._new_con()
94
+ self._registry.clear()
95
+ pattern = "**/*" if self._config.recursive else "*"
96
+ taken: set[str] = set()
97
+ for path in sorted(self._config.data_dir.glob(pattern)):
98
+ reader = READERS.get(path.suffix.lower())
99
+ if not path.is_file() or reader is None:
100
+ continue
101
+ name = _table_name(path, taken)
102
+ taken.add(name)
103
+ try:
104
+ self._con.execute(
105
+ f'CREATE OR REPLACE TABLE "{name}" AS SELECT * FROM {reader}(?)',
106
+ [str(path)],
107
+ )
108
+ except Exception as exc: # noqa: BLE001 - surface load errors per file
109
+ raise TableBridgeError(f"Failed to load {path.name}: {exc}") from exc
110
+ rel = str(path.relative_to(self._config.data_dir))
111
+ self._registry[name] = {"file": rel, "kind": path.suffix.lower().lstrip(".")}
112
+ # Defense in depth: once data is materialized, forbid further file access.
113
+ with contextlib.suppress(Exception):
114
+ self._con.execute("SET enable_external_access=false")
115
+ return len(self._registry)
116
+
117
+ # -- introspection -------------------------------------------------------
118
+
119
+ def list_sources(self) -> list[dict[str, Any]]:
120
+ out = []
121
+ for name, meta in self._registry.items():
122
+ cols = self._con.execute(f'SELECT * FROM "{name}" LIMIT 0').description
123
+ out.append({"table": name, "file": meta["file"], "kind": meta["kind"], "columns": len(cols)})
124
+ return out
125
+
126
+ def describe(self, table: str) -> list[dict[str, str]]:
127
+ self._require(table)
128
+ rows = self._con.execute(f'DESCRIBE "{table}"').fetchall()
129
+ return [{"column": r[0], "type": r[1]} for r in rows]
130
+
131
+ def preview(self, table: str, n: int = 20) -> dict[str, Any]:
132
+ self._require(table)
133
+ n = max(1, min(n, self._config.max_rows))
134
+ return self._fetch(f'SELECT * FROM "{table}" LIMIT {n}')
135
+
136
+ def query(self, sql: str) -> dict[str, Any]:
137
+ return self._fetch(validate_sql(sql))
138
+
139
+ # -- helpers -------------------------------------------------------------
140
+
141
+ def _require(self, table: str) -> None:
142
+ if table not in self._registry:
143
+ known = ", ".join(self._registry) or "(none)"
144
+ raise TableBridgeError(f"Unknown table '{table}'. Available: {known}")
145
+
146
+ def _fetch(self, sql: str) -> dict[str, Any]:
147
+ try:
148
+ cur = self._con.execute(sql)
149
+ except Exception as exc: # noqa: BLE001 - return query errors to the agent
150
+ raise TableBridgeError(f"Query failed: {exc}") from exc
151
+ columns = [d[0] for d in cur.description] if cur.description else []
152
+ cap = self._config.max_rows
153
+ rows = cur.fetchmany(cap + 1)
154
+ truncated = len(rows) > cap
155
+ rows = rows[:cap]
156
+ return {
157
+ "columns": columns,
158
+ "rows": [dict(zip(columns, r, strict=False)) for r in rows],
159
+ "row_count": len(rows),
160
+ "truncated": truncated,
161
+ }
tablebridge/server.py ADDED
@@ -0,0 +1,91 @@
1
+ """The tablebridge MCP server.
2
+
3
+ Tools return JSON so the agent gets structured results. Everything is read-only
4
+ and sandboxed to the configured data directory.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from typing import Any
11
+
12
+ from mcp.server.fastmcp import FastMCP
13
+
14
+ from .config import Config
15
+ from .db import TableBridge
16
+
17
+ mcp = FastMCP("tablebridge")
18
+
19
+ _bridge: TableBridge | None = None
20
+
21
+
22
+ def get_bridge() -> TableBridge:
23
+ global _bridge
24
+ if _bridge is None:
25
+ _bridge = TableBridge(Config.from_env())
26
+ return _bridge
27
+
28
+
29
+ def set_bridge(bridge: TableBridge) -> None:
30
+ """Replace the module-level bridge (used by tests)."""
31
+ global _bridge
32
+ _bridge = bridge
33
+
34
+
35
+ def _json(data: Any) -> str:
36
+ return json.dumps(data, indent=2, default=str)
37
+
38
+
39
+ @mcp.tool()
40
+ def list_sources() -> str:
41
+ """List the tables available to query (one per data file) with column counts.
42
+
43
+ Start here: each CSV/Parquet/JSON file under the data directory is exposed as
44
+ a table you can SELECT from and JOIN across.
45
+ """
46
+ return _json(get_bridge().list_sources())
47
+
48
+
49
+ @mcp.tool()
50
+ def describe(table: str) -> str:
51
+ """Show a table's columns and types."""
52
+ return _json(get_bridge().describe(table))
53
+
54
+
55
+ @mcp.tool()
56
+ def preview(table: str, n: int = 20) -> str:
57
+ """Return the first ``n`` rows of a table (capped by TABLEBRIDGE_MAX_ROWS)."""
58
+ return _json(get_bridge().preview(table, n))
59
+
60
+
61
+ @mcp.tool()
62
+ def query(sql: str) -> str:
63
+ """Run a read-only SQL query (DuckDB dialect) across the loaded tables.
64
+
65
+ Supports SELECT / WITH / DESCRIBE / SUMMARIZE and JOINs across files. Writes
66
+ and raw file-access functions are rejected. Results are capped at
67
+ TABLEBRIDGE_MAX_ROWS; a ``truncated`` flag indicates when more rows exist.
68
+ """
69
+ return _json(get_bridge().query(sql))
70
+
71
+
72
+ @mcp.tool()
73
+ def refresh() -> str:
74
+ """Re-scan the data directory (pick up added/changed files) and report the count."""
75
+ count = get_bridge().scan()
76
+ return _json({"reloaded_tables": count})
77
+
78
+
79
+ @mcp.tool()
80
+ def server_info() -> str:
81
+ """Report the effective configuration (data dir, row cap, supported formats)."""
82
+ return _json(get_bridge().config.as_dict())
83
+
84
+
85
+ def main() -> None:
86
+ """Console-script entry point: run the server over stdio."""
87
+ mcp.run()
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.4
2
+ Name: tablebridge
3
+ Version: 0.1.0
4
+ Summary: An MCP server that turns a folder of CSV / Parquet / JSON files into one SQL-queryable source for your AI agent.
5
+ Project-URL: Homepage, https://github.com/Michael-WhiteCapData/tablebridge-mcp
6
+ Project-URL: Repository, https://github.com/Michael-WhiteCapData/tablebridge-mcp
7
+ Project-URL: Issues, https://github.com/Michael-WhiteCapData/tablebridge-mcp/issues
8
+ Author: Michael Tierney
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: claude,csv,data-integration,duckdb,mcp,model-context-protocol,parquet,sql
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Database
18
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: duckdb>=1.0
21
+ Requires-Dist: mcp>=1.2
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
24
+ Requires-Dist: pytest>=8; extra == 'dev'
25
+ Requires-Dist: ruff>=0.6; extra == 'dev'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <!-- mcp-name: io.github.Michael-WhiteCapData/tablebridge-mcp -->
29
+
30
+ # tablebridge
31
+
32
+ **Turn a folder of CSV / Parquet / JSON files into one SQL-queryable source for your AI agent.**
33
+
34
+ [![CI](https://github.com/Michael-WhiteCapData/tablebridge-mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/Michael-WhiteCapData/tablebridge-mcp/actions/workflows/ci.yml)
35
+ [![PyPI](https://img.shields.io/pypi/v/tablebridge?color=3775A9&logo=pypi&logoColor=white)](https://pypi.org/project/tablebridge/)
36
+ [![Python](https://img.shields.io/badge/python-3.11%2B-3776AB?logo=python&logoColor=white)](https://www.python.org/)
37
+ [![MCP](https://img.shields.io/badge/MCP-server-D97757)](https://modelcontextprotocol.io/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
39
+
40
+ Small businesses don't have a data warehouse — they have a folder full of exports: `customers.csv`, last month's `orders.xlsx`, a `regions.json` someone emailed over. `tablebridge` is an [MCP](https://modelcontextprotocol.io/) server that points [DuckDB](https://duckdb.org/) at that folder, exposes **each file as a SQL table**, and lets your agent run **read-only SQL — including JOINs across files** — to answer questions over all of them at once. Scattered spreadsheets become one queryable source of truth.
41
+
42
+ It's **read-only and sandboxed**: files are loaded into an in-memory database, the data directory is the only thing it can see, and queries are validated so an agent can't write, escape to other paths, or call raw file functions.
43
+
44
+ ---
45
+
46
+ ## Why you'd want this
47
+
48
+ - 🔗 **One source over many files.** JOIN `orders.csv` to `customers.csv` to `regions.json` in a single query — no ETL, no database to stand up.
49
+ - 🦆 **DuckDB-powered.** Fast analytical SQL over CSV, TSV, Parquet, JSON/NDJSON.
50
+ - 🔒 **Safe by design.** Files are materialized into memory; queries are validated read-only; raw file-access functions and out-of-sandbox paths are rejected.
51
+ - 🤖 **Agent-friendly.** `list_sources` → `describe` → `query` is a natural flow the agent can follow on its own.
52
+ - 🪶 **Two dependencies** (`mcp`, `duckdb`), fully typed and tested.
53
+
54
+ ## Install
55
+
56
+ ```bash
57
+ uvx tablebridge # run directly
58
+ # or
59
+ pip install tablebridge # then run: tablebridge
60
+ ```
61
+
62
+ ### Claude Code
63
+
64
+ ```bash
65
+ TABLEBRIDGE_DATA_DIR=/path/to/your/data claude mcp add tablebridge -- uvx tablebridge
66
+ ```
67
+
68
+ ### Claude Desktop / Cursor
69
+
70
+ ```jsonc
71
+ {
72
+ "mcpServers": {
73
+ "tablebridge": {
74
+ "command": "uvx",
75
+ "args": ["tablebridge"],
76
+ "env": { "TABLEBRIDGE_DATA_DIR": "/path/to/your/data" }
77
+ }
78
+ }
79
+ }
80
+ ```
81
+
82
+ ## Tools
83
+
84
+ | Tool | Description |
85
+ | --- | --- |
86
+ | `list_sources` | List the tables (one per data file) with column counts — start here |
87
+ | `describe` | A table's columns and types |
88
+ | `preview` | First N rows of a table |
89
+ | `query` | Run read-only SQL (DuckDB dialect) across the tables, JOINs included |
90
+ | `refresh` | Re-scan the data directory for added/changed files |
91
+ | `server_info` | Effective config (data dir, row cap, supported formats) |
92
+
93
+ ## Example
94
+
95
+ With a folder containing `customers.csv`, `orders.csv`, and `regions.json`:
96
+
97
+ > **You:** Who are my top 3 customers by total spend, and what region are they in?
98
+ >
99
+ > **Agent:** *(calls `list_sources`, then `query`)*
100
+ > ```sql
101
+ > SELECT c.name, r.region, SUM(o.total) AS spend
102
+ > FROM customers c
103
+ > JOIN orders o ON o.customer_id = c.id
104
+ > JOIN regions r ON r.customer_id = c.id
105
+ > GROUP BY c.name, r.region
106
+ > ORDER BY spend DESC
107
+ > LIMIT 3;
108
+ > ```
109
+
110
+ ## Configuration
111
+
112
+ | Variable | Default | Description |
113
+ | --- | --- | --- |
114
+ | `TABLEBRIDGE_DATA_DIR` | `.` | Directory of files to expose (the sandbox boundary) |
115
+ | `TABLEBRIDGE_MAX_ROWS` | `1000` | Max rows returned per query/preview |
116
+ | `TABLEBRIDGE_RECURSIVE` | `1` | Scan subdirectories too |
117
+
118
+ Supported formats: `.csv`, `.tsv`, `.parquet`, `.json`, `.ndjson`.
119
+
120
+ ## Security model
121
+
122
+ 1. **Sandboxed** to `TABLEBRIDGE_DATA_DIR` — only files under it are loaded.
123
+ 2. **Materialized** into an in-memory DuckDB, then external filesystem access is disabled — queries can't reach other paths.
124
+ 3. **Validated SQL** — a single read-only statement only; writes and raw file-reader functions are rejected.
125
+
126
+ ## Development
127
+
128
+ ```bash
129
+ git clone https://github.com/Michael-WhiteCapData/tablebridge-mcp
130
+ cd tablebridge-mcp
131
+ uv pip install -e ".[dev]"
132
+ ruff check .
133
+ pytest # uses real DuckDB over temp files
134
+ ```
135
+
136
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
137
+
138
+ ## License
139
+
140
+ [MIT](LICENSE) © Michael Tierney
@@ -0,0 +1,9 @@
1
+ tablebridge/__init__.py,sha256=FeRW4GxElndLCVFghZAdt93o3iaxhaS7FJD0t0pF9C0,537
2
+ tablebridge/config.py,sha256=l-MdDOkqO1yXBCkRnwCmSfyptRgQHRanIzXqxTBjQI4,1335
3
+ tablebridge/db.py,sha256=akNVP54hJGvTL3RH5VW9JdRAgtHgtQbu71yxmPv_Dhs,6155
4
+ tablebridge/server.py,sha256=sOaRV2SyTGTSfohhl8SMyucL7fd2YWNqBPgDJdv_WcE,2325
5
+ tablebridge-0.1.0.dist-info/METADATA,sha256=lhFfD0qFgSTud64_hGF0CzfYd-fUEn--ZAST3WlaK_E,5658
6
+ tablebridge-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
7
+ tablebridge-0.1.0.dist-info/entry_points.txt,sha256=snZv5v1d7GzimKXzbeSp0JqKMpzT4YC5qpeVw1ZlbX4,56
8
+ tablebridge-0.1.0.dist-info/licenses/LICENSE,sha256=CY7xjvDIH4rbWyhYFOZZaAfXsrsdo5apgxDnsY-xq8g,1072
9
+ tablebridge-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tablebridge = tablebridge.server:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Michael Tierney
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.