tableshot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tableshot-0.1.0/.github/workflows/ci.yml +60 -0
- tableshot-0.1.0/.gitignore +36 -0
- tableshot-0.1.0/CLAUDE.md +11 -0
- tableshot-0.1.0/LICENSE +21 -0
- tableshot-0.1.0/PKG-INFO +293 -0
- tableshot-0.1.0/README.md +247 -0
- tableshot-0.1.0/SPEC.md +687 -0
- tableshot-0.1.0/benchmarks/competitor-data.md +83 -0
- tableshot-0.1.0/benchmarks/outputs/empty_page.md +6 -0
- tableshot-0.1.0/benchmarks/outputs/multi_page.md +14 -0
- tableshot-0.1.0/benchmarks/outputs/multi_table.md +14 -0
- tableshot-0.1.0/benchmarks/outputs/simple_bordered.md +8 -0
- tableshot-0.1.0/benchmarks/outputs/single_row.md +5 -0
- tableshot-0.1.0/benchmarks/outputs/special_chars.md +8 -0
- tableshot-0.1.0/benchmarks/outputs/wide_table.md +7 -0
- tableshot-0.1.0/benchmarks/results.json +192 -0
- tableshot-0.1.0/benchmarks/results.md +146 -0
- tableshot-0.1.0/benchmarks/run_benchmarks.py +351 -0
- tableshot-0.1.0/pyproject.toml +80 -0
- tableshot-0.1.0/src/tableshot/__init__.py +3 -0
- tableshot-0.1.0/src/tableshot/__main__.py +5 -0
- tableshot-0.1.0/src/tableshot/backends/__init__.py +0 -0
- tableshot-0.1.0/src/tableshot/backends/ml_backend.py +475 -0
- tableshot-0.1.0/src/tableshot/backends/pdfplumber_backend.py +113 -0
- tableshot-0.1.0/src/tableshot/formatter.py +142 -0
- tableshot-0.1.0/src/tableshot/input_handler.py +136 -0
- tableshot-0.1.0/src/tableshot/pipeline.py +243 -0
- tableshot-0.1.0/src/tableshot/server.py +105 -0
- tableshot-0.1.0/src/tableshot/utils.py +98 -0
- tableshot-0.1.0/tests/__init__.py +0 -0
- tableshot-0.1.0/tests/conftest.py +23 -0
- tableshot-0.1.0/tests/fixtures/empty_page.pdf +113 -0
- tableshot-0.1.0/tests/fixtures/multi_page.pdf +0 -0
- tableshot-0.1.0/tests/fixtures/multi_table.pdf +87 -0
- tableshot-0.1.0/tests/fixtures/simple_bordered.pdf +0 -0
- tableshot-0.1.0/tests/fixtures/single_row.pdf +84 -0
- tableshot-0.1.0/tests/fixtures/special_chars.pdf +0 -0
- tableshot-0.1.0/tests/fixtures/wide_table.pdf +0 -0
- tableshot-0.1.0/tests/generate_fixtures.py +231 -0
- tableshot-0.1.0/tests/test_formats.py +80 -0
- tableshot-0.1.0/tests/test_input_handler.py +100 -0
- tableshot-0.1.0/tests/test_ml_backend.py +275 -0
- tableshot-0.1.0/tests/test_pipeline.py +395 -0
- tableshot-0.1.0/tests/test_server.py +79 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, master]
|
|
6
|
+
tags: ["v*"]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main, master]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Install dependencies
|
|
26
|
+
run: |
|
|
27
|
+
python -m pip install --upgrade pip
|
|
28
|
+
pip install -e ".[dev]"
|
|
29
|
+
pip install fpdf2
|
|
30
|
+
|
|
31
|
+
- name: Generate test fixtures
|
|
32
|
+
run: python tests/generate_fixtures.py
|
|
33
|
+
|
|
34
|
+
- name: Lint with ruff
|
|
35
|
+
run: ruff check src/ tests/
|
|
36
|
+
|
|
37
|
+
- name: Run tests with coverage
|
|
38
|
+
run: pytest --cov=tableshot --cov-report=term-missing --cov-fail-under=80
|
|
39
|
+
|
|
40
|
+
publish:
|
|
41
|
+
needs: test
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
44
|
+
permissions:
|
|
45
|
+
id-token: write
|
|
46
|
+
steps:
|
|
47
|
+
- uses: actions/checkout@v4
|
|
48
|
+
|
|
49
|
+
- name: Set up Python
|
|
50
|
+
uses: actions/setup-python@v5
|
|
51
|
+
with:
|
|
52
|
+
python-version: "3.12"
|
|
53
|
+
|
|
54
|
+
- name: Build
|
|
55
|
+
run: |
|
|
56
|
+
pip install build
|
|
57
|
+
python -m build
|
|
58
|
+
|
|
59
|
+
- name: Publish to PyPI
|
|
60
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
|
|
22
|
+
# Testing
|
|
23
|
+
.pytest_cache/
|
|
24
|
+
.coverage
|
|
25
|
+
htmlcov/
|
|
26
|
+
|
|
27
|
+
# OS
|
|
28
|
+
.DS_Store
|
|
29
|
+
Thumbs.db
|
|
30
|
+
|
|
31
|
+
# Large test files (manual/real-world PDFs)
|
|
32
|
+
tests/by hand/
|
|
33
|
+
|
|
34
|
+
# Temp files
|
|
35
|
+
*.tmp
|
|
36
|
+
*.bak
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# TableShot
|
|
2
|
+
|
|
3
|
+
Read SPEC.md before doing anything. It contains the full build specification,
|
|
4
|
+
architecture decisions, tech stack, and sprint plan.
|
|
5
|
+
|
|
6
|
+
Key constraints:
|
|
7
|
+
- Base install must be <50MB (pdfplumber + pypdfium2 + mcp SDK only)
|
|
8
|
+
- MIT license — no AGPL dependencies (no PyMuPDF)
|
|
9
|
+
- Python 3.10+, hatchling build system
|
|
10
|
+
- 2 MCP tools for v1: extract_tables, list_tables
|
|
11
|
+
- All code in src/tableshot/
|
tableshot-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Andrew
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tableshot-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tableshot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract tables from PDFs into clean, structured data -- instantly. An MCP server for AI assistants.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Bespoke34/tableshot
|
|
6
|
+
Project-URL: Repository, https://github.com/Bespoke34/tableshot
|
|
7
|
+
Project-URL: Issues, https://github.com/Bespoke34/tableshot/issues
|
|
8
|
+
Author: Andrew Makris
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: document-ai,mcp,model-context-protocol,pdf,structured-data,table-extraction
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Text Processing :: General
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: mcp>=1.0
|
|
23
|
+
Requires-Dist: pdfplumber>=0.10
|
|
24
|
+
Requires-Dist: pillow>=10.0
|
|
25
|
+
Requires-Dist: pypdfium2>=4.0
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: onnxtr[cpu]>=0.5; extra == 'all'
|
|
28
|
+
Requires-Dist: timm>=0.9; extra == 'all'
|
|
29
|
+
Requires-Dist: torch>=2.0; extra == 'all'
|
|
30
|
+
Requires-Dist: torchvision>=0.15; extra == 'all'
|
|
31
|
+
Requires-Dist: transformers>=4.30; extra == 'all'
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: fpdf2>=2.7; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
38
|
+
Provides-Extra: ml
|
|
39
|
+
Requires-Dist: timm>=0.9; extra == 'ml'
|
|
40
|
+
Requires-Dist: torch>=2.0; extra == 'ml'
|
|
41
|
+
Requires-Dist: torchvision>=0.15; extra == 'ml'
|
|
42
|
+
Requires-Dist: transformers>=4.30; extra == 'ml'
|
|
43
|
+
Provides-Extra: ocr
|
|
44
|
+
Requires-Dist: onnxtr[cpu]>=0.5; extra == 'ocr'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# TableShot
|
|
48
|
+
|
|
49
|
+
**The only MCP server for PDF table extraction.** Give any AI assistant the ability to read tables from PDFs -- no other tool does this.
|
|
50
|
+
|
|
51
|
+
[](https://pypi.org/project/tableshot/)
|
|
52
|
+
[](LICENSE)
|
|
53
|
+
[](https://github.com/Bespoke34/tableshot/actions)
|
|
54
|
+
[](https://pypi.org/project/tableshot/)
|
|
55
|
+
|
|
56
|
+
Camelot, Tabula, and Table Transformer are Python libraries -- they require a developer to write code. TableShot is an MCP server: Claude Desktop, Cursor, and Windsurf can use it directly with zero code.
|
|
57
|
+
|
|
58
|
+
~33MB install. No model downloads. No API keys. Results in <100ms.
|
|
59
|
+
|
|
60
|
+
<!-- TODO: Replace with actual demo GIF -->
|
|
61
|
+
<!--  -->
|
|
62
|
+
|
|
63
|
+
## The Problem
|
|
64
|
+
|
|
65
|
+
Ask any AI assistant to read a table from a PDF. It can't -- you get word soup:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
Sales Report Q1 2024 Product Price Quantity Total Widget A $10.00 100
|
|
69
|
+
$1,000.00 Widget B $25.50 50 $1,275.00 Widget C $5.99 200 $1,198.00
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
TableShot gives you this:
|
|
73
|
+
|
|
74
|
+
| Product | Price | Quantity | Total |
|
|
75
|
+
|----------|---------|----------|-----------|
|
|
76
|
+
| Widget A | $10.00 | 100 | $1,000.00 |
|
|
77
|
+
| Widget B | $25.50 | 50 | $1,275.00 |
|
|
78
|
+
| Widget C | $5.99 | 200 | $1,198.00 |
|
|
79
|
+
| Widget D | $149.00 | 10 | $1,490.00 |
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
### Claude Desktop / Cursor / Windsurf
|
|
84
|
+
|
|
85
|
+
Add to your MCP config:
|
|
86
|
+
|
|
87
|
+
```json
|
|
88
|
+
{
|
|
89
|
+
"mcpServers": {
|
|
90
|
+
"tableshot": {
|
|
91
|
+
"command": "uvx",
|
|
92
|
+
"args": ["tableshot"]
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Then just ask: *"Extract the tables from /path/to/report.pdf"*
|
|
99
|
+
|
|
100
|
+
### pip
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install tableshot
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Run as a standalone MCP server:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
tableshot # stdio transport (for MCP clients)
|
|
110
|
+
python -m tableshot # same thing
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Tools
|
|
114
|
+
|
|
115
|
+
| Tool | What it does |
|
|
116
|
+
|------|-------------|
|
|
117
|
+
| `extract_tables` | Extract all tables as Markdown, CSV, JSON, or HTML |
|
|
118
|
+
| `list_tables` | Quick scan -- preview tables before extracting |
|
|
119
|
+
|
|
120
|
+
### `extract_tables`
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
source: str # File path or URL to a PDF (or image with [ml] extra)
|
|
124
|
+
pages: str = "all" # "all", "1", "1-3", "1,3,5"
|
|
125
|
+
format: str = "markdown" # "markdown", "csv", "json", "html"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### `list_tables`
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
source: str # File path or URL to a PDF
|
|
132
|
+
pages: str = "all" # "all", "1", "1-3", "1,3,5"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Returns table count, dimensions, headers, and a preview row for each table found.
|
|
136
|
+
|
|
137
|
+
## Examples
|
|
138
|
+
|
|
139
|
+
### Financial report (bordered table)
|
|
140
|
+
|
|
141
|
+
**Input:** BlackRock-style quarterly earnings PDF
|
|
142
|
+
|
|
143
|
+
**Output (markdown):**
|
|
144
|
+
```
|
|
145
|
+
| | Q3 2023 | Q3 2022 | 9M 2023 | 9M 2022 |
|
|
146
|
+
| ------------------------------------ | ---------- | ---------- | ---------- | ---------- |
|
|
147
|
+
| Total revenue | $4,522 | $4,311 | $13,228 | $13,536 |
|
|
148
|
+
| Total expense | 2,885 | 2,785 | 8,538 | 8,578 |
|
|
149
|
+
| Operating income | $1,637 | $1,526 | $4,690 | $4,958 |
|
|
150
|
+
| Operating margin | 36.2% | 35.4% | 35.5% | 36.6% |
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Extracted in **25ms**.
|
|
154
|
+
|
|
155
|
+
### Multi-table document
|
|
156
|
+
|
|
157
|
+
**Input:** PDF with employee directory + budget summary on the same page
|
|
158
|
+
|
|
159
|
+
**Output:** Both tables extracted separately with correct headers:
|
|
160
|
+
```
|
|
161
|
+
Table 1: 3 rows x 3 cols (Name, Department, Email)
|
|
162
|
+
Table 2: 4 rows x 2 cols (Category, Amount)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Wide table (8 columns, landscape)
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
| ID | Name | Q1 | Q2 | Q3 | Q4 | Total | Status |
|
|
169
|
+
| --- | ----- | --- | --- | --- | --- | ----- | ------ |
|
|
170
|
+
| 1 | Alpha | 100 | 150 | 200 | 250 | 700 | Active |
|
|
171
|
+
| 2 | Beta | 90 | 110 | 130 | 170 | 500 | Active |
|
|
172
|
+
| 3 | Gamma | 0 | 0 | 50 | 80 | 130 | New |
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
All 4 output formats (Markdown, CSV, JSON, HTML) available for every extraction.
|
|
176
|
+
|
|
177
|
+
## Benchmarks
|
|
178
|
+
|
|
179
|
+
Tested on 10 PDFs covering bordered tables, multi-table pages, multi-page documents,
|
|
180
|
+
special characters, wide tables, and real financial statements.
|
|
181
|
+
|
|
182
|
+
| Metric | Result |
|
|
183
|
+
|--------|--------|
|
|
184
|
+
| **Bordered table accuracy** | 8/8 exact match |
|
|
185
|
+
| **Speed (bordered tables)** | 4-25ms per extraction |
|
|
186
|
+
| **Speed (3-page financial PDF)** | 182ms |
|
|
187
|
+
| **Output format validity** | 36/36 pass (9 PDFs x 4 formats) |
|
|
188
|
+
|
|
189
|
+
### Test Data
|
|
190
|
+
|
|
191
|
+
Generated fixtures — click **Source** to see the input PDF, **Output** to see what TableShot extracts:
|
|
192
|
+
|
|
193
|
+
| Fixture | Description | Source | Output | Speed |
|
|
194
|
+
|---------|-------------|--------|--------|-------|
|
|
195
|
+
| simple_bordered | 4-column sales report (Product, Price, Quantity, Total) | [PDF](tests/fixtures/simple_bordered.pdf) | [Extracted](benchmarks/outputs/simple_bordered.md) | 10ms |
|
|
196
|
+
| multi_table | Two tables on one page: employee directory + budget summary | [PDF](tests/fixtures/multi_table.pdf) | [Extracted](benchmarks/outputs/multi_table.md) | 10ms |
|
|
197
|
+
| single_row | Minimal table — header + one data row | [PDF](tests/fixtures/single_row.pdf) | [Extracted](benchmarks/outputs/single_row.md) | 4ms |
|
|
198
|
+
| multi_page | One table per page across 2 pages | [PDF](tests/fixtures/multi_page.pdf) | [Extracted](benchmarks/outputs/multi_page.md) | 9ms |
|
|
199
|
+
| empty_page | Page 1 text only; page 2 has a table | [PDF](tests/fixtures/empty_page.pdf) | [Extracted](benchmarks/outputs/empty_page.md) | 6ms |
|
|
200
|
+
| special_chars | Cells with `$`, `:`, `"`, `&`, `<>` | [PDF](tests/fixtures/special_chars.pdf) | [Extracted](benchmarks/outputs/special_chars.md) | 6ms |
|
|
201
|
+
| wide_table | 8-column landscape table (Q1–Q4, Total, Status) | [PDF](tests/fixtures/wide_table.pdf) | [Extracted](benchmarks/outputs/wide_table.md) | 11ms |
|
|
202
|
+
|
|
203
|
+
Real-world PDFs (not included in repo due to size/licensing):
|
|
204
|
+
|
|
205
|
+
| PDF | Description | Tables | Speed |
|
|
206
|
+
|-----|-------------|--------|-------|
|
|
207
|
+
| BlackRock mock | Generated mock of a BlackRock quarterly earnings statement (5 columns) | 1 table, 11 rows | 25ms |
|
|
208
|
+
| Sample Financial Statements | 3-page financial statement with complex visual formatting (155KB) | 3 tables, 75 rows | 182ms |
|
|
209
|
+
| NHM table | Large 56-page document with 55 tables (25MB) | 55 tables, 2321 rows | 5.8s |
|
|
210
|
+
|
|
211
|
+
Full machine-readable results in [benchmarks/results.json](benchmarks/results.json). Detailed before/after comparisons in [benchmarks/results.md](benchmarks/results.md).
|
|
212
|
+
|
|
213
|
+
### vs Other Tools
|
|
214
|
+
|
|
215
|
+
| | TableShot | Camelot | Tabula-py | Table Transformer |
|
|
216
|
+
|---|---|---|---|---|
|
|
217
|
+
| **Install** | ~33MB, nothing else | Needs Ghostscript | Needs Java (100-300MB) | Needs PyTorch (700MB-5GB) |
|
|
218
|
+
| **Speed** | ~10ms/table | >20s worst case | Variable (JVM startup) | 2-5s/page |
|
|
219
|
+
| **Bordered tables** | Excellent | Excellent | Good | Excellent |
|
|
220
|
+
| **Borderless** | Good (text fallback) | Poor | Better detection | Best |
|
|
221
|
+
| **MCP support** | Native | None | None | None |
|
|
222
|
+
| **Maintained** | Active | ~5 years stale | Active | Active |
|
|
223
|
+
|
|
224
|
+
*Competitor data from [Adhikari & Agarwal 2024](https://arxiv.org/abs/2410.09871), OpenNews 2024 review, and published GitHub metrics. Full results in [benchmarks/results.md](benchmarks/results.md).*
|
|
225
|
+
|
|
226
|
+
## Need Scanned PDFs or Images?
|
|
227
|
+
|
|
228
|
+
The base install handles native PDFs with text layers (90%+ of real-world use cases).
|
|
229
|
+
For scanned documents and images:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
pip install tableshot[ml] # Table Transformer for image-based tables
|
|
233
|
+
pip install tableshot[ocr] # OCR for scanned documents (ONNX, no PyTorch)
|
|
234
|
+
pip install tableshot[all] # Everything
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
With `[ml]` installed, TableShot automatically detects whether a PDF has a text layer:
|
|
238
|
+
- **Text layer present** -- uses pdfplumber (fast, ~10ms)
|
|
239
|
+
- **Scanned / no text layer** -- uses Table Transformer for detection, pdfplumber for text extraction
|
|
240
|
+
- **Image files** (PNG, JPEG) -- uses Table Transformer + OCR (requires `[ocr]`)
|
|
241
|
+
|
|
242
|
+
You can also force the ML backend: `extract_tables("/path/to/scan.pdf", backend="ml")`
|
|
243
|
+
|
|
244
|
+
## How It Works
|
|
245
|
+
|
|
246
|
+
```
|
|
247
|
+
PDF/Image ──> Smart Router ──> Table Detection ──> Cell Extraction ──> Formatted Output
|
|
248
|
+
| |
|
|
249
|
+
| PDF with text layer: | Markdown
|
|
250
|
+
| pdfplumber (lines → text fallback) | CSV
|
|
251
|
+
| | JSON
|
|
252
|
+
| Scanned PDF / Image (with [ml]): | HTML
|
|
253
|
+
| Table Transformer → pdfplumber text / OCR |
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
- **pdfplumber** handles PDF parsing and table detection (MIT)
|
|
257
|
+
- **pypdfium2** renders PDF pages to images for ML backend (Apache-2.0)
|
|
258
|
+
- **Table Transformer** (optional `[ml]`) detects tables in images (MIT)
|
|
259
|
+
- **MCP SDK** exposes tools to AI assistants via stdio transport (MIT)
|
|
260
|
+
|
|
261
|
+
Total base install: ~33MB. No model downloads. No GPU required.
|
|
262
|
+
|
|
263
|
+
## Known Limitations
|
|
264
|
+
|
|
265
|
+
All rule-based PDF table extractors (including Camelot and Tabula) share these limits:
|
|
266
|
+
|
|
267
|
+
- **Financial statements with visual formatting** -- amounts positioned by whitespace rather than cell borders can fragment across columns
|
|
268
|
+
- **Scanned PDFs / images** -- no OCR in base install (use `tableshot[ml]` or `tableshot[ocr]`)
|
|
269
|
+
- **Scientific papers with equations** -- inline math breaks table boundary detection
|
|
270
|
+
- **Complex borderless tables** -- ambiguous column alignment can cause misdetection
|
|
271
|
+
|
|
272
|
+
We're honest about these. For edge cases, `tableshot[ml]` adds Table Transformer support.
|
|
273
|
+
|
|
274
|
+
## Contributing
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
git clone https://github.com/Bespoke34/tableshot.git
|
|
278
|
+
cd tableshot
|
|
279
|
+
pip install -e ".[dev]"
|
|
280
|
+
pip install fpdf2 # for generating test fixtures
|
|
281
|
+
python tests/generate_fixtures.py # create test PDFs
|
|
282
|
+
pytest -m "not slow" # run 160 tests (skip ML tests)
|
|
283
|
+
pytest # run all 167 tests (needs [ml] extra)
|
|
284
|
+
ruff check src/ tests/ # lint
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
- 95% test coverage, all tests must pass
|
|
288
|
+
- Ruff clean, no lint warnings
|
|
289
|
+
- MIT license -- all dependencies must be MIT/Apache-2.0/BSD compatible
|
|
290
|
+
|
|
291
|
+
## License
|
|
292
|
+
|
|
293
|
+
MIT
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# TableShot
|
|
2
|
+
|
|
3
|
+
**The only MCP server for PDF table extraction.** Give any AI assistant the ability to read tables from PDFs -- no other tool does this.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/tableshot/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://github.com/Bespoke34/tableshot/actions)
|
|
8
|
+
[](https://pypi.org/project/tableshot/)
|
|
9
|
+
|
|
10
|
+
Camelot, Tabula, and Table Transformer are Python libraries -- they require a developer to write code. TableShot is an MCP server: Claude Desktop, Cursor, and Windsurf can use it directly with zero code.
|
|
11
|
+
|
|
12
|
+
~33MB install. No model downloads. No API keys. Results in <100ms.
|
|
13
|
+
|
|
14
|
+
<!-- TODO: Replace with actual demo GIF -->
|
|
15
|
+
<!--  -->
|
|
16
|
+
|
|
17
|
+
## The Problem
|
|
18
|
+
|
|
19
|
+
Ask any AI assistant to read a table from a PDF. It can't -- you get word soup:
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
Sales Report Q1 2024 Product Price Quantity Total Widget A $10.00 100
|
|
23
|
+
$1,000.00 Widget B $25.50 50 $1,275.00 Widget C $5.99 200 $1,198.00
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
TableShot gives you this:
|
|
27
|
+
|
|
28
|
+
| Product | Price | Quantity | Total |
|
|
29
|
+
|----------|---------|----------|-----------|
|
|
30
|
+
| Widget A | $10.00 | 100 | $1,000.00 |
|
|
31
|
+
| Widget B | $25.50 | 50 | $1,275.00 |
|
|
32
|
+
| Widget C | $5.99 | 200 | $1,198.00 |
|
|
33
|
+
| Widget D | $149.00 | 10 | $1,490.00 |
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### Claude Desktop / Cursor / Windsurf
|
|
38
|
+
|
|
39
|
+
Add to your MCP config:
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"mcpServers": {
|
|
44
|
+
"tableshot": {
|
|
45
|
+
"command": "uvx",
|
|
46
|
+
"args": ["tableshot"]
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Then just ask: *"Extract the tables from /path/to/report.pdf"*
|
|
53
|
+
|
|
54
|
+
### pip
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install tableshot
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Run as a standalone MCP server:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
tableshot # stdio transport (for MCP clients)
|
|
64
|
+
python -m tableshot # same thing
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Tools
|
|
68
|
+
|
|
69
|
+
| Tool | What it does |
|
|
70
|
+
|------|-------------|
|
|
71
|
+
| `extract_tables` | Extract all tables as Markdown, CSV, JSON, or HTML |
|
|
72
|
+
| `list_tables` | Quick scan -- preview tables before extracting |
|
|
73
|
+
|
|
74
|
+
### `extract_tables`
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
source: str # File path or URL to a PDF (or image with [ml] extra)
|
|
78
|
+
pages: str = "all" # "all", "1", "1-3", "1,3,5"
|
|
79
|
+
format: str = "markdown" # "markdown", "csv", "json", "html"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### `list_tables`
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
source: str # File path or URL to a PDF
|
|
86
|
+
pages: str = "all" # "all", "1", "1-3", "1,3,5"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Returns table count, dimensions, headers, and a preview row for each table found.
|
|
90
|
+
|
|
91
|
+
## Examples
|
|
92
|
+
|
|
93
|
+
### Financial report (bordered table)
|
|
94
|
+
|
|
95
|
+
**Input:** BlackRock-style quarterly earnings PDF
|
|
96
|
+
|
|
97
|
+
**Output (markdown):**
|
|
98
|
+
```
|
|
99
|
+
| | Q3 2023 | Q3 2022 | 9M 2023 | 9M 2022 |
|
|
100
|
+
| ------------------------------------ | ---------- | ---------- | ---------- | ---------- |
|
|
101
|
+
| Total revenue | $4,522 | $4,311 | $13,228 | $13,536 |
|
|
102
|
+
| Total expense | 2,885 | 2,785 | 8,538 | 8,578 |
|
|
103
|
+
| Operating income | $1,637 | $1,526 | $4,690 | $4,958 |
|
|
104
|
+
| Operating margin | 36.2% | 35.4% | 35.5% | 36.6% |
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Extracted in **25ms**.
|
|
108
|
+
|
|
109
|
+
### Multi-table document
|
|
110
|
+
|
|
111
|
+
**Input:** PDF with employee directory + budget summary on the same page
|
|
112
|
+
|
|
113
|
+
**Output:** Both tables extracted separately with correct headers:
|
|
114
|
+
```
|
|
115
|
+
Table 1: 3 rows x 3 cols (Name, Department, Email)
|
|
116
|
+
Table 2: 4 rows x 2 cols (Category, Amount)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Wide table (8 columns, landscape)
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
| ID | Name | Q1 | Q2 | Q3 | Q4 | Total | Status |
|
|
123
|
+
| --- | ----- | --- | --- | --- | --- | ----- | ------ |
|
|
124
|
+
| 1 | Alpha | 100 | 150 | 200 | 250 | 700 | Active |
|
|
125
|
+
| 2 | Beta | 90 | 110 | 130 | 170 | 500 | Active |
|
|
126
|
+
| 3 | Gamma | 0 | 0 | 50 | 80 | 130 | New |
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
All 4 output formats (Markdown, CSV, JSON, HTML) available for every extraction.
|
|
130
|
+
|
|
131
|
+
## Benchmarks
|
|
132
|
+
|
|
133
|
+
Tested on 10 PDFs covering bordered tables, multi-table pages, multi-page documents,
|
|
134
|
+
special characters, wide tables, and real financial statements.
|
|
135
|
+
|
|
136
|
+
| Metric | Result |
|
|
137
|
+
|--------|--------|
|
|
138
|
+
| **Bordered table accuracy** | 8/8 exact match |
|
|
139
|
+
| **Speed (bordered tables)** | 4-25ms per extraction |
|
|
140
|
+
| **Speed (3-page financial PDF)** | 182ms |
|
|
141
|
+
| **Output format validity** | 36/36 pass (9 PDFs x 4 formats) |
|
|
142
|
+
|
|
143
|
+
### Test Data
|
|
144
|
+
|
|
145
|
+
Generated fixtures — click **Source** to see the input PDF, **Output** to see what TableShot extracts:
|
|
146
|
+
|
|
147
|
+
| Fixture | Description | Source | Output | Speed |
|
|
148
|
+
|---------|-------------|--------|--------|-------|
|
|
149
|
+
| simple_bordered | 4-column sales report (Product, Price, Quantity, Total) | [PDF](tests/fixtures/simple_bordered.pdf) | [Extracted](benchmarks/outputs/simple_bordered.md) | 10ms |
|
|
150
|
+
| multi_table | Two tables on one page: employee directory + budget summary | [PDF](tests/fixtures/multi_table.pdf) | [Extracted](benchmarks/outputs/multi_table.md) | 10ms |
|
|
151
|
+
| single_row | Minimal table — header + one data row | [PDF](tests/fixtures/single_row.pdf) | [Extracted](benchmarks/outputs/single_row.md) | 4ms |
|
|
152
|
+
| multi_page | One table per page across 2 pages | [PDF](tests/fixtures/multi_page.pdf) | [Extracted](benchmarks/outputs/multi_page.md) | 9ms |
|
|
153
|
+
| empty_page | Page 1 text only; page 2 has a table | [PDF](tests/fixtures/empty_page.pdf) | [Extracted](benchmarks/outputs/empty_page.md) | 6ms |
|
|
154
|
+
| special_chars | Cells with `$`, `:`, `"`, `&`, `<>` | [PDF](tests/fixtures/special_chars.pdf) | [Extracted](benchmarks/outputs/special_chars.md) | 6ms |
|
|
155
|
+
| wide_table | 8-column landscape table (Q1–Q4, Total, Status) | [PDF](tests/fixtures/wide_table.pdf) | [Extracted](benchmarks/outputs/wide_table.md) | 11ms |
|
|
156
|
+
|
|
157
|
+
Real-world PDFs (not included in repo due to size/licensing):
|
|
158
|
+
|
|
159
|
+
| PDF | Description | Tables | Speed |
|
|
160
|
+
|-----|-------------|--------|-------|
|
|
161
|
+
| BlackRock mock | Generated mock of a BlackRock quarterly earnings statement (5 columns) | 1 table, 11 rows | 25ms |
|
|
162
|
+
| Sample Financial Statements | 3-page financial statement with complex visual formatting (155KB) | 3 tables, 75 rows | 182ms |
|
|
163
|
+
| NHM table | Large 56-page document with 55 tables (25MB) | 55 tables, 2321 rows | 5.8s |
|
|
164
|
+
|
|
165
|
+
Full machine-readable results in [benchmarks/results.json](benchmarks/results.json). Detailed before/after comparisons in [benchmarks/results.md](benchmarks/results.md).
|
|
166
|
+
|
|
167
|
+
### vs Other Tools
|
|
168
|
+
|
|
169
|
+
| | TableShot | Camelot | Tabula-py | Table Transformer |
|
|
170
|
+
|---|---|---|---|---|
|
|
171
|
+
| **Install** | ~33MB, nothing else | Needs Ghostscript | Needs Java (100-300MB) | Needs PyTorch (700MB-5GB) |
|
|
172
|
+
| **Speed** | ~10ms/table | >20s worst case | Variable (JVM startup) | 2-5s/page |
|
|
173
|
+
| **Bordered tables** | Excellent | Excellent | Good | Excellent |
|
|
174
|
+
| **Borderless** | Good (text fallback) | Poor | Better detection | Best |
|
|
175
|
+
| **MCP support** | Native | None | None | None |
|
|
176
|
+
| **Maintained** | Active | ~5 years stale | Active | Active |
|
|
177
|
+
|
|
178
|
+
*Competitor data from [Adhikari & Agarwal 2024](https://arxiv.org/abs/2410.09871), OpenNews 2024 review, and published GitHub metrics. Full results in [benchmarks/results.md](benchmarks/results.md).*
|
|
179
|
+
|
|
180
|
+
## Need Scanned PDFs or Images?
|
|
181
|
+
|
|
182
|
+
The base install handles native PDFs with text layers (90%+ of real-world use cases).
|
|
183
|
+
For scanned documents and images:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install tableshot[ml] # Table Transformer for image-based tables
|
|
187
|
+
pip install tableshot[ocr] # OCR for scanned documents (ONNX, no PyTorch)
|
|
188
|
+
pip install tableshot[all] # Everything
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
With `[ml]` installed, TableShot automatically detects whether a PDF has a text layer:
|
|
192
|
+
- **Text layer present** -- uses pdfplumber (fast, ~10ms)
|
|
193
|
+
- **Scanned / no text layer** -- uses Table Transformer for detection, pdfplumber for text extraction
|
|
194
|
+
- **Image files** (PNG, JPEG) -- uses Table Transformer + OCR (requires `[ocr]`)
|
|
195
|
+
|
|
196
|
+
You can also force the ML backend: `extract_tables("/path/to/scan.pdf", backend="ml")`
|
|
197
|
+
|
|
198
|
+
## How It Works
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
PDF/Image ──> Smart Router ──> Table Detection ──> Cell Extraction ──> Formatted Output
|
|
202
|
+
| |
|
|
203
|
+
| PDF with text layer: | Markdown
|
|
204
|
+
| pdfplumber (lines → text fallback) | CSV
|
|
205
|
+
| | JSON
|
|
206
|
+
| Scanned PDF / Image (with [ml]): | HTML
|
|
207
|
+
| Table Transformer → pdfplumber text / OCR |
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
- **pdfplumber** handles PDF parsing and table detection (MIT)
|
|
211
|
+
- **pypdfium2** renders PDF pages to images for ML backend (Apache-2.0)
|
|
212
|
+
- **Table Transformer** (optional `[ml]`) detects tables in images (MIT)
|
|
213
|
+
- **MCP SDK** exposes tools to AI assistants via stdio transport (MIT)
|
|
214
|
+
|
|
215
|
+
Total base install: ~33MB. No model downloads. No GPU required.
|
|
216
|
+
|
|
217
|
+
## Known Limitations
|
|
218
|
+
|
|
219
|
+
All rule-based PDF table extractors (including Camelot and Tabula) share these limits:
|
|
220
|
+
|
|
221
|
+
- **Financial statements with visual formatting** -- amounts positioned by whitespace rather than cell borders can fragment across columns
|
|
222
|
+
- **Scanned PDFs / images** -- no OCR in base install (use `tableshot[ml]` or `tableshot[ocr]`)
|
|
223
|
+
- **Scientific papers with equations** -- inline math breaks table boundary detection
|
|
224
|
+
- **Complex borderless tables** -- ambiguous column alignment can cause misdetection
|
|
225
|
+
|
|
226
|
+
We're honest about these. For edge cases, `tableshot[ml]` adds Table Transformer support.
|
|
227
|
+
|
|
228
|
+
## Contributing
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
git clone https://github.com/Bespoke34/tableshot.git
|
|
232
|
+
cd tableshot
|
|
233
|
+
pip install -e ".[dev]"
|
|
234
|
+
pip install fpdf2 # for generating test fixtures
|
|
235
|
+
python tests/generate_fixtures.py # create test PDFs
|
|
236
|
+
pytest -m "not slow" # run 160 tests (skip ML tests)
|
|
237
|
+
pytest # run all 167 tests (needs [ml] extra)
|
|
238
|
+
ruff check src/ tests/ # lint
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
- 95% test coverage, all tests must pass
|
|
242
|
+
- Ruff clean, no lint warnings
|
|
243
|
+
- MIT license -- all dependencies must be MIT/Apache-2.0/BSD compatible
|
|
244
|
+
|
|
245
|
+
## License
|
|
246
|
+
|
|
247
|
+
MIT
|