tableshot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. tableshot-0.1.0/.github/workflows/ci.yml +60 -0
  2. tableshot-0.1.0/.gitignore +36 -0
  3. tableshot-0.1.0/CLAUDE.md +11 -0
  4. tableshot-0.1.0/LICENSE +21 -0
  5. tableshot-0.1.0/PKG-INFO +293 -0
  6. tableshot-0.1.0/README.md +247 -0
  7. tableshot-0.1.0/SPEC.md +687 -0
  8. tableshot-0.1.0/benchmarks/competitor-data.md +83 -0
  9. tableshot-0.1.0/benchmarks/outputs/empty_page.md +6 -0
  10. tableshot-0.1.0/benchmarks/outputs/multi_page.md +14 -0
  11. tableshot-0.1.0/benchmarks/outputs/multi_table.md +14 -0
  12. tableshot-0.1.0/benchmarks/outputs/simple_bordered.md +8 -0
  13. tableshot-0.1.0/benchmarks/outputs/single_row.md +5 -0
  14. tableshot-0.1.0/benchmarks/outputs/special_chars.md +8 -0
  15. tableshot-0.1.0/benchmarks/outputs/wide_table.md +7 -0
  16. tableshot-0.1.0/benchmarks/results.json +192 -0
  17. tableshot-0.1.0/benchmarks/results.md +146 -0
  18. tableshot-0.1.0/benchmarks/run_benchmarks.py +351 -0
  19. tableshot-0.1.0/pyproject.toml +80 -0
  20. tableshot-0.1.0/src/tableshot/__init__.py +3 -0
  21. tableshot-0.1.0/src/tableshot/__main__.py +5 -0
  22. tableshot-0.1.0/src/tableshot/backends/__init__.py +0 -0
  23. tableshot-0.1.0/src/tableshot/backends/ml_backend.py +475 -0
  24. tableshot-0.1.0/src/tableshot/backends/pdfplumber_backend.py +113 -0
  25. tableshot-0.1.0/src/tableshot/formatter.py +142 -0
  26. tableshot-0.1.0/src/tableshot/input_handler.py +136 -0
  27. tableshot-0.1.0/src/tableshot/pipeline.py +243 -0
  28. tableshot-0.1.0/src/tableshot/server.py +105 -0
  29. tableshot-0.1.0/src/tableshot/utils.py +98 -0
  30. tableshot-0.1.0/tests/__init__.py +0 -0
  31. tableshot-0.1.0/tests/conftest.py +23 -0
  32. tableshot-0.1.0/tests/fixtures/empty_page.pdf +113 -0
  33. tableshot-0.1.0/tests/fixtures/multi_page.pdf +0 -0
  34. tableshot-0.1.0/tests/fixtures/multi_table.pdf +87 -0
  35. tableshot-0.1.0/tests/fixtures/simple_bordered.pdf +0 -0
  36. tableshot-0.1.0/tests/fixtures/single_row.pdf +84 -0
  37. tableshot-0.1.0/tests/fixtures/special_chars.pdf +0 -0
  38. tableshot-0.1.0/tests/fixtures/wide_table.pdf +0 -0
  39. tableshot-0.1.0/tests/generate_fixtures.py +231 -0
  40. tableshot-0.1.0/tests/test_formats.py +80 -0
  41. tableshot-0.1.0/tests/test_input_handler.py +100 -0
  42. tableshot-0.1.0/tests/test_ml_backend.py +275 -0
  43. tableshot-0.1.0/tests/test_pipeline.py +395 -0
  44. tableshot-0.1.0/tests/test_server.py +79 -0
@@ -0,0 +1,60 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, master]
6
+ tags: ["v*"]
7
+ pull_request:
8
+ branches: [main, master]
9
+
10
+ jobs:
11
+ test:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install -e ".[dev]"
29
+ pip install fpdf2
30
+
31
+ - name: Generate test fixtures
32
+ run: python tests/generate_fixtures.py
33
+
34
+ - name: Lint with ruff
35
+ run: ruff check src/ tests/
36
+
37
+ - name: Run tests with coverage
38
+ run: pytest --cov=tableshot --cov-report=term-missing --cov-fail-under=80
39
+
40
+ publish:
41
+ needs: test
42
+ runs-on: ubuntu-latest
43
+ if: startsWith(github.ref, 'refs/tags/v')
44
+ permissions:
45
+ id-token: write
46
+ steps:
47
+ - uses: actions/checkout@v4
48
+
49
+ - name: Set up Python
50
+ uses: actions/setup-python@v5
51
+ with:
52
+ python-version: "3.12"
53
+
54
+ - name: Build
55
+ run: |
56
+ pip install build
57
+ python -m build
58
+
59
+ - name: Publish to PyPI
60
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,36 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+ *.whl
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # IDE
17
+ .idea/
18
+ .vscode/
19
+ *.swp
20
+ *.swo
21
+
22
+ # Testing
23
+ .pytest_cache/
24
+ .coverage
25
+ htmlcov/
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # Large test files (manual/real-world PDFs)
32
+ tests/by hand/
33
+
34
+ # Temp files
35
+ *.tmp
36
+ *.bak
@@ -0,0 +1,11 @@
1
+ # TableShot
2
+
3
+ Read SPEC.md before doing anything. It contains the full build specification,
4
+ architecture decisions, tech stack, and sprint plan.
5
+
6
+ Key constraints:
7
+ - Base install must be <50MB (pdfplumber + pypdfium2 + mcp SDK only)
8
+ - MIT license — no AGPL dependencies (no PyMuPDF)
9
+ - Python 3.10+, hatchling build system
10
+ - 2 MCP tools for v1: extract_tables, list_tables
11
+ - All code in src/tableshot/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Andrew
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,293 @@
1
+ Metadata-Version: 2.4
2
+ Name: tableshot
3
+ Version: 0.1.0
4
+ Summary: Extract tables from PDFs into clean, structured data -- instantly. An MCP server for AI assistants.
5
+ Project-URL: Homepage, https://github.com/Bespoke34/tableshot
6
+ Project-URL: Repository, https://github.com/Bespoke34/tableshot
7
+ Project-URL: Issues, https://github.com/Bespoke34/tableshot/issues
8
+ Author: Andrew Makris
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: document-ai,mcp,model-context-protocol,pdf,structured-data,table-extraction
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Text Processing :: General
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: mcp>=1.0
23
+ Requires-Dist: pdfplumber>=0.10
24
+ Requires-Dist: pillow>=10.0
25
+ Requires-Dist: pypdfium2>=4.0
26
+ Provides-Extra: all
27
+ Requires-Dist: onnxtr[cpu]>=0.5; extra == 'all'
28
+ Requires-Dist: timm>=0.9; extra == 'all'
29
+ Requires-Dist: torch>=2.0; extra == 'all'
30
+ Requires-Dist: torchvision>=0.15; extra == 'all'
31
+ Requires-Dist: transformers>=4.30; extra == 'all'
32
+ Provides-Extra: dev
33
+ Requires-Dist: fpdf2>=2.7; extra == 'dev'
34
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
35
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
36
+ Requires-Dist: pytest>=8.0; extra == 'dev'
37
+ Requires-Dist: ruff>=0.4; extra == 'dev'
38
+ Provides-Extra: ml
39
+ Requires-Dist: timm>=0.9; extra == 'ml'
40
+ Requires-Dist: torch>=2.0; extra == 'ml'
41
+ Requires-Dist: torchvision>=0.15; extra == 'ml'
42
+ Requires-Dist: transformers>=4.30; extra == 'ml'
43
+ Provides-Extra: ocr
44
+ Requires-Dist: onnxtr[cpu]>=0.5; extra == 'ocr'
45
+ Description-Content-Type: text/markdown
46
+
47
+ # TableShot
48
+
49
+ **The only MCP server for PDF table extraction.** Give any AI assistant the ability to read tables from PDFs -- no other tool does this.
50
+
51
+ [![PyPI](https://img.shields.io/pypi/v/tableshot)](https://pypi.org/project/tableshot/)
52
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
53
+ [![Tests](https://img.shields.io/github/actions/workflow/status/Bespoke34/tableshot/ci.yml?label=tests)](https://github.com/Bespoke34/tableshot/actions)
54
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://pypi.org/project/tableshot/)
55
+
56
+ Camelot, Tabula, and Table Transformer are Python libraries -- they require a developer to write code. TableShot is an MCP server: Claude Desktop, Cursor, and Windsurf can use it directly with zero code.
57
+
58
+ ~33MB install. No model downloads. No API keys. Results in <100ms.
59
+
60
+ <!-- TODO: Replace with actual demo GIF -->
61
+ <!-- ![Demo](assets/demo.gif) -->
62
+
63
+ ## The Problem
64
+
65
+ Ask any AI assistant to read a table from a PDF. It can't -- you get word soup:
66
+
67
+ ```
68
+ Sales Report Q1 2024 Product Price Quantity Total Widget A $10.00 100
69
+ $1,000.00 Widget B $25.50 50 $1,275.00 Widget C $5.99 200 $1,198.00
70
+ ```
71
+
72
+ TableShot gives you this:
73
+
74
+ | Product | Price | Quantity | Total |
75
+ |----------|---------|----------|-----------|
76
+ | Widget A | $10.00 | 100 | $1,000.00 |
77
+ | Widget B | $25.50 | 50 | $1,275.00 |
78
+ | Widget C | $5.99 | 200 | $1,198.00 |
79
+ | Widget D | $149.00 | 10 | $1,490.00 |
80
+
81
+ ## Quick Start
82
+
83
+ ### Claude Desktop / Cursor / Windsurf
84
+
85
+ Add to your MCP config:
86
+
87
+ ```json
88
+ {
89
+ "mcpServers": {
90
+ "tableshot": {
91
+ "command": "uvx",
92
+ "args": ["tableshot"]
93
+ }
94
+ }
95
+ }
96
+ ```
97
+
98
+ Then just ask: *"Extract the tables from /path/to/report.pdf"*
99
+
100
+ ### pip
101
+
102
+ ```bash
103
+ pip install tableshot
104
+ ```
105
+
106
+ Run as a standalone MCP server:
107
+
108
+ ```bash
109
+ tableshot # stdio transport (for MCP clients)
110
+ python -m tableshot # same thing
111
+ ```
112
+
113
+ ## Tools
114
+
115
+ | Tool | What it does |
116
+ |------|-------------|
117
+ | `extract_tables` | Extract all tables as Markdown, CSV, JSON, or HTML |
118
+ | `list_tables` | Quick scan -- preview tables before extracting |
119
+
120
+ ### `extract_tables`
121
+
122
+ ```
123
+ source: str # File path or URL to a PDF (or image with [ml] extra)
124
+ pages: str = "all" # "all", "1", "1-3", "1,3,5"
125
+ format: str = "markdown" # "markdown", "csv", "json", "html"
126
+ ```
127
+
128
+ ### `list_tables`
129
+
130
+ ```
131
+ source: str # File path or URL to a PDF
132
+ pages: str = "all" # "all", "1", "1-3", "1,3,5"
133
+ ```
134
+
135
+ Returns table count, dimensions, headers, and a preview row for each table found.
136
+
137
+ ## Examples
138
+
139
+ ### Financial report (bordered table)
140
+
141
+ **Input:** BlackRock-style quarterly earnings PDF
142
+
143
+ **Output (markdown):**
144
+ ```
145
+ | | Q3 2023 | Q3 2022 | 9M 2023 | 9M 2022 |
146
+ | ------------------------------------ | ---------- | ---------- | ---------- | ---------- |
147
+ | Total revenue | $4,522 | $4,311 | $13,228 | $13,536 |
148
+ | Total expense | 2,885 | 2,785 | 8,538 | 8,578 |
149
+ | Operating income | $1,637 | $1,526 | $4,690 | $4,958 |
150
+ | Operating margin | 36.2% | 35.4% | 35.5% | 36.6% |
151
+ ```
152
+
153
+ Extracted in **25ms**.
154
+
155
+ ### Multi-table document
156
+
157
+ **Input:** PDF with employee directory + budget summary on the same page
158
+
159
+ **Output:** Both tables extracted separately with correct headers:
160
+ ```
161
+ Table 1: 3 rows x 3 cols (Name, Department, Email)
162
+ Table 2: 4 rows x 2 cols (Category, Amount)
163
+ ```
164
+
165
+ ### Wide table (8 columns, landscape)
166
+
167
+ ```
168
+ | ID | Name | Q1 | Q2 | Q3 | Q4 | Total | Status |
169
+ | --- | ----- | --- | --- | --- | --- | ----- | ------ |
170
+ | 1 | Alpha | 100 | 150 | 200 | 250 | 700 | Active |
171
+ | 2 | Beta | 90 | 110 | 130 | 170 | 500 | Active |
172
+ | 3 | Gamma | 0 | 0 | 50 | 80 | 130 | New |
173
+ ```
174
+
175
+ All 4 output formats (Markdown, CSV, JSON, HTML) available for every extraction.
176
+
177
+ ## Benchmarks
178
+
179
+ Tested on 10 PDFs covering bordered tables, multi-table pages, multi-page documents,
180
+ special characters, wide tables, and real financial statements.
181
+
182
+ | Metric | Result |
183
+ |--------|--------|
184
+ | **Bordered table accuracy** | 8/8 exact match |
185
+ | **Speed (bordered tables)** | 4-25ms per extraction |
186
+ | **Speed (3-page financial PDF)** | 182ms |
187
+ | **Output format validity** | 36/36 pass (9 PDFs x 4 formats) |
188
+
189
+ ### Test Data
190
+
191
+ Generated fixtures — click **Source** to see the input PDF, **Output** to see what TableShot extracts:
192
+
193
+ | Fixture | Description | Source | Output | Speed |
194
+ |---------|-------------|--------|--------|-------|
195
+ | simple_bordered | 4-column sales report (Product, Price, Quantity, Total) | [PDF](tests/fixtures/simple_bordered.pdf) | [Extracted](benchmarks/outputs/simple_bordered.md) | 10ms |
196
+ | multi_table | Two tables on one page: employee directory + budget summary | [PDF](tests/fixtures/multi_table.pdf) | [Extracted](benchmarks/outputs/multi_table.md) | 10ms |
197
+ | single_row | Minimal table — header + one data row | [PDF](tests/fixtures/single_row.pdf) | [Extracted](benchmarks/outputs/single_row.md) | 4ms |
198
+ | multi_page | One table per page across 2 pages | [PDF](tests/fixtures/multi_page.pdf) | [Extracted](benchmarks/outputs/multi_page.md) | 9ms |
199
+ | empty_page | Page 1 text only; page 2 has a table | [PDF](tests/fixtures/empty_page.pdf) | [Extracted](benchmarks/outputs/empty_page.md) | 6ms |
200
+ | special_chars | Cells with `$`, `:`, `"`, `&`, `<>` | [PDF](tests/fixtures/special_chars.pdf) | [Extracted](benchmarks/outputs/special_chars.md) | 6ms |
201
+ | wide_table | 8-column landscape table (Q1–Q4, Total, Status) | [PDF](tests/fixtures/wide_table.pdf) | [Extracted](benchmarks/outputs/wide_table.md) | 11ms |
202
+
203
+ Real-world PDFs (not included in repo due to size/licensing):
204
+
205
+ | PDF | Description | Tables | Speed |
206
+ |-----|-------------|--------|-------|
207
+ | BlackRock mock | Generated mock of a BlackRock quarterly earnings statement (5 columns) | 1 table, 11 rows | 25ms |
208
+ | Sample Financial Statements | 3-page financial statement with complex visual formatting (155KB) | 3 tables, 75 rows | 182ms |
209
+ | NHM table | Large 56-page document with 55 tables (25MB) | 55 tables, 2321 rows | 5.8s |
210
+
211
+ Full machine-readable results in [benchmarks/results.json](benchmarks/results.json). Detailed before/after comparisons in [benchmarks/results.md](benchmarks/results.md).
212
+
213
+ ### vs Other Tools
214
+
215
+ | | TableShot | Camelot | Tabula-py | Table Transformer |
216
+ |---|---|---|---|---|
217
+ | **Install** | ~33MB, nothing else | Needs Ghostscript | Needs Java (100-300MB) | Needs PyTorch (700MB-5GB) |
218
+ | **Speed** | ~10ms/table | >20s worst case | Variable (JVM startup) | 2-5s/page |
219
+ | **Bordered tables** | Excellent | Excellent | Good | Excellent |
220
+ | **Borderless** | Good (text fallback) | Poor | Better detection | Best |
221
+ | **MCP support** | Native | None | None | None |
222
+ | **Maintained** | Active | ~5 years stale | Active | Active |
223
+
224
+ *Competitor data from [Adhikari & Agarwal 2024](https://arxiv.org/abs/2410.09871), OpenNews 2024 review, and published GitHub metrics. Full results in [benchmarks/results.md](benchmarks/results.md).*
225
+
226
+ ## Need Scanned PDFs or Images?
227
+
228
+ The base install handles native PDFs with text layers (90%+ of real-world use cases).
229
+ For scanned documents and images:
230
+
231
+ ```bash
232
+ pip install tableshot[ml] # Table Transformer for image-based tables
233
+ pip install tableshot[ocr] # OCR for scanned documents (ONNX, no PyTorch)
234
+ pip install tableshot[all] # Everything
235
+ ```
236
+
237
+ With `[ml]` installed, TableShot automatically detects whether a PDF has a text layer:
238
+ - **Text layer present** -- uses pdfplumber (fast, ~10ms)
239
+ - **Scanned / no text layer** -- uses Table Transformer for detection, pdfplumber for text extraction
240
+ - **Image files** (PNG, JPEG) -- uses Table Transformer + OCR (requires `[ocr]`)
241
+
242
+ You can also force the ML backend: `extract_tables("/path/to/scan.pdf", backend="ml")`
243
+
244
+ ## How It Works
245
+
246
+ ```
247
+ PDF/Image ──> Smart Router ──> Table Detection ──> Cell Extraction ──> Formatted Output
248
+ | |
249
+ | PDF with text layer: | Markdown
250
+ | pdfplumber (lines → text fallback) | CSV
251
+ | | JSON
252
+ | Scanned PDF / Image (with [ml]): | HTML
253
+ | Table Transformer → pdfplumber text / OCR |
254
+ ```
255
+
256
+ - **pdfplumber** handles PDF parsing and table detection (MIT)
257
+ - **pypdfium2** renders PDF pages to images for ML backend (Apache-2.0)
258
+ - **Table Transformer** (optional `[ml]`) detects tables in images (MIT)
259
+ - **MCP SDK** exposes tools to AI assistants via stdio transport (MIT)
260
+
261
+ Total base install: ~33MB. No model downloads. No GPU required.
262
+
263
+ ## Known Limitations
264
+
265
+ All rule-based PDF table extractors (including Camelot and Tabula) share these limits:
266
+
267
+ - **Financial statements with visual formatting** -- amounts positioned by whitespace rather than cell borders can fragment across columns
268
+ - **Scanned PDFs / images** -- no OCR in base install (use `tableshot[ml]` or `tableshot[ocr]`)
269
+ - **Scientific papers with equations** -- inline math breaks table boundary detection
270
+ - **Complex borderless tables** -- ambiguous column alignment can cause misdetection
271
+
272
+ We're honest about these. For edge cases, `tableshot[ml]` adds Table Transformer support.
273
+
274
+ ## Contributing
275
+
276
+ ```bash
277
+ git clone https://github.com/Bespoke34/tableshot.git
278
+ cd tableshot
279
+ pip install -e ".[dev]"
280
+ pip install fpdf2 # for generating test fixtures
281
+ python tests/generate_fixtures.py # create test PDFs
282
+ pytest -m "not slow" # run 160 tests (skip ML tests)
283
+ pytest # run all 167 tests (needs [ml] extra)
284
+ ruff check src/ tests/ # lint
285
+ ```
286
+
287
+ - 95% test coverage, all tests must pass
288
+ - Ruff clean, no lint warnings
289
+ - MIT license -- all dependencies must be MIT/Apache-2.0/BSD compatible
290
+
291
+ ## License
292
+
293
+ MIT
@@ -0,0 +1,247 @@
1
+ # TableShot
2
+
3
+ **The only MCP server for PDF table extraction.** Give any AI assistant the ability to read tables from PDFs -- no other tool does this.
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/tableshot)](https://pypi.org/project/tableshot/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
7
+ [![Tests](https://img.shields.io/github/actions/workflow/status/Bespoke34/tableshot/ci.yml?label=tests)](https://github.com/Bespoke34/tableshot/actions)
8
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://pypi.org/project/tableshot/)
9
+
10
+ Camelot, Tabula, and Table Transformer are Python libraries -- they require a developer to write code. TableShot is an MCP server: Claude Desktop, Cursor, and Windsurf can use it directly with zero code.
11
+
12
+ ~33MB install. No model downloads. No API keys. Results in <100ms.
13
+
14
+ <!-- TODO: Replace with actual demo GIF -->
15
+ <!-- ![Demo](assets/demo.gif) -->
16
+
17
+ ## The Problem
18
+
19
+ Ask any AI assistant to read a table from a PDF. It can't -- you get word soup:
20
+
21
+ ```
22
+ Sales Report Q1 2024 Product Price Quantity Total Widget A $10.00 100
23
+ $1,000.00 Widget B $25.50 50 $1,275.00 Widget C $5.99 200 $1,198.00
24
+ ```
25
+
26
+ TableShot gives you this:
27
+
28
+ | Product | Price | Quantity | Total |
29
+ |----------|---------|----------|-----------|
30
+ | Widget A | $10.00 | 100 | $1,000.00 |
31
+ | Widget B | $25.50 | 50 | $1,275.00 |
32
+ | Widget C | $5.99 | 200 | $1,198.00 |
33
+ | Widget D | $149.00 | 10 | $1,490.00 |
34
+
35
+ ## Quick Start
36
+
37
+ ### Claude Desktop / Cursor / Windsurf
38
+
39
+ Add to your MCP config:
40
+
41
+ ```json
42
+ {
43
+ "mcpServers": {
44
+ "tableshot": {
45
+ "command": "uvx",
46
+ "args": ["tableshot"]
47
+ }
48
+ }
49
+ }
50
+ ```
51
+
52
+ Then just ask: *"Extract the tables from /path/to/report.pdf"*
53
+
54
+ ### pip
55
+
56
+ ```bash
57
+ pip install tableshot
58
+ ```
59
+
60
+ Run as a standalone MCP server:
61
+
62
+ ```bash
63
+ tableshot # stdio transport (for MCP clients)
64
+ python -m tableshot # same thing
65
+ ```
66
+
67
+ ## Tools
68
+
69
+ | Tool | What it does |
70
+ |------|-------------|
71
+ | `extract_tables` | Extract all tables as Markdown, CSV, JSON, or HTML |
72
+ | `list_tables` | Quick scan -- preview tables before extracting |
73
+
74
+ ### `extract_tables`
75
+
76
+ ```
77
+ source: str # File path or URL to a PDF (or image with [ml] extra)
78
+ pages: str = "all" # "all", "1", "1-3", "1,3,5"
79
+ format: str = "markdown" # "markdown", "csv", "json", "html"
80
+ ```
81
+
82
+ ### `list_tables`
83
+
84
+ ```
85
+ source: str # File path or URL to a PDF
86
+ pages: str = "all" # "all", "1", "1-3", "1,3,5"
87
+ ```
88
+
89
+ Returns table count, dimensions, headers, and a preview row for each table found.
90
+
91
+ ## Examples
92
+
93
+ ### Financial report (bordered table)
94
+
95
+ **Input:** BlackRock-style quarterly earnings PDF
96
+
97
+ **Output (markdown):**
98
+ ```
99
+ | | Q3 2023 | Q3 2022 | 9M 2023 | 9M 2022 |
100
+ | ------------------------------------ | ---------- | ---------- | ---------- | ---------- |
101
+ | Total revenue | $4,522 | $4,311 | $13,228 | $13,536 |
102
+ | Total expense | 2,885 | 2,785 | 8,538 | 8,578 |
103
+ | Operating income | $1,637 | $1,526 | $4,690 | $4,958 |
104
+ | Operating margin | 36.2% | 35.4% | 35.5% | 36.6% |
105
+ ```
106
+
107
+ Extracted in **25ms**.
108
+
109
+ ### Multi-table document
110
+
111
+ **Input:** PDF with employee directory + budget summary on the same page
112
+
113
+ **Output:** Both tables extracted separately with correct headers:
114
+ ```
115
+ Table 1: 3 rows x 3 cols (Name, Department, Email)
116
+ Table 2: 4 rows x 2 cols (Category, Amount)
117
+ ```
118
+
119
+ ### Wide table (8 columns, landscape)
120
+
121
+ ```
122
+ | ID | Name | Q1 | Q2 | Q3 | Q4 | Total | Status |
123
+ | --- | ----- | --- | --- | --- | --- | ----- | ------ |
124
+ | 1 | Alpha | 100 | 150 | 200 | 250 | 700 | Active |
125
+ | 2 | Beta | 90 | 110 | 130 | 170 | 500 | Active |
126
+ | 3 | Gamma | 0 | 0 | 50 | 80 | 130 | New |
127
+ ```
128
+
129
+ All 4 output formats (Markdown, CSV, JSON, HTML) available for every extraction.
130
+
131
+ ## Benchmarks
132
+
133
+ Tested on 10 PDFs covering bordered tables, multi-table pages, multi-page documents,
134
+ special characters, wide tables, and real financial statements.
135
+
136
+ | Metric | Result |
137
+ |--------|--------|
138
+ | **Bordered table accuracy** | 8/8 exact match |
139
+ | **Speed (bordered tables)** | 4-25ms per extraction |
140
+ | **Speed (3-page financial PDF)** | 182ms |
141
+ | **Output format validity** | 36/36 pass (9 PDFs x 4 formats) |
142
+
143
+ ### Test Data
144
+
145
+ Generated fixtures — click **Source** to see the input PDF, **Output** to see what TableShot extracts:
146
+
147
+ | Fixture | Description | Source | Output | Speed |
148
+ |---------|-------------|--------|--------|-------|
149
+ | simple_bordered | 4-column sales report (Product, Price, Quantity, Total) | [PDF](tests/fixtures/simple_bordered.pdf) | [Extracted](benchmarks/outputs/simple_bordered.md) | 10ms |
150
+ | multi_table | Two tables on one page: employee directory + budget summary | [PDF](tests/fixtures/multi_table.pdf) | [Extracted](benchmarks/outputs/multi_table.md) | 10ms |
151
+ | single_row | Minimal table — header + one data row | [PDF](tests/fixtures/single_row.pdf) | [Extracted](benchmarks/outputs/single_row.md) | 4ms |
152
+ | multi_page | One table per page across 2 pages | [PDF](tests/fixtures/multi_page.pdf) | [Extracted](benchmarks/outputs/multi_page.md) | 9ms |
153
+ | empty_page | Page 1 text only; page 2 has a table | [PDF](tests/fixtures/empty_page.pdf) | [Extracted](benchmarks/outputs/empty_page.md) | 6ms |
154
+ | special_chars | Cells with `$`, `:`, `"`, `&`, `<>` | [PDF](tests/fixtures/special_chars.pdf) | [Extracted](benchmarks/outputs/special_chars.md) | 6ms |
155
+ | wide_table | 8-column landscape table (Q1–Q4, Total, Status) | [PDF](tests/fixtures/wide_table.pdf) | [Extracted](benchmarks/outputs/wide_table.md) | 11ms |
156
+
157
+ Real-world PDFs (not included in repo due to size/licensing):
158
+
159
+ | PDF | Description | Tables | Speed |
160
+ |-----|-------------|--------|-------|
161
+ | BlackRock mock | Generated mock of a BlackRock quarterly earnings statement (5 columns) | 1 table, 11 rows | 25ms |
162
+ | Sample Financial Statements | 3-page financial statement with complex visual formatting (155KB) | 3 tables, 75 rows | 182ms |
163
+ | NHM table | Large 56-page document with 55 tables (25MB) | 55 tables, 2321 rows | 5.8s |
164
+
165
+ Full machine-readable results in [benchmarks/results.json](benchmarks/results.json). Detailed before/after comparisons in [benchmarks/results.md](benchmarks/results.md).
166
+
167
+ ### vs Other Tools
168
+
169
+ | | TableShot | Camelot | Tabula-py | Table Transformer |
170
+ |---|---|---|---|---|
171
+ | **Install** | ~33MB, nothing else | Needs Ghostscript | Needs Java (100-300MB) | Needs PyTorch (700MB-5GB) |
172
+ | **Speed** | ~10ms/table | >20s worst case | Variable (JVM startup) | 2-5s/page |
173
+ | **Bordered tables** | Excellent | Excellent | Good | Excellent |
174
+ | **Borderless** | Good (text fallback) | Poor | Better detection | Best |
175
+ | **MCP support** | Native | None | None | None |
176
+ | **Maintained** | Active | ~5 years stale | Active | Active |
177
+
178
+ *Competitor data from [Adhikari & Agarwal 2024](https://arxiv.org/abs/2410.09871), OpenNews 2024 review, and published GitHub metrics. Full results in [benchmarks/results.md](benchmarks/results.md).*
179
+
180
+ ## Need Scanned PDFs or Images?
181
+
182
+ The base install handles native PDFs with text layers (90%+ of real-world use cases).
183
+ For scanned documents and images:
184
+
185
+ ```bash
186
+ pip install tableshot[ml] # Table Transformer for image-based tables
187
+ pip install tableshot[ocr] # OCR for scanned documents (ONNX, no PyTorch)
188
+ pip install tableshot[all] # Everything
189
+ ```
190
+
191
+ With `[ml]` installed, TableShot automatically detects whether a PDF has a text layer:
192
+ - **Text layer present** -- uses pdfplumber (fast, ~10ms)
193
+ - **Scanned / no text layer** -- uses Table Transformer for detection, pdfplumber for text extraction
194
+ - **Image files** (PNG, JPEG) -- uses Table Transformer + OCR (requires `[ocr]`)
195
+
196
+ You can also force the ML backend: `extract_tables("/path/to/scan.pdf", backend="ml")`
197
+
198
+ ## How It Works
199
+
200
+ ```
201
+ PDF/Image ──> Smart Router ──> Table Detection ──> Cell Extraction ──> Formatted Output
202
+ | |
203
+ | PDF with text layer: | Markdown
204
+ | pdfplumber (lines → text fallback) | CSV
205
+ | | JSON
206
+ | Scanned PDF / Image (with [ml]): | HTML
207
+ | Table Transformer → pdfplumber text / OCR |
208
+ ```
209
+
210
+ - **pdfplumber** handles PDF parsing and table detection (MIT)
211
+ - **pypdfium2** renders PDF pages to images for ML backend (Apache-2.0)
212
+ - **Table Transformer** (optional `[ml]`) detects tables in images (MIT)
213
+ - **MCP SDK** exposes tools to AI assistants via stdio transport (MIT)
214
+
215
+ Total base install: ~33MB. No model downloads. No GPU required.
216
+
217
+ ## Known Limitations
218
+
219
+ All rule-based PDF table extractors (including Camelot and Tabula) share these limits:
220
+
221
+ - **Financial statements with visual formatting** -- amounts positioned by whitespace rather than cell borders can fragment across columns
222
+ - **Scanned PDFs / images** -- no OCR in base install (use `tableshot[ml]` or `tableshot[ocr]`)
223
+ - **Scientific papers with equations** -- inline math breaks table boundary detection
224
+ - **Complex borderless tables** -- ambiguous column alignment can cause misdetection
225
+
226
+ We're honest about these. For edge cases, `tableshot[ml]` adds Table Transformer support.
227
+
228
+ ## Contributing
229
+
230
+ ```bash
231
+ git clone https://github.com/Bespoke34/tableshot.git
232
+ cd tableshot
233
+ pip install -e ".[dev]"
234
+ pip install fpdf2 # for generating test fixtures
235
+ python tests/generate_fixtures.py # create test PDFs
236
+ pytest -m "not slow" # run 160 tests (skip ML tests)
237
+ pytest # run all 167 tests (needs [ml] extra)
238
+ ruff check src/ tests/ # lint
239
+ ```
240
+
241
+ - 95% test coverage, all tests must pass
242
+ - Ruff clean, no lint warnings
243
+ - MIT license -- all dependencies must be MIT/Apache-2.0/BSD compatible
244
+
245
+ ## License
246
+
247
+ MIT