pdf-porter 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_porter-0.1.0/.claude/settings.local.json +10 -0
- pdf_porter-0.1.0/.gitattributes +2 -0
- pdf_porter-0.1.0/.gitignore +181 -0
- pdf_porter-0.1.0/CLAUDE.md +85 -0
- pdf_porter-0.1.0/CLAUDE_CODE_PROMPTS.md +212 -0
- pdf_porter-0.1.0/IMPLEMENTATION_PLAN.md +181 -0
- pdf_porter-0.1.0/LICENSE +21 -0
- pdf_porter-0.1.0/NAMING.md +85 -0
- pdf_porter-0.1.0/PKG-INFO +67 -0
- pdf_porter-0.1.0/README.md +54 -0
- pdf_porter-0.1.0/generate_fixtures.py +112 -0
- pdf_porter-0.1.0/pdf_porter/__init__.py +1 -0
- pdf_porter-0.1.0/pdf_porter/server.py +64 -0
- pdf_porter-0.1.0/pyproject.toml +22 -0
- pdf_porter-0.1.0/tests/__init__.py +0 -0
- pdf_porter-0.1.0/tests/fixtures/simple.pdf +74 -0
- pdf_porter-0.1.0/tests/fixtures/table.pdf +74 -0
- pdf_porter-0.1.0/tests/fixtures/two_column.pdf +68 -0
- pdf_porter-0.1.0/tests/test_server.py +52 -0
- pdf_porter-0.1.0/uv.lock +2893 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
|
|
110
|
+
# pdm
|
|
111
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
112
|
+
#pdm.lock
|
|
113
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
114
|
+
# in version control.
|
|
115
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
|
116
|
+
.pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
121
|
+
__pypackages__/
|
|
122
|
+
|
|
123
|
+
# Celery stuff
|
|
124
|
+
celerybeat-schedule
|
|
125
|
+
celerybeat.pid
|
|
126
|
+
|
|
127
|
+
# SageMath parsed files
|
|
128
|
+
*.sage.py
|
|
129
|
+
|
|
130
|
+
# Environments
|
|
131
|
+
.env
|
|
132
|
+
.venv
|
|
133
|
+
env/
|
|
134
|
+
venv/
|
|
135
|
+
ENV/
|
|
136
|
+
env.bak/
|
|
137
|
+
venv.bak/
|
|
138
|
+
|
|
139
|
+
# Spyder project settings
|
|
140
|
+
.spyderproject
|
|
141
|
+
.spyproject
|
|
142
|
+
|
|
143
|
+
# Rope project settings
|
|
144
|
+
.ropeproject
|
|
145
|
+
|
|
146
|
+
# mkdocs documentation
|
|
147
|
+
/site
|
|
148
|
+
|
|
149
|
+
# mypy
|
|
150
|
+
.mypy_cache/
|
|
151
|
+
.dmypy.json
|
|
152
|
+
dmypy.json
|
|
153
|
+
|
|
154
|
+
# Pyre type checker
|
|
155
|
+
.pyre/
|
|
156
|
+
|
|
157
|
+
# pytype static type analyzer
|
|
158
|
+
.pytype/
|
|
159
|
+
|
|
160
|
+
# Cython debug symbols
|
|
161
|
+
cython_debug/
|
|
162
|
+
|
|
163
|
+
# PyCharm
|
|
164
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
165
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
166
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
167
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
168
|
+
#.idea/
|
|
169
|
+
|
|
170
|
+
# Ruff stuff:
|
|
171
|
+
.ruff_cache/
|
|
172
|
+
|
|
173
|
+
# PyPI configuration file
|
|
174
|
+
.pypirc
|
|
175
|
+
|
|
176
|
+
# Cursor
|
|
177
|
+
# Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
|
|
178
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
179
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
180
|
+
.cursorignore
|
|
181
|
+
.cursorindexingignore
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project
|
|
6
|
+
`pdf-porter`: a minimal MCP server exposing one tool — `pdf_to_markdown(path, save_output)` — for layout-aware PDF→Markdown conversion via Docling. Pitch: one tool, zero config, five minutes.
|
|
7
|
+
|
|
8
|
+
**Note:** `docling-mcp` (the obvious name) is taken by IBM Research Zurich, the Docling authors themselves, at v1.3.2. This package is intentionally minimal — one tool, no config. The README should reference IBM's `docling-mcp` as the full-featured alternative: "if you want the full Docling toolset, see docling-mcp."
|
|
9
|
+
|
|
10
|
+
## Stack
|
|
11
|
+
- Python 3.11+
|
|
12
|
+
- `mcp[cli]` (FastMCP)
|
|
13
|
+
- `docling`
|
|
14
|
+
- `uv` for packaging and running
|
|
15
|
+
- `reportlab` (dev only, for generating test fixtures)
|
|
16
|
+
|
|
17
|
+
## Structure
|
|
18
|
+
```
|
|
19
|
+
pdf_porter/
|
|
20
|
+
__init__.py # __version__ = "0.1.0"
|
|
21
|
+
server.py # FastMCP app, tool definitions, module-level converter singleton
|
|
22
|
+
tests/
|
|
23
|
+
test_server.py # integration tests (no mocking)
|
|
24
|
+
fixtures/ # simple.pdf, two_column.pdf, table.pdf (generated by generate_fixtures.py)
|
|
25
|
+
generate_fixtures.py # standalone script to regenerate test PDFs via reportlab
|
|
26
|
+
pyproject.toml
|
|
27
|
+
README.md
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Commands
|
|
31
|
+
```bash
|
|
32
|
+
uv run python -m pdf_porter # start server
|
|
33
|
+
uv run pytest # run tests
|
|
34
|
+
uv run mcp dev pdf_porter/server.py # MCP inspector
|
|
35
|
+
uv run pytest tests/test_server.py::test_table_pdf # run a single test
|
|
36
|
+
python generate_fixtures.py # regenerate test fixture PDFs
|
|
37
|
+
uv build # build wheel + sdist into dist/
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Publishing to PyPI
|
|
41
|
+
|
|
42
|
+
`PYPI_TOKEN` must be set in the environment before publishing. Claude Code's Bash tool does **not** inherit variables exported in the user's interactive shell — the user must publish manually from their own terminal:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# In your terminal (not via Claude Code):
|
|
46
|
+
uv build
|
|
47
|
+
uv publish --token $PYPI_TOKEN
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
After publishing, verify the release:
|
|
51
|
+
```bash
|
|
52
|
+
curl -s https://pypi.org/pypi/pdf-porter/json | python -m json.tool | grep '"version"'
|
|
53
|
+
uvx pdf-porter & # should start cleanly; kill with Ctrl-C
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Architecture
|
|
57
|
+
|
|
58
|
+
`server.py` holds a module-level `DocumentConverter` singleton (instantiated once, not per-call) and a boolean flag to log "Loading Docling models..." to stderr on the first conversion only. The `pdf_to_markdown` tool validates path existence and `.pdf` extension before calling Docling, and wraps all exceptions as `"Error: {e}"` strings — never raw tracebacks.
|
|
59
|
+
|
|
60
|
+
Entry point: `pdf_porter.server:main` → calls `mcp.run()`. Also handles `if __name__ == "__main__"`.
|
|
61
|
+
|
|
62
|
+
## Tool signature
|
|
63
|
+
```python
|
|
64
|
+
pdf_to_markdown(path: str, save_output: bool = False) -> str
|
|
65
|
+
```
|
|
66
|
+
- Returns markdown string directly, or `"Saved to: {output_path}"` when `save_output=True`
|
|
67
|
+
- Returns `"Error: ..."` on any failure (bad path, wrong extension, conversion error)
|
|
68
|
+
|
|
69
|
+
## Rules
|
|
70
|
+
- No global state beyond the converter singleton and the first-run flag — document both in server.py.
|
|
71
|
+
- All errors surface as readable strings, never raw tracebacks to the LLM.
|
|
72
|
+
- Tests are integration tests — do not mock `DocumentConverter`.
|
|
73
|
+
- Tests must cover: simple PDF, two-column PDF, table PDF (asserts `|` present), bad path, non-PDF extension, `save_output=True`.
|
|
74
|
+
- Do not add dependencies beyond `mcp`, `docling`, `pytest`, and `reportlab` (dev).
|
|
75
|
+
- README must include a working `claude_desktop_config.json` snippet.
|
|
76
|
+
|
|
77
|
+
## Known limitations (inform users)
|
|
78
|
+
- First run downloads Docling model weights (~500MB); subsequent runs are fast.
|
|
79
|
+
- OCR mode not enabled in v0.1 — scanned/image-only PDFs will return empty text.
|
|
80
|
+
- Windows path handling not validated.
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
## PATHS
|
|
84
|
+
|
|
85
|
+
NEVER HARD CODE PATHS. /path/to/repo should NEVER APPEAR in the codebase.
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Claude Code Prompt Sequence
|
|
2
|
+
## mcp-docling — Autonomous Build Session
|
|
3
|
+
|
|
4
|
+
Run these prompts in order in Claude Code. Each phase should complete fully before the next begins. If a phase fails its acceptance criteria, fix before proceeding.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Prompt 0 — Bootstrap
|
|
9
|
+
|
|
10
|
+
```
|
|
11
|
+
Initialize a new Python project called mcp-docling.
|
|
12
|
+
|
|
13
|
+
Create this directory structure:
|
|
14
|
+
mcp_docling/__init__.py
|
|
15
|
+
mcp_docling/server.py
|
|
16
|
+
tests/__init__.py
|
|
17
|
+
tests/test_server.py
|
|
18
|
+
tests/fixtures/ (empty for now)
|
|
19
|
+
pyproject.toml
|
|
20
|
+
README.md
|
|
21
|
+
CLAUDE.md
|
|
22
|
+
LICENSE
|
|
23
|
+
|
|
24
|
+
pyproject.toml requirements:
|
|
25
|
+
- name: mcp-docling
|
|
26
|
+
- version: 0.1.0
|
|
27
|
+
- description: "MCP server for layout-aware PDF to Markdown conversion via Docling"
|
|
28
|
+
- requires-python: ">=3.11"
|
|
29
|
+
- dependencies: ["mcp[cli]", "docling"]
|
|
30
|
+
- optional-dependencies.dev: ["pytest"]
|
|
31
|
+
- [project.scripts] mcp-docling = "mcp_docling.server:main"
|
|
32
|
+
- [project.urls] homepage = "https://github.com/benmazzotta/mcp-docling"
|
|
33
|
+
|
|
34
|
+
In mcp_docling/__init__.py, just set __version__ = "0.1.0".
|
|
35
|
+
|
|
36
|
+
Do not write server.py or tests yet. Confirm the scaffold is in place.
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Prompt 1 — Server
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Write mcp_docling/server.py.
|
|
45
|
+
|
|
46
|
+
Use FastMCP from the mcp package. Create a module-level DocumentConverter singleton
|
|
47
|
+
from docling.document_converter. Do not reinstantiate it on every call.
|
|
48
|
+
|
|
49
|
+
Expose one tool:
|
|
50
|
+
|
|
51
|
+
pdf_to_markdown(path: str, save_output: bool = False) -> str
|
|
52
|
+
|
|
53
|
+
Behavior:
|
|
54
|
+
- If path does not exist: return "Error: file not found: {path}"
|
|
55
|
+
- If path does not end in .pdf (case-insensitive): return "Error: not a PDF file: {path}"
|
|
56
|
+
- Convert using DocumentConverter().convert(path), then export_to_markdown()
|
|
57
|
+
- If save_output is True: write markdown to same directory, same stem, .md extension.
|
|
58
|
+
Return "Saved to: {output_path}"
|
|
59
|
+
- If save_output is False: return the markdown string directly
|
|
60
|
+
- Wrap everything in try/except Exception as e and return f"Error: {e}" on failure
|
|
61
|
+
|
|
62
|
+
Add a main() function:
|
|
63
|
+
def main():
|
|
64
|
+
mcp.run()
|
|
65
|
+
|
|
66
|
+
Add if __name__ == "__main__": main() at the bottom.
|
|
67
|
+
|
|
68
|
+
Log "Loading Docling models..." to stderr before the first conversion (not at import time —
|
|
69
|
+
use a module-level flag).
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Prompt 2 — Fixtures and Tests
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
Generate test fixture PDFs in tests/fixtures/ using reportlab.
|
|
78
|
+
Add reportlab as a dev dependency if not present.
|
|
79
|
+
|
|
80
|
+
Generate three PDFs:
|
|
81
|
+
|
|
82
|
+
1. tests/fixtures/simple.pdf
|
|
83
|
+
- Single column
|
|
84
|
+
- Three paragraphs of Lorem Ipsum
|
|
85
|
+
- One H1 heading
|
|
86
|
+
|
|
87
|
+
2. tests/fixtures/two_column.pdf
|
|
88
|
+
- Two-column layout
|
|
89
|
+
- Different text in each column, clearly labeled "Column A" and "Column B"
|
|
90
|
+
|
|
91
|
+
3. tests/fixtures/table.pdf
|
|
92
|
+
- A table with 3 columns and 5 rows
|
|
93
|
+
- Headers: Name, Value, Notes
|
|
94
|
+
- Fill with sample data
|
|
95
|
+
|
|
96
|
+
Write a standalone script generate_fixtures.py at the project root that generates
|
|
97
|
+
all three. Run it. Confirm all three files exist and are valid PDFs.
|
|
98
|
+
|
|
99
|
+
Then write tests/test_server.py:
|
|
100
|
+
|
|
101
|
+
Import pdf_to_markdown directly from mcp_docling.server (not through MCP).
|
|
102
|
+
|
|
103
|
+
Tests:
|
|
104
|
+
- test_simple_pdf: result is str, len > 100, no "Error:" prefix
|
|
105
|
+
- test_two_column_pdf: result is str, len > 100, no "Error:" prefix
|
|
106
|
+
- test_table_pdf: result contains "|" character
|
|
107
|
+
- test_bad_path: result starts with "Error:"
|
|
108
|
+
- test_non_pdf: create a temp .txt file, result starts with "Error:"
|
|
109
|
+
- test_save_output: copy simple.pdf to tmp_path, call with save_output=True,
|
|
110
|
+
assert .md file exists and is non-empty
|
|
111
|
+
|
|
112
|
+
Run pytest. All tests must pass before proceeding.
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Prompt 3 — README and Packaging
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
Write README.md with these exact sections:
|
|
121
|
+
|
|
122
|
+
# mcp-docling
|
|
123
|
+
|
|
124
|
+
One sentence: what it does and why it's better than pypdf.
|
|
125
|
+
|
|
126
|
+
## Installation
|
|
127
|
+
|
|
128
|
+
### Use with Claude Desktop (recommended)
|
|
129
|
+
Add to claude_desktop_config.json:
|
|
130
|
+
|
|
131
|
+
{
|
|
132
|
+
"mcpServers": {
|
|
133
|
+
"pdf-tools": {
|
|
134
|
+
"command": "uvx",
|
|
135
|
+
"args": ["mcp-docling"]
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
Then restart Claude Desktop.
|
|
141
|
+
|
|
142
|
+
### Development
|
|
143
|
+
git clone https://github.com/benmazzotta/mcp-docling
|
|
144
|
+
cd mcp-docling
|
|
145
|
+
uv sync --dev
|
|
146
|
+
uv run pytest
|
|
147
|
+
|
|
148
|
+
## Tool Reference
|
|
149
|
+
|
|
150
|
+
### pdf_to_markdown
|
|
151
|
+
- path (str): absolute or relative path to a PDF file
|
|
152
|
+
- save_output (bool, default False): if True, writes .md file next to the PDF and returns the path
|
|
153
|
+
|
|
154
|
+
## Known Limitations
|
|
155
|
+
- First run downloads Docling model weights (~500MB). Subsequent runs are fast.
|
|
156
|
+
- Scanned PDFs: OCR mode is not enabled in v0.1. Text in image-only PDFs will be empty.
|
|
157
|
+
- Tested on macOS and Linux. Windows path handling not validated.
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
MIT. See LICENSE.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
Write LICENSE as MIT, author Ben Mazzotta, year 2025.
|
|
165
|
+
|
|
166
|
+
Then run:
|
|
167
|
+
uv build
|
|
168
|
+
|
|
169
|
+
Confirm dist/ contains a .whl and a .tar.gz. Report the filenames.
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Prompt 4 — Publish
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
Publish to PyPI.
|
|
178
|
+
|
|
179
|
+
First, check whether PYPI_TOKEN is set in the environment.
|
|
180
|
+
If not set: print the following and stop:
|
|
181
|
+
|
|
182
|
+
"To publish, set your PyPI API token:
|
|
183
|
+
export PYPI_TOKEN=pypi-...
|
|
184
|
+
Then re-run this prompt."
|
|
185
|
+
|
|
186
|
+
If set, run:
|
|
187
|
+
uv publish --token $PYPI_TOKEN
|
|
188
|
+
|
|
189
|
+
After publishing, wait 60 seconds, then verify:
|
|
190
|
+
curl -s https://pypi.org/pypi/mcp-docling/json | python -m json.tool | grep '"version"'
|
|
191
|
+
|
|
192
|
+
Then test the live package in a temporary directory:
|
|
193
|
+
cd /tmp
|
|
194
|
+
uvx mcp-docling &
|
|
195
|
+
sleep 5
|
|
196
|
+
kill %1
|
|
197
|
+
|
|
198
|
+
Confirm it starts without error. Report success or failure clearly.
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Final Checklist
|
|
204
|
+
|
|
205
|
+
After all four prompts complete, verify:
|
|
206
|
+
|
|
207
|
+
- [ ] `uv run pytest` — all tests green
|
|
208
|
+
- [ ] `uv build` — dist/ has wheel and sdist
|
|
209
|
+
- [ ] `uvx mcp-docling` — server starts
|
|
210
|
+
- [ ] Package visible at pypi.org/project/mcp-docling
|
|
211
|
+
- [ ] README on PyPI renders correctly
|
|
212
|
+
- [ ] claude_desktop_config.json snippet in README is accurate
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# mcp-docling: Concept Note & Implementation Plan
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
Claude's native PDF ingestion is shallow. It reads text streams, not document structure. Tables become noise. Two-column papers read in the wrong order. Scanned PDFs return nothing. The result is that users paste PDFs into Claude and get confused or degraded responses — not because Claude is weak, but because the input is scrambled before Claude ever sees it.
|
|
6
|
+
|
|
7
|
+
## Solution
|
|
8
|
+
|
|
9
|
+
A local MCP server that wraps Docling — a layout-aware document converter — and exposes a single tool to Claude Desktop: `pdf_to_markdown(path)`. Claude calls it, gets clean structured markdown back, and reasons over that instead.
|
|
10
|
+
|
|
11
|
+
No upload. No cloud dependency. No background service. The server process lives only during a Claude Desktop session.
|
|
12
|
+
|
|
13
|
+
## Scope
|
|
14
|
+
|
|
15
|
+
**In:** PDF to markdown. Local file paths. Optional output-to-file.
|
|
16
|
+
**Out:** Other document formats, cloud storage integration, batch processing, GUI. These are v2 concerns.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Phase 1: Build
|
|
21
|
+
|
|
22
|
+
### Goal
|
|
23
|
+
A working MCP server that converts a local PDF to markdown and returns it to Claude.
|
|
24
|
+
|
|
25
|
+
### Prompt for Claude Code
|
|
26
|
+
```
|
|
27
|
+
Create a Python MCP server using FastMCP (mcp[cli] package) in mcp_docling/server.py.
|
|
28
|
+
|
|
29
|
+
Expose one tool: pdf_to_markdown(path: str, save_output: bool = False) -> str
|
|
30
|
+
|
|
31
|
+
Behavior:
|
|
32
|
+
- Validate that path exists and is a .pdf file. Return a clear error string if not.
|
|
33
|
+
- Use DocumentConverter from docling to convert the PDF.
|
|
34
|
+
- Call result.document.export_to_markdown().
|
|
35
|
+
- If save_output is True, write the markdown to the same directory as the PDF,
|
|
36
|
+
with the same filename and .md extension. Return the output path as a string.
|
|
37
|
+
- If save_output is False, return the markdown content directly.
|
|
38
|
+
- Catch all exceptions and return them as readable error strings prefixed with "Error: ".
|
|
39
|
+
- Cache the DocumentConverter as a module-level singleton to avoid reloading weights on every call.
|
|
40
|
+
|
|
41
|
+
Set up pyproject.toml with:
|
|
42
|
+
- name: mcp-docling
|
|
43
|
+
- version: 0.1.0
|
|
44
|
+
- dependencies: mcp[cli], docling
|
|
45
|
+
- dev dependencies: pytest
|
|
46
|
+
- entry point: mcp_docling.server:mcp (as mcp script)
|
|
47
|
+
- Python >= 3.11
|
|
48
|
+
|
|
49
|
+
Do not add any other dependencies.
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Acceptance Criteria
|
|
53
|
+
- [ ] `uv run python -m mcp_docling` starts without error
|
|
54
|
+
- [ ] `uv run mcp dev mcp_docling/server.py` opens MCP inspector
|
|
55
|
+
- [ ] Tool appears in inspector with correct signature
|
|
56
|
+
- [ ] Calling with a valid PDF path returns non-empty markdown string
|
|
57
|
+
- [ ] Calling with a bad path returns a readable error string, not a traceback
|
|
58
|
+
- [ ] `save_output=True` writes a `.md` file next to the PDF
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Phase 2: Test
|
|
63
|
+
|
|
64
|
+
### Goal
|
|
65
|
+
Confidence across the real-world PDF types that matter. This is where pypdf fails and docling earns its keep.
|
|
66
|
+
|
|
67
|
+
### Prompt for Claude Code
|
|
68
|
+
```
|
|
69
|
+
Create tests/test_server.py using pytest.
|
|
70
|
+
|
|
71
|
+
Download or generate four fixture PDFs into tests/fixtures/:
|
|
72
|
+
1. simple.pdf — single column, clean text (generate with reportlab if needed)
|
|
73
|
+
2. two_column.pdf — two-column layout (generate with reportlab)
|
|
74
|
+
3. table.pdf — PDF containing a markdown-renderable table (generate with reportlab)
|
|
75
|
+
4. scanned.pdf — use any freely licensed scanned PDF sample from a public URL,
|
|
76
|
+
or skip if download is unreliable and note it
|
|
77
|
+
|
|
78
|
+
For each fixture, write a test that:
|
|
79
|
+
- Calls pdf_to_markdown(path) directly (import the function, don't go through MCP)
|
|
80
|
+
- Asserts the result is a non-empty string
|
|
81
|
+
- Asserts no "Error:" prefix in result
|
|
82
|
+
|
|
83
|
+
Additional tests:
|
|
84
|
+
- Bad path returns string starting with "Error:"
|
|
85
|
+
- Non-PDF extension returns string starting with "Error:"
|
|
86
|
+
- save_output=True creates the expected .md file (use tmp_path fixture, copy a fixture there first)
|
|
87
|
+
|
|
88
|
+
Do not mock DocumentConverter. These are integration tests. They should call docling for real.
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Acceptance Criteria
|
|
92
|
+
- [ ] `uv run pytest` passes all tests
|
|
93
|
+
- [ ] Two-column test returns markdown where paragraphs are not interleaved
|
|
94
|
+
- [ ] Table test returns output containing a markdown table (`|` characters)
|
|
95
|
+
- [ ] Error path tests pass
|
|
96
|
+
- [ ] save_output test confirms file exists and is non-empty
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Phase 3: Package
|
|
101
|
+
|
|
102
|
+
### Goal
|
|
103
|
+
Anyone can run this with a single `uvx` command. No cloning required.
|
|
104
|
+
|
|
105
|
+
### Prompt for Claude Code
|
|
106
|
+
```
|
|
107
|
+
Prepare mcp-docling for PyPI publication.
|
|
108
|
+
|
|
109
|
+
1. Verify pyproject.toml is complete:
|
|
110
|
+
- name, version, description, license (MIT), author, homepage, readme
|
|
111
|
+
- Correct entry point so `uvx mcp-docling` starts the server
|
|
112
|
+
|
|
113
|
+
2. Write README.md with these sections:
|
|
114
|
+
- One-sentence description
|
|
115
|
+
- Installation (two methods: uvx one-liner, and uv add for development)
|
|
116
|
+
- claude_desktop_config.json snippet showing exact configuration
|
|
117
|
+
- Tool reference: pdf_to_markdown parameters and return values
|
|
118
|
+
- Known limitations: large files, scanned PDFs need OCR mode, first-run weight download
|
|
119
|
+
- License
|
|
120
|
+
|
|
121
|
+
3. Add LICENSE (MIT, author: Ben Mazzotta, year 2025)
|
|
122
|
+
|
|
123
|
+
4. Build the package:
|
|
124
|
+
uv build
|
|
125
|
+
|
|
126
|
+
5. Confirm dist/ contains a .whl and .tar.gz
|
|
127
|
+
|
|
128
|
+
6. Dry-run install from the wheel locally to confirm the entry point works:
|
|
129
|
+
uv run --with dist/mcp_docling-*.whl mcp-docling --help
|
|
130
|
+
(adjust if FastMCP handles --help differently)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Acceptance Criteria
|
|
134
|
+
- [ ] `uv build` completes without error
|
|
135
|
+
- [ ] `dist/` contains both wheel and sdist
|
|
136
|
+
- [ ] README contains working config snippet (verified by reading it)
|
|
137
|
+
- [ ] LICENSE file present
|
|
138
|
+
- [ ] Entry point is callable from the wheel
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Phase 4: Publish
|
|
143
|
+
|
|
144
|
+
### Goal
|
|
145
|
+
Package is live on PyPI. Users can configure it in two lines.
|
|
146
|
+
|
|
147
|
+
### Prompt for Claude Code
|
|
148
|
+
```
|
|
149
|
+
Publish mcp-docling to PyPI using uv.
|
|
150
|
+
|
|
151
|
+
Steps:
|
|
152
|
+
1. Ensure PYPI_TOKEN is available as environment variable (do not hardcode it)
|
|
153
|
+
2. Run: uv publish --token $PYPI_TOKEN
|
|
154
|
+
3. Wait ~60 seconds, then verify publication:
|
|
155
|
+
curl https://pypi.org/pypi/mcp-docling/json | python -m json.tool | grep version
|
|
156
|
+
4. Test the live package:
|
|
157
|
+
uvx mcp-docling
|
|
158
|
+
(confirm it starts and prints MCP server startup message)
|
|
159
|
+
5. Update README if the PyPI badge URL or install command needs adjusting.
|
|
160
|
+
|
|
161
|
+
If PYPI_TOKEN is not set, stop and print instructions for the user to set it.
|
|
162
|
+
Do not attempt to publish without a token.
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Acceptance Criteria
|
|
166
|
+
- [ ] Package appears at pypi.org/project/mcp-docling
|
|
167
|
+
- [ ] `uvx mcp-docling` works in a clean environment (test in a new uv venv)
|
|
168
|
+
- [ ] Version on PyPI matches pyproject.toml
|
|
169
|
+
- [ ] README on PyPI renders correctly (check the PyPI project page)
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Risk Notes
|
|
174
|
+
|
|
175
|
+
**Docling weight download on first run.** Users will see a pause of 30–90 seconds the first time `pdf_to_markdown` is called. The README must warn them. Consider printing a log message from the server: `"Loading Docling models (first run only)..."`.
|
|
176
|
+
|
|
177
|
+
**Scanned PDFs.** Docling handles these but OCR mode must be enabled explicitly. v0.1 can document this as a known limitation and expose it as a future `ocr=True` parameter.
|
|
178
|
+
|
|
179
|
+
**Path handling on Windows.** Docling and MCP both support Windows but path separators can bite. Test on the platform you care about.
|
|
180
|
+
|
|
181
|
+
**PyPI name collision.** Check that `mcp-docling` is unclaimed before Phase 3. If taken, `mcp-pdf-docling` or `docling-mcp` are reasonable alternatives.
|
pdf_porter-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Benjamin Mazzotta
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|