pdf2mcp 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2mcp-0.2.2/.claude/settings.local.json +36 -0
- pdf2mcp-0.2.2/.env.example +19 -0
- pdf2mcp-0.2.2/.gitignore +36 -0
- pdf2mcp-0.2.2/.python-version +1 -0
- pdf2mcp-0.2.2/LICENSE +21 -0
- pdf2mcp-0.2.2/PKG-INFO +230 -0
- pdf2mcp-0.2.2/README.md +195 -0
- pdf2mcp-0.2.2/pyproject.toml +68 -0
- pdf2mcp-0.2.2/src/pdf2mcp/__init__.py +5 -0
- pdf2mcp-0.2.2/src/pdf2mcp/chunker.py +314 -0
- pdf2mcp-0.2.2/src/pdf2mcp/cli.py +356 -0
- pdf2mcp-0.2.2/src/pdf2mcp/config.py +137 -0
- pdf2mcp-0.2.2/src/pdf2mcp/embeddings.py +113 -0
- pdf2mcp-0.2.2/src/pdf2mcp/ingest.py +131 -0
- pdf2mcp-0.2.2/src/pdf2mcp/models.py +32 -0
- pdf2mcp-0.2.2/src/pdf2mcp/parser.py +48 -0
- pdf2mcp-0.2.2/src/pdf2mcp/py.typed +0 -0
- pdf2mcp-0.2.2/src/pdf2mcp/search.py +410 -0
- pdf2mcp-0.2.2/src/pdf2mcp/server.py +320 -0
- pdf2mcp-0.2.2/src/pdf2mcp/store.py +260 -0
- pdf2mcp-0.2.2/tests/__init__.py +1 -0
- pdf2mcp-0.2.2/tests/test_chunker.py +284 -0
- pdf2mcp-0.2.2/tests/test_cli.py +514 -0
- pdf2mcp-0.2.2/tests/test_config.py +201 -0
- pdf2mcp-0.2.2/tests/test_embeddings.py +148 -0
- pdf2mcp-0.2.2/tests/test_ingest.py +247 -0
- pdf2mcp-0.2.2/tests/test_models.py +90 -0
- pdf2mcp-0.2.2/tests/test_parser.py +138 -0
- pdf2mcp-0.2.2/tests/test_search.py +616 -0
- pdf2mcp-0.2.2/tests/test_server.py +430 -0
- pdf2mcp-0.2.2/tests/test_store.py +189 -0
- pdf2mcp-0.2.2/uv.lock +1626 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(python -m pytest tests/test_config.py -v 2>&1)",
|
|
5
|
+
"Bash(python -c \"from __future__ import annotations; from pydantic_settings import BaseSettings; print\\('ok'\\)\" 2>&1)",
|
|
6
|
+
"Bash(python -c \"\nimport os\nos.environ['OPENAI_API_KEY'] = 'sk-test'\nfrom pdf2mcp.config import Settings\ns = Settings\\(\\)\nprint\\(s.model_fields.keys\\(\\)\\)\nprint\\(hasattr\\(s, 'openai_base_url'\\)\\)\nprint\\(s.model_dump\\(\\)\\)\n\" 2>&1)",
|
|
7
|
+
"Bash(python -c \"\nimport os\nos.environ['OPENAI_API_KEY'] = 'sk-test'\nfrom pdf2mcp.config import Settings\ns = Settings\\(\\)\nprint\\('has attr:', hasattr\\(s, 'openai_base_url'\\)\\)\nprint\\('value:', s.openai_base_url\\)\n\" 2>&1)",
|
|
8
|
+
"Bash(python -c \"\nfrom pdf2mcp.config import Settings\nprint\\(Settings.model_fields.keys\\(\\)\\)\n\" 2>&1)",
|
|
9
|
+
"Bash(python -c \"import pdf2mcp.config; print\\(pdf2mcp.config.__file__\\)\" 2>&1 && find /Users/aissam/SynologyDrive/Work/Aisobotics/Dev/pdf2mcp -path '*/pdf2mcp/config.py' -not -path '*/.venv/*' 2>&1)",
|
|
10
|
+
"Bash(pip install:*)",
|
|
11
|
+
"Bash(python -c \"\nimport os\nos.environ['OPENAI_API_KEY'] = 'sk-test'\nfrom pdf2mcp.config import Settings\nprint\\(Settings.model_fields.keys\\(\\)\\)\ns = Settings\\(\\)\nprint\\('base_url:', s.openai_base_url\\)\n\" 2>&1)",
|
|
12
|
+
"Bash(python -c \"import pdf2mcp.config; print\\(pdf2mcp.config.__file__\\)\" 2>&1)",
|
|
13
|
+
"Bash(pip uninstall:*)",
|
|
14
|
+
"Bash(uv pip:*)",
|
|
15
|
+
"Bash(python -c \"\nimport os; os.environ['OPENAI_API_KEY'] = 'sk-test'\nfrom pdf2mcp.config import Settings\nprint\\(Settings.model_fields.keys\\(\\)\\)\ns = Settings\\(\\)\nprint\\('base_url:', s.openai_base_url\\)\n\" 2>&1)",
|
|
16
|
+
"Bash(python -m pytest 2>&1)",
|
|
17
|
+
"Bash(python -c \"from pdf2mcp.cli import _ENV_TEMPLATE; print\\(_ENV_TEMPLATE\\)\")",
|
|
18
|
+
"Bash(which pdf2mcp:*)",
|
|
19
|
+
"Bash(git add:*)",
|
|
20
|
+
"Bash(git commit:*)",
|
|
21
|
+
"Bash(git push:*)",
|
|
22
|
+
"mcp__plugin_serena_serena__list_dir",
|
|
23
|
+
"Bash(python -m pytest tests/test_search.py tests/test_server.py -v 2>&1)",
|
|
24
|
+
"Bash(python -m pytest -q 2>&1)",
|
|
25
|
+
"Bash(uv run:*)",
|
|
26
|
+
"Bash(python -m pytest tests/test_store.py tests/test_embeddings.py tests/test_search.py tests/test_ingest.py -v 2>&1 | head -120)",
|
|
27
|
+
"Bash(python -m pytest -v 2>&1 | tail -30)",
|
|
28
|
+
"Bash(python -c \"import lancedb; help\\(lancedb.table.Table.create_index\\)\" 2>&1 | head -60)",
|
|
29
|
+
"Bash(gh release:*)",
|
|
30
|
+
"Bash(gh repo:*)",
|
|
31
|
+
"Bash(pip index:*)",
|
|
32
|
+
"Bash(python -m build 2>&1)",
|
|
33
|
+
"Bash(twine check:*)"
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Required: OpenAI API key for embeddings
|
|
2
|
+
OPENAI_API_KEY=sk-your-api-key-here
|
|
3
|
+
|
|
4
|
+
# Optional: OpenAI base URL (for Azure, local proxies, or compatible providers)
|
|
5
|
+
# PDF2MCP_OPENAI_BASE_URL=https://api.openai.com/v1
|
|
6
|
+
|
|
7
|
+
# Optional: Override defaults
|
|
8
|
+
# PDF2MCP_DOCS_DIR=docs
|
|
9
|
+
# PDF2MCP_DATA_DIR=data
|
|
10
|
+
# PDF2MCP_EMBEDDING_MODEL=text-embedding-3-small
|
|
11
|
+
# PDF2MCP_CHUNK_SIZE=500
|
|
12
|
+
# PDF2MCP_CHUNK_OVERLAP=50
|
|
13
|
+
# PDF2MCP_DEFAULT_NUM_RESULTS=5
|
|
14
|
+
# PDF2MCP_SERVER_NAME=pdf-docs
|
|
15
|
+
|
|
16
|
+
# Server transport settings
|
|
17
|
+
# PDF2MCP_SERVER_TRANSPORT=streamable-http
|
|
18
|
+
# PDF2MCP_SERVER_HOST=0.0.0.0
|
|
19
|
+
# PDF2MCP_SERVER_PORT=8000
|
pdf2mcp-0.2.2/.gitignore
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
**/__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.egg
|
|
9
|
+
|
|
10
|
+
# Virtual environments
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
|
|
14
|
+
# Type checking / linting caches
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
|
|
18
|
+
# Testing
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
htmlcov/
|
|
22
|
+
|
|
23
|
+
# Environment
|
|
24
|
+
.env
|
|
25
|
+
|
|
26
|
+
# Project-specific
|
|
27
|
+
data/
|
|
28
|
+
docs/*.pdf
|
|
29
|
+
|
|
30
|
+
# IDE
|
|
31
|
+
.vscode/
|
|
32
|
+
.idea/
|
|
33
|
+
|
|
34
|
+
# OS
|
|
35
|
+
.DS_Store
|
|
36
|
+
Thumbs.db
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
pdf2mcp-0.2.2/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pdf2mcp-0.2.2/PKG-INFO
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf2mcp
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Turn any PDF folder into a searchable MCP server
|
|
5
|
+
Project-URL: Homepage, https://github.com/iSamBa/pdf2mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/iSamBa/pdf2mcp
|
|
7
|
+
Project-URL: Issues, https://github.com/iSamBa/pdf2mcp/issues
|
|
8
|
+
Author-email: iSamBa <bahou.aissam@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: embeddings,lancedb,mcp,pdf,rag,search
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: lancedb>=0.6
|
|
24
|
+
Requires-Dist: mcp[cli]>=1.0
|
|
25
|
+
Requires-Dist: openai>=1.0
|
|
26
|
+
Requires-Dist: pydantic-settings>=2.0
|
|
27
|
+
Requires-Dist: pymupdf4llm>=0.0.17
|
|
28
|
+
Requires-Dist: python-dotenv>=1.0
|
|
29
|
+
Requires-Dist: tenacity>=8.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# pdf2mcp
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
██████╗ ██████╗ ███████╗██████╗ ███╗ ███╗ ██████╗██████╗
|
|
40
|
+
██╔══██╗██╔══██╗██╔════╝╚════██╗████╗ ████║██╔════╝██╔══██╗
|
|
41
|
+
██████╔╝██║ ██║█████╗ █████╔╝██╔████╔██║██║ ██████╔╝
|
|
42
|
+
██╔═══╝ ██║ ██║██╔══╝ ██╔═══╝ ██║╚██╔╝██║██║ ██╔═══╝
|
|
43
|
+
██║ ██████╔╝██║ ███████╗██║ ╚═╝ ██║╚██████╗██║
|
|
44
|
+
╚═╝ ╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝╚═╝
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Turn any PDF folder into a searchable MCP server.
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
Clone the repo, then install globally with `uv tool`:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
git clone https://github.com/iSamBa/pdf2mcp.git
|
|
55
|
+
uv tool install ./pdf2mcp
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
This makes `pdf2mcp` available as a command anywhere on your system.
|
|
59
|
+
|
|
60
|
+
To update after pulling new changes:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
uv tool install --force ./pdf2mcp
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
To run directly from source without installing:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
cd ./pdf2mcp
|
|
70
|
+
uv run pdf2mcp --help
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Verify
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pdf2mcp --version
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# 1. Scaffold a project (creates docs/ and .env)
|
|
83
|
+
pdf2mcp init ./my-project
|
|
84
|
+
cd my-project
|
|
85
|
+
|
|
86
|
+
# 2. Add your PDFs to docs/ and set OPENAI_API_KEY in .env
|
|
87
|
+
|
|
88
|
+
# 3. Ingest
|
|
89
|
+
pdf2mcp ingest
|
|
90
|
+
|
|
91
|
+
# 4. Start the server
|
|
92
|
+
pdf2mcp serve
|
|
93
|
+
|
|
94
|
+
# 5. Get config snippets for your MCP client
|
|
95
|
+
pdf2mcp config
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Architecture
|
|
99
|
+
|
|
100
|
+
pdf2mcp separates **server** and **client** concerns:
|
|
101
|
+
|
|
102
|
+
- **Server** (`pdf2mcp serve`) — runs independently, handles PDF ingestion, embedding, and search. Configured via `PDF2MCP_*` environment variables.
|
|
103
|
+
- **Client** (Claude Code, Cursor, VS Code, etc.) — connects to a running server over HTTP. Only needs the server URL.
|
|
104
|
+
|
|
105
|
+
The default transport is `streamable-http`. The server listens on `http://127.0.0.1:8000/mcp` and shuts down gracefully on SIGINT/SIGTERM.
|
|
106
|
+
|
|
107
|
+
## Commands
|
|
108
|
+
|
|
109
|
+
| Command | Description |
|
|
110
|
+
|---------|-------------|
|
|
111
|
+
| `pdf2mcp init [dir]` | Scaffold a working directory with `docs/` and `.env` |
|
|
112
|
+
| `pdf2mcp ingest` | Parse PDFs, chunk, embed, and store in vector DB |
|
|
113
|
+
| `pdf2mcp serve` | Start the MCP server (HTTP by default) |
|
|
114
|
+
| `pdf2mcp config` | Print ready-to-paste config for MCP clients |
|
|
115
|
+
|
|
116
|
+
### Common Flags
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Override docs directory
|
|
120
|
+
pdf2mcp ingest --docs-dir ./my-pdfs
|
|
121
|
+
pdf2mcp serve --docs-dir ./my-pdfs
|
|
122
|
+
|
|
123
|
+
# Use stdio transport (for clients that spawn the server)
|
|
124
|
+
pdf2mcp serve --transport stdio
|
|
125
|
+
|
|
126
|
+
# Custom host/port
|
|
127
|
+
pdf2mcp serve --host 0.0.0.0 --port 9000
|
|
128
|
+
|
|
129
|
+
# Custom server name
|
|
130
|
+
pdf2mcp serve --name my-docs
|
|
131
|
+
|
|
132
|
+
# Config for a specific client
|
|
133
|
+
pdf2mcp config --client cursor
|
|
134
|
+
pdf2mcp config --client claude-desktop --transport stdio
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Client Configuration
|
|
138
|
+
|
|
139
|
+
`pdf2mcp config` generates ready-to-paste JSON for all supported clients. The default is HTTP — clients just need the server URL:
|
|
140
|
+
|
|
141
|
+
```json
|
|
142
|
+
{
|
|
143
|
+
"mcpServers": {
|
|
144
|
+
"pdf-docs": {
|
|
145
|
+
"type": "http",
|
|
146
|
+
"url": "http://127.0.0.1:8000/mcp"
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
| Client | Config File | Top-level Key | HTTP Support |
|
|
153
|
+
|--------|------------|--------------|--------------|
|
|
154
|
+
| Claude Code | `.mcp.json` | `mcpServers` | Yes |
|
|
155
|
+
| Claude Desktop | `claude_desktop_config.json` | `mcpServers` | No (stdio only) |
|
|
156
|
+
| Cursor | `.cursor/mcp.json` | `mcpServers` | Yes |
|
|
157
|
+
| VS Code / Copilot | `.vscode/mcp.json` | `servers` | Yes |
|
|
158
|
+
|
|
159
|
+
Use `--transport stdio` for clients that need to spawn the server process (e.g., Claude Desktop):
|
|
160
|
+
|
|
161
|
+
```json
|
|
162
|
+
{
|
|
163
|
+
"mcpServers": {
|
|
164
|
+
"pdf-docs": {
|
|
165
|
+
"command": "uv",
|
|
166
|
+
"args": ["run", "pdf2mcp", "serve"]
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Environment Variables
|
|
173
|
+
|
|
174
|
+
### Server settings (`PDF2MCP_*`)
|
|
175
|
+
|
|
176
|
+
These configure the server process. MCP clients never need these.
|
|
177
|
+
|
|
178
|
+
| Variable | Default | Description |
|
|
179
|
+
|----------|---------|-------------|
|
|
180
|
+
| `OPENAI_API_KEY` | (required) | OpenAI API key for embeddings |
|
|
181
|
+
| `PDF2MCP_OPENAI_BASE_URL` | `https://api.openai.com/v1` | OpenAI API base URL (for Azure, local proxies, or compatible providers) |
|
|
182
|
+
| `PDF2MCP_DOCS_DIR` | `docs` | Directory containing PDF files |
|
|
183
|
+
| `PDF2MCP_DATA_DIR` | `data` | Directory for vector database |
|
|
184
|
+
| `PDF2MCP_EMBEDDING_MODEL` | `text-embedding-3-small` | OpenAI embedding model |
|
|
185
|
+
| `PDF2MCP_CHUNK_SIZE` | `500` | Target chunk size in tokens |
|
|
186
|
+
| `PDF2MCP_CHUNK_OVERLAP` | `50` | Overlap between chunks in tokens |
|
|
187
|
+
| `PDF2MCP_DEFAULT_NUM_RESULTS` | `5` | Default search results count |
|
|
188
|
+
| `PDF2MCP_SERVER_NAME` | `pdf-docs` | MCP server name |
|
|
189
|
+
| `PDF2MCP_SERVER_TRANSPORT` | `streamable-http` | Transport protocol |
|
|
190
|
+
| `PDF2MCP_SERVER_HOST` | `127.0.0.1` | Host to bind to |
|
|
191
|
+
| `PDF2MCP_SERVER_PORT` | `8000` | Port to bind to |
|
|
192
|
+
|
|
193
|
+
### Client settings (`PDF2MCP_CLIENT_*`)
|
|
194
|
+
|
|
195
|
+
These configure how a client connects to the server. No secrets needed.
|
|
196
|
+
|
|
197
|
+
| Variable | Default | Description |
|
|
198
|
+
|----------|---------|-------------|
|
|
199
|
+
| `PDF2MCP_CLIENT_SERVER_NAME` | `pdf-docs` | Server name in client config |
|
|
200
|
+
| `PDF2MCP_CLIENT_SERVER_URL` | `http://127.0.0.1:8000/mcp` | Server URL |
|
|
201
|
+
| `PDF2MCP_CLIENT_TRANSPORT` | `streamable-http` | Transport protocol |
|
|
202
|
+
|
|
203
|
+
## MCP Tools
|
|
204
|
+
|
|
205
|
+
The server exposes six tools:
|
|
206
|
+
|
|
207
|
+
| Tool | Description |
|
|
208
|
+
|------|-------------|
|
|
209
|
+
| `search_docs(query)` | Semantic search across **all** ingested PDFs |
|
|
210
|
+
| `search_in_doc(query, filename)` | Semantic search scoped to a **single** document |
|
|
211
|
+
| `list_docs()` | List all ingested documents with chunk counts |
|
|
212
|
+
| `get_sections(filename)` | Get section headings for a specific document |
|
|
213
|
+
| `read_page(filename, page)` | Read the full content of a specific page |
|
|
214
|
+
| `read_section(filename, section_title)` | Read the full content of a named section |
|
|
215
|
+
|
|
216
|
+
### Typical workflow
|
|
217
|
+
|
|
218
|
+
1. **`list_docs`** — discover available documents
|
|
219
|
+
2. **`get_sections`** — browse a document's structure
|
|
220
|
+
3. **`read_section`** or **`read_page`** — read specific content
|
|
221
|
+
4. **`search_docs`** or **`search_in_doc`** — find information by query
|
|
222
|
+
|
|
223
|
+
## Development
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
uv sync --all-extras
|
|
227
|
+
uv run pytest
|
|
228
|
+
uv run ruff check src/
|
|
229
|
+
uv run mypy src/
|
|
230
|
+
```
|
pdf2mcp-0.2.2/README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# pdf2mcp
|
|
2
|
+
|
|
3
|
+
```
|
|
4
|
+
██████╗ ██████╗ ███████╗██████╗ ███╗ ███╗ ██████╗██████╗
|
|
5
|
+
██╔══██╗██╔══██╗██╔════╝╚════██╗████╗ ████║██╔════╝██╔══██╗
|
|
6
|
+
██████╔╝██║ ██║█████╗ █████╔╝██╔████╔██║██║ ██████╔╝
|
|
7
|
+
██╔═══╝ ██║ ██║██╔══╝ ██╔═══╝ ██║╚██╔╝██║██║ ██╔═══╝
|
|
8
|
+
██║ ██████╔╝██║ ███████╗██║ ╚═╝ ██║╚██████╗██║
|
|
9
|
+
╚═╝ ╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ ╚═════╝╚═╝
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Turn any PDF folder into a searchable MCP server.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
Clone the repo, then install globally with `uv tool`:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
git clone https://github.com/iSamBa/pdf2mcp.git
|
|
20
|
+
uv tool install ./pdf2mcp
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
This makes `pdf2mcp` available as a command anywhere on your system.
|
|
24
|
+
|
|
25
|
+
To update after pulling new changes:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv tool install --force ./pdf2mcp
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
To run directly from source without installing:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
cd ./pdf2mcp
|
|
35
|
+
uv run pdf2mcp --help
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Verify
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pdf2mcp --version
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# 1. Scaffold a project (creates docs/ and .env)
|
|
48
|
+
pdf2mcp init ./my-project
|
|
49
|
+
cd my-project
|
|
50
|
+
|
|
51
|
+
# 2. Add your PDFs to docs/ and set OPENAI_API_KEY in .env
|
|
52
|
+
|
|
53
|
+
# 3. Ingest
|
|
54
|
+
pdf2mcp ingest
|
|
55
|
+
|
|
56
|
+
# 4. Start the server
|
|
57
|
+
pdf2mcp serve
|
|
58
|
+
|
|
59
|
+
# 5. Get config snippets for your MCP client
|
|
60
|
+
pdf2mcp config
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Architecture
|
|
64
|
+
|
|
65
|
+
pdf2mcp separates **server** and **client** concerns:
|
|
66
|
+
|
|
67
|
+
- **Server** (`pdf2mcp serve`) — runs independently, handles PDF ingestion, embedding, and search. Configured via `PDF2MCP_*` environment variables.
|
|
68
|
+
- **Client** (Claude Code, Cursor, VS Code, etc.) — connects to a running server over HTTP. Only needs the server URL.
|
|
69
|
+
|
|
70
|
+
The default transport is `streamable-http`. The server listens on `http://127.0.0.1:8000/mcp` and shuts down gracefully on SIGINT/SIGTERM.
|
|
71
|
+
|
|
72
|
+
## Commands
|
|
73
|
+
|
|
74
|
+
| Command | Description |
|
|
75
|
+
|---------|-------------|
|
|
76
|
+
| `pdf2mcp init [dir]` | Scaffold a working directory with `docs/` and `.env` |
|
|
77
|
+
| `pdf2mcp ingest` | Parse PDFs, chunk, embed, and store in vector DB |
|
|
78
|
+
| `pdf2mcp serve` | Start the MCP server (HTTP by default) |
|
|
79
|
+
| `pdf2mcp config` | Print ready-to-paste config for MCP clients |
|
|
80
|
+
|
|
81
|
+
### Common Flags
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Override docs directory
|
|
85
|
+
pdf2mcp ingest --docs-dir ./my-pdfs
|
|
86
|
+
pdf2mcp serve --docs-dir ./my-pdfs
|
|
87
|
+
|
|
88
|
+
# Use stdio transport (for clients that spawn the server)
|
|
89
|
+
pdf2mcp serve --transport stdio
|
|
90
|
+
|
|
91
|
+
# Custom host/port
|
|
92
|
+
pdf2mcp serve --host 0.0.0.0 --port 9000
|
|
93
|
+
|
|
94
|
+
# Custom server name
|
|
95
|
+
pdf2mcp serve --name my-docs
|
|
96
|
+
|
|
97
|
+
# Config for a specific client
|
|
98
|
+
pdf2mcp config --client cursor
|
|
99
|
+
pdf2mcp config --client claude-desktop --transport stdio
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Client Configuration
|
|
103
|
+
|
|
104
|
+
`pdf2mcp config` generates ready-to-paste JSON for all supported clients. The default is HTTP — clients just need the server URL:
|
|
105
|
+
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"mcpServers": {
|
|
109
|
+
"pdf-docs": {
|
|
110
|
+
"type": "http",
|
|
111
|
+
"url": "http://127.0.0.1:8000/mcp"
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
| Client | Config File | Top-level Key | HTTP Support |
|
|
118
|
+
|--------|------------|--------------|--------------|
|
|
119
|
+
| Claude Code | `.mcp.json` | `mcpServers` | Yes |
|
|
120
|
+
| Claude Desktop | `claude_desktop_config.json` | `mcpServers` | No (stdio only) |
|
|
121
|
+
| Cursor | `.cursor/mcp.json` | `mcpServers` | Yes |
|
|
122
|
+
| VS Code / Copilot | `.vscode/mcp.json` | `servers` | Yes |
|
|
123
|
+
|
|
124
|
+
Use `--transport stdio` for clients that need to spawn the server process (e.g., Claude Desktop):
|
|
125
|
+
|
|
126
|
+
```json
|
|
127
|
+
{
|
|
128
|
+
"mcpServers": {
|
|
129
|
+
"pdf-docs": {
|
|
130
|
+
"command": "uv",
|
|
131
|
+
"args": ["run", "pdf2mcp", "serve"]
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Environment Variables
|
|
138
|
+
|
|
139
|
+
### Server settings (`PDF2MCP_*`)
|
|
140
|
+
|
|
141
|
+
These configure the server process. MCP clients never need these.
|
|
142
|
+
|
|
143
|
+
| Variable | Default | Description |
|
|
144
|
+
|----------|---------|-------------|
|
|
145
|
+
| `OPENAI_API_KEY` | (required) | OpenAI API key for embeddings |
|
|
146
|
+
| `PDF2MCP_OPENAI_BASE_URL` | `https://api.openai.com/v1` | OpenAI API base URL (for Azure, local proxies, or compatible providers) |
|
|
147
|
+
| `PDF2MCP_DOCS_DIR` | `docs` | Directory containing PDF files |
|
|
148
|
+
| `PDF2MCP_DATA_DIR` | `data` | Directory for vector database |
|
|
149
|
+
| `PDF2MCP_EMBEDDING_MODEL` | `text-embedding-3-small` | OpenAI embedding model |
|
|
150
|
+
| `PDF2MCP_CHUNK_SIZE` | `500` | Target chunk size in tokens |
|
|
151
|
+
| `PDF2MCP_CHUNK_OVERLAP` | `50` | Overlap between chunks in tokens |
|
|
152
|
+
| `PDF2MCP_DEFAULT_NUM_RESULTS` | `5` | Default search results count |
|
|
153
|
+
| `PDF2MCP_SERVER_NAME` | `pdf-docs` | MCP server name |
|
|
154
|
+
| `PDF2MCP_SERVER_TRANSPORT` | `streamable-http` | Transport protocol |
|
|
155
|
+
| `PDF2MCP_SERVER_HOST` | `127.0.0.1` | Host to bind to |
|
|
156
|
+
| `PDF2MCP_SERVER_PORT` | `8000` | Port to bind to |
|
|
157
|
+
|
|
158
|
+
### Client settings (`PDF2MCP_CLIENT_*`)
|
|
159
|
+
|
|
160
|
+
These configure how a client connects to the server. No secrets needed.
|
|
161
|
+
|
|
162
|
+
| Variable | Default | Description |
|
|
163
|
+
|----------|---------|-------------|
|
|
164
|
+
| `PDF2MCP_CLIENT_SERVER_NAME` | `pdf-docs` | Server name in client config |
|
|
165
|
+
| `PDF2MCP_CLIENT_SERVER_URL` | `http://127.0.0.1:8000/mcp` | Server URL |
|
|
166
|
+
| `PDF2MCP_CLIENT_TRANSPORT` | `streamable-http` | Transport protocol |
|
|
167
|
+
|
|
168
|
+
## MCP Tools
|
|
169
|
+
|
|
170
|
+
The server exposes six tools:
|
|
171
|
+
|
|
172
|
+
| Tool | Description |
|
|
173
|
+
|------|-------------|
|
|
174
|
+
| `search_docs(query)` | Semantic search across **all** ingested PDFs |
|
|
175
|
+
| `search_in_doc(query, filename)` | Semantic search scoped to a **single** document |
|
|
176
|
+
| `list_docs()` | List all ingested documents with chunk counts |
|
|
177
|
+
| `get_sections(filename)` | Get section headings for a specific document |
|
|
178
|
+
| `read_page(filename, page)` | Read the full content of a specific page |
|
|
179
|
+
| `read_section(filename, section_title)` | Read the full content of a named section |
|
|
180
|
+
|
|
181
|
+
### Typical workflow
|
|
182
|
+
|
|
183
|
+
1. **`list_docs`** — discover available documents
|
|
184
|
+
2. **`get_sections`** — browse a document's structure
|
|
185
|
+
3. **`read_section`** or **`read_page`** — read specific content
|
|
186
|
+
4. **`search_docs`** or **`search_in_doc`** — find information by query
|
|
187
|
+
|
|
188
|
+
## Development
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
uv sync --all-extras
|
|
192
|
+
uv run pytest
|
|
193
|
+
uv run ruff check src/
|
|
194
|
+
uv run mypy src/
|
|
195
|
+
```
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pdf2mcp"
|
|
3
|
+
version = "0.2.2"
|
|
4
|
+
description = "Turn any PDF folder into a searchable MCP server"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "iSamBa", email = "bahou.aissam@gmail.com" }
|
|
9
|
+
]
|
|
10
|
+
keywords = ["pdf", "mcp", "search", "embeddings", "rag", "lancedb"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
"Topic :: Text Processing :: Indexing",
|
|
22
|
+
]
|
|
23
|
+
requires-python = ">=3.10"
|
|
24
|
+
dependencies = [
|
|
25
|
+
"mcp[cli]>=1.0",
|
|
26
|
+
"pymupdf4llm>=0.0.17",
|
|
27
|
+
"lancedb>=0.6",
|
|
28
|
+
"openai>=1.0",
|
|
29
|
+
"pydantic-settings>=2.0",
|
|
30
|
+
"python-dotenv>=1.0",
|
|
31
|
+
"tenacity>=8.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/iSamBa/pdf2mcp"
|
|
36
|
+
Repository = "https://github.com/iSamBa/pdf2mcp"
|
|
37
|
+
Issues = "https://github.com/iSamBa/pdf2mcp/issues"
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
pdf2mcp = "pdf2mcp.cli:main"
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["hatchling"]
|
|
44
|
+
build-backend = "hatchling.build"
|
|
45
|
+
|
|
46
|
+
[tool.hatch.build.targets.wheel]
|
|
47
|
+
packages = ["src/pdf2mcp"]
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
dev = [
|
|
51
|
+
"pytest>=8.0",
|
|
52
|
+
"ruff>=0.4",
|
|
53
|
+
"mypy>=1.10",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
line-length = 88
|
|
58
|
+
target-version = "py310"
|
|
59
|
+
|
|
60
|
+
[tool.ruff.lint]
|
|
61
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
62
|
+
|
|
63
|
+
[tool.mypy]
|
|
64
|
+
python_version = "3.10"
|
|
65
|
+
strict = true
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
testpaths = ["tests"]
|