thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdfbrain/mcp_server.py ADDED
@@ -0,0 +1,315 @@
1
+ """
2
+ thinkpdf MCP Server v2 - Production-ready MCP server for IDE integration.
3
+
4
+ Uses Docling (IBM) for maximum quality PDF to Markdown conversion.
5
+
6
+ Tools:
7
+ - convert_pdf: Convert PDF/DOCX/PPTX to Markdown
8
+ - read_pdf: Convert and return content directly (best for IAs)
9
+ - get_document_info: Get metadata about a document
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Optional
18
+
19
+ from .engine import thinkpdfEngine, HAS_DOCLING, HAS_PDFMD
20
+
21
+
22
+ class thinkpdfMCPServer:
23
+ """
24
+ MCP Server for thinkpdf v2.
25
+
26
+ Allows Cursor, Antigravity, and other MCP-compatible tools to
27
+ convert and read PDFs directly.
28
+ """
29
+
30
+ def __init__(self):
31
+ self.engine = thinkpdfEngine()
32
+
33
+ def handle_request(self, request: Dict[str, Any]) -> Optional[Dict[str, Any]]:
34
+ """Handle an incoming MCP request."""
35
+ method = request.get("method", "")
36
+ params = request.get("params", {})
37
+ request_id = request.get("id")
38
+
39
+ # Notifications (no id) don't get responses
40
+ is_notification = request_id is None
41
+
42
+ try:
43
+ if method == "initialize":
44
+ result = self._handle_initialize(params)
45
+ elif method == "initialized":
46
+ # This is a notification, no response needed
47
+ return None
48
+ elif method == "notifications/initialized":
49
+ # This is a notification, no response needed
50
+ return None
51
+ elif method == "tools/list":
52
+ result = self._handle_list_tools()
53
+ elif method == "tools/call":
54
+ result = self._handle_tool_call(params)
55
+ elif method == "ping":
56
+ result = {}
57
+ elif method == "resources/list":
58
+ result = {"resources": []}
59
+ elif method == "prompts/list":
60
+ result = {"prompts": []}
61
+ else:
62
+ if is_notification:
63
+ return None
64
+ return self._error_response(request_id, -32601, f"Method not found: {method}")
65
+
66
+ # Notifications don't get responses
67
+ if is_notification:
68
+ return None
69
+
70
+ return {
71
+ "jsonrpc": "2.0",
72
+ "id": request_id,
73
+ "result": result,
74
+ }
75
+
76
+ except Exception as e:
77
+ if is_notification:
78
+ return None
79
+ return self._error_response(request_id, -32603, str(e))
80
+
81
+ def _handle_initialize(self, params: Dict[str, Any]) -> Dict[str, Any]:
82
+ """Handle initialization request."""
83
+ engine_info = "Docling (IBM)" if HAS_DOCLING else "pdfmd"
84
+
85
+ return {
86
+ "protocolVersion": "2024-11-05",
87
+ "capabilities": {
88
+ "tools": {},
89
+ },
90
+ "serverInfo": {
91
+ "name": "thinkpdf",
92
+ "version": "2.0.0",
93
+ "description": f"PDF to Markdown converter powered by {engine_info}",
94
+ },
95
+ }
96
+
97
+ def _handle_list_tools(self) -> Dict[str, Any]:
98
+ """Return list of available tools."""
99
+ return {
100
+ "tools": [
101
+ {
102
+ "name": "read_pdf",
103
+ "description": "Read a PDF file and return its content as Markdown. Best for reading documents in chat.",
104
+ "inputSchema": {
105
+ "type": "object",
106
+ "properties": {
107
+ "path": {
108
+ "type": "string",
109
+ "description": "Absolute path to the PDF file",
110
+ },
111
+ "engine": {
112
+ "type": "string",
113
+ "description": "Conversion engine: 'auto' (default), 'docling' (high quality, slow), or 'pdfmd' (fast)",
114
+ "enum": ["auto", "docling", "pdfmd"],
115
+ },
116
+ },
117
+ "required": ["path"],
118
+ },
119
+ },
120
+ {
121
+ "name": "convert_pdf",
122
+ "description": "Convert a PDF file to Markdown and save to disk.",
123
+ "inputSchema": {
124
+ "type": "object",
125
+ "properties": {
126
+ "path": {
127
+ "type": "string",
128
+ "description": "Absolute path to the PDF file",
129
+ },
130
+ "output": {
131
+ "type": "string",
132
+ "description": "Output path for markdown file (optional)",
133
+ },
134
+ "engine": {
135
+ "type": "string",
136
+ "description": "Conversion engine: 'auto' (default), 'docling' (high quality, slow), or 'pdfmd' (fast)",
137
+ "enum": ["auto", "docling", "pdfmd"],
138
+ },
139
+ },
140
+ "required": ["path"],
141
+ },
142
+ },
143
+ {
144
+ "name": "get_document_info",
145
+ "description": "Get information about a document (page count, size, etc.)",
146
+ "inputSchema": {
147
+ "type": "object",
148
+ "properties": {
149
+ "path": {
150
+ "type": "string",
151
+ "description": "Absolute path to the document",
152
+ },
153
+ },
154
+ "required": ["path"],
155
+ },
156
+ },
157
+ ],
158
+ }
159
+
160
+ def _handle_tool_call(self, params: Dict[str, Any]) -> Dict[str, Any]:
161
+ """Handle a tool call."""
162
+ tool_name = params.get("name", "")
163
+ arguments = params.get("arguments", {})
164
+
165
+ if tool_name == "read_pdf":
166
+ return self._tool_read_pdf(arguments)
167
+ elif tool_name == "convert_pdf":
168
+ return self._tool_convert_pdf(arguments)
169
+ elif tool_name == "get_document_info":
170
+ return self._tool_get_document_info(arguments)
171
+ else:
172
+ raise ValueError(f"Unknown tool: {tool_name}")
173
+
174
+ def _tool_read_pdf(self, args: Dict[str, Any]) -> Dict[str, Any]:
175
+ """Read a PDF and return content as markdown."""
176
+ path = args.get("path")
177
+ engine_choice = args.get("engine", "auto")
178
+ if not path:
179
+ raise ValueError("path is required")
180
+
181
+ pdf_path = Path(path)
182
+ if not pdf_path.exists():
183
+ raise ValueError(f"File not found: {path}")
184
+
185
+ # Convert using engine
186
+ # We need to re-instantiate engine if a specific one is requested different from default
187
+ # Or just pass it if convert accepted it. convert() doesn't take engine arg, engine is set on init.
188
+ # So we create a temporary engine instance with the requested type.
189
+
190
+ temp_engine = thinkpdfEngine(engine=engine_choice)
191
+ markdown = temp_engine.convert(pdf_path)
192
+
193
+ return {
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"# Content from: {pdf_path.name}\n\n{markdown}",
198
+ },
199
+ ],
200
+ "isError": False,
201
+ }
202
+
203
+ def _tool_convert_pdf(self, args: Dict[str, Any]) -> Dict[str, Any]:
204
+ """Convert PDF to markdown file."""
205
+ path = args.get("path")
206
+ engine_choice = args.get("engine", "auto")
207
+ if not path:
208
+ raise ValueError("path is required")
209
+
210
+ pdf_path = Path(path)
211
+ if not pdf_path.exists():
212
+ raise ValueError(f"File not found: {path}")
213
+
214
+ # Determine output path
215
+ output = args.get("output")
216
+ if output:
217
+ output_path = Path(output)
218
+ else:
219
+ output_path = pdf_path.with_suffix(".md")
220
+
221
+ # Convert
222
+ temp_engine = thinkpdfEngine(engine=engine_choice)
223
+ markdown = temp_engine.convert(pdf_path, output_path)
224
+ word_count = len(markdown.split())
225
+
226
+ return {
227
+ "content": [
228
+ {
229
+ "type": "text",
230
+ "text": f"Converted {pdf_path.name} to {output_path.name}\n"
231
+ f"Words: {word_count}\n"
232
+ f"Output: {output_path}",
233
+ },
234
+ ],
235
+ "isError": False,
236
+ }
237
+
238
+ def _tool_get_document_info(self, args: Dict[str, Any]) -> Dict[str, Any]:
239
+ """Get document information."""
240
+ path = args.get("path")
241
+ if not path:
242
+ raise ValueError("path is required")
243
+
244
+ doc_path = Path(path)
245
+ if not doc_path.exists():
246
+ raise ValueError(f"File not found: {path}")
247
+
248
+ info = self.engine.get_document_info(doc_path)
249
+
250
+ return {
251
+ "content": [
252
+ {
253
+ "type": "text",
254
+ "text": json.dumps(info, indent=2),
255
+ },
256
+ ],
257
+ "isError": False,
258
+ }
259
+
260
+ def _error_response(self, request_id: Any, code: int, message: str) -> Dict[str, Any]:
261
+ """Create an error response."""
262
+ return {
263
+ "jsonrpc": "2.0",
264
+ "id": request_id,
265
+ "error": {
266
+ "code": code,
267
+ "message": message,
268
+ },
269
+ }
270
+
271
+ def run_stdio(self):
272
+ """Run the server on stdio."""
273
+ import io
274
+
275
+ # Redirect stderr to avoid polluting stdout
276
+ sys.stderr = io.StringIO()
277
+
278
+ # Set stdout to binary mode for clean output
279
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', newline='\n')
280
+
281
+ while True:
282
+ try:
283
+ line = sys.stdin.readline()
284
+ if not line:
285
+ break
286
+
287
+ line = line.strip()
288
+ if not line:
289
+ continue
290
+
291
+ request = json.loads(line)
292
+ response = self.handle_request(request)
293
+
294
+ # Only send response if there is one (notifications return None)
295
+ if response is not None:
296
+ sys.stdout.write(json.dumps(response) + "\n")
297
+ sys.stdout.flush()
298
+
299
+ except json.JSONDecodeError:
300
+ continue
301
+ except KeyboardInterrupt:
302
+ break
303
+ except Exception:
304
+ # Silently ignore errors to avoid corrupting protocol
305
+ continue
306
+
307
+
308
+ def main():
309
+ """Entry point for MCP server."""
310
+ server = thinkpdfMCPServer()
311
+ server.run_stdio()
312
+
313
+
314
+ if __name__ == "__main__":
315
+ main()
@@ -0,0 +1 @@
1
+ """Utility functions and helpers."""
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: thinkpdf
3
+ Version: 1.0.1
4
+ Summary: PDF to Markdown engine for LLMs. Smart table extraction, OCR, MCP server for AI IDEs.
5
+ Author-email: Augusto Cesar Perin <augustocesarperin@abstratuslabs.com>
6
+ License: AGPL-3.0
7
+ Keywords: pdf,markdown,converter,ocr,tables,math,latex,llm,ai,mcp,cursor,docling
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: pymupdf>=1.23.0
23
+ Provides-Extra: docling
24
+ Requires-Dist: docling>=2.0.0; extra == "docling"
25
+ Provides-Extra: gui
26
+ Requires-Dist: customtkinter>=5.2.0; extra == "gui"
27
+ Requires-Dist: Pillow>=10.0.0; extra == "gui"
28
+ Provides-Extra: ocr
29
+ Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
30
+ Provides-Extra: full
31
+ Requires-Dist: docling>=2.0.0; extra == "full"
32
+ Requires-Dist: customtkinter>=5.2.0; extra == "full"
33
+ Requires-Dist: Pillow>=10.0.0; extra == "full"
34
+ Requires-Dist: pytesseract>=0.3.10; extra == "full"
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
37
+ Requires-Dist: black>=23.0.0; extra == "dev"
38
+ Dynamic: license-file
39
+
40
+
41
+ # thinkpdf
42
+
43
+
44
+ Convert PDFs to clean Markdown for LLMs. Includes MCP Server for AI coding assistants.
45
+
46
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
47
+ [![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
48
+ [![PyPI](https://img.shields.io/pypi/v/thinkpdf.svg)](https://pypi.org/project/thinkpdf/)
49
+
50
+ ## Features
51
+
52
+ | Feature | Description |
53
+ |---------|-------------|
54
+ | **Smart Detection** | Auto-chooses best engine for each PDF |
55
+ | **High Table Accuracy** | Uses IBM Docling's TableFormer |
56
+ | **Fast Mode** | pdfmd for simple documents |
57
+ | **Smart Cache** | Never reprocess the same PDF |
58
+ | **MCP Server** | For AI coding assistants |
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ pip install thinkpdf
64
+ ```
65
+
66
+ For maximum quality (requires GPU):
67
+ ```bash
68
+ pip install thinkpdf[docling]
69
+ ```
70
+
71
+ ## Usage
72
+
73
+ ### MCP Server
74
+
75
+ Add to `~/.cursor/mcp.json`:
76
+ ```json
77
+ {
78
+ "mcpServers": {
79
+ "thinkpdf": {
80
+ "command": "python",
81
+ "args": ["-m", "thinkpdf.mcp_server"]
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ Then just ask:
88
+ > "Read the PDF at D:\docs\manual.pdf and explain it"
89
+
90
+ ### Python API
91
+
92
+ ```python
93
+ from thinkpdf import convert
94
+
95
+ # Simple conversion
96
+ markdown = convert("document.pdf")
97
+ print(markdown)
98
+
99
+ # With output file
100
+ convert("document.pdf", "output.md")
101
+ ```
102
+
103
+ ### CLI
104
+
105
+ ```bash
106
+ thinkpdf document.pdf -o output.md
107
+ ```
108
+
109
+ ## How It Works
110
+
111
+ 1. Analyzes PDF complexity (tables, scans, simple text)
112
+ 2. Chooses best engine (Docling for complex, pdfmd for simple)
113
+ 3. Checks cache (instant if already converted)
114
+ 4. Converts to structured Markdown
115
+ 5. Caches result for next time
116
+
117
+
118
+ ## MCP Tools
119
+
120
+ When using as MCP server:
121
+
122
+ | Tool | Description |
123
+ |------|-------------|
124
+ | `read_pdf` | Convert and return content directly |
125
+ | `convert_pdf` | Convert and save to file |
126
+ | `get_document_info` | Get PDF metadata |
127
+
128
+
129
+ ## Requirements
130
+
131
+ - Python 3.10+
132
+ - PyMuPDF (included)
133
+ - Docling (optional, for best quality)
134
+
135
+ ## License
136
+
137
+ **AGPLv3** - Open source license. Commercial use requires sharing source code.
138
+
@@ -0,0 +1,25 @@
1
+ pdfbrain/__init__.py,sha256=xZOyvYF9TeX_Oi8-b-2cEfdxuZEv0h5bJcasjLgRS0g,492
2
+ pdfbrain/app_gui.py,sha256=19ZLzrFDNxLSrs4bkKrMK32srKIIsnu5W25AQna3Wr8,16952
3
+ pdfbrain/cli.py,sha256=oViuFZVuZf6JeO3J-0cfAymHvCQZfVuzPiXZ4GPeE7Q,6717
4
+ pdfbrain/engine.py,sha256=AJ4kqWQ6S0g0O8v5jL20apLEjlZ39NeXyHs-lJ2ZAmM,13973
5
+ pdfbrain/mcp_server.py,sha256=JfNTpZ0HDv3EqJGnxVE5E_hd3WQk13v_rnNaB8PtwUU,11449
6
+ pdfbrain/cache/__init__.py,sha256=e5MPcfYPnJs2ZTqL848oH6ILTUiDJecjgkfY1f2RgnY,105
7
+ pdfbrain/cache/cache_manager.py,sha256=ymlN0LqHB-MC2bciVQfamB9ugMSWKG9VcLV0ZQ_VZOs,8118
8
+ pdfbrain/core/__init__.py,sha256=78gHd8_RNR4IpR543p_wktBs11iN_EqMMLVBb0_8oJU,169
9
+ pdfbrain/core/converter.py,sha256=ansR1HXoTk4vf4TWtV_jBedZjRB3-pA-qRyepc6zEas,10667
10
+ pdfbrain/core/equations.py,sha256=6w4fGYhAQwirx2R9SM4dU3VWoZeex5BKloEzMGWu_EY,18328
11
+ pdfbrain/core/extract.py,sha256=h5NtSRiHievAumbIM5j3JURgY17Y-xipe5vLFSboWfE,17114
12
+ pdfbrain/core/extractor.py,sha256=U97Sfno5dAIgfGTAq2AJd8gD6gxHZZG3HcReg1g106s,8509
13
+ pdfbrain/core/models.py,sha256=4_ZNeMssrN0T24QGILOqhuj5w8TdWmj3ePk35dOpv8k,7099
14
+ pdfbrain/core/pipeline.py,sha256=9kqz-lZCU3_pzbFAt0O5KxtENRXvu9zpsBOc-pGS40M,9073
15
+ pdfbrain/core/render.py,sha256=VdZl6k2SnQtYh0MzjcCvo3ezaS5WtVWLL3D-33KGn4o,18605
16
+ pdfbrain/core/tables.py,sha256=sFFqVU8UAznII3lTzWLN4JEY9Wc_cdAcgrH7mr8RpdA,26046
17
+ pdfbrain/core/transform.py,sha256=g7lFXKh0rrkwccc2k1m36EHvIlbopXXjLGQOKWRMwmk,19561
18
+ pdfbrain/core/utils.py,sha256=XVObtq7Qxa7F6nKmK55BcIg2Q7nJV1o7QRQZ9Z4kcOs,6239
19
+ pdfbrain/utils/__init__.py,sha256=hW2RNzEUnUkEZKxK4InN_pc-BJCXuTjYkoH8RNgWoNU,38
20
+ thinkpdf-1.0.1.dist-info/licenses/LICENSE,sha256=aTrK5WpSq3jDMilkSJ-c-SI1lGWXAiKi0hQQ_rFEpSo,33703
21
+ thinkpdf-1.0.1.dist-info/METADATA,sha256=mL1Yhjzn2XKRVy3zhlmuYSOySPZ-6aEHjJAYXchSMo8,3831
22
+ thinkpdf-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
+ thinkpdf-1.0.1.dist-info/entry_points.txt,sha256=c6fgizzX3H0_0G5cwl2ApQiM3GWEuoijvOB-zxlE2Kw,118
24
+ thinkpdf-1.0.1.dist-info/top_level.txt,sha256=ihDXWpk0wmfwfgF0Zl_NHkhlwsKxOFB7lIDZCSm7tdE,9
25
+ thinkpdf-1.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,4 @@
1
+ [console_scripts]
2
+ pdfmind = pdfmind.cli:main
3
+ pdfmind-gui = pdfmind.app_gui:main
4
+ pdfmind-mcp = pdfmind.mcp_server:main