PyPI - thinkpdf - Versions diffs - 1.0.1__py3-none-any.whl - Mend

thinkpdf 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pdfbrain/__init__.py +22 -0
pdfbrain/app_gui.py +530 -0
pdfbrain/cache/__init__.py +5 -0
pdfbrain/cache/cache_manager.py +252 -0
pdfbrain/cli.py +255 -0
pdfbrain/core/__init__.py +6 -0
pdfbrain/core/converter.py +332 -0
pdfbrain/core/equations.py +635 -0
pdfbrain/core/extract.py +469 -0
pdfbrain/core/extractor.py +272 -0
pdfbrain/core/models.py +196 -0
pdfbrain/core/pipeline.py +287 -0
pdfbrain/core/render.py +574 -0
pdfbrain/core/tables.py +871 -0
pdfbrain/core/transform.py +604 -0
pdfbrain/core/utils.py +229 -0
pdfbrain/engine.py +392 -0
pdfbrain/mcp_server.py +315 -0
pdfbrain/utils/__init__.py +1 -0
thinkpdf-1.0.1.dist-info/METADATA +138 -0
thinkpdf-1.0.1.dist-info/RECORD +25 -0
thinkpdf-1.0.1.dist-info/WHEEL +5 -0
thinkpdf-1.0.1.dist-info/entry_points.txt +4 -0
thinkpdf-1.0.1.dist-info/licenses/LICENSE +620 -0
thinkpdf-1.0.1.dist-info/top_level.txt +1 -0

pdfbrain/mcp_server.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""
+thinkpdf MCP Server v2 - Production-ready MCP server for IDE integration.
+Uses Docling (IBM) for maximum quality PDF to Markdown conversion.
+Tools:
+- convert_pdf: Convert PDF/DOCX/PPTX to Markdown
+- read_pdf: Convert and return content directly (best for IAs)
+- get_document_info: Get metadata about a document
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+from .engine import thinkpdfEngine, HAS_DOCLING, HAS_PDFMD
+class thinkpdfMCPServer:
+    """
+    MCP Server for thinkpdf v2.
+    Allows Cursor, Antigravity, and other MCP-compatible tools to
+    convert and read PDFs directly.
+    """
+    def __init__(self):
+        self.engine = thinkpdfEngine()
+    def handle_request(self, request: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Handle an incoming MCP request."""
+        method = request.get("method", "")
+        params = request.get("params", {})
+        request_id = request.get("id")
+        # Notifications (no id) don't get responses
+        is_notification = request_id is None
+        try:
+            if method == "initialize":
+                result = self._handle_initialize(params)
+            elif method == "initialized":
+                # This is a notification, no response needed
+                return None
+            elif method == "notifications/initialized":
+                # This is a notification, no response needed
+                return None
+            elif method == "tools/list":
+                result = self._handle_list_tools()
+            elif method == "tools/call":
+                result = self._handle_tool_call(params)
+            elif method == "ping":
+                result = {}
+            elif method == "resources/list":
+                result = {"resources": []}
+            elif method == "prompts/list":
+                result = {"prompts": []}
+            else:
+                if is_notification:
+                    return None
+                return self._error_response(request_id, -32601, f"Method not found: {method}")
+            # Notifications don't get responses
+            if is_notification:
+                return None
+            return {
+                "jsonrpc": "2.0",
+                "id": request_id,
+                "result": result,
+            }
+        except Exception as e:
+            if is_notification:
+                return None
+            return self._error_response(request_id, -32603, str(e))
+    def _handle_initialize(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle initialization request."""
+        engine_info = "Docling (IBM)" if HAS_DOCLING else "pdfmd"
+        return {
+            "protocolVersion": "2024-11-05",
+            "capabilities": {
+                "tools": {},
+            },
+            "serverInfo": {
+                "name": "thinkpdf",
+                "version": "2.0.0",
+                "description": f"PDF to Markdown converter powered by {engine_info}",
+            },
+        }
+    def _handle_list_tools(self) -> Dict[str, Any]:
+        """Return list of available tools."""
+        return {
+            "tools": [
+                {
+                    "name": "read_pdf",
+                    "description": "Read a PDF file and return its content as Markdown. Best for reading documents in chat.",
+                    "inputSchema": {
+                        "type": "object",
+                        "properties": {
+                            "path": {
+                                "type": "string",
+                                "description": "Absolute path to the PDF file",
+                            },
+                            "engine": {
+                                "type": "string",
+                                "description": "Conversion engine: 'auto' (default), 'docling' (high quality, slow), or 'pdfmd' (fast)",
+                                "enum": ["auto", "docling", "pdfmd"],
+                            },
+                        },
+                        "required": ["path"],
+                    },
+                },
+                {
+                    "name": "convert_pdf",
+                    "description": "Convert a PDF file to Markdown and save to disk.",
+                    "inputSchema": {
+                        "type": "object",
+                        "properties": {
+                            "path": {
+                                "type": "string",
+                                "description": "Absolute path to the PDF file",
+                            },
+                            "output": {
+                                "type": "string",
+                                "description": "Output path for markdown file (optional)",
+                            },
+                            "engine": {
+                                "type": "string",
+                                "description": "Conversion engine: 'auto' (default), 'docling' (high quality, slow), or 'pdfmd' (fast)",
+                                "enum": ["auto", "docling", "pdfmd"],
+                            },
+                        },
+                        "required": ["path"],
+                    },
+                },
+                {
+                    "name": "get_document_info",
+                    "description": "Get information about a document (page count, size, etc.)",
+                    "inputSchema": {
+                        "type": "object",
+                        "properties": {
+                            "path": {
+                                "type": "string",
+                                "description": "Absolute path to the document",
+                            },
+                        },
+                        "required": ["path"],
+                    },
+                },
+            ],
+        }
+    def _handle_tool_call(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle a tool call."""
+        tool_name = params.get("name", "")
+        arguments = params.get("arguments", {})
+        if tool_name == "read_pdf":
+            return self._tool_read_pdf(arguments)
+        elif tool_name == "convert_pdf":
+            return self._tool_convert_pdf(arguments)
+        elif tool_name == "get_document_info":
+            return self._tool_get_document_info(arguments)
+        else:
+            raise ValueError(f"Unknown tool: {tool_name}")
+    def _tool_read_pdf(self, args: Dict[str, Any]) -> Dict[str, Any]:
+        """Read a PDF and return content as markdown."""
+        path = args.get("path")
+        engine_choice = args.get("engine", "auto")
+        if not path:
+            raise ValueError("path is required")
+        pdf_path = Path(path)
+        if not pdf_path.exists():
+            raise ValueError(f"File not found: {path}")
+        # Convert using engine
+        # We need to re-instantiate engine if a specific one is requested different from default
+        # Or just pass it if convert accepted it. convert() doesn't take engine arg, engine is set on init.
+        # So we create a temporary engine instance with the requested type.
+        temp_engine = thinkpdfEngine(engine=engine_choice)
+        markdown = temp_engine.convert(pdf_path)
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"# Content from: {pdf_path.name}\n\n{markdown}",
+                },
+            ],
+            "isError": False,
+        }
+    def _tool_convert_pdf(self, args: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert PDF to markdown file."""
+        path = args.get("path")
+        engine_choice = args.get("engine", "auto")
+        if not path:
+            raise ValueError("path is required")
+        pdf_path = Path(path)
+        if not pdf_path.exists():
+            raise ValueError(f"File not found: {path}")
+        # Determine output path
+        output = args.get("output")
+        if output:
+            output_path = Path(output)
+        else:
+            output_path = pdf_path.with_suffix(".md")
+        # Convert
+        temp_engine = thinkpdfEngine(engine=engine_choice)
+        markdown = temp_engine.convert(pdf_path, output_path)
+        word_count = len(markdown.split())
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Converted {pdf_path.name} to {output_path.name}\n"
+                            f"Words: {word_count}\n"
+                            f"Output: {output_path}",
+                },
+            ],
+            "isError": False,
+        }
+    def _tool_get_document_info(self, args: Dict[str, Any]) -> Dict[str, Any]:
+        """Get document information."""
+        path = args.get("path")
+        if not path:
+            raise ValueError("path is required")
+        doc_path = Path(path)
+        if not doc_path.exists():
+            raise ValueError(f"File not found: {path}")
+        info = self.engine.get_document_info(doc_path)
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": json.dumps(info, indent=2),
+                },
+            ],
+            "isError": False,
+        }
+    def _error_response(self, request_id: Any, code: int, message: str) -> Dict[str, Any]:
+        """Create an error response."""
+        return {
+            "jsonrpc": "2.0",
+            "id": request_id,
+            "error": {
+                "code": code,
+                "message": message,
+            },
+        }
+    def run_stdio(self):
+        """Run the server on stdio."""
+        import io
+        # Redirect stderr to avoid polluting stdout
+        sys.stderr = io.StringIO()
+        # Set stdout to binary mode for clean output
+        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', newline='\n')
+        while True:
+            try:
+                line = sys.stdin.readline()
+                if not line:
+                    break
+                line = line.strip()
+                if not line:
+                    continue
+                request = json.loads(line)
+                response = self.handle_request(request)
+                # Only send response if there is one (notifications return None)
+                if response is not None:
+                    sys.stdout.write(json.dumps(response) + "\n")
+                    sys.stdout.flush()
+            except json.JSONDecodeError:
+                continue
+            except KeyboardInterrupt:
+                break
+            except Exception:
+                # Silently ignore errors to avoid corrupting protocol
+                continue
+def main():
+    """Entry point for MCP server."""
+    server = thinkpdfMCPServer()
+    server.run_stdio()
+if __name__ == "__main__":
+    main()

pdfbrain/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Utility functions and helpers."""

thinkpdf-1.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,138 @@
+Metadata-Version: 2.4
+Name: thinkpdf
+Version: 1.0.1
+Summary: PDF to Markdown engine for LLMs. Smart table extraction, OCR, MCP server for AI IDEs.
+Author-email: Augusto Cesar Perin <augustocesarperin@abstratuslabs.com>
+License: AGPL-3.0
+Keywords: pdf,markdown,converter,ocr,tables,math,latex,llm,ai,mcp,cursor,docling
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pymupdf>=1.23.0
+Provides-Extra: docling
+Requires-Dist: docling>=2.0.0; extra == "docling"
+Provides-Extra: gui
+Requires-Dist: customtkinter>=5.2.0; extra == "gui"
+Requires-Dist: Pillow>=10.0.0; extra == "gui"
+Provides-Extra: ocr
+Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
+Provides-Extra: full
+Requires-Dist: docling>=2.0.0; extra == "full"
+Requires-Dist: customtkinter>=5.2.0; extra == "full"
+Requires-Dist: Pillow>=10.0.0; extra == "full"
+Requires-Dist: pytesseract>=0.3.10; extra == "full"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.4.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Dynamic: license-file
+# thinkpdf
+Convert PDFs to clean Markdown for LLMs. Includes MCP Server for AI coding assistants.
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
+[![PyPI](https://img.shields.io/pypi/v/thinkpdf.svg)](https://pypi.org/project/thinkpdf/)
+## Features
+| Feature | Description |
+|---------|-------------|
+| **Smart Detection** | Auto-chooses best engine for each PDF |
+| **High Table Accuracy** | Uses IBM Docling's TableFormer |
+| **Fast Mode** | pdfmd for simple documents |
+| **Smart Cache** | Never reprocess the same PDF |
+| **MCP Server** | For AI coding assistants |
+## Installation
+```bash
+pip install thinkpdf
+```
+For maximum quality (requires GPU):
+```bash
+pip install thinkpdf[docling]
+```
+## Usage
+### MCP Server
+Add to `~/.cursor/mcp.json`:
+```json
+{
+  "mcpServers": {
+    "thinkpdf": {
+      "command": "python",
+      "args": ["-m", "thinkpdf.mcp_server"]
+    }
+  }
+}
+```
+Then just ask:
+> "Read the PDF at D:\docs\manual.pdf and explain it"
+### Python API
+```python
+from thinkpdf import convert
+# Simple conversion
+markdown = convert("document.pdf")
+print(markdown)
+# With output file
+convert("document.pdf", "output.md")
+```
+### CLI
+```bash
+thinkpdf document.pdf -o output.md
+```
+## How It Works
+1. Analyzes PDF complexity (tables, scans, simple text)
+2. Chooses best engine (Docling for complex, pdfmd for simple)
+3. Checks cache (instant if already converted)
+4. Converts to structured Markdown
+5. Caches result for next time
+## MCP Tools
+When using as MCP server:
+| Tool | Description |
+|------|-------------|
+| `read_pdf` | Convert and return content directly |
+| `convert_pdf` | Convert and save to file |
+| `get_document_info` | Get PDF metadata |
+## Requirements
+- Python 3.10+
+- PyMuPDF (included)
+- Docling (optional, for best quality)
+## License
+**AGPLv3** - Open source license. Commercial use requires sharing source code.

thinkpdf-1.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,25 @@
+pdfbrain/__init__.py,sha256=xZOyvYF9TeX_Oi8-b-2cEfdxuZEv0h5bJcasjLgRS0g,492
+pdfbrain/app_gui.py,sha256=19ZLzrFDNxLSrs4bkKrMK32srKIIsnu5W25AQna3Wr8,16952
+pdfbrain/cli.py,sha256=oViuFZVuZf6JeO3J-0cfAymHvCQZfVuzPiXZ4GPeE7Q,6717
+pdfbrain/engine.py,sha256=AJ4kqWQ6S0g0O8v5jL20apLEjlZ39NeXyHs-lJ2ZAmM,13973
+pdfbrain/mcp_server.py,sha256=JfNTpZ0HDv3EqJGnxVE5E_hd3WQk13v_rnNaB8PtwUU,11449
+pdfbrain/cache/__init__.py,sha256=e5MPcfYPnJs2ZTqL848oH6ILTUiDJecjgkfY1f2RgnY,105
+pdfbrain/cache/cache_manager.py,sha256=ymlN0LqHB-MC2bciVQfamB9ugMSWKG9VcLV0ZQ_VZOs,8118
+pdfbrain/core/__init__.py,sha256=78gHd8_RNR4IpR543p_wktBs11iN_EqMMLVBb0_8oJU,169
+pdfbrain/core/converter.py,sha256=ansR1HXoTk4vf4TWtV_jBedZjRB3-pA-qRyepc6zEas,10667
+pdfbrain/core/equations.py,sha256=6w4fGYhAQwirx2R9SM4dU3VWoZeex5BKloEzMGWu_EY,18328
+pdfbrain/core/extract.py,sha256=h5NtSRiHievAumbIM5j3JURgY17Y-xipe5vLFSboWfE,17114
+pdfbrain/core/extractor.py,sha256=U97Sfno5dAIgfGTAq2AJd8gD6gxHZZG3HcReg1g106s,8509
+pdfbrain/core/models.py,sha256=4_ZNeMssrN0T24QGILOqhuj5w8TdWmj3ePk35dOpv8k,7099
+pdfbrain/core/pipeline.py,sha256=9kqz-lZCU3_pzbFAt0O5KxtENRXvu9zpsBOc-pGS40M,9073
+pdfbrain/core/render.py,sha256=VdZl6k2SnQtYh0MzjcCvo3ezaS5WtVWLL3D-33KGn4o,18605
+pdfbrain/core/tables.py,sha256=sFFqVU8UAznII3lTzWLN4JEY9Wc_cdAcgrH7mr8RpdA,26046
+pdfbrain/core/transform.py,sha256=g7lFXKh0rrkwccc2k1m36EHvIlbopXXjLGQOKWRMwmk,19561
+pdfbrain/core/utils.py,sha256=XVObtq7Qxa7F6nKmK55BcIg2Q7nJV1o7QRQZ9Z4kcOs,6239
+pdfbrain/utils/__init__.py,sha256=hW2RNzEUnUkEZKxK4InN_pc-BJCXuTjYkoH8RNgWoNU,38
+thinkpdf-1.0.1.dist-info/licenses/LICENSE,sha256=aTrK5WpSq3jDMilkSJ-c-SI1lGWXAiKi0hQQ_rFEpSo,33703
+thinkpdf-1.0.1.dist-info/METADATA,sha256=mL1Yhjzn2XKRVy3zhlmuYSOySPZ-6aEHjJAYXchSMo8,3831
+thinkpdf-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+thinkpdf-1.0.1.dist-info/entry_points.txt,sha256=c6fgizzX3H0_0G5cwl2ApQiM3GWEuoijvOB-zxlE2Kw,118
+thinkpdf-1.0.1.dist-info/top_level.txt,sha256=ihDXWpk0wmfwfgF0Zl_NHkhlwsKxOFB7lIDZCSm7tdE,9
+thinkpdf-1.0.1.dist-info/RECORD,,

thinkpdf-1.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

thinkpdf-1.0.1.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,4 @@
+[console_scripts]
+pdfmind = pdfmind.cli:main
+pdfmind-gui = pdfmind.app_gui:main
+pdfmind-mcp = pdfmind.mcp_server:main