kreuzberg 3.6.1__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ """MCP server for Kreuzberg text extraction."""
2
+
3
+ from .server import mcp
4
+
5
+ __all__ = ["mcp"]
@@ -0,0 +1,227 @@
1
+ """Kreuzberg MCP server implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ from typing import Any
7
+
8
+ from mcp.server import FastMCP
9
+ from mcp.types import TextContent
10
+
11
+ from kreuzberg._types import ExtractionConfig, OcrBackendType
12
+ from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
13
+
14
+ # Create the MCP server
15
+ mcp = FastMCP("Kreuzberg Text Extraction")
16
+
17
+
18
+ @mcp.tool()
19
+ def extract_document( # noqa: PLR0913
20
+ file_path: str,
21
+ mime_type: str | None = None,
22
+ force_ocr: bool = False,
23
+ chunk_content: bool = False,
24
+ extract_tables: bool = False,
25
+ extract_entities: bool = False,
26
+ extract_keywords: bool = False,
27
+ ocr_backend: OcrBackendType = "tesseract",
28
+ max_chars: int = 1000,
29
+ max_overlap: int = 200,
30
+ keyword_count: int = 10,
31
+ auto_detect_language: bool = False,
32
+ ) -> dict[str, Any]:
33
+ """Extract text content from a document file.
34
+
35
+ Args:
36
+ file_path: Path to the document file
37
+ mime_type: MIME type of the document (auto-detected if not provided)
38
+ force_ocr: Force OCR even for text-based documents
39
+ chunk_content: Split content into chunks
40
+ extract_tables: Extract tables from the document
41
+ extract_entities: Extract named entities
42
+ extract_keywords: Extract keywords
43
+ ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
44
+ max_chars: Maximum characters per chunk
45
+ max_overlap: Character overlap between chunks
46
+ keyword_count: Number of keywords to extract
47
+ auto_detect_language: Auto-detect document language
48
+
49
+ Returns:
50
+ Extracted content with metadata, tables, chunks, entities, and keywords
51
+ """
52
+ config = ExtractionConfig(
53
+ force_ocr=force_ocr,
54
+ chunk_content=chunk_content,
55
+ extract_tables=extract_tables,
56
+ extract_entities=extract_entities,
57
+ extract_keywords=extract_keywords,
58
+ ocr_backend=ocr_backend,
59
+ max_chars=max_chars,
60
+ max_overlap=max_overlap,
61
+ keyword_count=keyword_count,
62
+ auto_detect_language=auto_detect_language,
63
+ )
64
+
65
+ result = extract_file_sync(file_path, mime_type, config)
66
+ return result.to_dict()
67
+
68
+
69
+ @mcp.tool()
70
+ def extract_bytes( # noqa: PLR0913
71
+ content_base64: str,
72
+ mime_type: str,
73
+ force_ocr: bool = False,
74
+ chunk_content: bool = False,
75
+ extract_tables: bool = False,
76
+ extract_entities: bool = False,
77
+ extract_keywords: bool = False,
78
+ ocr_backend: OcrBackendType = "tesseract",
79
+ max_chars: int = 1000,
80
+ max_overlap: int = 200,
81
+ keyword_count: int = 10,
82
+ auto_detect_language: bool = False,
83
+ ) -> dict[str, Any]:
84
+ """Extract text content from document bytes.
85
+
86
+ Args:
87
+ content_base64: Base64-encoded document content
88
+ mime_type: MIME type of the document
89
+ force_ocr: Force OCR even for text-based documents
90
+ chunk_content: Split content into chunks
91
+ extract_tables: Extract tables from the document
92
+ extract_entities: Extract named entities
93
+ extract_keywords: Extract keywords
94
+ ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
95
+ max_chars: Maximum characters per chunk
96
+ max_overlap: Character overlap between chunks
97
+ keyword_count: Number of keywords to extract
98
+ auto_detect_language: Auto-detect document language
99
+
100
+ Returns:
101
+ Extracted content with metadata, tables, chunks, entities, and keywords
102
+ """
103
+ content_bytes = base64.b64decode(content_base64)
104
+
105
+ config = ExtractionConfig(
106
+ force_ocr=force_ocr,
107
+ chunk_content=chunk_content,
108
+ extract_tables=extract_tables,
109
+ extract_entities=extract_entities,
110
+ extract_keywords=extract_keywords,
111
+ ocr_backend=ocr_backend,
112
+ max_chars=max_chars,
113
+ max_overlap=max_overlap,
114
+ keyword_count=keyword_count,
115
+ auto_detect_language=auto_detect_language,
116
+ )
117
+
118
+ result = extract_bytes_sync(content_bytes, mime_type, config)
119
+ return result.to_dict()
120
+
121
+
122
+ @mcp.tool()
123
+ def extract_simple(
124
+ file_path: str,
125
+ mime_type: str | None = None,
126
+ ) -> str:
127
+ """Simple text extraction from a document file.
128
+
129
+ Args:
130
+ file_path: Path to the document file
131
+ mime_type: MIME type of the document (auto-detected if not provided)
132
+
133
+ Returns:
134
+ Extracted text content as a string
135
+ """
136
+ config = ExtractionConfig()
137
+ result = extract_file_sync(file_path, mime_type, config)
138
+ return result.content
139
+
140
+
141
+ @mcp.resource("config://default")
142
+ def get_default_config() -> str:
143
+ """Get the default extraction configuration."""
144
+ config = ExtractionConfig()
145
+ return str(config.__dict__)
146
+
147
+
148
+ @mcp.resource("config://available-backends")
149
+ def get_available_backends() -> str:
150
+ """Get available OCR backends."""
151
+ return "tesseract, easyocr, paddleocr"
152
+
153
+
154
+ @mcp.resource("extractors://supported-formats")
155
+ def get_supported_formats() -> str:
156
+ """Get supported document formats."""
157
+ return """
158
+ Supported formats:
159
+ - PDF documents
160
+ - Images (PNG, JPG, JPEG, TIFF, BMP, WEBP)
161
+ - Office documents (DOCX, PPTX, XLSX)
162
+ - HTML files
163
+ - Text files (TXT, CSV, TSV)
164
+ - And more...
165
+ """
166
+
167
+
168
+ @mcp.prompt()
169
+ def extract_and_summarize(file_path: str) -> list[TextContent]:
170
+ """Extract text from a document and provide a summary prompt.
171
+
172
+ Args:
173
+ file_path: Path to the document file
174
+
175
+ Returns:
176
+ Extracted content with summarization prompt
177
+ """
178
+ result = extract_file_sync(file_path, None, ExtractionConfig())
179
+
180
+ return [
181
+ TextContent(
182
+ type="text",
183
+ text=f"Document Content:\n{result.content}\n\nPlease provide a concise summary of this document.",
184
+ )
185
+ ]
186
+
187
+
188
+ @mcp.prompt()
189
+ def extract_structured(file_path: str) -> list[TextContent]:
190
+ """Extract text with structured analysis prompt.
191
+
192
+ Args:
193
+ file_path: Path to the document file
194
+
195
+ Returns:
196
+ Extracted content with structured analysis prompt
197
+ """
198
+ config = ExtractionConfig(
199
+ extract_entities=True,
200
+ extract_keywords=True,
201
+ extract_tables=True,
202
+ )
203
+ result = extract_file_sync(file_path, None, config)
204
+
205
+ content = f"Document Content:\n{result.content}\n\n"
206
+
207
+ if result.entities:
208
+ content += f"Entities: {[f'{e.text} ({e.type})' for e in result.entities]}\n\n"
209
+
210
+ if result.keywords:
211
+ content += f"Keywords: {[f'{kw[0]} ({kw[1]:.2f})' for kw in result.keywords]}\n\n"
212
+
213
+ if result.tables:
214
+ content += f"Tables found: {len(result.tables)}\n\n"
215
+
216
+ content += "Please analyze this document and provide structured insights."
217
+
218
+ return [TextContent(type="text", text=content)]
219
+
220
+
221
+ def main() -> None:
222
+ """Main entry point for the MCP server."""
223
+ mcp.run()
224
+
225
+
226
+ if __name__ == "__main__":
227
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.6.1
3
+ Version: 3.7.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -25,7 +25,8 @@ Requires-Python: >=3.10
25
25
  Requires-Dist: anyio>=4.9.0
26
26
  Requires-Dist: charset-normalizer>=3.4.2
27
27
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
28
- Requires-Dist: html-to-markdown>=1.4.0
28
+ Requires-Dist: html-to-markdown[lxml]>=1.6.0
29
+ Requires-Dist: mcp>=1.11.0
29
30
  Requires-Dist: msgspec>=0.18.0
30
31
  Requires-Dist: playa-pdf>=0.6.1
31
32
  Requires-Dist: psutil>=7.0.0
@@ -83,14 +84,15 @@ Description-Content-Type: text/markdown
83
84
 
84
85
  ## Why Kreuzberg?
85
86
 
86
- - **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
87
- - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
87
+ - **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
88
+ - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
88
89
  - **⚡ Dual APIs**: Only library with both sync and async support
89
90
  - **🔧 Zero Configuration**: Works out of the box with sane defaults
90
91
  - **🏠 Local Processing**: No cloud dependencies or external API calls
91
92
  - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
92
93
  - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
93
- - **🐳 Production Ready**: CLI, REST API, and Docker images included
94
+ - **🤖 AI Integration**: Native MCP server for Claude and other AI tools
95
+ - **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
94
96
 
95
97
  ## Quick Start
96
98
 
@@ -136,17 +138,66 @@ asyncio.run(main())
136
138
 
137
139
  ## Deployment Options
138
140
 
141
+ ### 🤖 MCP Server (AI Integration)
142
+
143
+ **Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
144
+
145
+ ```bash
146
+ # Install and run MCP server with all features (recommended)
147
+ pip install "kreuzberg[all]"
148
+ kreuzberg-mcp
149
+
150
+ # Or with uvx (recommended for Claude Desktop)
151
+ uvx --with "kreuzberg[all]" kreuzberg-mcp
152
+
153
+ # Basic installation (core features only)
154
+ pip install kreuzberg
155
+ kreuzberg-mcp
156
+ ```
157
+
158
+ **Configure in Claude Desktop (`claude_desktop_config.json`):**
159
+
160
+ ```json
161
+ {
162
+ "mcpServers": {
163
+ "kreuzberg": {
164
+ "command": "uvx",
165
+ "args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
166
+ }
167
+ }
168
+ }
169
+ ```
170
+
171
+ **Basic configuration (core features only):**
172
+
173
+ ```json
174
+ {
175
+ "mcpServers": {
176
+ "kreuzberg": {
177
+ "command": "uvx",
178
+ "args": ["kreuzberg-mcp"]
179
+ }
180
+ }
181
+ }
182
+ ```
183
+
184
+ **Available MCP capabilities:**
185
+
186
+ - **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
187
+ - **Resources**: Configuration, supported formats, OCR backends
188
+ - **Prompts**: Extract-and-summarize, structured analysis workflows
189
+
139
190
  ### 🐳 Docker (Recommended)
140
191
 
141
192
  ```bash
142
193
  # Run API server
143
- docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
194
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
144
195
 
145
196
  # Extract files
146
197
  curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
147
198
  ```
148
199
 
149
- Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
200
+ Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
150
201
 
151
202
  ### 🌐 REST API
152
203
 
@@ -191,15 +242,20 @@ kreuzberg extract *.pdf --output-dir ./extracted/
191
242
 
192
243
  ## Performance
193
244
 
194
- **Fastest extraction speeds** with minimal resource usage:
245
+ **[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
246
+
247
+ | Library | Speed | Memory | Install Size | Dependencies | Success Rate |
248
+ | ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
249
+ | **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
250
+ | Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
251
+ | MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
252
+ | Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
195
253
 
196
- | Library | Speed | Memory | Size | Success Rate |
197
- | ------------- | -------------- | ------------- | ----------- | ------------ |
198
- | **Kreuzberg** | **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
199
- | Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
200
- | MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
201
- | Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
254
+ \*_Can achieve 75% reliability with 15% performance trade-off when configured_
255
+ †_Good on simple documents, struggles with large/complex files (>10MB)_
256
+ ‡_Frequently fails/times out on medium files (>1MB)_
202
257
 
258
+ > **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
203
259
  > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
204
260
 
205
261
  ## Documentation
@@ -216,6 +272,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
216
272
 
217
273
  ## Advanced Features
218
274
 
275
+ - **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
219
276
  - **📊 Table Extraction**: Extract tables from PDFs with GMFT
220
277
  - **🧩 Content Chunking**: Split documents for RAG applications
221
278
  - **🎯 Custom Extractors**: Extend with your own document handlers
@@ -233,7 +290,7 @@ ______________________________________________________________________
233
290
 
234
291
  <div align="center">
235
292
 
236
- **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
293
+ **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
237
294
 
238
295
  Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
239
296
 
@@ -24,6 +24,8 @@ kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6
24
24
  kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
25
25
  kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
26
26
  kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
27
+ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
28
+ kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
27
29
  kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
28
30
  kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
29
31
  kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
@@ -47,8 +49,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
47
49
  kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
48
50
  kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
49
51
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
- kreuzberg-3.6.1.dist-info/METADATA,sha256=JPTejc7zpahkvhZtUqTVPPVzQ-93aOPnx3l3EQXseok,9160
51
- kreuzberg-3.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
- kreuzberg-3.6.1.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
53
- kreuzberg-3.6.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
- kreuzberg-3.6.1.dist-info/RECORD,,
52
+ kreuzberg-3.7.0.dist-info/METADATA,sha256=0rBXhtDYCdZ2AGpQdXTTZUQUX8T01OsKzwrm2nl14QA,11137
53
+ kreuzberg-3.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
54
+ kreuzberg-3.7.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
55
+ kreuzberg-3.7.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
56
+ kreuzberg-3.7.0.dist-info/RECORD,,
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
2
  kreuzberg = kreuzberg.cli:cli
3
+ kreuzberg-mcp = kreuzberg._mcp.server:main