kreuzberg 3.6.2__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.7.0.dist-info}/METADATA +54 -2
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.7.0.dist-info}/RECORD +7 -5
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.7.0.dist-info}/entry_points.txt +1 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.7.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.7.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_mcp/server.py
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
"""Kreuzberg MCP server implementation."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import base64
|
6
|
+
from typing import Any
|
7
|
+
|
8
|
+
from mcp.server import FastMCP
|
9
|
+
from mcp.types import TextContent
|
10
|
+
|
11
|
+
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
12
|
+
from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
|
13
|
+
|
14
|
+
# Create the MCP server
|
15
|
+
mcp = FastMCP("Kreuzberg Text Extraction")
|
16
|
+
|
17
|
+
|
18
|
+
@mcp.tool()
|
19
|
+
def extract_document( # noqa: PLR0913
|
20
|
+
file_path: str,
|
21
|
+
mime_type: str | None = None,
|
22
|
+
force_ocr: bool = False,
|
23
|
+
chunk_content: bool = False,
|
24
|
+
extract_tables: bool = False,
|
25
|
+
extract_entities: bool = False,
|
26
|
+
extract_keywords: bool = False,
|
27
|
+
ocr_backend: OcrBackendType = "tesseract",
|
28
|
+
max_chars: int = 1000,
|
29
|
+
max_overlap: int = 200,
|
30
|
+
keyword_count: int = 10,
|
31
|
+
auto_detect_language: bool = False,
|
32
|
+
) -> dict[str, Any]:
|
33
|
+
"""Extract text content from a document file.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
file_path: Path to the document file
|
37
|
+
mime_type: MIME type of the document (auto-detected if not provided)
|
38
|
+
force_ocr: Force OCR even for text-based documents
|
39
|
+
chunk_content: Split content into chunks
|
40
|
+
extract_tables: Extract tables from the document
|
41
|
+
extract_entities: Extract named entities
|
42
|
+
extract_keywords: Extract keywords
|
43
|
+
ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
|
44
|
+
max_chars: Maximum characters per chunk
|
45
|
+
max_overlap: Character overlap between chunks
|
46
|
+
keyword_count: Number of keywords to extract
|
47
|
+
auto_detect_language: Auto-detect document language
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Extracted content with metadata, tables, chunks, entities, and keywords
|
51
|
+
"""
|
52
|
+
config = ExtractionConfig(
|
53
|
+
force_ocr=force_ocr,
|
54
|
+
chunk_content=chunk_content,
|
55
|
+
extract_tables=extract_tables,
|
56
|
+
extract_entities=extract_entities,
|
57
|
+
extract_keywords=extract_keywords,
|
58
|
+
ocr_backend=ocr_backend,
|
59
|
+
max_chars=max_chars,
|
60
|
+
max_overlap=max_overlap,
|
61
|
+
keyword_count=keyword_count,
|
62
|
+
auto_detect_language=auto_detect_language,
|
63
|
+
)
|
64
|
+
|
65
|
+
result = extract_file_sync(file_path, mime_type, config)
|
66
|
+
return result.to_dict()
|
67
|
+
|
68
|
+
|
69
|
+
@mcp.tool()
|
70
|
+
def extract_bytes( # noqa: PLR0913
|
71
|
+
content_base64: str,
|
72
|
+
mime_type: str,
|
73
|
+
force_ocr: bool = False,
|
74
|
+
chunk_content: bool = False,
|
75
|
+
extract_tables: bool = False,
|
76
|
+
extract_entities: bool = False,
|
77
|
+
extract_keywords: bool = False,
|
78
|
+
ocr_backend: OcrBackendType = "tesseract",
|
79
|
+
max_chars: int = 1000,
|
80
|
+
max_overlap: int = 200,
|
81
|
+
keyword_count: int = 10,
|
82
|
+
auto_detect_language: bool = False,
|
83
|
+
) -> dict[str, Any]:
|
84
|
+
"""Extract text content from document bytes.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
content_base64: Base64-encoded document content
|
88
|
+
mime_type: MIME type of the document
|
89
|
+
force_ocr: Force OCR even for text-based documents
|
90
|
+
chunk_content: Split content into chunks
|
91
|
+
extract_tables: Extract tables from the document
|
92
|
+
extract_entities: Extract named entities
|
93
|
+
extract_keywords: Extract keywords
|
94
|
+
ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
|
95
|
+
max_chars: Maximum characters per chunk
|
96
|
+
max_overlap: Character overlap between chunks
|
97
|
+
keyword_count: Number of keywords to extract
|
98
|
+
auto_detect_language: Auto-detect document language
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Extracted content with metadata, tables, chunks, entities, and keywords
|
102
|
+
"""
|
103
|
+
content_bytes = base64.b64decode(content_base64)
|
104
|
+
|
105
|
+
config = ExtractionConfig(
|
106
|
+
force_ocr=force_ocr,
|
107
|
+
chunk_content=chunk_content,
|
108
|
+
extract_tables=extract_tables,
|
109
|
+
extract_entities=extract_entities,
|
110
|
+
extract_keywords=extract_keywords,
|
111
|
+
ocr_backend=ocr_backend,
|
112
|
+
max_chars=max_chars,
|
113
|
+
max_overlap=max_overlap,
|
114
|
+
keyword_count=keyword_count,
|
115
|
+
auto_detect_language=auto_detect_language,
|
116
|
+
)
|
117
|
+
|
118
|
+
result = extract_bytes_sync(content_bytes, mime_type, config)
|
119
|
+
return result.to_dict()
|
120
|
+
|
121
|
+
|
122
|
+
@mcp.tool()
|
123
|
+
def extract_simple(
|
124
|
+
file_path: str,
|
125
|
+
mime_type: str | None = None,
|
126
|
+
) -> str:
|
127
|
+
"""Simple text extraction from a document file.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
file_path: Path to the document file
|
131
|
+
mime_type: MIME type of the document (auto-detected if not provided)
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
Extracted text content as a string
|
135
|
+
"""
|
136
|
+
config = ExtractionConfig()
|
137
|
+
result = extract_file_sync(file_path, mime_type, config)
|
138
|
+
return result.content
|
139
|
+
|
140
|
+
|
141
|
+
@mcp.resource("config://default")
|
142
|
+
def get_default_config() -> str:
|
143
|
+
"""Get the default extraction configuration."""
|
144
|
+
config = ExtractionConfig()
|
145
|
+
return str(config.__dict__)
|
146
|
+
|
147
|
+
|
148
|
+
@mcp.resource("config://available-backends")
|
149
|
+
def get_available_backends() -> str:
|
150
|
+
"""Get available OCR backends."""
|
151
|
+
return "tesseract, easyocr, paddleocr"
|
152
|
+
|
153
|
+
|
154
|
+
@mcp.resource("extractors://supported-formats")
|
155
|
+
def get_supported_formats() -> str:
|
156
|
+
"""Get supported document formats."""
|
157
|
+
return """
|
158
|
+
Supported formats:
|
159
|
+
- PDF documents
|
160
|
+
- Images (PNG, JPG, JPEG, TIFF, BMP, WEBP)
|
161
|
+
- Office documents (DOCX, PPTX, XLSX)
|
162
|
+
- HTML files
|
163
|
+
- Text files (TXT, CSV, TSV)
|
164
|
+
- And more...
|
165
|
+
"""
|
166
|
+
|
167
|
+
|
168
|
+
@mcp.prompt()
|
169
|
+
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
170
|
+
"""Extract text from a document and provide a summary prompt.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
file_path: Path to the document file
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Extracted content with summarization prompt
|
177
|
+
"""
|
178
|
+
result = extract_file_sync(file_path, None, ExtractionConfig())
|
179
|
+
|
180
|
+
return [
|
181
|
+
TextContent(
|
182
|
+
type="text",
|
183
|
+
text=f"Document Content:\n{result.content}\n\nPlease provide a concise summary of this document.",
|
184
|
+
)
|
185
|
+
]
|
186
|
+
|
187
|
+
|
188
|
+
@mcp.prompt()
|
189
|
+
def extract_structured(file_path: str) -> list[TextContent]:
|
190
|
+
"""Extract text with structured analysis prompt.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
file_path: Path to the document file
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Extracted content with structured analysis prompt
|
197
|
+
"""
|
198
|
+
config = ExtractionConfig(
|
199
|
+
extract_entities=True,
|
200
|
+
extract_keywords=True,
|
201
|
+
extract_tables=True,
|
202
|
+
)
|
203
|
+
result = extract_file_sync(file_path, None, config)
|
204
|
+
|
205
|
+
content = f"Document Content:\n{result.content}\n\n"
|
206
|
+
|
207
|
+
if result.entities:
|
208
|
+
content += f"Entities: {[f'{e.text} ({e.type})' for e in result.entities]}\n\n"
|
209
|
+
|
210
|
+
if result.keywords:
|
211
|
+
content += f"Keywords: {[f'{kw[0]} ({kw[1]:.2f})' for kw in result.keywords]}\n\n"
|
212
|
+
|
213
|
+
if result.tables:
|
214
|
+
content += f"Tables found: {len(result.tables)}\n\n"
|
215
|
+
|
216
|
+
content += "Please analyze this document and provide structured insights."
|
217
|
+
|
218
|
+
return [TextContent(type="text", text=content)]
|
219
|
+
|
220
|
+
|
221
|
+
def main() -> None:
|
222
|
+
"""Main entry point for the MCP server."""
|
223
|
+
mcp.run()
|
224
|
+
|
225
|
+
|
226
|
+
if __name__ == "__main__":
|
227
|
+
main()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.7.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -26,6 +26,7 @@ Requires-Dist: anyio>=4.9.0
|
|
26
26
|
Requires-Dist: charset-normalizer>=3.4.2
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
28
|
Requires-Dist: html-to-markdown[lxml]>=1.6.0
|
29
|
+
Requires-Dist: mcp>=1.11.0
|
29
30
|
Requires-Dist: msgspec>=0.18.0
|
30
31
|
Requires-Dist: playa-pdf>=0.6.1
|
31
32
|
Requires-Dist: psutil>=7.0.0
|
@@ -90,7 +91,8 @@ Description-Content-Type: text/markdown
|
|
90
91
|
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
91
92
|
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
92
93
|
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
93
|
-
-
|
94
|
+
- **🤖 AI Integration**: Native MCP server for Claude and other AI tools
|
95
|
+
- **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
|
94
96
|
|
95
97
|
## Quick Start
|
96
98
|
|
@@ -136,6 +138,55 @@ asyncio.run(main())
|
|
136
138
|
|
137
139
|
## Deployment Options
|
138
140
|
|
141
|
+
### 🤖 MCP Server (AI Integration)
|
142
|
+
|
143
|
+
**Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
|
144
|
+
|
145
|
+
```bash
|
146
|
+
# Install and run MCP server with all features (recommended)
|
147
|
+
pip install "kreuzberg[all]"
|
148
|
+
kreuzberg-mcp
|
149
|
+
|
150
|
+
# Or with uvx (recommended for Claude Desktop)
|
151
|
+
uvx --with "kreuzberg[all]" kreuzberg-mcp
|
152
|
+
|
153
|
+
# Basic installation (core features only)
|
154
|
+
pip install kreuzberg
|
155
|
+
kreuzberg-mcp
|
156
|
+
```
|
157
|
+
|
158
|
+
**Configure in Claude Desktop (`claude_desktop_config.json`):**
|
159
|
+
|
160
|
+
```json
|
161
|
+
{
|
162
|
+
"mcpServers": {
|
163
|
+
"kreuzberg": {
|
164
|
+
"command": "uvx",
|
165
|
+
"args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
|
166
|
+
}
|
167
|
+
}
|
168
|
+
}
|
169
|
+
```
|
170
|
+
|
171
|
+
**Basic configuration (core features only):**
|
172
|
+
|
173
|
+
```json
|
174
|
+
{
|
175
|
+
"mcpServers": {
|
176
|
+
"kreuzberg": {
|
177
|
+
"command": "uvx",
|
178
|
+
"args": ["kreuzberg-mcp"]
|
179
|
+
}
|
180
|
+
}
|
181
|
+
}
|
182
|
+
```
|
183
|
+
|
184
|
+
**Available MCP capabilities:**
|
185
|
+
|
186
|
+
- **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
|
187
|
+
- **Resources**: Configuration, supported formats, OCR backends
|
188
|
+
- **Prompts**: Extract-and-summarize, structured analysis workflows
|
189
|
+
|
139
190
|
### 🐳 Docker (Recommended)
|
140
191
|
|
141
192
|
```bash
|
@@ -221,6 +272,7 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
221
272
|
|
222
273
|
## Advanced Features
|
223
274
|
|
275
|
+
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
224
276
|
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
225
277
|
- **🧩 Content Chunking**: Split documents for RAG applications
|
226
278
|
- **🎯 Custom Extractors**: Extend with your own document handlers
|
@@ -24,6 +24,8 @@ kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6
|
|
24
24
|
kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
|
25
25
|
kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
|
26
26
|
kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
|
27
|
+
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
28
|
+
kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
|
27
29
|
kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
|
28
30
|
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
|
29
31
|
kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
|
@@ -47,8 +49,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
|
|
47
49
|
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
48
50
|
kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
|
49
51
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
50
|
-
kreuzberg-3.
|
51
|
-
kreuzberg-3.
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
54
|
-
kreuzberg-3.
|
52
|
+
kreuzberg-3.7.0.dist-info/METADATA,sha256=0rBXhtDYCdZ2AGpQdXTTZUQUX8T01OsKzwrm2nl14QA,11137
|
53
|
+
kreuzberg-3.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
54
|
+
kreuzberg-3.7.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
55
|
+
kreuzberg-3.7.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
56
|
+
kreuzberg-3.7.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|