kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_base.py +40 -0
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +17 -18
- kreuzberg/_extractors/_pdf.py +68 -14
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_multiprocessing/__init__.py +2 -3
- kreuzberg/_ocr/__init__.py +30 -0
- kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
- kreuzberg/_ocr/_sync.py +566 -0
- kreuzberg/_ocr/_tesseract.py +6 -2
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +17 -2
- kreuzberg/_utils/_process_pool.py +178 -1
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
- kreuzberg-3.8.0.dist-info/RECORD +57 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg-3.6.2.dist-info/RECORD +0 -54
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -13,7 +13,7 @@ from python_calamine import CalamineWorkbook
|
|
13
13
|
|
14
14
|
from kreuzberg._extractors._base import Extractor
|
15
15
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
|
16
|
-
from kreuzberg._types import ExtractionResult
|
16
|
+
from kreuzberg._types import ExtractionResult, Metadata
|
17
17
|
from kreuzberg._utils._string import normalize_spaces
|
18
18
|
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
19
19
|
from kreuzberg._utils._tmp import create_temp_file
|
@@ -45,9 +45,14 @@ class SpreadSheetExtractor(Extractor):
|
|
45
45
|
try:
|
46
46
|
results: list[str] = await run_taskgroup(*tasks)
|
47
47
|
|
48
|
-
|
49
|
-
content="\n\n".join(results),
|
48
|
+
result = ExtractionResult(
|
49
|
+
content="\n\n".join(results),
|
50
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
51
|
+
metadata=self._extract_spreadsheet_metadata(workbook),
|
52
|
+
chunks=[],
|
50
53
|
)
|
54
|
+
|
55
|
+
return self._apply_quality_processing(result)
|
51
56
|
except ExceptionGroup as eg:
|
52
57
|
raise ParsingError(
|
53
58
|
"Failed to extract file data",
|
@@ -87,7 +92,14 @@ class SpreadSheetExtractor(Extractor):
|
|
87
92
|
sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
|
88
93
|
results.append(sheet_text)
|
89
94
|
|
90
|
-
|
95
|
+
result = ExtractionResult(
|
96
|
+
content="\n\n".join(results),
|
97
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
98
|
+
metadata=self._extract_spreadsheet_metadata(workbook),
|
99
|
+
chunks=[],
|
100
|
+
)
|
101
|
+
|
102
|
+
return self._apply_quality_processing(result)
|
91
103
|
except Exception as e:
|
92
104
|
raise ParsingError(
|
93
105
|
"Failed to extract file data",
|
@@ -181,3 +193,166 @@ class SpreadSheetExtractor(Extractor):
|
|
181
193
|
result = "\n".join(markdown_lines)
|
182
194
|
|
183
195
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
196
|
+
|
197
|
+
def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
198
|
+
"""Enhanced sheet processing with better table structure preservation."""
|
199
|
+
try:
|
200
|
+
# pandas is optional dependency
|
201
|
+
import pandas as pd
|
202
|
+
|
203
|
+
from kreuzberg._utils._table import enhance_table_markdown
|
204
|
+
|
205
|
+
sheet = workbook.get_sheet_by_name(sheet_name)
|
206
|
+
data = sheet.to_python()
|
207
|
+
|
208
|
+
if not data or not any(row for row in data):
|
209
|
+
return f"## {sheet_name}\n\n*Empty sheet*"
|
210
|
+
|
211
|
+
# Convert to DataFrame
|
212
|
+
df = pd.DataFrame(data)
|
213
|
+
|
214
|
+
# Clean up empty rows and columns
|
215
|
+
df = df.dropna(how="all").dropna(axis=1, how="all")
|
216
|
+
|
217
|
+
if df.empty:
|
218
|
+
return f"## {sheet_name}\n\n*No data*"
|
219
|
+
|
220
|
+
# Create a mock TableData for enhanced formatting
|
221
|
+
from PIL import Image
|
222
|
+
|
223
|
+
from kreuzberg._types import TableData
|
224
|
+
|
225
|
+
# Create a 1x1 transparent image as placeholder
|
226
|
+
placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
|
227
|
+
mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
|
228
|
+
|
229
|
+
enhanced_markdown = enhance_table_markdown(mock_table)
|
230
|
+
return f"## {sheet_name}\n\n{enhanced_markdown}"
|
231
|
+
|
232
|
+
except (ImportError, AttributeError, ValueError):
|
233
|
+
# Fallback to original method if pandas/table enhancement fails
|
234
|
+
return self._convert_sheet_to_text_sync(workbook, sheet_name)
|
235
|
+
|
236
|
+
@staticmethod
|
237
|
+
def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
|
238
|
+
"""Extract metadata from spreadsheet using python-calamine.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
workbook: CalamineWorkbook instance
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
Metadata dict using existing metadata keys where possible
|
245
|
+
"""
|
246
|
+
metadata: Metadata = {}
|
247
|
+
|
248
|
+
# Extract basic document properties
|
249
|
+
SpreadSheetExtractor._extract_document_properties(workbook, metadata)
|
250
|
+
|
251
|
+
# Add structural information
|
252
|
+
SpreadSheetExtractor._add_structure_info(workbook, metadata)
|
253
|
+
|
254
|
+
# Analyze content complexity
|
255
|
+
SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
|
256
|
+
|
257
|
+
return metadata
|
258
|
+
|
259
|
+
@staticmethod
|
260
|
+
def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
261
|
+
"""Extract basic document properties from workbook."""
|
262
|
+
with contextlib.suppress(AttributeError, Exception):
|
263
|
+
if not (hasattr(workbook, "metadata") and workbook.metadata):
|
264
|
+
return
|
265
|
+
|
266
|
+
props = workbook.metadata
|
267
|
+
|
268
|
+
# Basic properties mapping
|
269
|
+
property_mapping = {
|
270
|
+
"title": "title",
|
271
|
+
"author": "authors", # Convert to list
|
272
|
+
"subject": "subject",
|
273
|
+
"comments": "comments",
|
274
|
+
"keywords": "keywords", # Process separately
|
275
|
+
"category": "categories", # Convert to list
|
276
|
+
"company": "organization",
|
277
|
+
"manager": "modified_by",
|
278
|
+
}
|
279
|
+
|
280
|
+
for prop_name, meta_key in property_mapping.items():
|
281
|
+
if hasattr(props, prop_name) and (value := getattr(props, prop_name)):
|
282
|
+
if meta_key in ("authors", "categories"):
|
283
|
+
metadata[meta_key] = [value] # type: ignore[literal-required]
|
284
|
+
elif meta_key == "keywords":
|
285
|
+
keywords = [k.strip() for k in value.replace(";", ",").split(",") if k.strip()]
|
286
|
+
if keywords:
|
287
|
+
metadata[meta_key] = keywords # type: ignore[literal-required]
|
288
|
+
else:
|
289
|
+
metadata[meta_key] = value # type: ignore[literal-required]
|
290
|
+
|
291
|
+
# Handle dates separately
|
292
|
+
SpreadSheetExtractor._extract_date_properties(props, metadata)
|
293
|
+
|
294
|
+
@staticmethod
|
295
|
+
def _extract_date_properties(props: Any, metadata: Metadata) -> None:
|
296
|
+
"""Extract and format date properties."""
|
297
|
+
date_mapping = {"created": "created_at", "modified": "modified_at"}
|
298
|
+
|
299
|
+
for prop_name, meta_key in date_mapping.items():
|
300
|
+
if hasattr(props, prop_name) and (date_value := getattr(props, prop_name)):
|
301
|
+
with contextlib.suppress(Exception):
|
302
|
+
if hasattr(date_value, "isoformat"):
|
303
|
+
metadata[meta_key] = date_value.isoformat() # type: ignore[literal-required]
|
304
|
+
else:
|
305
|
+
metadata[meta_key] = str(date_value) # type: ignore[literal-required]
|
306
|
+
|
307
|
+
@staticmethod
|
308
|
+
def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
309
|
+
"""Add structural information about the spreadsheet."""
|
310
|
+
if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
|
311
|
+
return
|
312
|
+
|
313
|
+
sheet_count = len(workbook.sheet_names)
|
314
|
+
structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
|
315
|
+
|
316
|
+
# Don't list too many sheet names (magic number made constant)
|
317
|
+
max_sheet_names_to_list = 5
|
318
|
+
if sheet_count <= max_sheet_names_to_list:
|
319
|
+
structure_info += f": {', '.join(workbook.sheet_names)}"
|
320
|
+
|
321
|
+
metadata["description"] = structure_info
|
322
|
+
|
323
|
+
@staticmethod
|
324
|
+
def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
325
|
+
"""Analyze spreadsheet content for complexity indicators."""
|
326
|
+
with contextlib.suppress(Exception):
|
327
|
+
has_formulas = False
|
328
|
+
total_cells = 0
|
329
|
+
|
330
|
+
# Check only first few sheets for performance
|
331
|
+
max_sheets_to_check = 3
|
332
|
+
max_rows_to_check = 50
|
333
|
+
|
334
|
+
for sheet_name in workbook.sheet_names[:max_sheets_to_check]:
|
335
|
+
with contextlib.suppress(Exception):
|
336
|
+
sheet = workbook.get_sheet_by_name(sheet_name)
|
337
|
+
data = sheet.to_python()
|
338
|
+
|
339
|
+
for row in data[:max_rows_to_check]:
|
340
|
+
if not row: # Skip empty rows
|
341
|
+
continue
|
342
|
+
|
343
|
+
total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
|
344
|
+
|
345
|
+
# Check for formulas (simple heuristic)
|
346
|
+
if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
|
347
|
+
has_formulas = True
|
348
|
+
break
|
349
|
+
|
350
|
+
# Build summary
|
351
|
+
summary_parts = []
|
352
|
+
if total_cells > 0:
|
353
|
+
summary_parts.append(f"Contains {total_cells}+ data cells")
|
354
|
+
if has_formulas:
|
355
|
+
summary_parts.append("includes formulas")
|
356
|
+
|
357
|
+
if summary_parts and "summary" not in metadata:
|
358
|
+
metadata["summary"] = f"Spreadsheet that {', '.join(summary_parts)}."
|
@@ -0,0 +1,148 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
5
|
+
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
|
8
|
+
from kreuzberg._extractors._base import Extractor
|
9
|
+
from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
|
10
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
11
|
+
from kreuzberg._utils._string import normalize_spaces, safe_decode
|
12
|
+
from kreuzberg._utils._sync import run_sync
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
|
18
|
+
class StructuredDataExtractor(Extractor):
|
19
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
20
|
+
JSON_MIME_TYPE,
|
21
|
+
"text/json",
|
22
|
+
YAML_MIME_TYPE,
|
23
|
+
"text/yaml",
|
24
|
+
"text/x-yaml",
|
25
|
+
"application/yaml",
|
26
|
+
TOML_MIME_TYPE,
|
27
|
+
"text/toml",
|
28
|
+
}
|
29
|
+
|
30
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
31
|
+
return await run_sync(self.extract_bytes_sync, content)
|
32
|
+
|
33
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
34
|
+
content = await AsyncPath(path).read_bytes()
|
35
|
+
return await self.extract_bytes_async(content)
|
36
|
+
|
37
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
38
|
+
text_content = safe_decode(content)
|
39
|
+
|
40
|
+
try:
|
41
|
+
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
42
|
+
data = json.loads(text_content)
|
43
|
+
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
44
|
+
try:
|
45
|
+
import tomllib # type: ignore[import-not-found]
|
46
|
+
except ImportError:
|
47
|
+
try:
|
48
|
+
import tomli as tomllib # type: ignore[import-not-found]
|
49
|
+
except ImportError:
|
50
|
+
return ExtractionResult(
|
51
|
+
content=normalize_spaces(text_content),
|
52
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
53
|
+
metadata={"warning": "tomllib/tomli not available, returning raw text"},
|
54
|
+
chunks=[],
|
55
|
+
)
|
56
|
+
data = tomllib.loads(text_content)
|
57
|
+
else:
|
58
|
+
try:
|
59
|
+
import yaml
|
60
|
+
|
61
|
+
data = yaml.safe_load(text_content)
|
62
|
+
except ImportError:
|
63
|
+
return ExtractionResult(
|
64
|
+
content=normalize_spaces(text_content),
|
65
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
66
|
+
metadata={"warning": "PyYAML not available, returning raw text"},
|
67
|
+
chunks=[],
|
68
|
+
)
|
69
|
+
|
70
|
+
text_parts: list[str] = []
|
71
|
+
metadata: dict[str, Any] = {}
|
72
|
+
|
73
|
+
if isinstance(data, dict):
|
74
|
+
text_parts.extend(self._extract_from_dict(data, metadata))
|
75
|
+
elif isinstance(data, list):
|
76
|
+
text_parts.extend(self._extract_from_list(data, metadata))
|
77
|
+
else:
|
78
|
+
text_parts.append(str(data))
|
79
|
+
|
80
|
+
combined_text = "\n".join(text_parts) if text_parts else text_content
|
81
|
+
|
82
|
+
return ExtractionResult(
|
83
|
+
content=normalize_spaces(combined_text),
|
84
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
85
|
+
metadata=normalize_metadata(metadata),
|
86
|
+
chunks=[],
|
87
|
+
)
|
88
|
+
|
89
|
+
except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
|
90
|
+
return ExtractionResult(
|
91
|
+
content=normalize_spaces(text_content),
|
92
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
93
|
+
metadata={"parse_error": str(e)},
|
94
|
+
chunks=[],
|
95
|
+
)
|
96
|
+
|
97
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
98
|
+
content = path.read_bytes()
|
99
|
+
return self.extract_bytes_sync(content)
|
100
|
+
|
101
|
+
def _extract_from_dict(self, data: dict[str, Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
|
102
|
+
text_parts = []
|
103
|
+
|
104
|
+
for key, value in data.items():
|
105
|
+
full_key = f"{prefix}.{key}" if prefix else key
|
106
|
+
|
107
|
+
if isinstance(value, str) and value.strip():
|
108
|
+
text_parts.append(f"{full_key}: {value}")
|
109
|
+
|
110
|
+
if any(
|
111
|
+
text_field in key.lower()
|
112
|
+
for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
|
113
|
+
):
|
114
|
+
metadata[full_key] = value
|
115
|
+
|
116
|
+
elif isinstance(value, (int, float, bool)):
|
117
|
+
text_parts.append(f"{full_key}: {value}")
|
118
|
+
|
119
|
+
elif isinstance(value, dict):
|
120
|
+
text_parts.extend(self._extract_from_dict(value, metadata, full_key))
|
121
|
+
|
122
|
+
elif isinstance(value, list):
|
123
|
+
text_parts.extend(self._extract_from_list(value, metadata, full_key))
|
124
|
+
|
125
|
+
elif value is not None:
|
126
|
+
text_parts.append(f"{full_key}: {value!s}")
|
127
|
+
|
128
|
+
return text_parts
|
129
|
+
|
130
|
+
def _extract_from_list(self, data: list[Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
|
131
|
+
text_parts = []
|
132
|
+
|
133
|
+
for i, item in enumerate(data):
|
134
|
+
item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
|
135
|
+
|
136
|
+
if isinstance(item, str) and item.strip():
|
137
|
+
text_parts.append(f"{item_key}: {item}")
|
138
|
+
|
139
|
+
elif isinstance(item, dict):
|
140
|
+
text_parts.extend(self._extract_from_dict(item, metadata, item_key))
|
141
|
+
|
142
|
+
elif isinstance(item, list):
|
143
|
+
text_parts.extend(self._extract_from_list(item, metadata, item_key))
|
144
|
+
|
145
|
+
elif item is not None:
|
146
|
+
text_parts.append(f"{item_key}: {item!s}")
|
147
|
+
|
148
|
+
return text_parts
|
kreuzberg/_gmft.py
CHANGED
@@ -196,7 +196,7 @@ async def extract_tables( # noqa: PLR0915
|
|
196
196
|
|
197
197
|
try:
|
198
198
|
if use_isolated_process:
|
199
|
-
from kreuzberg._multiprocessing
|
199
|
+
from kreuzberg._multiprocessing import extract_tables_isolated_async
|
200
200
|
|
201
201
|
result = await extract_tables_isolated_async(file_path, config)
|
202
202
|
|
@@ -314,7 +314,7 @@ def extract_tables_sync(
|
|
314
314
|
return cached_result # type: ignore[no-any-return]
|
315
315
|
|
316
316
|
if use_isolated_process:
|
317
|
-
from kreuzberg._multiprocessing
|
317
|
+
from kreuzberg._multiprocessing import extract_tables_isolated
|
318
318
|
|
319
319
|
result = extract_tables_isolated(file_path, config)
|
320
320
|
|
kreuzberg/_mcp/server.py
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
"""Kreuzberg MCP server implementation."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import base64
|
6
|
+
from typing import Any
|
7
|
+
|
8
|
+
from mcp.server import FastMCP
|
9
|
+
from mcp.types import TextContent
|
10
|
+
|
11
|
+
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
12
|
+
from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
|
13
|
+
|
14
|
+
# Create the MCP server
|
15
|
+
mcp = FastMCP("Kreuzberg Text Extraction")
|
16
|
+
|
17
|
+
|
18
|
+
@mcp.tool()
|
19
|
+
def extract_document( # noqa: PLR0913
|
20
|
+
file_path: str,
|
21
|
+
mime_type: str | None = None,
|
22
|
+
force_ocr: bool = False,
|
23
|
+
chunk_content: bool = False,
|
24
|
+
extract_tables: bool = False,
|
25
|
+
extract_entities: bool = False,
|
26
|
+
extract_keywords: bool = False,
|
27
|
+
ocr_backend: OcrBackendType = "tesseract",
|
28
|
+
max_chars: int = 1000,
|
29
|
+
max_overlap: int = 200,
|
30
|
+
keyword_count: int = 10,
|
31
|
+
auto_detect_language: bool = False,
|
32
|
+
) -> dict[str, Any]:
|
33
|
+
"""Extract text content from a document file.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
file_path: Path to the document file
|
37
|
+
mime_type: MIME type of the document (auto-detected if not provided)
|
38
|
+
force_ocr: Force OCR even for text-based documents
|
39
|
+
chunk_content: Split content into chunks
|
40
|
+
extract_tables: Extract tables from the document
|
41
|
+
extract_entities: Extract named entities
|
42
|
+
extract_keywords: Extract keywords
|
43
|
+
ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
|
44
|
+
max_chars: Maximum characters per chunk
|
45
|
+
max_overlap: Character overlap between chunks
|
46
|
+
keyword_count: Number of keywords to extract
|
47
|
+
auto_detect_language: Auto-detect document language
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Extracted content with metadata, tables, chunks, entities, and keywords
|
51
|
+
"""
|
52
|
+
config = ExtractionConfig(
|
53
|
+
force_ocr=force_ocr,
|
54
|
+
chunk_content=chunk_content,
|
55
|
+
extract_tables=extract_tables,
|
56
|
+
extract_entities=extract_entities,
|
57
|
+
extract_keywords=extract_keywords,
|
58
|
+
ocr_backend=ocr_backend,
|
59
|
+
max_chars=max_chars,
|
60
|
+
max_overlap=max_overlap,
|
61
|
+
keyword_count=keyword_count,
|
62
|
+
auto_detect_language=auto_detect_language,
|
63
|
+
)
|
64
|
+
|
65
|
+
result = extract_file_sync(file_path, mime_type, config)
|
66
|
+
return result.to_dict()
|
67
|
+
|
68
|
+
|
69
|
+
@mcp.tool()
|
70
|
+
def extract_bytes( # noqa: PLR0913
|
71
|
+
content_base64: str,
|
72
|
+
mime_type: str,
|
73
|
+
force_ocr: bool = False,
|
74
|
+
chunk_content: bool = False,
|
75
|
+
extract_tables: bool = False,
|
76
|
+
extract_entities: bool = False,
|
77
|
+
extract_keywords: bool = False,
|
78
|
+
ocr_backend: OcrBackendType = "tesseract",
|
79
|
+
max_chars: int = 1000,
|
80
|
+
max_overlap: int = 200,
|
81
|
+
keyword_count: int = 10,
|
82
|
+
auto_detect_language: bool = False,
|
83
|
+
) -> dict[str, Any]:
|
84
|
+
"""Extract text content from document bytes.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
content_base64: Base64-encoded document content
|
88
|
+
mime_type: MIME type of the document
|
89
|
+
force_ocr: Force OCR even for text-based documents
|
90
|
+
chunk_content: Split content into chunks
|
91
|
+
extract_tables: Extract tables from the document
|
92
|
+
extract_entities: Extract named entities
|
93
|
+
extract_keywords: Extract keywords
|
94
|
+
ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
|
95
|
+
max_chars: Maximum characters per chunk
|
96
|
+
max_overlap: Character overlap between chunks
|
97
|
+
keyword_count: Number of keywords to extract
|
98
|
+
auto_detect_language: Auto-detect document language
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Extracted content with metadata, tables, chunks, entities, and keywords
|
102
|
+
"""
|
103
|
+
content_bytes = base64.b64decode(content_base64)
|
104
|
+
|
105
|
+
config = ExtractionConfig(
|
106
|
+
force_ocr=force_ocr,
|
107
|
+
chunk_content=chunk_content,
|
108
|
+
extract_tables=extract_tables,
|
109
|
+
extract_entities=extract_entities,
|
110
|
+
extract_keywords=extract_keywords,
|
111
|
+
ocr_backend=ocr_backend,
|
112
|
+
max_chars=max_chars,
|
113
|
+
max_overlap=max_overlap,
|
114
|
+
keyword_count=keyword_count,
|
115
|
+
auto_detect_language=auto_detect_language,
|
116
|
+
)
|
117
|
+
|
118
|
+
result = extract_bytes_sync(content_bytes, mime_type, config)
|
119
|
+
return result.to_dict()
|
120
|
+
|
121
|
+
|
122
|
+
@mcp.tool()
|
123
|
+
def extract_simple(
|
124
|
+
file_path: str,
|
125
|
+
mime_type: str | None = None,
|
126
|
+
) -> str:
|
127
|
+
"""Simple text extraction from a document file.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
file_path: Path to the document file
|
131
|
+
mime_type: MIME type of the document (auto-detected if not provided)
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
Extracted text content as a string
|
135
|
+
"""
|
136
|
+
config = ExtractionConfig()
|
137
|
+
result = extract_file_sync(file_path, mime_type, config)
|
138
|
+
return result.content
|
139
|
+
|
140
|
+
|
141
|
+
@mcp.resource("config://default")
|
142
|
+
def get_default_config() -> str:
|
143
|
+
"""Get the default extraction configuration."""
|
144
|
+
config = ExtractionConfig()
|
145
|
+
return str(config.__dict__)
|
146
|
+
|
147
|
+
|
148
|
+
@mcp.resource("config://available-backends")
|
149
|
+
def get_available_backends() -> str:
|
150
|
+
"""Get available OCR backends."""
|
151
|
+
return "tesseract, easyocr, paddleocr"
|
152
|
+
|
153
|
+
|
154
|
+
@mcp.resource("extractors://supported-formats")
|
155
|
+
def get_supported_formats() -> str:
|
156
|
+
"""Get supported document formats."""
|
157
|
+
return """
|
158
|
+
Supported formats:
|
159
|
+
- PDF documents
|
160
|
+
- Images (PNG, JPG, JPEG, TIFF, BMP, WEBP)
|
161
|
+
- Office documents (DOCX, PPTX, XLSX)
|
162
|
+
- HTML files
|
163
|
+
- Text files (TXT, CSV, TSV)
|
164
|
+
- And more...
|
165
|
+
"""
|
166
|
+
|
167
|
+
|
168
|
+
@mcp.prompt()
|
169
|
+
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
170
|
+
"""Extract text from a document and provide a summary prompt.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
file_path: Path to the document file
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Extracted content with summarization prompt
|
177
|
+
"""
|
178
|
+
result = extract_file_sync(file_path, None, ExtractionConfig())
|
179
|
+
|
180
|
+
return [
|
181
|
+
TextContent(
|
182
|
+
type="text",
|
183
|
+
text=f"Document Content:\n{result.content}\n\nPlease provide a concise summary of this document.",
|
184
|
+
)
|
185
|
+
]
|
186
|
+
|
187
|
+
|
188
|
+
@mcp.prompt()
|
189
|
+
def extract_structured(file_path: str) -> list[TextContent]:
|
190
|
+
"""Extract text with structured analysis prompt.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
file_path: Path to the document file
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Extracted content with structured analysis prompt
|
197
|
+
"""
|
198
|
+
config = ExtractionConfig(
|
199
|
+
extract_entities=True,
|
200
|
+
extract_keywords=True,
|
201
|
+
extract_tables=True,
|
202
|
+
)
|
203
|
+
result = extract_file_sync(file_path, None, config)
|
204
|
+
|
205
|
+
content = f"Document Content:\n{result.content}\n\n"
|
206
|
+
|
207
|
+
if result.entities:
|
208
|
+
content += f"Entities: {[f'{e.text} ({e.type})' for e in result.entities]}\n\n"
|
209
|
+
|
210
|
+
if result.keywords:
|
211
|
+
content += f"Keywords: {[f'{kw[0]} ({kw[1]:.2f})' for kw in result.keywords]}\n\n"
|
212
|
+
|
213
|
+
if result.tables:
|
214
|
+
content += f"Tables found: {len(result.tables)}\n\n"
|
215
|
+
|
216
|
+
content += "Please analyze this document and provide structured insights."
|
217
|
+
|
218
|
+
return [TextContent(type="text", text=content)]
|
219
|
+
|
220
|
+
|
221
|
+
def main() -> None:
|
222
|
+
"""Main entry point for the MCP server."""
|
223
|
+
mcp.run()
|
224
|
+
|
225
|
+
|
226
|
+
if __name__ == "__main__":
|
227
|
+
main()
|
kreuzberg/_mime_types.py
CHANGED
@@ -17,6 +17,12 @@ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
|
17
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
18
|
DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
19
19
|
|
20
|
+
EML_MIME_TYPE: Final = "message/rfc822"
|
21
|
+
MSG_MIME_TYPE: Final = "application/vnd.ms-outlook"
|
22
|
+
JSON_MIME_TYPE: Final = "application/json"
|
23
|
+
YAML_MIME_TYPE: Final = "application/x-yaml"
|
24
|
+
TOML_MIME_TYPE: Final = "application/toml"
|
25
|
+
|
20
26
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
21
27
|
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
22
28
|
EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
|
@@ -127,6 +133,12 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
|
|
127
133
|
".org": "text/x-org",
|
128
134
|
".epub": "application/epub+zip",
|
129
135
|
".rtf": "application/rtf",
|
136
|
+
".eml": EML_MIME_TYPE,
|
137
|
+
".msg": MSG_MIME_TYPE,
|
138
|
+
".json": JSON_MIME_TYPE,
|
139
|
+
".yaml": YAML_MIME_TYPE,
|
140
|
+
".yml": YAML_MIME_TYPE,
|
141
|
+
".toml": TOML_MIME_TYPE,
|
130
142
|
".odt": "application/vnd.oasis.opendocument.text",
|
131
143
|
".docx": DOCX_MIME_TYPE,
|
132
144
|
".bib": "application/x-bibtex",
|
@@ -139,7 +151,21 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
139
151
|
| IMAGE_MIME_TYPES
|
140
152
|
| PANDOC_SUPPORTED_MIME_TYPES
|
141
153
|
| SPREADSHEET_MIME_TYPES
|
142
|
-
| {
|
154
|
+
| {
|
155
|
+
PDF_MIME_TYPE,
|
156
|
+
POWER_POINT_MIME_TYPE,
|
157
|
+
HTML_MIME_TYPE,
|
158
|
+
EML_MIME_TYPE,
|
159
|
+
MSG_MIME_TYPE,
|
160
|
+
JSON_MIME_TYPE,
|
161
|
+
YAML_MIME_TYPE,
|
162
|
+
TOML_MIME_TYPE,
|
163
|
+
"text/json",
|
164
|
+
"text/yaml",
|
165
|
+
"text/x-yaml",
|
166
|
+
"application/yaml",
|
167
|
+
"text/toml",
|
168
|
+
}
|
143
169
|
)
|
144
170
|
|
145
171
|
|
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Multiprocessing utilities for kreuzberg."""
|
2
2
|
|
3
|
-
from .
|
4
|
-
from .tesseract_pool import TesseractProcessPool
|
3
|
+
from .gmft_isolated import extract_tables_isolated, extract_tables_isolated_async
|
5
4
|
|
6
|
-
__all__ = ["
|
5
|
+
__all__ = ["extract_tables_isolated", "extract_tables_isolated_async"]
|