kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +156 -30
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +4 -6
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +95 -27
- kreuzberg/_extractors/_html.py +85 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +209 -99
- kreuzberg/_extractors/_presentation.py +72 -8
- kreuzberg/_extractors/_spread_sheet.py +25 -30
- kreuzberg/_mcp/server.py +345 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +2 -2
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +74 -34
- kreuzberg/_types.py +182 -23
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_sync.py +36 -6
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +43 -27
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
- kreuzberg-3.15.0.dist-info/RECORD +60 -0
- kreuzberg-3.14.0.dist-info/RECORD +0 -58
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,13 +2,10 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import contextlib
|
4
4
|
import csv
|
5
|
-
import os
|
6
5
|
import sys
|
7
|
-
import tempfile
|
8
6
|
from datetime import date, datetime, time, timedelta
|
9
7
|
from io import StringIO
|
10
|
-
from
|
11
|
-
from typing import Any
|
8
|
+
from typing import TYPE_CHECKING, Any
|
12
9
|
|
13
10
|
import polars as pl
|
14
11
|
from anyio import Path as AsyncPath
|
@@ -21,9 +18,12 @@ from kreuzberg._types import ExtractionResult, Metadata, TableData
|
|
21
18
|
from kreuzberg._utils._string import normalize_spaces
|
22
19
|
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
23
20
|
from kreuzberg._utils._table import enhance_table_markdown
|
24
|
-
from kreuzberg._utils._tmp import create_temp_file
|
21
|
+
from kreuzberg._utils._tmp import create_temp_file, temporary_file, temporary_file_sync
|
25
22
|
from kreuzberg.exceptions import ParsingError
|
26
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from pathlib import Path
|
26
|
+
|
27
27
|
if sys.version_info < (3, 11): # pragma: no cover
|
28
28
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
29
29
|
|
@@ -48,12 +48,8 @@ class SpreadSheetExtractor(Extractor):
|
|
48
48
|
|
49
49
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
50
50
|
file_extension = self._get_file_extension()
|
51
|
-
|
52
|
-
await AsyncPath(xlsx_path).write_bytes(content)
|
53
|
-
try:
|
51
|
+
async with temporary_file(file_extension, content) as xlsx_path:
|
54
52
|
return await self.extract_path_async(xlsx_path)
|
55
|
-
finally:
|
56
|
-
await unlink()
|
57
53
|
|
58
54
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
59
55
|
try:
|
@@ -86,16 +82,8 @@ class SpreadSheetExtractor(Extractor):
|
|
86
82
|
|
87
83
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
88
84
|
file_extension = self._get_file_extension()
|
89
|
-
|
90
|
-
|
91
|
-
try:
|
92
|
-
with os.fdopen(fd, "wb") as f:
|
93
|
-
f.write(content)
|
94
|
-
|
95
|
-
return self.extract_path_sync(Path(temp_path))
|
96
|
-
finally:
|
97
|
-
with contextlib.suppress(OSError):
|
98
|
-
Path(temp_path).unlink()
|
85
|
+
with temporary_file_sync(file_extension, content) as temp_path:
|
86
|
+
return self.extract_path_sync(temp_path)
|
99
87
|
|
100
88
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
101
89
|
try:
|
@@ -122,15 +110,17 @@ class SpreadSheetExtractor(Extractor):
|
|
122
110
|
|
123
111
|
@staticmethod
|
124
112
|
def _convert_cell_to_str(value: Any) -> str:
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
113
|
+
match value:
|
114
|
+
case None:
|
115
|
+
return ""
|
116
|
+
case bool():
|
117
|
+
return str(value).lower()
|
118
|
+
case datetime() | date() | time():
|
119
|
+
return value.isoformat()
|
120
|
+
case timedelta():
|
121
|
+
return f"{value.total_seconds()} seconds"
|
122
|
+
case _:
|
123
|
+
return str(value)
|
134
124
|
|
135
125
|
async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
136
126
|
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
@@ -207,7 +197,12 @@ class SpreadSheetExtractor(Extractor):
|
|
207
197
|
if not data or not any(row for row in data):
|
208
198
|
return f"## {sheet_name}\n\n*Empty sheet*"
|
209
199
|
|
210
|
-
|
200
|
+
# Normalize row lengths to avoid polars ShapeError
|
201
|
+
if data:
|
202
|
+
max_cols = max(len(row) if row else 0 for row in data)
|
203
|
+
data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
|
204
|
+
|
205
|
+
df = pl.DataFrame(data, strict=False)
|
211
206
|
|
212
207
|
df = df.filter(~pl.all_horizontal(pl.all().is_null()))
|
213
208
|
df = df.select([col for col in df.columns if not df[col].is_null().all()])
|
kreuzberg/_mcp/server.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import binascii
|
4
5
|
import json
|
6
|
+
from pathlib import Path
|
5
7
|
from typing import Any
|
6
8
|
|
7
9
|
import msgspec
|
@@ -9,34 +11,178 @@ from mcp.server import FastMCP
|
|
9
11
|
from mcp.types import TextContent
|
10
12
|
|
11
13
|
from kreuzberg._config import discover_config
|
12
|
-
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
13
|
-
from kreuzberg.
|
14
|
+
from kreuzberg._types import ExtractionConfig, OcrBackendType, PSMMode, TesseractConfig
|
15
|
+
from kreuzberg.exceptions import ValidationError
|
16
|
+
from kreuzberg.extraction import (
|
17
|
+
batch_extract_bytes_sync,
|
18
|
+
batch_extract_file_sync,
|
19
|
+
extract_bytes_sync,
|
20
|
+
extract_file_sync,
|
21
|
+
)
|
14
22
|
|
15
23
|
mcp = FastMCP("Kreuzberg Text Extraction")
|
16
24
|
|
25
|
+
# Security and performance limits
|
26
|
+
MAX_BATCH_SIZE = 100
|
27
|
+
|
28
|
+
|
29
|
+
def _validate_file_path(file_path: str) -> Path:
|
30
|
+
"""Validate file path to prevent path traversal attacks.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
file_path: The file path to validate
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
Path: The validated Path object
|
37
|
+
|
38
|
+
Raises:
|
39
|
+
ValidationError: If path traversal is detected or path is invalid
|
40
|
+
"""
|
41
|
+
try:
|
42
|
+
path = Path(file_path).resolve()
|
43
|
+
except (OSError, ValueError) as e:
|
44
|
+
raise ValidationError(
|
45
|
+
f"Invalid file path: {file_path}",
|
46
|
+
context={"file_path": file_path, "error": str(e)},
|
47
|
+
) from e
|
48
|
+
|
49
|
+
# Check for path traversal attempts
|
50
|
+
if ".." in file_path and not file_path.startswith("/"):
|
51
|
+
raise ValidationError(
|
52
|
+
"Path traversal detected in file path",
|
53
|
+
context={"file_path": file_path, "resolved_path": str(path)},
|
54
|
+
)
|
55
|
+
|
56
|
+
if not path.exists():
|
57
|
+
raise ValidationError(
|
58
|
+
f"File not found: {file_path}",
|
59
|
+
context={"file_path": file_path, "resolved_path": str(path)},
|
60
|
+
)
|
61
|
+
|
62
|
+
if not path.is_file():
|
63
|
+
raise ValidationError(
|
64
|
+
f"Path is not a file: {file_path}",
|
65
|
+
context={"file_path": file_path, "resolved_path": str(path)},
|
66
|
+
)
|
67
|
+
|
68
|
+
return path
|
69
|
+
|
70
|
+
|
71
|
+
def _validate_file_path_with_context(file_path: str, index: int, total: int) -> Path:
|
72
|
+
"""Validate file path and add context for batch operations."""
|
73
|
+
try:
|
74
|
+
return _validate_file_path(file_path)
|
75
|
+
except ValidationError as e:
|
76
|
+
# Add context about which file in the batch failed
|
77
|
+
e.context = e.context or {}
|
78
|
+
e.context["batch_index"] = index
|
79
|
+
e.context["total_files"] = total
|
80
|
+
raise
|
81
|
+
|
82
|
+
|
83
|
+
def _validate_base64_content(content_base64: str, context_info: str | None = None) -> bytes:
|
84
|
+
"""Validate and decode base64 content with proper error handling.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
content_base64: The base64 string to validate and decode
|
88
|
+
context_info: Additional context information for error reporting
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
bytes: The decoded content
|
92
|
+
|
93
|
+
Raises:
|
94
|
+
ValidationError: If the base64 content is invalid
|
95
|
+
"""
|
96
|
+
if not content_base64:
|
97
|
+
raise ValidationError(
|
98
|
+
"Base64 content cannot be empty",
|
99
|
+
context={"context": context_info},
|
100
|
+
)
|
101
|
+
|
102
|
+
# Check for whitespace-only content
|
103
|
+
if not content_base64.strip():
|
104
|
+
raise ValidationError(
|
105
|
+
"Base64 content cannot be whitespace only",
|
106
|
+
context={"content_preview": content_base64[:50], "context": context_info},
|
107
|
+
)
|
108
|
+
|
109
|
+
try:
|
110
|
+
content_bytes = base64.b64decode(content_base64, validate=True)
|
111
|
+
except (ValueError, binascii.Error) as e:
|
112
|
+
error_type = type(e).__name__
|
113
|
+
raise ValidationError(
|
114
|
+
f"Invalid base64 content: {error_type}: {e}",
|
115
|
+
context={
|
116
|
+
"error_type": error_type,
|
117
|
+
"error": str(e),
|
118
|
+
"content_preview": content_base64[:50] + "..." if len(content_base64) > 50 else content_base64,
|
119
|
+
"context": context_info,
|
120
|
+
},
|
121
|
+
) from e
|
122
|
+
|
123
|
+
return content_bytes
|
124
|
+
|
17
125
|
|
18
126
|
def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
19
127
|
base_config = discover_config()
|
20
128
|
|
129
|
+
# Extract Tesseract-specific parameters from kwargs first
|
130
|
+
tesseract_lang = kwargs.pop("tesseract_lang", None)
|
131
|
+
tesseract_psm = kwargs.pop("tesseract_psm", None)
|
132
|
+
tesseract_output_format = kwargs.pop("tesseract_output_format", None)
|
133
|
+
enable_table_detection = kwargs.pop("enable_table_detection", None)
|
134
|
+
|
21
135
|
if base_config is None:
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
136
|
+
config_dict = kwargs
|
137
|
+
else:
|
138
|
+
config_dict = {
|
139
|
+
"force_ocr": base_config.force_ocr,
|
140
|
+
"chunk_content": base_config.chunk_content,
|
141
|
+
"extract_tables": base_config.extract_tables,
|
142
|
+
"extract_entities": base_config.extract_entities,
|
143
|
+
"extract_keywords": base_config.extract_keywords,
|
144
|
+
"ocr_backend": base_config.ocr_backend,
|
145
|
+
"max_chars": base_config.max_chars,
|
146
|
+
"max_overlap": base_config.max_overlap,
|
147
|
+
"keyword_count": base_config.keyword_count,
|
148
|
+
"auto_detect_language": base_config.auto_detect_language,
|
149
|
+
"ocr_config": base_config.ocr_config,
|
150
|
+
"gmft_config": base_config.gmft_config,
|
151
|
+
}
|
152
|
+
config_dict = config_dict | kwargs
|
153
|
+
|
154
|
+
# Handle Tesseract OCR configuration
|
155
|
+
ocr_backend = config_dict.get("ocr_backend")
|
156
|
+
if ocr_backend == "tesseract" and (
|
157
|
+
tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
|
158
|
+
):
|
159
|
+
tesseract_config_dict = {}
|
160
|
+
|
161
|
+
if tesseract_lang:
|
162
|
+
tesseract_config_dict["language"] = tesseract_lang
|
163
|
+
if tesseract_psm is not None:
|
164
|
+
try:
|
165
|
+
tesseract_config_dict["psm"] = PSMMode(tesseract_psm)
|
166
|
+
except ValueError as e:
|
167
|
+
raise ValidationError(
|
168
|
+
f"Invalid PSM mode value: {tesseract_psm}",
|
169
|
+
context={"psm_value": tesseract_psm, "error": str(e)},
|
170
|
+
) from e
|
171
|
+
if tesseract_output_format:
|
172
|
+
tesseract_config_dict["output_format"] = tesseract_output_format
|
173
|
+
if enable_table_detection:
|
174
|
+
tesseract_config_dict["enable_table_detection"] = True
|
175
|
+
|
176
|
+
if tesseract_config_dict:
|
177
|
+
# Merge with existing tesseract config if present
|
178
|
+
existing_ocr_config = config_dict.get("ocr_config")
|
179
|
+
if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
|
180
|
+
# Convert existing config to dict, merge, and recreate
|
181
|
+
existing_dict = existing_ocr_config.to_dict()
|
182
|
+
merged_dict = existing_dict | tesseract_config_dict
|
183
|
+
config_dict["ocr_config"] = TesseractConfig(**merged_dict)
|
184
|
+
else:
|
185
|
+
config_dict["ocr_config"] = TesseractConfig(**tesseract_config_dict)
|
40
186
|
|
41
187
|
return ExtractionConfig(**config_dict)
|
42
188
|
|
@@ -55,7 +201,13 @@ def extract_document( # noqa: PLR0913
|
|
55
201
|
max_overlap: int = 200,
|
56
202
|
keyword_count: int = 10,
|
57
203
|
auto_detect_language: bool = False,
|
204
|
+
tesseract_lang: str | None = None,
|
205
|
+
tesseract_psm: int | None = None,
|
206
|
+
tesseract_output_format: str | None = None,
|
207
|
+
enable_table_detection: bool | None = None,
|
58
208
|
) -> dict[str, Any]:
|
209
|
+
# Validate file path for security
|
210
|
+
validated_path = _validate_file_path(file_path)
|
59
211
|
config = _create_config_with_overrides(
|
60
212
|
force_ocr=force_ocr,
|
61
213
|
chunk_content=chunk_content,
|
@@ -67,9 +219,13 @@ def extract_document( # noqa: PLR0913
|
|
67
219
|
max_overlap=max_overlap,
|
68
220
|
keyword_count=keyword_count,
|
69
221
|
auto_detect_language=auto_detect_language,
|
222
|
+
tesseract_lang=tesseract_lang,
|
223
|
+
tesseract_psm=tesseract_psm,
|
224
|
+
tesseract_output_format=tesseract_output_format,
|
225
|
+
enable_table_detection=enable_table_detection,
|
70
226
|
)
|
71
227
|
|
72
|
-
result = extract_file_sync(
|
228
|
+
result = extract_file_sync(str(validated_path), mime_type, config)
|
73
229
|
return result.to_dict(include_none=True)
|
74
230
|
|
75
231
|
|
@@ -87,8 +243,12 @@ def extract_bytes( # noqa: PLR0913
|
|
87
243
|
max_overlap: int = 200,
|
88
244
|
keyword_count: int = 10,
|
89
245
|
auto_detect_language: bool = False,
|
246
|
+
tesseract_lang: str | None = None,
|
247
|
+
tesseract_psm: int | None = None,
|
248
|
+
tesseract_output_format: str | None = None,
|
249
|
+
enable_table_detection: bool | None = None,
|
90
250
|
) -> dict[str, Any]:
|
91
|
-
content_bytes =
|
251
|
+
content_bytes = _validate_base64_content(content_base64, "extract_bytes")
|
92
252
|
|
93
253
|
config = _create_config_with_overrides(
|
94
254
|
force_ocr=force_ocr,
|
@@ -101,19 +261,175 @@ def extract_bytes( # noqa: PLR0913
|
|
101
261
|
max_overlap=max_overlap,
|
102
262
|
keyword_count=keyword_count,
|
103
263
|
auto_detect_language=auto_detect_language,
|
264
|
+
tesseract_lang=tesseract_lang,
|
265
|
+
tesseract_psm=tesseract_psm,
|
266
|
+
tesseract_output_format=tesseract_output_format,
|
267
|
+
enable_table_detection=enable_table_detection,
|
104
268
|
)
|
105
269
|
|
106
270
|
result = extract_bytes_sync(content_bytes, mime_type, config)
|
107
271
|
return result.to_dict(include_none=True)
|
108
272
|
|
109
273
|
|
274
|
+
@mcp.tool()
|
275
|
+
def batch_extract_document( # noqa: PLR0913
|
276
|
+
file_paths: list[str],
|
277
|
+
force_ocr: bool = False,
|
278
|
+
chunk_content: bool = False,
|
279
|
+
extract_tables: bool = False,
|
280
|
+
extract_entities: bool = False,
|
281
|
+
extract_keywords: bool = False,
|
282
|
+
ocr_backend: OcrBackendType = "tesseract",
|
283
|
+
max_chars: int = 1000,
|
284
|
+
max_overlap: int = 200,
|
285
|
+
keyword_count: int = 10,
|
286
|
+
auto_detect_language: bool = False,
|
287
|
+
tesseract_lang: str | None = None,
|
288
|
+
tesseract_psm: int | None = None,
|
289
|
+
tesseract_output_format: str | None = None,
|
290
|
+
enable_table_detection: bool | None = None,
|
291
|
+
) -> list[dict[str, Any]]:
|
292
|
+
# Validate batch size
|
293
|
+
if len(file_paths) > MAX_BATCH_SIZE:
|
294
|
+
raise ValidationError(
|
295
|
+
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
296
|
+
context={"batch_size": len(file_paths), "max_batch_size": MAX_BATCH_SIZE},
|
297
|
+
)
|
298
|
+
|
299
|
+
if not file_paths:
|
300
|
+
raise ValidationError(
|
301
|
+
"File paths list cannot be empty",
|
302
|
+
context={"file_paths": file_paths},
|
303
|
+
)
|
304
|
+
|
305
|
+
# Validate all file paths for security
|
306
|
+
validated_paths = []
|
307
|
+
for i, file_path in enumerate(file_paths):
|
308
|
+
validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
|
309
|
+
validated_paths.append(str(validated_path))
|
310
|
+
config = _create_config_with_overrides(
|
311
|
+
force_ocr=force_ocr,
|
312
|
+
chunk_content=chunk_content,
|
313
|
+
extract_tables=extract_tables,
|
314
|
+
extract_entities=extract_entities,
|
315
|
+
extract_keywords=extract_keywords,
|
316
|
+
ocr_backend=ocr_backend,
|
317
|
+
max_chars=max_chars,
|
318
|
+
max_overlap=max_overlap,
|
319
|
+
keyword_count=keyword_count,
|
320
|
+
auto_detect_language=auto_detect_language,
|
321
|
+
tesseract_lang=tesseract_lang,
|
322
|
+
tesseract_psm=tesseract_psm,
|
323
|
+
tesseract_output_format=tesseract_output_format,
|
324
|
+
enable_table_detection=enable_table_detection,
|
325
|
+
)
|
326
|
+
|
327
|
+
results = batch_extract_file_sync(validated_paths, config)
|
328
|
+
return [result.to_dict(include_none=True) for result in results]
|
329
|
+
|
330
|
+
|
331
|
+
@mcp.tool()
|
332
|
+
def batch_extract_bytes( # noqa: PLR0913
|
333
|
+
content_items: list[dict[str, str]],
|
334
|
+
force_ocr: bool = False,
|
335
|
+
chunk_content: bool = False,
|
336
|
+
extract_tables: bool = False,
|
337
|
+
extract_entities: bool = False,
|
338
|
+
extract_keywords: bool = False,
|
339
|
+
ocr_backend: OcrBackendType = "tesseract",
|
340
|
+
max_chars: int = 1000,
|
341
|
+
max_overlap: int = 200,
|
342
|
+
keyword_count: int = 10,
|
343
|
+
auto_detect_language: bool = False,
|
344
|
+
tesseract_lang: str | None = None,
|
345
|
+
tesseract_psm: int | None = None,
|
346
|
+
tesseract_output_format: str | None = None,
|
347
|
+
enable_table_detection: bool | None = None,
|
348
|
+
) -> list[dict[str, Any]]:
|
349
|
+
# Validate input
|
350
|
+
if not content_items:
|
351
|
+
raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
|
352
|
+
|
353
|
+
if not isinstance(content_items, list):
|
354
|
+
raise ValidationError(
|
355
|
+
"content_items must be a list", context={"content_items_type": type(content_items).__name__}
|
356
|
+
)
|
357
|
+
|
358
|
+
# Validate batch size
|
359
|
+
if len(content_items) > MAX_BATCH_SIZE:
|
360
|
+
raise ValidationError(
|
361
|
+
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
362
|
+
context={"batch_size": len(content_items), "max_batch_size": MAX_BATCH_SIZE},
|
363
|
+
)
|
364
|
+
|
365
|
+
config = _create_config_with_overrides(
|
366
|
+
force_ocr=force_ocr,
|
367
|
+
chunk_content=chunk_content,
|
368
|
+
extract_tables=extract_tables,
|
369
|
+
extract_entities=extract_entities,
|
370
|
+
extract_keywords=extract_keywords,
|
371
|
+
ocr_backend=ocr_backend,
|
372
|
+
max_chars=max_chars,
|
373
|
+
max_overlap=max_overlap,
|
374
|
+
keyword_count=keyword_count,
|
375
|
+
auto_detect_language=auto_detect_language,
|
376
|
+
tesseract_lang=tesseract_lang,
|
377
|
+
tesseract_psm=tesseract_psm,
|
378
|
+
tesseract_output_format=tesseract_output_format,
|
379
|
+
enable_table_detection=enable_table_detection,
|
380
|
+
)
|
381
|
+
|
382
|
+
# Convert list of dicts to list of tuples (bytes, mime_type)
|
383
|
+
contents = []
|
384
|
+
for i, item in enumerate(content_items):
|
385
|
+
# Validate item structure
|
386
|
+
if not isinstance(item, dict):
|
387
|
+
raise ValidationError(
|
388
|
+
f"Item at index {i} must be a dictionary",
|
389
|
+
context={"item_index": i, "item_type": type(item).__name__, "item": item},
|
390
|
+
)
|
391
|
+
|
392
|
+
# Check for required keys
|
393
|
+
if "content_base64" not in item:
|
394
|
+
raise ValidationError(
|
395
|
+
f"Item at index {i} is missing required key 'content_base64'",
|
396
|
+
context={"item_index": i, "item_keys": list(item.keys()), "item": item},
|
397
|
+
)
|
398
|
+
|
399
|
+
if "mime_type" not in item:
|
400
|
+
raise ValidationError(
|
401
|
+
f"Item at index {i} is missing required key 'mime_type'",
|
402
|
+
context={"item_index": i, "item_keys": list(item.keys()), "item": item},
|
403
|
+
)
|
404
|
+
|
405
|
+
content_base64 = item["content_base64"]
|
406
|
+
mime_type = item["mime_type"]
|
407
|
+
|
408
|
+
# Validate base64 content
|
409
|
+
try:
|
410
|
+
content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
|
411
|
+
except ValidationError as e:
|
412
|
+
# Add batch-specific context
|
413
|
+
e.context = e.context or {}
|
414
|
+
e.context["item_index"] = i
|
415
|
+
e.context["total_items"] = len(content_items)
|
416
|
+
raise
|
417
|
+
|
418
|
+
contents.append((content_bytes, mime_type))
|
419
|
+
|
420
|
+
results = batch_extract_bytes_sync(contents, config)
|
421
|
+
return [result.to_dict(include_none=True) for result in results]
|
422
|
+
|
423
|
+
|
110
424
|
@mcp.tool()
|
111
425
|
def extract_simple(
|
112
426
|
file_path: str,
|
113
427
|
mime_type: str | None = None,
|
114
428
|
) -> str:
|
429
|
+
# Validate file path for security
|
430
|
+
validated_path = _validate_file_path(file_path)
|
115
431
|
config = _create_config_with_overrides()
|
116
|
-
result = extract_file_sync(
|
432
|
+
result = extract_file_sync(str(validated_path), mime_type, config)
|
117
433
|
return result.content
|
118
434
|
|
119
435
|
|
@@ -151,7 +467,9 @@ def get_supported_formats() -> str:
|
|
151
467
|
|
152
468
|
@mcp.prompt()
|
153
469
|
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
154
|
-
|
470
|
+
# Validate file path for security
|
471
|
+
validated_path = _validate_file_path(file_path)
|
472
|
+
result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
|
155
473
|
|
156
474
|
return [
|
157
475
|
TextContent(
|
@@ -163,12 +481,14 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
|
|
163
481
|
|
164
482
|
@mcp.prompt()
|
165
483
|
def extract_structured(file_path: str) -> list[TextContent]:
|
484
|
+
# Validate file path for security
|
485
|
+
validated_path = _validate_file_path(file_path)
|
166
486
|
config = _create_config_with_overrides(
|
167
487
|
extract_entities=True,
|
168
488
|
extract_keywords=True,
|
169
489
|
extract_tables=True,
|
170
490
|
)
|
171
|
-
result = extract_file_sync(
|
491
|
+
result = extract_file_sync(str(validated_path), None, config)
|
172
492
|
|
173
493
|
content = f"Document Content:\n{result.content}\n\n"
|
174
494
|
|
kreuzberg/_mime_types.py
CHANGED
@@ -56,6 +56,48 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
|
|
56
56
|
"image/x-tiff",
|
57
57
|
}
|
58
58
|
|
59
|
+
IMAGE_FORMATS: Final[frozenset[str]] = frozenset(
|
60
|
+
{
|
61
|
+
"jpg",
|
62
|
+
"jpeg",
|
63
|
+
"png",
|
64
|
+
"gif",
|
65
|
+
"bmp",
|
66
|
+
"tiff",
|
67
|
+
"tif",
|
68
|
+
"webp",
|
69
|
+
"jp2",
|
70
|
+
"jpx",
|
71
|
+
"jpm",
|
72
|
+
"mj2",
|
73
|
+
"pnm",
|
74
|
+
"pbm",
|
75
|
+
"pgm",
|
76
|
+
"ppm",
|
77
|
+
}
|
78
|
+
)
|
79
|
+
|
80
|
+
IMAGE_MIME_TO_EXT: Final[dict[str, str]] = {
|
81
|
+
"image/bmp": "bmp",
|
82
|
+
"image/x-bmp": "bmp",
|
83
|
+
"image/x-ms-bmp": "bmp",
|
84
|
+
"image/gif": "gif",
|
85
|
+
"image/jpeg": "jpg",
|
86
|
+
"image/pjpeg": "jpg",
|
87
|
+
"image/png": "png",
|
88
|
+
"image/tiff": "tiff",
|
89
|
+
"image/x-tiff": "tiff",
|
90
|
+
"image/jp2": "jp2",
|
91
|
+
"image/jpx": "jpx",
|
92
|
+
"image/jpm": "jpm",
|
93
|
+
"image/mj2": "mj2",
|
94
|
+
"image/webp": "webp",
|
95
|
+
"image/x-portable-anymap": "pnm",
|
96
|
+
"image/x-portable-bitmap": "pbm",
|
97
|
+
"image/x-portable-graymap": "pgm",
|
98
|
+
"image/x-portable-pixmap": "ppm",
|
99
|
+
}
|
100
|
+
|
59
101
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
60
102
|
"application/csl+json",
|
61
103
|
"application/docbook+xml",
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -239,7 +239,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
239
239
|
)
|
240
240
|
|
241
241
|
return ExtractionResult(
|
242
|
-
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
242
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
243
243
|
)
|
244
244
|
|
245
245
|
# Group text boxes by lines based on Y coordinate # ~keep
|
@@ -287,7 +287,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
287
287
|
)
|
288
288
|
|
289
289
|
return ExtractionResult(
|
290
|
-
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
290
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
291
291
|
)
|
292
292
|
|
293
293
|
@classmethod
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -192,7 +192,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
192
192
|
)
|
193
193
|
|
194
194
|
return ExtractionResult(
|
195
|
-
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
195
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
196
196
|
)
|
197
197
|
|
198
198
|
@classmethod
|