kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_mcp/server.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import binascii
|
4
5
|
import json
|
6
|
+
from pathlib import Path
|
5
7
|
from typing import Any
|
6
8
|
|
7
9
|
import msgspec
|
@@ -9,34 +11,170 @@ from mcp.server import FastMCP
|
|
9
11
|
from mcp.types import TextContent
|
10
12
|
|
11
13
|
from kreuzberg._config import discover_config
|
12
|
-
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
13
|
-
from kreuzberg.
|
14
|
+
from kreuzberg._types import ExtractionConfig, OcrBackendType, PSMMode, TesseractConfig
|
15
|
+
from kreuzberg.exceptions import ValidationError
|
16
|
+
from kreuzberg.extraction import (
|
17
|
+
batch_extract_bytes_sync,
|
18
|
+
batch_extract_file_sync,
|
19
|
+
extract_bytes_sync,
|
20
|
+
extract_file_sync,
|
21
|
+
)
|
14
22
|
|
15
23
|
mcp = FastMCP("Kreuzberg Text Extraction")
|
16
24
|
|
25
|
+
MAX_BATCH_SIZE = 100
|
26
|
+
|
27
|
+
|
28
|
+
def _validate_file_path(file_path: str) -> Path:
|
29
|
+
"""Validate file path to prevent path traversal attacks.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
file_path: The file path to validate
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Path: The validated Path object
|
36
|
+
|
37
|
+
Raises:
|
38
|
+
ValidationError: If path traversal is detected or path is invalid
|
39
|
+
"""
|
40
|
+
try:
|
41
|
+
path = Path(file_path).resolve()
|
42
|
+
except (OSError, ValueError) as e:
|
43
|
+
raise ValidationError(
|
44
|
+
f"Invalid file path: {file_path}",
|
45
|
+
context={"file_path": file_path, "error": str(e)},
|
46
|
+
) from e
|
47
|
+
|
48
|
+
if ".." in file_path and not file_path.startswith("/"):
|
49
|
+
raise ValidationError(
|
50
|
+
"Path traversal detected in file path",
|
51
|
+
context={"file_path": file_path, "resolved_path": str(path)},
|
52
|
+
)
|
53
|
+
|
54
|
+
if not path.exists():
|
55
|
+
raise ValidationError(
|
56
|
+
f"File not found: {file_path}",
|
57
|
+
context={"file_path": file_path, "resolved_path": str(path)},
|
58
|
+
)
|
59
|
+
|
60
|
+
if not path.is_file():
|
61
|
+
raise ValidationError(
|
62
|
+
f"Path is not a file: {file_path}",
|
63
|
+
context={"file_path": file_path, "resolved_path": str(path)},
|
64
|
+
)
|
65
|
+
|
66
|
+
return path
|
67
|
+
|
68
|
+
|
69
|
+
def _validate_file_path_with_context(file_path: str, index: int, total: int) -> Path:
|
70
|
+
"""Validate file path and add context for batch operations."""
|
71
|
+
try:
|
72
|
+
return _validate_file_path(file_path)
|
73
|
+
except ValidationError as e:
|
74
|
+
e.context = e.context or {}
|
75
|
+
e.context["batch_index"] = index
|
76
|
+
e.context["total_files"] = total
|
77
|
+
raise
|
78
|
+
|
79
|
+
|
80
|
+
def _validate_base64_content(content_base64: str, context_info: str | None = None) -> bytes:
|
81
|
+
"""Validate and decode base64 content with proper error handling.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
content_base64: The base64 string to validate and decode
|
85
|
+
context_info: Additional context information for error reporting
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
bytes: The decoded content
|
89
|
+
|
90
|
+
Raises:
|
91
|
+
ValidationError: If the base64 content is invalid
|
92
|
+
"""
|
93
|
+
if not content_base64:
|
94
|
+
raise ValidationError(
|
95
|
+
"Base64 content cannot be empty",
|
96
|
+
context={"context": context_info},
|
97
|
+
)
|
98
|
+
|
99
|
+
if not content_base64.strip():
|
100
|
+
raise ValidationError(
|
101
|
+
"Base64 content cannot be whitespace only",
|
102
|
+
context={"content_preview": content_base64[:50], "context": context_info},
|
103
|
+
)
|
104
|
+
|
105
|
+
try:
|
106
|
+
content_bytes = base64.b64decode(content_base64, validate=True)
|
107
|
+
except (ValueError, binascii.Error) as e:
|
108
|
+
error_type = type(e).__name__
|
109
|
+
raise ValidationError(
|
110
|
+
f"Invalid base64 content: {error_type}: {e}",
|
111
|
+
context={
|
112
|
+
"error_type": error_type,
|
113
|
+
"error": str(e),
|
114
|
+
"content_preview": content_base64[:50] + "..." if len(content_base64) > 50 else content_base64,
|
115
|
+
"context": context_info,
|
116
|
+
},
|
117
|
+
) from e
|
118
|
+
|
119
|
+
return content_bytes
|
120
|
+
|
17
121
|
|
18
122
|
def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
19
123
|
base_config = discover_config()
|
20
124
|
|
125
|
+
tesseract_lang = kwargs.pop("tesseract_lang", None)
|
126
|
+
tesseract_psm = kwargs.pop("tesseract_psm", None)
|
127
|
+
tesseract_output_format = kwargs.pop("tesseract_output_format", None)
|
128
|
+
enable_table_detection = kwargs.pop("enable_table_detection", None)
|
129
|
+
|
21
130
|
if base_config is None:
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
131
|
+
config_dict = kwargs
|
132
|
+
else:
|
133
|
+
config_dict = {
|
134
|
+
"force_ocr": base_config.force_ocr,
|
135
|
+
"chunk_content": base_config.chunk_content,
|
136
|
+
"extract_tables": base_config.extract_tables,
|
137
|
+
"extract_entities": base_config.extract_entities,
|
138
|
+
"extract_keywords": base_config.extract_keywords,
|
139
|
+
"ocr_backend": base_config.ocr_backend,
|
140
|
+
"max_chars": base_config.max_chars,
|
141
|
+
"max_overlap": base_config.max_overlap,
|
142
|
+
"keyword_count": base_config.keyword_count,
|
143
|
+
"auto_detect_language": base_config.auto_detect_language,
|
144
|
+
"ocr_config": base_config.ocr_config,
|
145
|
+
"gmft_config": base_config.gmft_config,
|
146
|
+
}
|
147
|
+
config_dict = config_dict | kwargs
|
148
|
+
|
149
|
+
ocr_backend = config_dict.get("ocr_backend")
|
150
|
+
if ocr_backend == "tesseract" and (
|
151
|
+
tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
|
152
|
+
):
|
153
|
+
tesseract_config_dict = {}
|
154
|
+
|
155
|
+
if tesseract_lang:
|
156
|
+
tesseract_config_dict["language"] = tesseract_lang
|
157
|
+
if tesseract_psm is not None:
|
158
|
+
try:
|
159
|
+
tesseract_config_dict["psm"] = PSMMode(tesseract_psm)
|
160
|
+
except ValueError as e:
|
161
|
+
raise ValidationError(
|
162
|
+
f"Invalid PSM mode value: {tesseract_psm}",
|
163
|
+
context={"psm_value": tesseract_psm, "error": str(e)},
|
164
|
+
) from e
|
165
|
+
if tesseract_output_format:
|
166
|
+
tesseract_config_dict["output_format"] = tesseract_output_format
|
167
|
+
if enable_table_detection:
|
168
|
+
tesseract_config_dict["enable_table_detection"] = True
|
169
|
+
|
170
|
+
if tesseract_config_dict:
|
171
|
+
existing_ocr_config = config_dict.get("ocr_config")
|
172
|
+
if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
|
173
|
+
existing_dict = existing_ocr_config.to_dict()
|
174
|
+
merged_dict = existing_dict | tesseract_config_dict
|
175
|
+
config_dict["ocr_config"] = TesseractConfig(**merged_dict)
|
176
|
+
else:
|
177
|
+
config_dict["ocr_config"] = TesseractConfig(**tesseract_config_dict)
|
40
178
|
|
41
179
|
return ExtractionConfig(**config_dict)
|
42
180
|
|
@@ -55,7 +193,12 @@ def extract_document( # noqa: PLR0913
|
|
55
193
|
max_overlap: int = 200,
|
56
194
|
keyword_count: int = 10,
|
57
195
|
auto_detect_language: bool = False,
|
196
|
+
tesseract_lang: str | None = None,
|
197
|
+
tesseract_psm: int | None = None,
|
198
|
+
tesseract_output_format: str | None = None,
|
199
|
+
enable_table_detection: bool | None = None,
|
58
200
|
) -> dict[str, Any]:
|
201
|
+
validated_path = _validate_file_path(file_path)
|
59
202
|
config = _create_config_with_overrides(
|
60
203
|
force_ocr=force_ocr,
|
61
204
|
chunk_content=chunk_content,
|
@@ -67,9 +210,13 @@ def extract_document( # noqa: PLR0913
|
|
67
210
|
max_overlap=max_overlap,
|
68
211
|
keyword_count=keyword_count,
|
69
212
|
auto_detect_language=auto_detect_language,
|
213
|
+
tesseract_lang=tesseract_lang,
|
214
|
+
tesseract_psm=tesseract_psm,
|
215
|
+
tesseract_output_format=tesseract_output_format,
|
216
|
+
enable_table_detection=enable_table_detection,
|
70
217
|
)
|
71
218
|
|
72
|
-
result = extract_file_sync(
|
219
|
+
result = extract_file_sync(str(validated_path), mime_type, config)
|
73
220
|
return result.to_dict(include_none=True)
|
74
221
|
|
75
222
|
|
@@ -87,8 +234,12 @@ def extract_bytes( # noqa: PLR0913
|
|
87
234
|
max_overlap: int = 200,
|
88
235
|
keyword_count: int = 10,
|
89
236
|
auto_detect_language: bool = False,
|
237
|
+
tesseract_lang: str | None = None,
|
238
|
+
tesseract_psm: int | None = None,
|
239
|
+
tesseract_output_format: str | None = None,
|
240
|
+
enable_table_detection: bool | None = None,
|
90
241
|
) -> dict[str, Any]:
|
91
|
-
content_bytes =
|
242
|
+
content_bytes = _validate_base64_content(content_base64, "extract_bytes")
|
92
243
|
|
93
244
|
config = _create_config_with_overrides(
|
94
245
|
force_ocr=force_ocr,
|
@@ -101,19 +252,165 @@ def extract_bytes( # noqa: PLR0913
|
|
101
252
|
max_overlap=max_overlap,
|
102
253
|
keyword_count=keyword_count,
|
103
254
|
auto_detect_language=auto_detect_language,
|
255
|
+
tesseract_lang=tesseract_lang,
|
256
|
+
tesseract_psm=tesseract_psm,
|
257
|
+
tesseract_output_format=tesseract_output_format,
|
258
|
+
enable_table_detection=enable_table_detection,
|
104
259
|
)
|
105
260
|
|
106
261
|
result = extract_bytes_sync(content_bytes, mime_type, config)
|
107
262
|
return result.to_dict(include_none=True)
|
108
263
|
|
109
264
|
|
265
|
+
@mcp.tool()
|
266
|
+
def batch_extract_document( # noqa: PLR0913
|
267
|
+
file_paths: list[str],
|
268
|
+
force_ocr: bool = False,
|
269
|
+
chunk_content: bool = False,
|
270
|
+
extract_tables: bool = False,
|
271
|
+
extract_entities: bool = False,
|
272
|
+
extract_keywords: bool = False,
|
273
|
+
ocr_backend: OcrBackendType = "tesseract",
|
274
|
+
max_chars: int = 1000,
|
275
|
+
max_overlap: int = 200,
|
276
|
+
keyword_count: int = 10,
|
277
|
+
auto_detect_language: bool = False,
|
278
|
+
tesseract_lang: str | None = None,
|
279
|
+
tesseract_psm: int | None = None,
|
280
|
+
tesseract_output_format: str | None = None,
|
281
|
+
enable_table_detection: bool | None = None,
|
282
|
+
) -> list[dict[str, Any]]:
|
283
|
+
if len(file_paths) > MAX_BATCH_SIZE:
|
284
|
+
raise ValidationError(
|
285
|
+
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
286
|
+
context={"batch_size": len(file_paths), "max_batch_size": MAX_BATCH_SIZE},
|
287
|
+
)
|
288
|
+
|
289
|
+
if not file_paths:
|
290
|
+
raise ValidationError(
|
291
|
+
"File paths list cannot be empty",
|
292
|
+
context={"file_paths": file_paths},
|
293
|
+
)
|
294
|
+
|
295
|
+
validated_paths = []
|
296
|
+
for i, file_path in enumerate(file_paths):
|
297
|
+
validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
|
298
|
+
validated_paths.append(str(validated_path))
|
299
|
+
config = _create_config_with_overrides(
|
300
|
+
force_ocr=force_ocr,
|
301
|
+
chunk_content=chunk_content,
|
302
|
+
extract_tables=extract_tables,
|
303
|
+
extract_entities=extract_entities,
|
304
|
+
extract_keywords=extract_keywords,
|
305
|
+
ocr_backend=ocr_backend,
|
306
|
+
max_chars=max_chars,
|
307
|
+
max_overlap=max_overlap,
|
308
|
+
keyword_count=keyword_count,
|
309
|
+
auto_detect_language=auto_detect_language,
|
310
|
+
tesseract_lang=tesseract_lang,
|
311
|
+
tesseract_psm=tesseract_psm,
|
312
|
+
tesseract_output_format=tesseract_output_format,
|
313
|
+
enable_table_detection=enable_table_detection,
|
314
|
+
)
|
315
|
+
|
316
|
+
results = batch_extract_file_sync(validated_paths, config)
|
317
|
+
return [result.to_dict(include_none=True) for result in results]
|
318
|
+
|
319
|
+
|
320
|
+
@mcp.tool()
|
321
|
+
def batch_extract_bytes( # noqa: PLR0913
|
322
|
+
content_items: list[dict[str, str]],
|
323
|
+
force_ocr: bool = False,
|
324
|
+
chunk_content: bool = False,
|
325
|
+
extract_tables: bool = False,
|
326
|
+
extract_entities: bool = False,
|
327
|
+
extract_keywords: bool = False,
|
328
|
+
ocr_backend: OcrBackendType = "tesseract",
|
329
|
+
max_chars: int = 1000,
|
330
|
+
max_overlap: int = 200,
|
331
|
+
keyword_count: int = 10,
|
332
|
+
auto_detect_language: bool = False,
|
333
|
+
tesseract_lang: str | None = None,
|
334
|
+
tesseract_psm: int | None = None,
|
335
|
+
tesseract_output_format: str | None = None,
|
336
|
+
enable_table_detection: bool | None = None,
|
337
|
+
) -> list[dict[str, Any]]:
|
338
|
+
if not content_items:
|
339
|
+
raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
|
340
|
+
|
341
|
+
if not isinstance(content_items, list):
|
342
|
+
raise ValidationError(
|
343
|
+
"content_items must be a list", context={"content_items_type": type(content_items).__name__}
|
344
|
+
)
|
345
|
+
|
346
|
+
if len(content_items) > MAX_BATCH_SIZE:
|
347
|
+
raise ValidationError(
|
348
|
+
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
349
|
+
context={"batch_size": len(content_items), "max_batch_size": MAX_BATCH_SIZE},
|
350
|
+
)
|
351
|
+
|
352
|
+
config = _create_config_with_overrides(
|
353
|
+
force_ocr=force_ocr,
|
354
|
+
chunk_content=chunk_content,
|
355
|
+
extract_tables=extract_tables,
|
356
|
+
extract_entities=extract_entities,
|
357
|
+
extract_keywords=extract_keywords,
|
358
|
+
ocr_backend=ocr_backend,
|
359
|
+
max_chars=max_chars,
|
360
|
+
max_overlap=max_overlap,
|
361
|
+
keyword_count=keyword_count,
|
362
|
+
auto_detect_language=auto_detect_language,
|
363
|
+
tesseract_lang=tesseract_lang,
|
364
|
+
tesseract_psm=tesseract_psm,
|
365
|
+
tesseract_output_format=tesseract_output_format,
|
366
|
+
enable_table_detection=enable_table_detection,
|
367
|
+
)
|
368
|
+
|
369
|
+
contents = []
|
370
|
+
for i, item in enumerate(content_items):
|
371
|
+
if not isinstance(item, dict):
|
372
|
+
raise ValidationError(
|
373
|
+
f"Item at index {i} must be a dictionary",
|
374
|
+
context={"item_index": i, "item_type": type(item).__name__, "item": item},
|
375
|
+
)
|
376
|
+
|
377
|
+
if "content_base64" not in item:
|
378
|
+
raise ValidationError(
|
379
|
+
f"Item at index {i} is missing required key 'content_base64'",
|
380
|
+
context={"item_index": i, "item_keys": list(item.keys()), "item": item},
|
381
|
+
)
|
382
|
+
|
383
|
+
if "mime_type" not in item:
|
384
|
+
raise ValidationError(
|
385
|
+
f"Item at index {i} is missing required key 'mime_type'",
|
386
|
+
context={"item_index": i, "item_keys": list(item.keys()), "item": item},
|
387
|
+
)
|
388
|
+
|
389
|
+
content_base64 = item["content_base64"]
|
390
|
+
mime_type = item["mime_type"]
|
391
|
+
|
392
|
+
try:
|
393
|
+
content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
|
394
|
+
except ValidationError as e:
|
395
|
+
e.context = e.context or {}
|
396
|
+
e.context["item_index"] = i
|
397
|
+
e.context["total_items"] = len(content_items)
|
398
|
+
raise
|
399
|
+
|
400
|
+
contents.append((content_bytes, mime_type))
|
401
|
+
|
402
|
+
results = batch_extract_bytes_sync(contents, config)
|
403
|
+
return [result.to_dict(include_none=True) for result in results]
|
404
|
+
|
405
|
+
|
110
406
|
@mcp.tool()
|
111
407
|
def extract_simple(
|
112
408
|
file_path: str,
|
113
409
|
mime_type: str | None = None,
|
114
410
|
) -> str:
|
411
|
+
validated_path = _validate_file_path(file_path)
|
115
412
|
config = _create_config_with_overrides()
|
116
|
-
result = extract_file_sync(
|
413
|
+
result = extract_file_sync(str(validated_path), mime_type, config)
|
117
414
|
return result.content
|
118
415
|
|
119
416
|
|
@@ -151,7 +448,8 @@ def get_supported_formats() -> str:
|
|
151
448
|
|
152
449
|
@mcp.prompt()
|
153
450
|
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
154
|
-
|
451
|
+
validated_path = _validate_file_path(file_path)
|
452
|
+
result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
|
155
453
|
|
156
454
|
return [
|
157
455
|
TextContent(
|
@@ -163,12 +461,13 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
|
|
163
461
|
|
164
462
|
@mcp.prompt()
|
165
463
|
def extract_structured(file_path: str) -> list[TextContent]:
|
464
|
+
validated_path = _validate_file_path(file_path)
|
166
465
|
config = _create_config_with_overrides(
|
167
466
|
extract_entities=True,
|
168
467
|
extract_keywords=True,
|
169
468
|
extract_tables=True,
|
170
469
|
)
|
171
|
-
result = extract_file_sync(
|
470
|
+
result = extract_file_sync(str(validated_path), None, config)
|
172
471
|
|
173
472
|
content = f"Document Content:\n{result.content}\n\n"
|
174
473
|
|
kreuzberg/_mime_types.py
CHANGED
@@ -56,6 +56,48 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
|
|
56
56
|
"image/x-tiff",
|
57
57
|
}
|
58
58
|
|
59
|
+
IMAGE_FORMATS: Final[frozenset[str]] = frozenset(
|
60
|
+
{
|
61
|
+
"jpg",
|
62
|
+
"jpeg",
|
63
|
+
"png",
|
64
|
+
"gif",
|
65
|
+
"bmp",
|
66
|
+
"tiff",
|
67
|
+
"tif",
|
68
|
+
"webp",
|
69
|
+
"jp2",
|
70
|
+
"jpx",
|
71
|
+
"jpm",
|
72
|
+
"mj2",
|
73
|
+
"pnm",
|
74
|
+
"pbm",
|
75
|
+
"pgm",
|
76
|
+
"ppm",
|
77
|
+
}
|
78
|
+
)
|
79
|
+
|
80
|
+
IMAGE_MIME_TO_EXT: Final[dict[str, str]] = {
|
81
|
+
"image/bmp": "bmp",
|
82
|
+
"image/x-bmp": "bmp",
|
83
|
+
"image/x-ms-bmp": "bmp",
|
84
|
+
"image/gif": "gif",
|
85
|
+
"image/jpeg": "jpg",
|
86
|
+
"image/pjpeg": "jpg",
|
87
|
+
"image/png": "png",
|
88
|
+
"image/tiff": "tiff",
|
89
|
+
"image/x-tiff": "tiff",
|
90
|
+
"image/jp2": "jp2",
|
91
|
+
"image/jpx": "jpx",
|
92
|
+
"image/jpm": "jpm",
|
93
|
+
"image/mj2": "mj2",
|
94
|
+
"image/webp": "webp",
|
95
|
+
"image/x-portable-anymap": "pnm",
|
96
|
+
"image/x-portable-bitmap": "pbm",
|
97
|
+
"image/x-portable-graymap": "pgm",
|
98
|
+
"image/x-portable-pixmap": "ppm",
|
99
|
+
}
|
100
|
+
|
59
101
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
60
102
|
"application/csl+json",
|
61
103
|
"application/docbook+xml",
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -33,22 +33,39 @@ except ImportError: # pragma: no cover
|
|
33
33
|
|
34
34
|
if TYPE_CHECKING:
|
35
35
|
import easyocr
|
36
|
-
import numpy as np
|
37
36
|
import torch
|
37
|
+
else:
|
38
|
+
easyocr: Any = None
|
39
|
+
torch: Any = None
|
40
|
+
|
41
|
+
HAS_EASYOCR: bool = False
|
42
|
+
|
43
|
+
|
44
|
+
def _import_easyocr() -> tuple[Any, Any]:
|
45
|
+
global HAS_EASYOCR, easyocr, torch
|
46
|
+
|
47
|
+
# If easyocr is already set (either real module or mock), return it
|
48
|
+
if easyocr is not None:
|
49
|
+
return easyocr, torch
|
50
|
+
|
51
|
+
# If explicitly disabled for testing
|
52
|
+
if not HAS_EASYOCR and easyocr is None:
|
53
|
+
return None, None
|
38
54
|
|
39
|
-
HAS_EASYOCR: bool
|
40
|
-
if not TYPE_CHECKING:
|
41
55
|
try:
|
42
|
-
import easyocr
|
43
|
-
import numpy as np
|
44
|
-
import torch
|
56
|
+
import easyocr as _easyocr # noqa: PLC0415
|
45
57
|
|
58
|
+
try:
|
59
|
+
import torch as _torch # noqa: PLC0415
|
60
|
+
except ImportError:
|
61
|
+
_torch = None # type: ignore[assignment]
|
62
|
+
|
63
|
+
easyocr = _easyocr
|
64
|
+
torch = _torch
|
46
65
|
HAS_EASYOCR = True
|
66
|
+
return easyocr, torch
|
47
67
|
except ImportError:
|
48
|
-
|
49
|
-
easyocr: Any = None
|
50
|
-
np: Any = None
|
51
|
-
torch: Any = None
|
68
|
+
return None, None
|
52
69
|
|
53
70
|
|
54
71
|
EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
@@ -142,6 +159,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
142
159
|
_reader: ClassVar[Any] = None
|
143
160
|
|
144
161
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
162
|
+
try:
|
163
|
+
import numpy as np # noqa: PLC0415
|
164
|
+
except ImportError as e:
|
165
|
+
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
166
|
+
|
145
167
|
use_cache = kwargs.pop("use_cache", True)
|
146
168
|
|
147
169
|
cache_kwargs = None
|
@@ -239,7 +261,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
239
261
|
)
|
240
262
|
|
241
263
|
return ExtractionResult(
|
242
|
-
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
264
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
243
265
|
)
|
244
266
|
|
245
267
|
# Group text boxes by lines based on Y coordinate # ~keep
|
@@ -287,12 +309,13 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
287
309
|
)
|
288
310
|
|
289
311
|
return ExtractionResult(
|
290
|
-
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
312
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
291
313
|
)
|
292
314
|
|
293
315
|
@classmethod
|
294
316
|
def _is_gpu_available(cls) -> bool:
|
295
|
-
|
317
|
+
# Use the module-level torch variable directly to respect patches
|
318
|
+
if torch is None:
|
296
319
|
return False
|
297
320
|
return bool(torch.cuda.is_available())
|
298
321
|
|
@@ -301,13 +324,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
301
324
|
if cls._reader is not None:
|
302
325
|
return
|
303
326
|
|
304
|
-
|
327
|
+
# Validate language first before attempting import
|
328
|
+
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
329
|
+
|
330
|
+
easyocr_module, _ = _import_easyocr()
|
331
|
+
if easyocr_module is None:
|
305
332
|
raise MissingDependencyError.create_for_package(
|
306
333
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
307
334
|
)
|
308
335
|
|
309
|
-
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
310
|
-
|
311
336
|
device_info = cls._resolve_device_config(**kwargs)
|
312
337
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
313
338
|
|
@@ -318,7 +343,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
318
343
|
|
319
344
|
try:
|
320
345
|
cls._reader = await run_sync(
|
321
|
-
|
346
|
+
easyocr_module.Reader,
|
322
347
|
languages,
|
323
348
|
gpu=use_gpu,
|
324
349
|
verbose=False,
|
@@ -382,6 +407,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
382
407
|
return languages
|
383
408
|
|
384
409
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
410
|
+
try:
|
411
|
+
import numpy as np # noqa: PLC0415
|
412
|
+
except ImportError as e:
|
413
|
+
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
414
|
+
|
385
415
|
use_cache = kwargs.pop("use_cache", True)
|
386
416
|
|
387
417
|
cache_kwargs = None
|
@@ -453,13 +483,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
453
483
|
if cls._reader is not None:
|
454
484
|
return
|
455
485
|
|
456
|
-
|
486
|
+
# Validate language first before attempting import
|
487
|
+
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
488
|
+
|
489
|
+
easyocr_module, _ = _import_easyocr()
|
490
|
+
if easyocr_module is None:
|
457
491
|
raise MissingDependencyError.create_for_package(
|
458
492
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
459
493
|
)
|
460
494
|
|
461
|
-
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
462
|
-
|
463
495
|
device_info = cls._resolve_device_config(**kwargs)
|
464
496
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
465
497
|
|
@@ -469,7 +501,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
469
501
|
kwargs.setdefault("recog_network", "standard")
|
470
502
|
|
471
503
|
try:
|
472
|
-
cls._reader =
|
504
|
+
cls._reader = easyocr_module.Reader(
|
473
505
|
languages,
|
474
506
|
gpu=use_gpu,
|
475
507
|
verbose=False,
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -192,7 +192,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
192
192
|
)
|
193
193
|
|
194
194
|
return ExtractionResult(
|
195
|
-
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
195
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
|
196
196
|
)
|
197
197
|
|
198
198
|
@classmethod
|