kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_mcp/server.py CHANGED
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import base64
4
+ import binascii
4
5
  import json
6
+ from pathlib import Path
5
7
  from typing import Any
6
8
 
7
9
  import msgspec
@@ -9,34 +11,170 @@ from mcp.server import FastMCP
9
11
  from mcp.types import TextContent
10
12
 
11
13
  from kreuzberg._config import discover_config
12
- from kreuzberg._types import ExtractionConfig, OcrBackendType
13
- from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
14
+ from kreuzberg._types import ExtractionConfig, OcrBackendType, PSMMode, TesseractConfig
15
+ from kreuzberg.exceptions import ValidationError
16
+ from kreuzberg.extraction import (
17
+ batch_extract_bytes_sync,
18
+ batch_extract_file_sync,
19
+ extract_bytes_sync,
20
+ extract_file_sync,
21
+ )
14
22
 
15
23
  mcp = FastMCP("Kreuzberg Text Extraction")
16
24
 
25
+ MAX_BATCH_SIZE = 100
26
+
27
+
28
+ def _validate_file_path(file_path: str) -> Path:
29
+ """Validate file path to prevent path traversal attacks.
30
+
31
+ Args:
32
+ file_path: The file path to validate
33
+
34
+ Returns:
35
+ Path: The validated Path object
36
+
37
+ Raises:
38
+ ValidationError: If path traversal is detected or path is invalid
39
+ """
40
+ try:
41
+ path = Path(file_path).resolve()
42
+ except (OSError, ValueError) as e:
43
+ raise ValidationError(
44
+ f"Invalid file path: {file_path}",
45
+ context={"file_path": file_path, "error": str(e)},
46
+ ) from e
47
+
48
+ if ".." in file_path and not file_path.startswith("/"):
49
+ raise ValidationError(
50
+ "Path traversal detected in file path",
51
+ context={"file_path": file_path, "resolved_path": str(path)},
52
+ )
53
+
54
+ if not path.exists():
55
+ raise ValidationError(
56
+ f"File not found: {file_path}",
57
+ context={"file_path": file_path, "resolved_path": str(path)},
58
+ )
59
+
60
+ if not path.is_file():
61
+ raise ValidationError(
62
+ f"Path is not a file: {file_path}",
63
+ context={"file_path": file_path, "resolved_path": str(path)},
64
+ )
65
+
66
+ return path
67
+
68
+
69
+ def _validate_file_path_with_context(file_path: str, index: int, total: int) -> Path:
70
+ """Validate file path and add context for batch operations."""
71
+ try:
72
+ return _validate_file_path(file_path)
73
+ except ValidationError as e:
74
+ e.context = e.context or {}
75
+ e.context["batch_index"] = index
76
+ e.context["total_files"] = total
77
+ raise
78
+
79
+
80
+ def _validate_base64_content(content_base64: str, context_info: str | None = None) -> bytes:
81
+ """Validate and decode base64 content with proper error handling.
82
+
83
+ Args:
84
+ content_base64: The base64 string to validate and decode
85
+ context_info: Additional context information for error reporting
86
+
87
+ Returns:
88
+ bytes: The decoded content
89
+
90
+ Raises:
91
+ ValidationError: If the base64 content is invalid
92
+ """
93
+ if not content_base64:
94
+ raise ValidationError(
95
+ "Base64 content cannot be empty",
96
+ context={"context": context_info},
97
+ )
98
+
99
+ if not content_base64.strip():
100
+ raise ValidationError(
101
+ "Base64 content cannot be whitespace only",
102
+ context={"content_preview": content_base64[:50], "context": context_info},
103
+ )
104
+
105
+ try:
106
+ content_bytes = base64.b64decode(content_base64, validate=True)
107
+ except (ValueError, binascii.Error) as e:
108
+ error_type = type(e).__name__
109
+ raise ValidationError(
110
+ f"Invalid base64 content: {error_type}: {e}",
111
+ context={
112
+ "error_type": error_type,
113
+ "error": str(e),
114
+ "content_preview": content_base64[:50] + "..." if len(content_base64) > 50 else content_base64,
115
+ "context": context_info,
116
+ },
117
+ ) from e
118
+
119
+ return content_bytes
120
+
17
121
 
18
122
  def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
19
123
  base_config = discover_config()
20
124
 
125
+ tesseract_lang = kwargs.pop("tesseract_lang", None)
126
+ tesseract_psm = kwargs.pop("tesseract_psm", None)
127
+ tesseract_output_format = kwargs.pop("tesseract_output_format", None)
128
+ enable_table_detection = kwargs.pop("enable_table_detection", None)
129
+
21
130
  if base_config is None:
22
- return ExtractionConfig(**kwargs)
23
-
24
- config_dict: dict[str, Any] = {
25
- "force_ocr": base_config.force_ocr,
26
- "chunk_content": base_config.chunk_content,
27
- "extract_tables": base_config.extract_tables,
28
- "extract_entities": base_config.extract_entities,
29
- "extract_keywords": base_config.extract_keywords,
30
- "ocr_backend": base_config.ocr_backend,
31
- "max_chars": base_config.max_chars,
32
- "max_overlap": base_config.max_overlap,
33
- "keyword_count": base_config.keyword_count,
34
- "auto_detect_language": base_config.auto_detect_language,
35
- "ocr_config": base_config.ocr_config,
36
- "gmft_config": base_config.gmft_config,
37
- }
38
-
39
- config_dict = config_dict | kwargs
131
+ config_dict = kwargs
132
+ else:
133
+ config_dict = {
134
+ "force_ocr": base_config.force_ocr,
135
+ "chunk_content": base_config.chunk_content,
136
+ "extract_tables": base_config.extract_tables,
137
+ "extract_entities": base_config.extract_entities,
138
+ "extract_keywords": base_config.extract_keywords,
139
+ "ocr_backend": base_config.ocr_backend,
140
+ "max_chars": base_config.max_chars,
141
+ "max_overlap": base_config.max_overlap,
142
+ "keyword_count": base_config.keyword_count,
143
+ "auto_detect_language": base_config.auto_detect_language,
144
+ "ocr_config": base_config.ocr_config,
145
+ "gmft_config": base_config.gmft_config,
146
+ }
147
+ config_dict = config_dict | kwargs
148
+
149
+ ocr_backend = config_dict.get("ocr_backend")
150
+ if ocr_backend == "tesseract" and (
151
+ tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
152
+ ):
153
+ tesseract_config_dict = {}
154
+
155
+ if tesseract_lang:
156
+ tesseract_config_dict["language"] = tesseract_lang
157
+ if tesseract_psm is not None:
158
+ try:
159
+ tesseract_config_dict["psm"] = PSMMode(tesseract_psm)
160
+ except ValueError as e:
161
+ raise ValidationError(
162
+ f"Invalid PSM mode value: {tesseract_psm}",
163
+ context={"psm_value": tesseract_psm, "error": str(e)},
164
+ ) from e
165
+ if tesseract_output_format:
166
+ tesseract_config_dict["output_format"] = tesseract_output_format
167
+ if enable_table_detection:
168
+ tesseract_config_dict["enable_table_detection"] = True
169
+
170
+ if tesseract_config_dict:
171
+ existing_ocr_config = config_dict.get("ocr_config")
172
+ if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
173
+ existing_dict = existing_ocr_config.to_dict()
174
+ merged_dict = existing_dict | tesseract_config_dict
175
+ config_dict["ocr_config"] = TesseractConfig(**merged_dict)
176
+ else:
177
+ config_dict["ocr_config"] = TesseractConfig(**tesseract_config_dict)
40
178
 
41
179
  return ExtractionConfig(**config_dict)
42
180
 
@@ -55,7 +193,12 @@ def extract_document( # noqa: PLR0913
55
193
  max_overlap: int = 200,
56
194
  keyword_count: int = 10,
57
195
  auto_detect_language: bool = False,
196
+ tesseract_lang: str | None = None,
197
+ tesseract_psm: int | None = None,
198
+ tesseract_output_format: str | None = None,
199
+ enable_table_detection: bool | None = None,
58
200
  ) -> dict[str, Any]:
201
+ validated_path = _validate_file_path(file_path)
59
202
  config = _create_config_with_overrides(
60
203
  force_ocr=force_ocr,
61
204
  chunk_content=chunk_content,
@@ -67,9 +210,13 @@ def extract_document( # noqa: PLR0913
67
210
  max_overlap=max_overlap,
68
211
  keyword_count=keyword_count,
69
212
  auto_detect_language=auto_detect_language,
213
+ tesseract_lang=tesseract_lang,
214
+ tesseract_psm=tesseract_psm,
215
+ tesseract_output_format=tesseract_output_format,
216
+ enable_table_detection=enable_table_detection,
70
217
  )
71
218
 
72
- result = extract_file_sync(file_path, mime_type, config)
219
+ result = extract_file_sync(str(validated_path), mime_type, config)
73
220
  return result.to_dict(include_none=True)
74
221
 
75
222
 
@@ -87,8 +234,12 @@ def extract_bytes( # noqa: PLR0913
87
234
  max_overlap: int = 200,
88
235
  keyword_count: int = 10,
89
236
  auto_detect_language: bool = False,
237
+ tesseract_lang: str | None = None,
238
+ tesseract_psm: int | None = None,
239
+ tesseract_output_format: str | None = None,
240
+ enable_table_detection: bool | None = None,
90
241
  ) -> dict[str, Any]:
91
- content_bytes = base64.b64decode(content_base64)
242
+ content_bytes = _validate_base64_content(content_base64, "extract_bytes")
92
243
 
93
244
  config = _create_config_with_overrides(
94
245
  force_ocr=force_ocr,
@@ -101,19 +252,165 @@ def extract_bytes( # noqa: PLR0913
101
252
  max_overlap=max_overlap,
102
253
  keyword_count=keyword_count,
103
254
  auto_detect_language=auto_detect_language,
255
+ tesseract_lang=tesseract_lang,
256
+ tesseract_psm=tesseract_psm,
257
+ tesseract_output_format=tesseract_output_format,
258
+ enable_table_detection=enable_table_detection,
104
259
  )
105
260
 
106
261
  result = extract_bytes_sync(content_bytes, mime_type, config)
107
262
  return result.to_dict(include_none=True)
108
263
 
109
264
 
265
+ @mcp.tool()
266
+ def batch_extract_document( # noqa: PLR0913
267
+ file_paths: list[str],
268
+ force_ocr: bool = False,
269
+ chunk_content: bool = False,
270
+ extract_tables: bool = False,
271
+ extract_entities: bool = False,
272
+ extract_keywords: bool = False,
273
+ ocr_backend: OcrBackendType = "tesseract",
274
+ max_chars: int = 1000,
275
+ max_overlap: int = 200,
276
+ keyword_count: int = 10,
277
+ auto_detect_language: bool = False,
278
+ tesseract_lang: str | None = None,
279
+ tesseract_psm: int | None = None,
280
+ tesseract_output_format: str | None = None,
281
+ enable_table_detection: bool | None = None,
282
+ ) -> list[dict[str, Any]]:
283
+ if len(file_paths) > MAX_BATCH_SIZE:
284
+ raise ValidationError(
285
+ f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
286
+ context={"batch_size": len(file_paths), "max_batch_size": MAX_BATCH_SIZE},
287
+ )
288
+
289
+ if not file_paths:
290
+ raise ValidationError(
291
+ "File paths list cannot be empty",
292
+ context={"file_paths": file_paths},
293
+ )
294
+
295
+ validated_paths = []
296
+ for i, file_path in enumerate(file_paths):
297
+ validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
298
+ validated_paths.append(str(validated_path))
299
+ config = _create_config_with_overrides(
300
+ force_ocr=force_ocr,
301
+ chunk_content=chunk_content,
302
+ extract_tables=extract_tables,
303
+ extract_entities=extract_entities,
304
+ extract_keywords=extract_keywords,
305
+ ocr_backend=ocr_backend,
306
+ max_chars=max_chars,
307
+ max_overlap=max_overlap,
308
+ keyword_count=keyword_count,
309
+ auto_detect_language=auto_detect_language,
310
+ tesseract_lang=tesseract_lang,
311
+ tesseract_psm=tesseract_psm,
312
+ tesseract_output_format=tesseract_output_format,
313
+ enable_table_detection=enable_table_detection,
314
+ )
315
+
316
+ results = batch_extract_file_sync(validated_paths, config)
317
+ return [result.to_dict(include_none=True) for result in results]
318
+
319
+
320
+ @mcp.tool()
321
+ def batch_extract_bytes( # noqa: PLR0913
322
+ content_items: list[dict[str, str]],
323
+ force_ocr: bool = False,
324
+ chunk_content: bool = False,
325
+ extract_tables: bool = False,
326
+ extract_entities: bool = False,
327
+ extract_keywords: bool = False,
328
+ ocr_backend: OcrBackendType = "tesseract",
329
+ max_chars: int = 1000,
330
+ max_overlap: int = 200,
331
+ keyword_count: int = 10,
332
+ auto_detect_language: bool = False,
333
+ tesseract_lang: str | None = None,
334
+ tesseract_psm: int | None = None,
335
+ tesseract_output_format: str | None = None,
336
+ enable_table_detection: bool | None = None,
337
+ ) -> list[dict[str, Any]]:
338
+ if not content_items:
339
+ raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
340
+
341
+ if not isinstance(content_items, list):
342
+ raise ValidationError(
343
+ "content_items must be a list", context={"content_items_type": type(content_items).__name__}
344
+ )
345
+
346
+ if len(content_items) > MAX_BATCH_SIZE:
347
+ raise ValidationError(
348
+ f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
349
+ context={"batch_size": len(content_items), "max_batch_size": MAX_BATCH_SIZE},
350
+ )
351
+
352
+ config = _create_config_with_overrides(
353
+ force_ocr=force_ocr,
354
+ chunk_content=chunk_content,
355
+ extract_tables=extract_tables,
356
+ extract_entities=extract_entities,
357
+ extract_keywords=extract_keywords,
358
+ ocr_backend=ocr_backend,
359
+ max_chars=max_chars,
360
+ max_overlap=max_overlap,
361
+ keyword_count=keyword_count,
362
+ auto_detect_language=auto_detect_language,
363
+ tesseract_lang=tesseract_lang,
364
+ tesseract_psm=tesseract_psm,
365
+ tesseract_output_format=tesseract_output_format,
366
+ enable_table_detection=enable_table_detection,
367
+ )
368
+
369
+ contents = []
370
+ for i, item in enumerate(content_items):
371
+ if not isinstance(item, dict):
372
+ raise ValidationError(
373
+ f"Item at index {i} must be a dictionary",
374
+ context={"item_index": i, "item_type": type(item).__name__, "item": item},
375
+ )
376
+
377
+ if "content_base64" not in item:
378
+ raise ValidationError(
379
+ f"Item at index {i} is missing required key 'content_base64'",
380
+ context={"item_index": i, "item_keys": list(item.keys()), "item": item},
381
+ )
382
+
383
+ if "mime_type" not in item:
384
+ raise ValidationError(
385
+ f"Item at index {i} is missing required key 'mime_type'",
386
+ context={"item_index": i, "item_keys": list(item.keys()), "item": item},
387
+ )
388
+
389
+ content_base64 = item["content_base64"]
390
+ mime_type = item["mime_type"]
391
+
392
+ try:
393
+ content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
394
+ except ValidationError as e:
395
+ e.context = e.context or {}
396
+ e.context["item_index"] = i
397
+ e.context["total_items"] = len(content_items)
398
+ raise
399
+
400
+ contents.append((content_bytes, mime_type))
401
+
402
+ results = batch_extract_bytes_sync(contents, config)
403
+ return [result.to_dict(include_none=True) for result in results]
404
+
405
+
110
406
  @mcp.tool()
111
407
  def extract_simple(
112
408
  file_path: str,
113
409
  mime_type: str | None = None,
114
410
  ) -> str:
411
+ validated_path = _validate_file_path(file_path)
115
412
  config = _create_config_with_overrides()
116
- result = extract_file_sync(file_path, mime_type, config)
413
+ result = extract_file_sync(str(validated_path), mime_type, config)
117
414
  return result.content
118
415
 
119
416
 
@@ -151,7 +448,8 @@ def get_supported_formats() -> str:
151
448
 
152
449
  @mcp.prompt()
153
450
  def extract_and_summarize(file_path: str) -> list[TextContent]:
154
- result = extract_file_sync(file_path, None, _create_config_with_overrides())
451
+ validated_path = _validate_file_path(file_path)
452
+ result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
155
453
 
156
454
  return [
157
455
  TextContent(
@@ -163,12 +461,13 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
163
461
 
164
462
  @mcp.prompt()
165
463
  def extract_structured(file_path: str) -> list[TextContent]:
464
+ validated_path = _validate_file_path(file_path)
166
465
  config = _create_config_with_overrides(
167
466
  extract_entities=True,
168
467
  extract_keywords=True,
169
468
  extract_tables=True,
170
469
  )
171
- result = extract_file_sync(file_path, None, config)
470
+ result = extract_file_sync(str(validated_path), None, config)
172
471
 
173
472
  content = f"Document Content:\n{result.content}\n\n"
174
473
 
kreuzberg/_mime_types.py CHANGED
@@ -56,6 +56,48 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
56
56
  "image/x-tiff",
57
57
  }
58
58
 
59
+ IMAGE_FORMATS: Final[frozenset[str]] = frozenset(
60
+ {
61
+ "jpg",
62
+ "jpeg",
63
+ "png",
64
+ "gif",
65
+ "bmp",
66
+ "tiff",
67
+ "tif",
68
+ "webp",
69
+ "jp2",
70
+ "jpx",
71
+ "jpm",
72
+ "mj2",
73
+ "pnm",
74
+ "pbm",
75
+ "pgm",
76
+ "ppm",
77
+ }
78
+ )
79
+
80
+ IMAGE_MIME_TO_EXT: Final[dict[str, str]] = {
81
+ "image/bmp": "bmp",
82
+ "image/x-bmp": "bmp",
83
+ "image/x-ms-bmp": "bmp",
84
+ "image/gif": "gif",
85
+ "image/jpeg": "jpg",
86
+ "image/pjpeg": "jpg",
87
+ "image/png": "png",
88
+ "image/tiff": "tiff",
89
+ "image/x-tiff": "tiff",
90
+ "image/jp2": "jp2",
91
+ "image/jpx": "jpx",
92
+ "image/jpm": "jpm",
93
+ "image/mj2": "mj2",
94
+ "image/webp": "webp",
95
+ "image/x-portable-anymap": "pnm",
96
+ "image/x-portable-bitmap": "pbm",
97
+ "image/x-portable-graymap": "pgm",
98
+ "image/x-portable-pixmap": "ppm",
99
+ }
100
+
59
101
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
60
102
  "application/csl+json",
61
103
  "application/docbook+xml",
@@ -33,22 +33,39 @@ except ImportError: # pragma: no cover
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  import easyocr
36
- import numpy as np
37
36
  import torch
37
+ else:
38
+ easyocr: Any = None
39
+ torch: Any = None
40
+
41
+ HAS_EASYOCR: bool = False
42
+
43
+
44
+ def _import_easyocr() -> tuple[Any, Any]:
45
+ global HAS_EASYOCR, easyocr, torch
46
+
47
+ # If easyocr is already set (either real module or mock), return it
48
+ if easyocr is not None:
49
+ return easyocr, torch
50
+
51
+ # If explicitly disabled for testing
52
+ if not HAS_EASYOCR and easyocr is None:
53
+ return None, None
38
54
 
39
- HAS_EASYOCR: bool
40
- if not TYPE_CHECKING:
41
55
  try:
42
- import easyocr
43
- import numpy as np
44
- import torch
56
+ import easyocr as _easyocr # noqa: PLC0415
45
57
 
58
+ try:
59
+ import torch as _torch # noqa: PLC0415
60
+ except ImportError:
61
+ _torch = None # type: ignore[assignment]
62
+
63
+ easyocr = _easyocr
64
+ torch = _torch
46
65
  HAS_EASYOCR = True
66
+ return easyocr, torch
47
67
  except ImportError:
48
- HAS_EASYOCR = False
49
- easyocr: Any = None
50
- np: Any = None
51
- torch: Any = None
68
+ return None, None
52
69
 
53
70
 
54
71
  EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -142,6 +159,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
142
159
  _reader: ClassVar[Any] = None
143
160
 
144
161
  async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
162
+ try:
163
+ import numpy as np # noqa: PLC0415
164
+ except ImportError as e:
165
+ raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
166
+
145
167
  use_cache = kwargs.pop("use_cache", True)
146
168
 
147
169
  cache_kwargs = None
@@ -239,7 +261,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
239
261
  )
240
262
 
241
263
  return ExtractionResult(
242
- content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
264
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
243
265
  )
244
266
 
245
267
  # Group text boxes by lines based on Y coordinate # ~keep
@@ -287,12 +309,13 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
287
309
  )
288
310
 
289
311
  return ExtractionResult(
290
- content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
312
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
291
313
  )
292
314
 
293
315
  @classmethod
294
316
  def _is_gpu_available(cls) -> bool:
295
- if not HAS_EASYOCR or torch is None:
317
+ # Use the module-level torch variable directly to respect patches
318
+ if torch is None:
296
319
  return False
297
320
  return bool(torch.cuda.is_available())
298
321
 
@@ -301,13 +324,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
301
324
  if cls._reader is not None:
302
325
  return
303
326
 
304
- if not HAS_EASYOCR or easyocr is None:
327
+ # Validate language first before attempting import
328
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
329
+
330
+ easyocr_module, _ = _import_easyocr()
331
+ if easyocr_module is None:
305
332
  raise MissingDependencyError.create_for_package(
306
333
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
307
334
  )
308
335
 
309
- languages = cls._validate_language_code(kwargs.pop("language", "en"))
310
-
311
336
  device_info = cls._resolve_device_config(**kwargs)
312
337
  use_gpu = device_info.device_type in ("cuda", "mps")
313
338
 
@@ -318,7 +343,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
318
343
 
319
344
  try:
320
345
  cls._reader = await run_sync(
321
- easyocr.Reader,
346
+ easyocr_module.Reader,
322
347
  languages,
323
348
  gpu=use_gpu,
324
349
  verbose=False,
@@ -382,6 +407,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
382
407
  return languages
383
408
 
384
409
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
410
+ try:
411
+ import numpy as np # noqa: PLC0415
412
+ except ImportError as e:
413
+ raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
414
+
385
415
  use_cache = kwargs.pop("use_cache", True)
386
416
 
387
417
  cache_kwargs = None
@@ -453,13 +483,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
453
483
  if cls._reader is not None:
454
484
  return
455
485
 
456
- if not HAS_EASYOCR or easyocr is None:
486
+ # Validate language first before attempting import
487
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
488
+
489
+ easyocr_module, _ = _import_easyocr()
490
+ if easyocr_module is None:
457
491
  raise MissingDependencyError.create_for_package(
458
492
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
459
493
  )
460
494
 
461
- languages = cls._validate_language_code(kwargs.pop("language", "en"))
462
-
463
495
  device_info = cls._resolve_device_config(**kwargs)
464
496
  use_gpu = device_info.device_type in ("cuda", "mps")
465
497
 
@@ -469,7 +501,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
469
501
  kwargs.setdefault("recog_network", "standard")
470
502
 
471
503
  try:
472
- cls._reader = easyocr.Reader(
504
+ cls._reader = easyocr_module.Reader(
473
505
  languages,
474
506
  gpu=use_gpu,
475
507
  verbose=False,
@@ -192,7 +192,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
192
192
  )
193
193
 
194
194
  return ExtractionResult(
195
- content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
195
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
196
196
  )
197
197
 
198
198
  @classmethod