kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,24 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import warnings
4
- from dataclasses import dataclass
5
- from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
4
+ from typing import TYPE_CHECKING, Any, ClassVar, Final
6
5
 
7
6
  from PIL import Image
8
7
 
9
8
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
10
9
  from kreuzberg._ocr._base import OCRBackend
11
- from kreuzberg._types import ExtractionResult, Metadata
12
- from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
10
+ from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
11
+ from kreuzberg._utils._device import DeviceInfo, validate_device_request
12
+ from kreuzberg._utils._ocr_cache import (
13
+ build_cache_kwargs,
14
+ cache_and_complete_async,
15
+ cache_and_complete_sync,
16
+ generate_image_hash,
17
+ get_file_info,
18
+ handle_cache_lookup_async,
19
+ handle_cache_lookup_sync,
20
+ mark_processing_complete,
21
+ )
13
22
  from kreuzberg._utils._string import normalize_spaces
14
23
  from kreuzberg._utils._sync import run_sync
15
24
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -22,6 +31,25 @@ try: # pragma: no cover
22
31
  except ImportError: # pragma: no cover
23
32
  from typing_extensions import Unpack
24
33
 
34
+ if TYPE_CHECKING:
35
+ import easyocr
36
+ import numpy as np
37
+ import torch
38
+
39
+ HAS_EASYOCR: bool
40
+ if not TYPE_CHECKING:
41
+ try:
42
+ import easyocr
43
+ import numpy as np
44
+ import torch
45
+
46
+ HAS_EASYOCR = True
47
+ except ImportError:
48
+ HAS_EASYOCR = False
49
+ easyocr: Any = None
50
+ np: Any = None
51
+ torch: Any = None
52
+
25
53
 
26
54
  EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
27
55
  "abq",
@@ -110,88 +138,32 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
110
138
  }
111
139
 
112
140
 
113
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
114
- class EasyOCRConfig:
115
- """Configuration options for EasyOCR."""
116
-
117
- add_margin: float = 0.1
118
- """Extend bounding boxes in all directions."""
119
- adjust_contrast: float = 0.5
120
- """Target contrast level for low contrast text."""
121
- beam_width: int = 5
122
- """Beam width for beam search in recognition."""
123
- canvas_size: int = 2560
124
- """Maximum image dimension for detection."""
125
- contrast_ths: float = 0.1
126
- """Contrast threshold for preprocessing."""
127
- decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
128
- """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
129
- height_ths: float = 0.5
130
- """Maximum difference in box height for merging."""
131
- language: str | list[str] = "en"
132
- """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
133
- a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
134
- link_threshold: float = 0.4
135
- """Link confidence threshold."""
136
- low_text: float = 0.4
137
- """Text low-bound score."""
138
- mag_ratio: float = 1.0
139
- """Image magnification ratio."""
140
- min_size: int = 10
141
- """Minimum text box size in pixels."""
142
- rotation_info: list[int] | None = None
143
- """List of angles to try for detection."""
144
- slope_ths: float = 0.1
145
- """Maximum slope for merging text boxes."""
146
- text_threshold: float = 0.7
147
- """Text confidence threshold."""
148
- use_gpu: bool = False
149
- """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
150
- device: DeviceType = "auto"
151
- """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
152
- gpu_memory_limit: float | None = None
153
- """Maximum GPU memory to use in GB. None for no limit."""
154
- fallback_to_cpu: bool = True
155
- """Whether to fallback to CPU if requested device is unavailable."""
156
- width_ths: float = 0.5
157
- """Maximum horizontal distance for merging boxes."""
158
- x_ths: float = 1.0
159
- """Maximum horizontal distance for paragraph merging."""
160
- y_ths: float = 0.5
161
- """Maximum vertical distance for paragraph merging."""
162
- ycenter_ths: float = 0.5
163
- """Maximum shift in y direction for merging."""
164
-
165
-
166
141
  class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
167
142
  _reader: ClassVar[Any] = None
168
143
 
169
144
  async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
170
- """Asynchronously process an image and extract its text and metadata using EasyOCR.
145
+ use_cache = kwargs.pop("use_cache", True)
171
146
 
172
- Args:
173
- image: An instance of PIL.Image representing the input image.
174
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
147
+ cache_kwargs = None
148
+ if use_cache:
149
+ image_hash = generate_image_hash(image)
150
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
175
151
 
176
- Returns:
177
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
152
+ cached_result = await handle_cache_lookup_async(cache_kwargs)
153
+ if cached_result:
154
+ return cached_result
178
155
 
179
- Raises:
180
- OCRError: If OCR processing fails.
181
- """
182
- import numpy as np # noqa: PLC0415
183
-
184
- await self._init_easyocr(**kwargs)
156
+ try:
157
+ await self._init_easyocr(**kwargs)
185
158
 
186
- beam_width = kwargs.pop("beam_width")
159
+ beam_width = kwargs.pop("beam_width", 5)
187
160
 
188
- kwargs.pop("language", None)
189
- kwargs.pop("use_gpu", None)
190
- kwargs.pop("device", None)
191
- kwargs.pop("gpu_memory_limit", None)
192
- kwargs.pop("fallback_to_cpu", None)
161
+ kwargs.pop("language", None)
162
+ kwargs.pop("use_gpu", None)
163
+ kwargs.pop("device", None)
164
+ kwargs.pop("gpu_memory_limit", None)
165
+ kwargs.pop("fallback_to_cpu", None)
193
166
 
194
- try:
195
167
  result = await run_sync(
196
168
  self._reader.readtext,
197
169
  np.array(image),
@@ -199,41 +171,47 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
199
171
  **kwargs,
200
172
  )
201
173
 
202
- return self._process_easyocr_result(result, image)
174
+ extraction_result = self._process_easyocr_result(result, image)
175
+
176
+ if use_cache and cache_kwargs:
177
+ await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
178
+
179
+ return extraction_result
203
180
  except Exception as e:
181
+ if use_cache and cache_kwargs:
182
+ mark_processing_complete(cache_kwargs)
204
183
  raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
205
184
 
206
185
  async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
207
- """Asynchronously process a file and extract its text and metadata using EasyOCR.
186
+ use_cache = kwargs.pop("use_cache", True)
208
187
 
209
- Args:
210
- path: A Path object representing the file to be processed.
211
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
188
+ cache_kwargs = None
189
+ if use_cache:
190
+ file_info = get_file_info(path)
191
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
212
192
 
213
- Returns:
214
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
193
+ cached_result = await handle_cache_lookup_async(cache_kwargs)
194
+ if cached_result:
195
+ return cached_result
215
196
 
216
- Raises:
217
- OCRError: If file loading or OCR processing fails.
218
- """
219
- await self._init_easyocr(**kwargs)
220
197
  try:
198
+ await self._init_easyocr(**kwargs)
221
199
  image = await run_sync(Image.open, path)
222
- return await self.process_image(image, **kwargs)
200
+
201
+ kwargs["use_cache"] = False
202
+ extraction_result = await self.process_image(image, **kwargs)
203
+
204
+ if use_cache and cache_kwargs:
205
+ await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
206
+
207
+ return extraction_result
223
208
  except Exception as e:
209
+ if use_cache and cache_kwargs:
210
+ mark_processing_complete(cache_kwargs)
224
211
  raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
225
212
 
226
213
  @staticmethod
227
214
  def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
228
- """Process EasyOCR result into an ExtractionResult with metadata.
229
-
230
- Args:
231
- result: The raw result from EasyOCR.
232
- image: The original PIL image.
233
-
234
- Returns:
235
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
236
- """
237
215
  if not result:
238
216
  return ExtractionResult(
239
217
  content="",
@@ -314,38 +292,19 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
314
292
 
315
293
  @classmethod
316
294
  def _is_gpu_available(cls) -> bool:
317
- """Check if GPU is available for EasyOCR.
318
-
319
- Returns:
320
- bool: True if GPU support is available.
321
- """
322
- try:
323
- import torch # noqa: PLC0415
324
-
325
- return bool(torch.cuda.is_available())
326
- except ImportError: # pragma: no cover
295
+ if not HAS_EASYOCR or torch is None:
327
296
  return False
297
+ return bool(torch.cuda.is_available())
328
298
 
329
299
  @classmethod
330
300
  async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
331
- """Initialize EasyOCR with the provided configuration.
332
-
333
- Args:
334
- **kwargs: Configuration parameters for EasyOCR including language, etc.
335
-
336
- Raises:
337
- MissingDependencyError: If EasyOCR is not installed.
338
- OCRError: If initialization fails.
339
- """
340
301
  if cls._reader is not None:
341
302
  return
342
303
 
343
- try:
344
- import easyocr # noqa: PLC0415
345
- except ImportError as e: # pragma: no cover
304
+ if not HAS_EASYOCR or easyocr is None:
346
305
  raise MissingDependencyError.create_for_package(
347
306
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
348
- ) from e
307
+ )
349
308
 
350
309
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
351
310
 
@@ -369,17 +328,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
369
328
 
370
329
  @classmethod
371
330
  def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
372
- """Resolve device configuration with backward compatibility.
373
-
374
- Args:
375
- **kwargs: Configuration parameters including device settings.
376
-
377
- Returns:
378
- DeviceInfo object for the selected device.
379
-
380
- Raises:
381
- ValidationError: If requested device is not available and fallback is disabled.
382
- """
383
331
  use_gpu = kwargs.get("use_gpu", False)
384
332
  device = kwargs.get("device", "auto")
385
333
  memory_limit = kwargs.get("gpu_memory_limit")
@@ -416,17 +364,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
416
364
 
417
365
  @staticmethod
418
366
  def _validate_language_code(language_codes: str | list[str]) -> list[str]:
419
- """Validate and normalize provided language codes.
420
-
421
- Args:
422
- language_codes: The language code(s), either as a string (single or comma-separated) or a list.
423
-
424
- Raises:
425
- ValidationError: If any of the languages are not supported by EasyOCR
426
-
427
- Returns:
428
- A list with the normalized language codes.
429
- """
430
367
  if isinstance(language_codes, str):
431
368
  languages = [lang.strip().lower() for lang in language_codes.split(",")]
432
369
  else:
@@ -445,80 +382,81 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
445
382
  return languages
446
383
 
447
384
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
448
- """Synchronously process an image and extract its text and metadata using EasyOCR.
385
+ use_cache = kwargs.pop("use_cache", True)
449
386
 
450
- Args:
451
- image: An instance of PIL.Image representing the input image.
452
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
387
+ cache_kwargs = None
388
+ if use_cache:
389
+ image_hash = generate_image_hash(image)
390
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
453
391
 
454
- Returns:
455
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
392
+ cached_result = handle_cache_lookup_sync(cache_kwargs)
393
+ if cached_result:
394
+ return cached_result
456
395
 
457
- Raises:
458
- OCRError: If OCR processing fails.
459
- """
460
- import numpy as np # noqa: PLC0415
461
-
462
- self._init_easyocr_sync(**kwargs)
396
+ try:
397
+ self._init_easyocr_sync(**kwargs)
463
398
 
464
- beam_width = kwargs.pop("beam_width")
465
- kwargs.pop("language", None)
466
- kwargs.pop("use_gpu", None)
467
- kwargs.pop("device", None)
468
- kwargs.pop("gpu_memory_limit", None)
469
- kwargs.pop("fallback_to_cpu", None)
399
+ beam_width = kwargs.pop("beam_width", 5)
400
+ kwargs.pop("language", None)
401
+ kwargs.pop("use_gpu", None)
402
+ kwargs.pop("device", None)
403
+ kwargs.pop("gpu_memory_limit", None)
404
+ kwargs.pop("fallback_to_cpu", None)
470
405
 
471
- try:
472
406
  result = self._reader.readtext(
473
407
  np.array(image),
474
408
  beamWidth=beam_width,
475
409
  **kwargs,
476
410
  )
477
411
 
478
- return self._process_easyocr_result(result, image)
412
+ extraction_result = self._process_easyocr_result(result, image)
413
+
414
+ if use_cache and cache_kwargs:
415
+ cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
416
+
417
+ return extraction_result
479
418
  except Exception as e:
419
+ if use_cache and cache_kwargs:
420
+ mark_processing_complete(cache_kwargs)
480
421
  raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
481
422
 
482
423
  def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
483
- """Synchronously process a file and extract its text and metadata using EasyOCR.
424
+ use_cache = kwargs.pop("use_cache", True)
484
425
 
485
- Args:
486
- path: A Path object representing the file to be processed.
487
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
426
+ cache_kwargs = None
427
+ if use_cache:
428
+ file_info = get_file_info(path)
429
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
488
430
 
489
- Returns:
490
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
431
+ cached_result = handle_cache_lookup_sync(cache_kwargs)
432
+ if cached_result:
433
+ return cached_result
491
434
 
492
- Raises:
493
- OCRError: If file loading or OCR processing fails.
494
- """
495
- self._init_easyocr_sync(**kwargs)
496
435
  try:
436
+ self._init_easyocr_sync(**kwargs)
497
437
  image = Image.open(path)
498
- return self.process_image_sync(image, **kwargs)
438
+
439
+ kwargs["use_cache"] = False
440
+ extraction_result = self.process_image_sync(image, **kwargs)
441
+
442
+ if use_cache and cache_kwargs:
443
+ cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
444
+
445
+ return extraction_result
499
446
  except Exception as e:
447
+ if use_cache and cache_kwargs:
448
+ mark_processing_complete(cache_kwargs)
500
449
  raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
501
450
 
502
451
  @classmethod
503
452
  def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
504
- """Synchronously initialize EasyOCR with the provided configuration.
505
-
506
- Args:
507
- **kwargs: Configuration parameters for EasyOCR including language, etc.
508
-
509
- Raises:
510
- MissingDependencyError: If EasyOCR is not installed.
511
- OCRError: If initialization fails.
512
- """
513
453
  if cls._reader is not None:
514
454
  return
515
455
 
516
- try:
517
- import easyocr # noqa: PLC0415
518
- except ImportError as e: # pragma: no cover
456
+ if not HAS_EASYOCR or easyocr is None:
519
457
  raise MissingDependencyError.create_for_package(
520
458
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
521
- ) from e
459
+ )
522
460
 
523
461
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
524
462