kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/_chunker.py +0 -15
  2. kreuzberg/_config.py +0 -124
  3. kreuzberg/_document_classification.py +20 -39
  4. kreuzberg/_entity_extraction.py +0 -29
  5. kreuzberg/_extractors/_base.py +4 -66
  6. kreuzberg/_extractors/_email.py +0 -4
  7. kreuzberg/_extractors/_image.py +0 -2
  8. kreuzberg/_extractors/_pandoc.py +0 -58
  9. kreuzberg/_extractors/_pdf.py +0 -3
  10. kreuzberg/_extractors/_presentation.py +0 -82
  11. kreuzberg/_extractors/_spread_sheet.py +0 -2
  12. kreuzberg/_gmft.py +0 -61
  13. kreuzberg/_language_detection.py +0 -14
  14. kreuzberg/_mime_types.py +0 -17
  15. kreuzberg/_ocr/_base.py +4 -76
  16. kreuzberg/_ocr/_easyocr.py +110 -85
  17. kreuzberg/_ocr/_paddleocr.py +146 -138
  18. kreuzberg/_ocr/_table_extractor.py +0 -76
  19. kreuzberg/_ocr/_tesseract.py +0 -206
  20. kreuzberg/_playa.py +0 -27
  21. kreuzberg/_registry.py +0 -36
  22. kreuzberg/_types.py +16 -119
  23. kreuzberg/_utils/_cache.py +0 -52
  24. kreuzberg/_utils/_device.py +0 -56
  25. kreuzberg/_utils/_document_cache.py +0 -73
  26. kreuzberg/_utils/_errors.py +0 -47
  27. kreuzberg/_utils/_ocr_cache.py +136 -0
  28. kreuzberg/_utils/_pdf_lock.py +0 -14
  29. kreuzberg/_utils/_process_pool.py +0 -47
  30. kreuzberg/_utils/_quality.py +0 -17
  31. kreuzberg/_utils/_ref.py +0 -16
  32. kreuzberg/_utils/_serialization.py +0 -25
  33. kreuzberg/_utils/_string.py +0 -20
  34. kreuzberg/_utils/_sync.py +0 -76
  35. kreuzberg/_utils/_table.py +0 -45
  36. kreuzberg/_utils/_tmp.py +0 -9
  37. kreuzberg/cli.py +2 -2
  38. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
  39. kreuzberg-3.13.2.dist-info/RECORD +57 -0
  40. kreuzberg-3.13.0.dist-info/RECORD +0 -56
  41. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0
@@ -9,6 +9,16 @@ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
9
9
  from kreuzberg._ocr._base import OCRBackend
10
10
  from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
11
11
  from kreuzberg._utils._device import DeviceInfo, validate_device_request
12
+ from kreuzberg._utils._ocr_cache import (
13
+ build_cache_kwargs,
14
+ cache_and_complete_async,
15
+ cache_and_complete_sync,
16
+ generate_image_hash,
17
+ get_file_info,
18
+ handle_cache_lookup_async,
19
+ handle_cache_lookup_sync,
20
+ mark_processing_complete,
21
+ )
12
22
  from kreuzberg._utils._string import normalize_spaces
13
23
  from kreuzberg._utils._sync import run_sync
14
24
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -21,17 +31,24 @@ try: # pragma: no cover
21
31
  except ImportError: # pragma: no cover
22
32
  from typing_extensions import Unpack
23
33
 
24
- try:
34
+ if TYPE_CHECKING:
25
35
  import easyocr
26
36
  import numpy as np
27
37
  import torch
28
38
 
29
- HAS_EASYOCR = True
30
- except ImportError:
31
- HAS_EASYOCR = False
32
- easyocr = None
33
- np = None # type: ignore[assignment]
34
- torch = None # type: ignore[assignment]
39
+ HAS_EASYOCR: bool
40
+ if not TYPE_CHECKING:
41
+ try:
42
+ import easyocr
43
+ import numpy as np
44
+ import torch
45
+
46
+ HAS_EASYOCR = True
47
+ except ImportError:
48
+ HAS_EASYOCR = False
49
+ easyocr: Any = None
50
+ np: Any = None
51
+ torch: Any = None
35
52
 
36
53
 
37
54
  EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -125,29 +142,28 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
125
142
  _reader: ClassVar[Any] = None
126
143
 
127
144
  async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
128
- """Asynchronously process an image and extract its text and metadata using EasyOCR.
145
+ use_cache = kwargs.pop("use_cache", True)
129
146
 
130
- Args:
131
- image: An instance of PIL.Image representing the input image.
132
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
147
+ cache_kwargs = None
148
+ if use_cache:
149
+ image_hash = generate_image_hash(image)
150
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
133
151
 
134
- Returns:
135
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
152
+ cached_result = await handle_cache_lookup_async(cache_kwargs)
153
+ if cached_result:
154
+ return cached_result
136
155
 
137
- Raises:
138
- OCRError: If OCR processing fails.
139
- """
140
- await self._init_easyocr(**kwargs)
156
+ try:
157
+ await self._init_easyocr(**kwargs)
141
158
 
142
- beam_width = kwargs.pop("beam_width")
159
+ beam_width = kwargs.pop("beam_width", 5)
143
160
 
144
- kwargs.pop("language", None)
145
- kwargs.pop("use_gpu", None)
146
- kwargs.pop("device", None)
147
- kwargs.pop("gpu_memory_limit", None)
148
- kwargs.pop("fallback_to_cpu", None)
161
+ kwargs.pop("language", None)
162
+ kwargs.pop("use_gpu", None)
163
+ kwargs.pop("device", None)
164
+ kwargs.pop("gpu_memory_limit", None)
165
+ kwargs.pop("fallback_to_cpu", None)
149
166
 
150
- try:
151
167
  result = await run_sync(
152
168
  self._reader.readtext,
153
169
  np.array(image),
@@ -155,28 +171,43 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
155
171
  **kwargs,
156
172
  )
157
173
 
158
- return self._process_easyocr_result(result, image)
174
+ extraction_result = self._process_easyocr_result(result, image)
175
+
176
+ if use_cache and cache_kwargs:
177
+ await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
178
+
179
+ return extraction_result
159
180
  except Exception as e:
181
+ if use_cache and cache_kwargs:
182
+ mark_processing_complete(cache_kwargs)
160
183
  raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
161
184
 
162
185
  async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
163
- """Asynchronously process a file and extract its text and metadata using EasyOCR.
186
+ use_cache = kwargs.pop("use_cache", True)
164
187
 
165
- Args:
166
- path: A Path object representing the file to be processed.
167
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
188
+ cache_kwargs = None
189
+ if use_cache:
190
+ file_info = get_file_info(path)
191
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
168
192
 
169
- Returns:
170
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
193
+ cached_result = await handle_cache_lookup_async(cache_kwargs)
194
+ if cached_result:
195
+ return cached_result
171
196
 
172
- Raises:
173
- OCRError: If file loading or OCR processing fails.
174
- """
175
- await self._init_easyocr(**kwargs)
176
197
  try:
198
+ await self._init_easyocr(**kwargs)
177
199
  image = await run_sync(Image.open, path)
178
- return await self.process_image(image, **kwargs)
200
+
201
+ kwargs["use_cache"] = False
202
+ extraction_result = await self.process_image(image, **kwargs)
203
+
204
+ if use_cache and cache_kwargs:
205
+ await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
206
+
207
+ return extraction_result
179
208
  except Exception as e:
209
+ if use_cache and cache_kwargs:
210
+ mark_processing_complete(cache_kwargs)
180
211
  raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
181
212
 
182
213
  @staticmethod
@@ -333,17 +364,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
333
364
 
334
365
  @staticmethod
335
366
  def _validate_language_code(language_codes: str | list[str]) -> list[str]:
336
- """Validate and normalize provided language codes.
337
-
338
- Args:
339
- language_codes: The language code(s), either as a string (single or comma-separated) or a list.
340
-
341
- Raises:
342
- ValidationError: If any of the languages are not supported by EasyOCR
343
-
344
- Returns:
345
- A list with the normalized language codes.
346
- """
347
367
  if isinstance(language_codes, str):
348
368
  languages = [lang.strip().lower() for lang in language_codes.split(",")]
349
369
  else:
@@ -362,69 +382,74 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
362
382
  return languages
363
383
 
364
384
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
365
- """Synchronously process an image and extract its text and metadata using EasyOCR.
385
+ use_cache = kwargs.pop("use_cache", True)
366
386
 
367
- Args:
368
- image: An instance of PIL.Image representing the input image.
369
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
387
+ cache_kwargs = None
388
+ if use_cache:
389
+ image_hash = generate_image_hash(image)
390
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
370
391
 
371
- Returns:
372
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
392
+ cached_result = handle_cache_lookup_sync(cache_kwargs)
393
+ if cached_result:
394
+ return cached_result
373
395
 
374
- Raises:
375
- OCRError: If OCR processing fails.
376
- """
377
- self._init_easyocr_sync(**kwargs)
396
+ try:
397
+ self._init_easyocr_sync(**kwargs)
378
398
 
379
- beam_width = kwargs.pop("beam_width")
380
- kwargs.pop("language", None)
381
- kwargs.pop("use_gpu", None)
382
- kwargs.pop("device", None)
383
- kwargs.pop("gpu_memory_limit", None)
384
- kwargs.pop("fallback_to_cpu", None)
399
+ beam_width = kwargs.pop("beam_width", 5)
400
+ kwargs.pop("language", None)
401
+ kwargs.pop("use_gpu", None)
402
+ kwargs.pop("device", None)
403
+ kwargs.pop("gpu_memory_limit", None)
404
+ kwargs.pop("fallback_to_cpu", None)
385
405
 
386
- try:
387
406
  result = self._reader.readtext(
388
407
  np.array(image),
389
408
  beamWidth=beam_width,
390
409
  **kwargs,
391
410
  )
392
411
 
393
- return self._process_easyocr_result(result, image)
412
+ extraction_result = self._process_easyocr_result(result, image)
413
+
414
+ if use_cache and cache_kwargs:
415
+ cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
416
+
417
+ return extraction_result
394
418
  except Exception as e:
419
+ if use_cache and cache_kwargs:
420
+ mark_processing_complete(cache_kwargs)
395
421
  raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
396
422
 
397
423
  def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
398
- """Synchronously process a file and extract its text and metadata using EasyOCR.
424
+ use_cache = kwargs.pop("use_cache", True)
399
425
 
400
- Args:
401
- path: A Path object representing the file to be processed.
402
- **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
426
+ cache_kwargs = None
427
+ if use_cache:
428
+ file_info = get_file_info(path)
429
+ cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
403
430
 
404
- Returns:
405
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
431
+ cached_result = handle_cache_lookup_sync(cache_kwargs)
432
+ if cached_result:
433
+ return cached_result
406
434
 
407
- Raises:
408
- OCRError: If file loading or OCR processing fails.
409
- """
410
- self._init_easyocr_sync(**kwargs)
411
435
  try:
436
+ self._init_easyocr_sync(**kwargs)
412
437
  image = Image.open(path)
413
- return self.process_image_sync(image, **kwargs)
438
+
439
+ kwargs["use_cache"] = False
440
+ extraction_result = self.process_image_sync(image, **kwargs)
441
+
442
+ if use_cache and cache_kwargs:
443
+ cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
444
+
445
+ return extraction_result
414
446
  except Exception as e:
447
+ if use_cache and cache_kwargs:
448
+ mark_processing_complete(cache_kwargs)
415
449
  raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
416
450
 
417
451
  @classmethod
418
452
  def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
419
- """Synchronously initialize EasyOCR with the provided configuration.
420
-
421
- Args:
422
- **kwargs: Configuration parameters for EasyOCR including language, etc.
423
-
424
- Raises:
425
- MissingDependencyError: If EasyOCR is not installed.
426
- OCRError: If initialization fails.
427
- """
428
453
  if cls._reader is not None:
429
454
  return
430
455