kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +0 -124
- kreuzberg/_document_classification.py +20 -39
- kreuzberg/_entity_extraction.py +0 -29
- kreuzberg/_extractors/_base.py +4 -66
- kreuzberg/_extractors/_email.py +0 -4
- kreuzberg/_extractors/_image.py +0 -2
- kreuzberg/_extractors/_pandoc.py +0 -58
- kreuzberg/_extractors/_pdf.py +0 -3
- kreuzberg/_extractors/_presentation.py +0 -82
- kreuzberg/_extractors/_spread_sheet.py +0 -2
- kreuzberg/_gmft.py +0 -61
- kreuzberg/_language_detection.py +0 -14
- kreuzberg/_mime_types.py +0 -17
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +110 -85
- kreuzberg/_ocr/_paddleocr.py +146 -138
- kreuzberg/_ocr/_table_extractor.py +0 -76
- kreuzberg/_ocr/_tesseract.py +0 -206
- kreuzberg/_playa.py +0 -27
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +16 -119
- kreuzberg/_utils/_cache.py +0 -52
- kreuzberg/_utils/_device.py +0 -56
- kreuzberg/_utils/_document_cache.py +0 -73
- kreuzberg/_utils/_errors.py +0 -47
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -14
- kreuzberg/_utils/_process_pool.py +0 -47
- kreuzberg/_utils/_quality.py +0 -17
- kreuzberg/_utils/_ref.py +0 -16
- kreuzberg/_utils/_serialization.py +0 -25
- kreuzberg/_utils/_string.py +0 -20
- kreuzberg/_utils/_sync.py +0 -76
- kreuzberg/_utils/_table.py +0 -45
- kreuzberg/_utils/_tmp.py +0 -9
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.13.0.dist-info/RECORD +0 -56
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -9,6 +9,16 @@ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
|
9
9
|
from kreuzberg._ocr._base import OCRBackend
|
10
10
|
from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
|
11
11
|
from kreuzberg._utils._device import DeviceInfo, validate_device_request
|
12
|
+
from kreuzberg._utils._ocr_cache import (
|
13
|
+
build_cache_kwargs,
|
14
|
+
cache_and_complete_async,
|
15
|
+
cache_and_complete_sync,
|
16
|
+
generate_image_hash,
|
17
|
+
get_file_info,
|
18
|
+
handle_cache_lookup_async,
|
19
|
+
handle_cache_lookup_sync,
|
20
|
+
mark_processing_complete,
|
21
|
+
)
|
12
22
|
from kreuzberg._utils._string import normalize_spaces
|
13
23
|
from kreuzberg._utils._sync import run_sync
|
14
24
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -21,17 +31,24 @@ try: # pragma: no cover
|
|
21
31
|
except ImportError: # pragma: no cover
|
22
32
|
from typing_extensions import Unpack
|
23
33
|
|
24
|
-
|
34
|
+
if TYPE_CHECKING:
|
25
35
|
import easyocr
|
26
36
|
import numpy as np
|
27
37
|
import torch
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
39
|
+
HAS_EASYOCR: bool
|
40
|
+
if not TYPE_CHECKING:
|
41
|
+
try:
|
42
|
+
import easyocr
|
43
|
+
import numpy as np
|
44
|
+
import torch
|
45
|
+
|
46
|
+
HAS_EASYOCR = True
|
47
|
+
except ImportError:
|
48
|
+
HAS_EASYOCR = False
|
49
|
+
easyocr: Any = None
|
50
|
+
np: Any = None
|
51
|
+
torch: Any = None
|
35
52
|
|
36
53
|
|
37
54
|
EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
@@ -125,29 +142,28 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
125
142
|
_reader: ClassVar[Any] = None
|
126
143
|
|
127
144
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
128
|
-
""
|
145
|
+
use_cache = kwargs.pop("use_cache", True)
|
129
146
|
|
130
|
-
|
131
|
-
|
132
|
-
|
147
|
+
cache_kwargs = None
|
148
|
+
if use_cache:
|
149
|
+
image_hash = generate_image_hash(image)
|
150
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
|
133
151
|
|
134
|
-
|
135
|
-
|
152
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
153
|
+
if cached_result:
|
154
|
+
return cached_result
|
136
155
|
|
137
|
-
|
138
|
-
|
139
|
-
"""
|
140
|
-
await self._init_easyocr(**kwargs)
|
156
|
+
try:
|
157
|
+
await self._init_easyocr(**kwargs)
|
141
158
|
|
142
|
-
|
159
|
+
beam_width = kwargs.pop("beam_width", 5)
|
143
160
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
161
|
+
kwargs.pop("language", None)
|
162
|
+
kwargs.pop("use_gpu", None)
|
163
|
+
kwargs.pop("device", None)
|
164
|
+
kwargs.pop("gpu_memory_limit", None)
|
165
|
+
kwargs.pop("fallback_to_cpu", None)
|
149
166
|
|
150
|
-
try:
|
151
167
|
result = await run_sync(
|
152
168
|
self._reader.readtext,
|
153
169
|
np.array(image),
|
@@ -155,28 +171,43 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
155
171
|
**kwargs,
|
156
172
|
)
|
157
173
|
|
158
|
-
|
174
|
+
extraction_result = self._process_easyocr_result(result, image)
|
175
|
+
|
176
|
+
if use_cache and cache_kwargs:
|
177
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
178
|
+
|
179
|
+
return extraction_result
|
159
180
|
except Exception as e:
|
181
|
+
if use_cache and cache_kwargs:
|
182
|
+
mark_processing_complete(cache_kwargs)
|
160
183
|
raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
|
161
184
|
|
162
185
|
async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
163
|
-
""
|
186
|
+
use_cache = kwargs.pop("use_cache", True)
|
164
187
|
|
165
|
-
|
166
|
-
|
167
|
-
|
188
|
+
cache_kwargs = None
|
189
|
+
if use_cache:
|
190
|
+
file_info = get_file_info(path)
|
191
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
|
168
192
|
|
169
|
-
|
170
|
-
|
193
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
194
|
+
if cached_result:
|
195
|
+
return cached_result
|
171
196
|
|
172
|
-
Raises:
|
173
|
-
OCRError: If file loading or OCR processing fails.
|
174
|
-
"""
|
175
|
-
await self._init_easyocr(**kwargs)
|
176
197
|
try:
|
198
|
+
await self._init_easyocr(**kwargs)
|
177
199
|
image = await run_sync(Image.open, path)
|
178
|
-
|
200
|
+
|
201
|
+
kwargs["use_cache"] = False
|
202
|
+
extraction_result = await self.process_image(image, **kwargs)
|
203
|
+
|
204
|
+
if use_cache and cache_kwargs:
|
205
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
206
|
+
|
207
|
+
return extraction_result
|
179
208
|
except Exception as e:
|
209
|
+
if use_cache and cache_kwargs:
|
210
|
+
mark_processing_complete(cache_kwargs)
|
180
211
|
raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
|
181
212
|
|
182
213
|
@staticmethod
|
@@ -333,17 +364,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
333
364
|
|
334
365
|
@staticmethod
|
335
366
|
def _validate_language_code(language_codes: str | list[str]) -> list[str]:
|
336
|
-
"""Validate and normalize provided language codes.
|
337
|
-
|
338
|
-
Args:
|
339
|
-
language_codes: The language code(s), either as a string (single or comma-separated) or a list.
|
340
|
-
|
341
|
-
Raises:
|
342
|
-
ValidationError: If any of the languages are not supported by EasyOCR
|
343
|
-
|
344
|
-
Returns:
|
345
|
-
A list with the normalized language codes.
|
346
|
-
"""
|
347
367
|
if isinstance(language_codes, str):
|
348
368
|
languages = [lang.strip().lower() for lang in language_codes.split(",")]
|
349
369
|
else:
|
@@ -362,69 +382,74 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
362
382
|
return languages
|
363
383
|
|
364
384
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
365
|
-
""
|
385
|
+
use_cache = kwargs.pop("use_cache", True)
|
366
386
|
|
367
|
-
|
368
|
-
|
369
|
-
|
387
|
+
cache_kwargs = None
|
388
|
+
if use_cache:
|
389
|
+
image_hash = generate_image_hash(image)
|
390
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
|
370
391
|
|
371
|
-
|
372
|
-
|
392
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
393
|
+
if cached_result:
|
394
|
+
return cached_result
|
373
395
|
|
374
|
-
|
375
|
-
|
376
|
-
"""
|
377
|
-
self._init_easyocr_sync(**kwargs)
|
396
|
+
try:
|
397
|
+
self._init_easyocr_sync(**kwargs)
|
378
398
|
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
399
|
+
beam_width = kwargs.pop("beam_width", 5)
|
400
|
+
kwargs.pop("language", None)
|
401
|
+
kwargs.pop("use_gpu", None)
|
402
|
+
kwargs.pop("device", None)
|
403
|
+
kwargs.pop("gpu_memory_limit", None)
|
404
|
+
kwargs.pop("fallback_to_cpu", None)
|
385
405
|
|
386
|
-
try:
|
387
406
|
result = self._reader.readtext(
|
388
407
|
np.array(image),
|
389
408
|
beamWidth=beam_width,
|
390
409
|
**kwargs,
|
391
410
|
)
|
392
411
|
|
393
|
-
|
412
|
+
extraction_result = self._process_easyocr_result(result, image)
|
413
|
+
|
414
|
+
if use_cache and cache_kwargs:
|
415
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
416
|
+
|
417
|
+
return extraction_result
|
394
418
|
except Exception as e:
|
419
|
+
if use_cache and cache_kwargs:
|
420
|
+
mark_processing_complete(cache_kwargs)
|
395
421
|
raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
|
396
422
|
|
397
423
|
def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
398
|
-
""
|
424
|
+
use_cache = kwargs.pop("use_cache", True)
|
399
425
|
|
400
|
-
|
401
|
-
|
402
|
-
|
426
|
+
cache_kwargs = None
|
427
|
+
if use_cache:
|
428
|
+
file_info = get_file_info(path)
|
429
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
|
403
430
|
|
404
|
-
|
405
|
-
|
431
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
432
|
+
if cached_result:
|
433
|
+
return cached_result
|
406
434
|
|
407
|
-
Raises:
|
408
|
-
OCRError: If file loading or OCR processing fails.
|
409
|
-
"""
|
410
|
-
self._init_easyocr_sync(**kwargs)
|
411
435
|
try:
|
436
|
+
self._init_easyocr_sync(**kwargs)
|
412
437
|
image = Image.open(path)
|
413
|
-
|
438
|
+
|
439
|
+
kwargs["use_cache"] = False
|
440
|
+
extraction_result = self.process_image_sync(image, **kwargs)
|
441
|
+
|
442
|
+
if use_cache and cache_kwargs:
|
443
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
444
|
+
|
445
|
+
return extraction_result
|
414
446
|
except Exception as e:
|
447
|
+
if use_cache and cache_kwargs:
|
448
|
+
mark_processing_complete(cache_kwargs)
|
415
449
|
raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
|
416
450
|
|
417
451
|
@classmethod
|
418
452
|
def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
|
419
|
-
"""Synchronously initialize EasyOCR with the provided configuration.
|
420
|
-
|
421
|
-
Args:
|
422
|
-
**kwargs: Configuration parameters for EasyOCR including language, etc.
|
423
|
-
|
424
|
-
Raises:
|
425
|
-
MissingDependencyError: If EasyOCR is not installed.
|
426
|
-
OCRError: If initialization fails.
|
427
|
-
"""
|
428
453
|
if cls._reader is not None:
|
429
454
|
return
|
430
455
|
|