kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -1,15 +1,24 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import warnings
|
4
|
-
from
|
5
|
-
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
6
5
|
|
7
6
|
from PIL import Image
|
8
7
|
|
9
8
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
10
9
|
from kreuzberg._ocr._base import OCRBackend
|
11
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
12
|
-
from kreuzberg._utils._device import DeviceInfo,
|
10
|
+
from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
|
11
|
+
from kreuzberg._utils._device import DeviceInfo, validate_device_request
|
12
|
+
from kreuzberg._utils._ocr_cache import (
|
13
|
+
build_cache_kwargs,
|
14
|
+
cache_and_complete_async,
|
15
|
+
cache_and_complete_sync,
|
16
|
+
generate_image_hash,
|
17
|
+
get_file_info,
|
18
|
+
handle_cache_lookup_async,
|
19
|
+
handle_cache_lookup_sync,
|
20
|
+
mark_processing_complete,
|
21
|
+
)
|
13
22
|
from kreuzberg._utils._string import normalize_spaces
|
14
23
|
from kreuzberg._utils._sync import run_sync
|
15
24
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -22,6 +31,25 @@ try: # pragma: no cover
|
|
22
31
|
except ImportError: # pragma: no cover
|
23
32
|
from typing_extensions import Unpack
|
24
33
|
|
34
|
+
if TYPE_CHECKING:
|
35
|
+
import easyocr
|
36
|
+
import numpy as np
|
37
|
+
import torch
|
38
|
+
|
39
|
+
HAS_EASYOCR: bool
|
40
|
+
if not TYPE_CHECKING:
|
41
|
+
try:
|
42
|
+
import easyocr
|
43
|
+
import numpy as np
|
44
|
+
import torch
|
45
|
+
|
46
|
+
HAS_EASYOCR = True
|
47
|
+
except ImportError:
|
48
|
+
HAS_EASYOCR = False
|
49
|
+
easyocr: Any = None
|
50
|
+
np: Any = None
|
51
|
+
torch: Any = None
|
52
|
+
|
25
53
|
|
26
54
|
EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
27
55
|
"abq",
|
@@ -110,88 +138,32 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
110
138
|
}
|
111
139
|
|
112
140
|
|
113
|
-
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
114
|
-
class EasyOCRConfig:
|
115
|
-
"""Configuration options for EasyOCR."""
|
116
|
-
|
117
|
-
add_margin: float = 0.1
|
118
|
-
"""Extend bounding boxes in all directions."""
|
119
|
-
adjust_contrast: float = 0.5
|
120
|
-
"""Target contrast level for low contrast text."""
|
121
|
-
beam_width: int = 5
|
122
|
-
"""Beam width for beam search in recognition."""
|
123
|
-
canvas_size: int = 2560
|
124
|
-
"""Maximum image dimension for detection."""
|
125
|
-
contrast_ths: float = 0.1
|
126
|
-
"""Contrast threshold for preprocessing."""
|
127
|
-
decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
|
128
|
-
"""Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
|
129
|
-
height_ths: float = 0.5
|
130
|
-
"""Maximum difference in box height for merging."""
|
131
|
-
language: str | list[str] = "en"
|
132
|
-
"""Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
|
133
|
-
a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
|
134
|
-
link_threshold: float = 0.4
|
135
|
-
"""Link confidence threshold."""
|
136
|
-
low_text: float = 0.4
|
137
|
-
"""Text low-bound score."""
|
138
|
-
mag_ratio: float = 1.0
|
139
|
-
"""Image magnification ratio."""
|
140
|
-
min_size: int = 10
|
141
|
-
"""Minimum text box size in pixels."""
|
142
|
-
rotation_info: list[int] | None = None
|
143
|
-
"""List of angles to try for detection."""
|
144
|
-
slope_ths: float = 0.1
|
145
|
-
"""Maximum slope for merging text boxes."""
|
146
|
-
text_threshold: float = 0.7
|
147
|
-
"""Text confidence threshold."""
|
148
|
-
use_gpu: bool = False
|
149
|
-
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
150
|
-
device: DeviceType = "auto"
|
151
|
-
"""Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
|
152
|
-
gpu_memory_limit: float | None = None
|
153
|
-
"""Maximum GPU memory to use in GB. None for no limit."""
|
154
|
-
fallback_to_cpu: bool = True
|
155
|
-
"""Whether to fallback to CPU if requested device is unavailable."""
|
156
|
-
width_ths: float = 0.5
|
157
|
-
"""Maximum horizontal distance for merging boxes."""
|
158
|
-
x_ths: float = 1.0
|
159
|
-
"""Maximum horizontal distance for paragraph merging."""
|
160
|
-
y_ths: float = 0.5
|
161
|
-
"""Maximum vertical distance for paragraph merging."""
|
162
|
-
ycenter_ths: float = 0.5
|
163
|
-
"""Maximum shift in y direction for merging."""
|
164
|
-
|
165
|
-
|
166
141
|
class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
167
142
|
_reader: ClassVar[Any] = None
|
168
143
|
|
169
144
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
170
|
-
""
|
145
|
+
use_cache = kwargs.pop("use_cache", True)
|
171
146
|
|
172
|
-
|
173
|
-
|
174
|
-
|
147
|
+
cache_kwargs = None
|
148
|
+
if use_cache:
|
149
|
+
image_hash = generate_image_hash(image)
|
150
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
|
175
151
|
|
176
|
-
|
177
|
-
|
152
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
153
|
+
if cached_result:
|
154
|
+
return cached_result
|
178
155
|
|
179
|
-
|
180
|
-
|
181
|
-
"""
|
182
|
-
import numpy as np # noqa: PLC0415
|
183
|
-
|
184
|
-
await self._init_easyocr(**kwargs)
|
156
|
+
try:
|
157
|
+
await self._init_easyocr(**kwargs)
|
185
158
|
|
186
|
-
|
159
|
+
beam_width = kwargs.pop("beam_width", 5)
|
187
160
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
161
|
+
kwargs.pop("language", None)
|
162
|
+
kwargs.pop("use_gpu", None)
|
163
|
+
kwargs.pop("device", None)
|
164
|
+
kwargs.pop("gpu_memory_limit", None)
|
165
|
+
kwargs.pop("fallback_to_cpu", None)
|
193
166
|
|
194
|
-
try:
|
195
167
|
result = await run_sync(
|
196
168
|
self._reader.readtext,
|
197
169
|
np.array(image),
|
@@ -199,41 +171,47 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
199
171
|
**kwargs,
|
200
172
|
)
|
201
173
|
|
202
|
-
|
174
|
+
extraction_result = self._process_easyocr_result(result, image)
|
175
|
+
|
176
|
+
if use_cache and cache_kwargs:
|
177
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
178
|
+
|
179
|
+
return extraction_result
|
203
180
|
except Exception as e:
|
181
|
+
if use_cache and cache_kwargs:
|
182
|
+
mark_processing_complete(cache_kwargs)
|
204
183
|
raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
|
205
184
|
|
206
185
|
async def process_file(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
207
|
-
""
|
186
|
+
use_cache = kwargs.pop("use_cache", True)
|
208
187
|
|
209
|
-
|
210
|
-
|
211
|
-
|
188
|
+
cache_kwargs = None
|
189
|
+
if use_cache:
|
190
|
+
file_info = get_file_info(path)
|
191
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
|
212
192
|
|
213
|
-
|
214
|
-
|
193
|
+
cached_result = await handle_cache_lookup_async(cache_kwargs)
|
194
|
+
if cached_result:
|
195
|
+
return cached_result
|
215
196
|
|
216
|
-
Raises:
|
217
|
-
OCRError: If file loading or OCR processing fails.
|
218
|
-
"""
|
219
|
-
await self._init_easyocr(**kwargs)
|
220
197
|
try:
|
198
|
+
await self._init_easyocr(**kwargs)
|
221
199
|
image = await run_sync(Image.open, path)
|
222
|
-
|
200
|
+
|
201
|
+
kwargs["use_cache"] = False
|
202
|
+
extraction_result = await self.process_image(image, **kwargs)
|
203
|
+
|
204
|
+
if use_cache and cache_kwargs:
|
205
|
+
await cache_and_complete_async(extraction_result, cache_kwargs, use_cache)
|
206
|
+
|
207
|
+
return extraction_result
|
223
208
|
except Exception as e:
|
209
|
+
if use_cache and cache_kwargs:
|
210
|
+
mark_processing_complete(cache_kwargs)
|
224
211
|
raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
|
225
212
|
|
226
213
|
@staticmethod
|
227
214
|
def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
|
228
|
-
"""Process EasyOCR result into an ExtractionResult with metadata.
|
229
|
-
|
230
|
-
Args:
|
231
|
-
result: The raw result from EasyOCR.
|
232
|
-
image: The original PIL image.
|
233
|
-
|
234
|
-
Returns:
|
235
|
-
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
236
|
-
"""
|
237
215
|
if not result:
|
238
216
|
return ExtractionResult(
|
239
217
|
content="",
|
@@ -314,38 +292,19 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
314
292
|
|
315
293
|
@classmethod
|
316
294
|
def _is_gpu_available(cls) -> bool:
|
317
|
-
|
318
|
-
|
319
|
-
Returns:
|
320
|
-
bool: True if GPU support is available.
|
321
|
-
"""
|
322
|
-
try:
|
323
|
-
import torch # noqa: PLC0415
|
324
|
-
|
325
|
-
return bool(torch.cuda.is_available())
|
326
|
-
except ImportError: # pragma: no cover
|
295
|
+
if not HAS_EASYOCR or torch is None:
|
327
296
|
return False
|
297
|
+
return bool(torch.cuda.is_available())
|
328
298
|
|
329
299
|
@classmethod
|
330
300
|
async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
|
331
|
-
"""Initialize EasyOCR with the provided configuration.
|
332
|
-
|
333
|
-
Args:
|
334
|
-
**kwargs: Configuration parameters for EasyOCR including language, etc.
|
335
|
-
|
336
|
-
Raises:
|
337
|
-
MissingDependencyError: If EasyOCR is not installed.
|
338
|
-
OCRError: If initialization fails.
|
339
|
-
"""
|
340
301
|
if cls._reader is not None:
|
341
302
|
return
|
342
303
|
|
343
|
-
|
344
|
-
import easyocr # noqa: PLC0415
|
345
|
-
except ImportError as e: # pragma: no cover
|
304
|
+
if not HAS_EASYOCR or easyocr is None:
|
346
305
|
raise MissingDependencyError.create_for_package(
|
347
306
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
348
|
-
)
|
307
|
+
)
|
349
308
|
|
350
309
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
351
310
|
|
@@ -369,17 +328,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
369
328
|
|
370
329
|
@classmethod
|
371
330
|
def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
|
372
|
-
"""Resolve device configuration with backward compatibility.
|
373
|
-
|
374
|
-
Args:
|
375
|
-
**kwargs: Configuration parameters including device settings.
|
376
|
-
|
377
|
-
Returns:
|
378
|
-
DeviceInfo object for the selected device.
|
379
|
-
|
380
|
-
Raises:
|
381
|
-
ValidationError: If requested device is not available and fallback is disabled.
|
382
|
-
"""
|
383
331
|
use_gpu = kwargs.get("use_gpu", False)
|
384
332
|
device = kwargs.get("device", "auto")
|
385
333
|
memory_limit = kwargs.get("gpu_memory_limit")
|
@@ -416,17 +364,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
416
364
|
|
417
365
|
@staticmethod
|
418
366
|
def _validate_language_code(language_codes: str | list[str]) -> list[str]:
|
419
|
-
"""Validate and normalize provided language codes.
|
420
|
-
|
421
|
-
Args:
|
422
|
-
language_codes: The language code(s), either as a string (single or comma-separated) or a list.
|
423
|
-
|
424
|
-
Raises:
|
425
|
-
ValidationError: If any of the languages are not supported by EasyOCR
|
426
|
-
|
427
|
-
Returns:
|
428
|
-
A list with the normalized language codes.
|
429
|
-
"""
|
430
367
|
if isinstance(language_codes, str):
|
431
368
|
languages = [lang.strip().lower() for lang in language_codes.split(",")]
|
432
369
|
else:
|
@@ -445,80 +382,81 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
445
382
|
return languages
|
446
383
|
|
447
384
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
448
|
-
""
|
385
|
+
use_cache = kwargs.pop("use_cache", True)
|
449
386
|
|
450
|
-
|
451
|
-
|
452
|
-
|
387
|
+
cache_kwargs = None
|
388
|
+
if use_cache:
|
389
|
+
image_hash = generate_image_hash(image)
|
390
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, image_hash=image_hash)
|
453
391
|
|
454
|
-
|
455
|
-
|
392
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
393
|
+
if cached_result:
|
394
|
+
return cached_result
|
456
395
|
|
457
|
-
|
458
|
-
|
459
|
-
"""
|
460
|
-
import numpy as np # noqa: PLC0415
|
461
|
-
|
462
|
-
self._init_easyocr_sync(**kwargs)
|
396
|
+
try:
|
397
|
+
self._init_easyocr_sync(**kwargs)
|
463
398
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
399
|
+
beam_width = kwargs.pop("beam_width", 5)
|
400
|
+
kwargs.pop("language", None)
|
401
|
+
kwargs.pop("use_gpu", None)
|
402
|
+
kwargs.pop("device", None)
|
403
|
+
kwargs.pop("gpu_memory_limit", None)
|
404
|
+
kwargs.pop("fallback_to_cpu", None)
|
470
405
|
|
471
|
-
try:
|
472
406
|
result = self._reader.readtext(
|
473
407
|
np.array(image),
|
474
408
|
beamWidth=beam_width,
|
475
409
|
**kwargs,
|
476
410
|
)
|
477
411
|
|
478
|
-
|
412
|
+
extraction_result = self._process_easyocr_result(result, image)
|
413
|
+
|
414
|
+
if use_cache and cache_kwargs:
|
415
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
416
|
+
|
417
|
+
return extraction_result
|
479
418
|
except Exception as e:
|
419
|
+
if use_cache and cache_kwargs:
|
420
|
+
mark_processing_complete(cache_kwargs)
|
480
421
|
raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
|
481
422
|
|
482
423
|
def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
483
|
-
""
|
424
|
+
use_cache = kwargs.pop("use_cache", True)
|
484
425
|
|
485
|
-
|
486
|
-
|
487
|
-
|
426
|
+
cache_kwargs = None
|
427
|
+
if use_cache:
|
428
|
+
file_info = get_file_info(path)
|
429
|
+
cache_kwargs = build_cache_kwargs("easyocr", kwargs, file_info=file_info)
|
488
430
|
|
489
|
-
|
490
|
-
|
431
|
+
cached_result = handle_cache_lookup_sync(cache_kwargs)
|
432
|
+
if cached_result:
|
433
|
+
return cached_result
|
491
434
|
|
492
|
-
Raises:
|
493
|
-
OCRError: If file loading or OCR processing fails.
|
494
|
-
"""
|
495
|
-
self._init_easyocr_sync(**kwargs)
|
496
435
|
try:
|
436
|
+
self._init_easyocr_sync(**kwargs)
|
497
437
|
image = Image.open(path)
|
498
|
-
|
438
|
+
|
439
|
+
kwargs["use_cache"] = False
|
440
|
+
extraction_result = self.process_image_sync(image, **kwargs)
|
441
|
+
|
442
|
+
if use_cache and cache_kwargs:
|
443
|
+
cache_and_complete_sync(extraction_result, cache_kwargs, use_cache)
|
444
|
+
|
445
|
+
return extraction_result
|
499
446
|
except Exception as e:
|
447
|
+
if use_cache and cache_kwargs:
|
448
|
+
mark_processing_complete(cache_kwargs)
|
500
449
|
raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
|
501
450
|
|
502
451
|
@classmethod
|
503
452
|
def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
|
504
|
-
"""Synchronously initialize EasyOCR with the provided configuration.
|
505
|
-
|
506
|
-
Args:
|
507
|
-
**kwargs: Configuration parameters for EasyOCR including language, etc.
|
508
|
-
|
509
|
-
Raises:
|
510
|
-
MissingDependencyError: If EasyOCR is not installed.
|
511
|
-
OCRError: If initialization fails.
|
512
|
-
"""
|
513
453
|
if cls._reader is not None:
|
514
454
|
return
|
515
455
|
|
516
|
-
|
517
|
-
import easyocr # noqa: PLC0415
|
518
|
-
except ImportError as e: # pragma: no cover
|
456
|
+
if not HAS_EASYOCR or easyocr is None:
|
519
457
|
raise MissingDependencyError.create_for_package(
|
520
458
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
521
|
-
)
|
459
|
+
)
|
522
460
|
|
523
461
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
524
462
|
|