kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -0
- kreuzberg/__main__.py +8 -0
- kreuzberg/_cli_config.py +175 -0
- kreuzberg/_extractors/_image.py +39 -4
- kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg/_extractors/_pdf.py +199 -19
- kreuzberg/_extractors/_presentation.py +1 -1
- kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg/_gmft.py +222 -16
- kreuzberg/_mime_types.py +62 -16
- kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- kreuzberg/_ocr/_easyocr.py +66 -10
- kreuzberg/_ocr/_paddleocr.py +86 -7
- kreuzberg/_ocr/_tesseract.py +136 -46
- kreuzberg/_playa.py +43 -0
- kreuzberg/_utils/_cache.py +372 -0
- kreuzberg/_utils/_device.py +356 -0
- kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg/_utils/_errors.py +232 -0
- kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg/_utils/_serialization.py +82 -0
- kreuzberg/_utils/_string.py +1 -1
- kreuzberg/_utils/_sync.py +21 -0
- kreuzberg/cli.py +338 -0
- kreuzberg/extraction.py +247 -36
- {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
- kreuzberg-3.3.0.dist-info/RECORD +48 -0
- {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
- kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
- kreuzberg-3.1.7.dist-info/RECORD +0 -33
- kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
- {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import warnings
|
3
4
|
from dataclasses import dataclass
|
4
5
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
5
6
|
|
@@ -8,6 +9,7 @@ from PIL import Image
|
|
8
9
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
9
10
|
from kreuzberg._ocr._base import OCRBackend
|
10
11
|
from kreuzberg._types import ExtractionResult, Metadata
|
12
|
+
from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
|
11
13
|
from kreuzberg._utils._string import normalize_spaces
|
12
14
|
from kreuzberg._utils._sync import run_sync
|
13
15
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -55,7 +57,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
55
57
|
"hr",
|
56
58
|
"hu",
|
57
59
|
"id",
|
58
|
-
"inh",
|
60
|
+
"inh",
|
59
61
|
"is",
|
60
62
|
"it",
|
61
63
|
"ja",
|
@@ -95,7 +97,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
95
97
|
"sw",
|
96
98
|
"ta",
|
97
99
|
"tab",
|
98
|
-
"te",
|
100
|
+
"te",
|
99
101
|
"th",
|
100
102
|
"tjk",
|
101
103
|
"tl",
|
@@ -144,7 +146,13 @@ class EasyOCRConfig:
|
|
144
146
|
text_threshold: float = 0.7
|
145
147
|
"""Text confidence threshold."""
|
146
148
|
use_gpu: bool = False
|
147
|
-
"""Whether to use GPU for inference."""
|
149
|
+
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
150
|
+
device: DeviceType = "auto"
|
151
|
+
"""Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
|
152
|
+
gpu_memory_limit: float | None = None
|
153
|
+
"""Maximum GPU memory to use in GB. None for no limit."""
|
154
|
+
fallback_to_cpu: bool = True
|
155
|
+
"""Whether to fallback to CPU if requested device is unavailable."""
|
148
156
|
width_ths: float = 0.5
|
149
157
|
"""Maximum horizontal distance for merging boxes."""
|
150
158
|
x_ths: float = 1.0
|
@@ -253,11 +261,12 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
253
261
|
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
|
254
262
|
)
|
255
263
|
|
264
|
+
# Group text boxes by lines based on Y coordinate # ~keep
|
256
265
|
sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
|
257
266
|
line_groups: list[list[Any]] = []
|
258
267
|
current_line: list[Any] = []
|
259
268
|
prev_y_center: float | None = None
|
260
|
-
line_height_threshold = 20
|
269
|
+
line_height_threshold = 20 # Minimum distance to consider as new line # ~keep
|
261
270
|
|
262
271
|
for item in sorted_results:
|
263
272
|
box, text, confidence = item
|
@@ -280,7 +289,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
280
289
|
confidence_count = 0
|
281
290
|
|
282
291
|
for line in line_groups:
|
283
|
-
line_sorted = sorted(line, key=lambda x: x[0][0][0])
|
292
|
+
line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
|
284
293
|
|
285
294
|
for item in line_sorted:
|
286
295
|
_, text, confidence = item
|
@@ -336,8 +345,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
336
345
|
) from e
|
337
346
|
|
338
347
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
339
|
-
|
340
|
-
|
348
|
+
|
349
|
+
device_info = cls._resolve_device_config(**kwargs)
|
350
|
+
use_gpu = device_info.device_type in ("cuda", "mps")
|
351
|
+
|
341
352
|
kwargs.setdefault("detector", True)
|
342
353
|
kwargs.setdefault("recognizer", True)
|
343
354
|
kwargs.setdefault("download_enabled", True)
|
@@ -347,12 +358,59 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
347
358
|
cls._reader = await run_sync(
|
348
359
|
easyocr.Reader,
|
349
360
|
languages,
|
350
|
-
gpu=
|
361
|
+
gpu=use_gpu,
|
351
362
|
verbose=False,
|
352
363
|
)
|
353
364
|
except Exception as e:
|
354
365
|
raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
|
355
366
|
|
367
|
+
@classmethod
|
368
|
+
def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
|
369
|
+
"""Resolve device configuration with backward compatibility.
|
370
|
+
|
371
|
+
Args:
|
372
|
+
**kwargs: Configuration parameters including device settings.
|
373
|
+
|
374
|
+
Returns:
|
375
|
+
DeviceInfo object for the selected device.
|
376
|
+
|
377
|
+
Raises:
|
378
|
+
ValidationError: If requested device is not available and fallback is disabled.
|
379
|
+
"""
|
380
|
+
use_gpu = kwargs.get("use_gpu", False)
|
381
|
+
device = kwargs.get("device", "auto")
|
382
|
+
memory_limit = kwargs.get("gpu_memory_limit")
|
383
|
+
fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
|
384
|
+
|
385
|
+
if use_gpu and device == "auto":
|
386
|
+
warnings.warn(
|
387
|
+
"The 'use_gpu' parameter is deprecated and will be removed in a future version. "
|
388
|
+
"Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
|
389
|
+
DeprecationWarning,
|
390
|
+
stacklevel=4,
|
391
|
+
)
|
392
|
+
|
393
|
+
device = "auto" if use_gpu else "cpu"
|
394
|
+
elif use_gpu and device != "auto":
|
395
|
+
warnings.warn(
|
396
|
+
"Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
|
397
|
+
"Using 'device' parameter value.",
|
398
|
+
DeprecationWarning,
|
399
|
+
stacklevel=4,
|
400
|
+
)
|
401
|
+
|
402
|
+
try:
|
403
|
+
return validate_device_request(
|
404
|
+
device,
|
405
|
+
"EasyOCR",
|
406
|
+
memory_limit=memory_limit,
|
407
|
+
fallback_to_cpu=fallback_to_cpu,
|
408
|
+
)
|
409
|
+
except ValidationError:
|
410
|
+
if not use_gpu and device == "cpu":
|
411
|
+
return DeviceInfo(device_type="cpu", name="CPU")
|
412
|
+
raise
|
413
|
+
|
356
414
|
@staticmethod
|
357
415
|
def _validate_language_code(language_codes: str | list[str]) -> list[str]:
|
358
416
|
"""Validate and normalize provided language codes.
|
@@ -367,10 +425,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
367
425
|
A list with the normalized language codes.
|
368
426
|
"""
|
369
427
|
if isinstance(language_codes, str):
|
370
|
-
# Handle comma-separated language codes
|
371
428
|
languages = [lang.strip().lower() for lang in language_codes.split(",")]
|
372
429
|
else:
|
373
|
-
# Handle list of language codes
|
374
430
|
languages = [lang.lower() for lang in language_codes]
|
375
431
|
|
376
432
|
unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import platform
|
4
|
+
import warnings
|
4
5
|
from dataclasses import dataclass
|
5
6
|
from importlib.util import find_spec
|
6
7
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
@@ -10,6 +11,7 @@ from PIL import Image
|
|
10
11
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
11
12
|
from kreuzberg._ocr._base import OCRBackend
|
12
13
|
from kreuzberg._types import ExtractionResult, Metadata
|
14
|
+
from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
|
13
15
|
from kreuzberg._utils._string import normalize_spaces
|
14
16
|
from kreuzberg._utils._sync import run_sync
|
15
17
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -91,7 +93,13 @@ class PaddleOCRConfig:
|
|
91
93
|
use_angle_cls: bool = True
|
92
94
|
"""Whether to use text orientation classification model."""
|
93
95
|
use_gpu: bool = False
|
94
|
-
"""Whether to use GPU for inference.
|
96
|
+
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
97
|
+
device: DeviceType = "auto"
|
98
|
+
"""Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
|
99
|
+
gpu_memory_limit: float | None = None
|
100
|
+
"""Maximum GPU memory to use in GB. None for no limit."""
|
101
|
+
fallback_to_cpu: bool = True
|
102
|
+
"""Whether to fallback to CPU if requested device is unavailable."""
|
95
103
|
use_space_char: bool = True
|
96
104
|
"""Whether to recognize spaces."""
|
97
105
|
use_zero_copy_run: bool = False
|
@@ -117,6 +125,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
117
125
|
import numpy as np
|
118
126
|
|
119
127
|
await self._init_paddle_ocr(**kwargs)
|
128
|
+
|
129
|
+
if image.mode != "RGB":
|
130
|
+
image = image.convert("RGB")
|
131
|
+
|
120
132
|
image_np = np.array(image)
|
121
133
|
try:
|
122
134
|
result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
|
@@ -145,7 +157,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
145
157
|
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
146
158
|
|
147
159
|
@staticmethod
|
148
|
-
def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
|
160
|
+
def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
|
149
161
|
"""Process PaddleOCR result into an ExtractionResult with metadata.
|
150
162
|
|
151
163
|
Args:
|
@@ -163,6 +175,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
163
175
|
if not page_result:
|
164
176
|
continue
|
165
177
|
|
178
|
+
# Group text boxes by lines based on Y coordinate # ~keep
|
166
179
|
sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
|
167
180
|
line_groups: list[list[Any]] = []
|
168
181
|
current_line: list[Any] = []
|
@@ -171,7 +184,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
171
184
|
for box in sorted_boxes:
|
172
185
|
box_points, (_, _) = box
|
173
186
|
current_y = sum(point[1] for point in box_points) / 4
|
174
|
-
min_box_distance = 20
|
187
|
+
min_box_distance = 20 # Minimum distance to consider as new line # ~keep
|
175
188
|
|
176
189
|
if prev_y is None or abs(current_y - prev_y) > min_box_distance:
|
177
190
|
if current_line:
|
@@ -186,7 +199,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
186
199
|
line_groups.append(current_line)
|
187
200
|
|
188
201
|
for line in line_groups:
|
189
|
-
line_sorted = sorted(line, key=lambda x: x[0][0][0])
|
202
|
+
line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
|
190
203
|
|
191
204
|
for box in line_sorted:
|
192
205
|
_, (text, confidence) = box
|
@@ -197,7 +210,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
197
210
|
|
198
211
|
text_content += "\n"
|
199
212
|
|
200
|
-
|
213
|
+
if hasattr(image, "width") and hasattr(image, "height"):
|
214
|
+
width = image.width
|
215
|
+
height = image.height
|
216
|
+
else:
|
217
|
+
width, height = image.size
|
201
218
|
metadata = Metadata(
|
202
219
|
width=width,
|
203
220
|
height=height,
|
@@ -248,19 +265,81 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
248
265
|
) from e
|
249
266
|
|
250
267
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
268
|
+
|
269
|
+
device_info = cls._resolve_device_config(**kwargs)
|
270
|
+
use_gpu = device_info.device_type == "cuda"
|
271
|
+
|
251
272
|
has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
|
252
273
|
kwargs.setdefault("use_angle_cls", True)
|
253
|
-
kwargs
|
254
|
-
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
|
274
|
+
kwargs["use_gpu"] = use_gpu and has_gpu_package
|
275
|
+
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
|
255
276
|
kwargs.setdefault("det_db_thresh", 0.3)
|
256
277
|
kwargs.setdefault("det_db_box_thresh", 0.5)
|
257
278
|
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
258
279
|
|
280
|
+
if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
|
281
|
+
kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
|
282
|
+
|
259
283
|
try:
|
260
284
|
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
|
261
285
|
except Exception as e:
|
262
286
|
raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
|
263
287
|
|
288
|
+
@classmethod
|
289
|
+
def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
|
290
|
+
"""Resolve device configuration with backward compatibility.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
**kwargs: Configuration parameters including device settings.
|
294
|
+
|
295
|
+
Returns:
|
296
|
+
DeviceInfo object for the selected device.
|
297
|
+
|
298
|
+
Raises:
|
299
|
+
ValidationError: If requested device is not available and fallback is disabled.
|
300
|
+
"""
|
301
|
+
use_gpu = kwargs.get("use_gpu", False)
|
302
|
+
device = kwargs.get("device", "auto")
|
303
|
+
memory_limit = kwargs.get("gpu_memory_limit")
|
304
|
+
fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
|
305
|
+
|
306
|
+
if use_gpu and device == "auto":
|
307
|
+
warnings.warn(
|
308
|
+
"The 'use_gpu' parameter is deprecated and will be removed in a future version. "
|
309
|
+
"Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
|
310
|
+
DeprecationWarning,
|
311
|
+
stacklevel=4,
|
312
|
+
)
|
313
|
+
|
314
|
+
device = "auto" if use_gpu else "cpu"
|
315
|
+
elif use_gpu and device != "auto":
|
316
|
+
warnings.warn(
|
317
|
+
"Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
|
318
|
+
"Using 'device' parameter value.",
|
319
|
+
DeprecationWarning,
|
320
|
+
stacklevel=4,
|
321
|
+
)
|
322
|
+
|
323
|
+
if device == "mps":
|
324
|
+
warnings.warn(
|
325
|
+
"PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
|
326
|
+
UserWarning,
|
327
|
+
stacklevel=4,
|
328
|
+
)
|
329
|
+
device = "cpu"
|
330
|
+
|
331
|
+
try:
|
332
|
+
return validate_device_request(
|
333
|
+
device,
|
334
|
+
"PaddleOCR",
|
335
|
+
memory_limit=memory_limit,
|
336
|
+
fallback_to_cpu=fallback_to_cpu,
|
337
|
+
)
|
338
|
+
except ValidationError:
|
339
|
+
if not use_gpu and device == "cpu":
|
340
|
+
return DeviceInfo(device_type="cpu", name="CPU")
|
341
|
+
raise
|
342
|
+
|
264
343
|
@staticmethod
|
265
344
|
def _validate_language_code(lang_code: str) -> str:
|
266
345
|
"""Convert a language code to PaddleOCR format.
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import hashlib
|
3
4
|
import re
|
4
5
|
import sys
|
5
6
|
from dataclasses import dataclass
|
@@ -144,7 +145,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
144
145
|
"tel",
|
145
146
|
"tgk",
|
146
147
|
"tgl",
|
147
|
-
"tha",
|
148
|
+
"tha",
|
148
149
|
"tir",
|
149
150
|
"ton",
|
150
151
|
"tur",
|
@@ -153,7 +154,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
153
154
|
"urd",
|
154
155
|
"uzb",
|
155
156
|
"uzb_cyrl",
|
156
|
-
"vie",
|
157
|
+
"vie",
|
157
158
|
"yid",
|
158
159
|
"yor",
|
159
160
|
}
|
@@ -227,62 +228,151 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
227
228
|
image: Image,
|
228
229
|
**kwargs: Unpack[TesseractConfig],
|
229
230
|
) -> ExtractionResult:
|
230
|
-
|
231
|
-
|
232
|
-
|
231
|
+
import io
|
232
|
+
|
233
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
234
|
+
|
235
|
+
image_buffer = io.BytesIO()
|
236
|
+
await run_sync(image.save, image_buffer, format="PNG")
|
237
|
+
image_content = image_buffer.getvalue()
|
238
|
+
|
239
|
+
cache_kwargs = {
|
240
|
+
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
241
|
+
"ocr_backend": "tesseract",
|
242
|
+
"ocr_config": str(sorted(kwargs.items())),
|
243
|
+
}
|
244
|
+
|
245
|
+
ocr_cache = get_ocr_cache()
|
246
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
247
|
+
if cached_result is not None:
|
248
|
+
return cached_result
|
249
|
+
|
250
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
251
|
+
import anyio
|
252
|
+
|
253
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
254
|
+
await anyio.to_thread.run_sync(event.wait)
|
255
|
+
|
256
|
+
# Try cache again after waiting for other process to complete # ~keep
|
257
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
258
|
+
if cached_result is not None:
|
259
|
+
return cached_result
|
260
|
+
|
261
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
262
|
+
|
233
263
|
try:
|
234
|
-
|
264
|
+
await self._validate_tesseract_version()
|
265
|
+
image_path, unlink = await create_temp_file(".png")
|
266
|
+
await run_sync(image.save, str(image_path), format="PNG")
|
267
|
+
try:
|
268
|
+
result = await self.process_file(image_path, **kwargs)
|
269
|
+
|
270
|
+
await ocr_cache.aset(result, **cache_kwargs)
|
271
|
+
|
272
|
+
return result
|
273
|
+
finally:
|
274
|
+
await unlink()
|
235
275
|
finally:
|
236
|
-
|
276
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
237
277
|
|
238
278
|
async def process_file(
|
239
279
|
self,
|
240
280
|
path: Path,
|
241
281
|
**kwargs: Unpack[TesseractConfig],
|
242
282
|
) -> ExtractionResult:
|
243
|
-
|
244
|
-
|
245
|
-
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
246
|
-
psm = kwargs.pop("psm", PSMMode.AUTO)
|
283
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
284
|
+
|
247
285
|
try:
|
248
|
-
|
249
|
-
|
250
|
-
"
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
str(
|
257
|
-
"
|
258
|
-
"
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
286
|
+
stat = path.stat()
|
287
|
+
file_info = {
|
288
|
+
"path": str(path.resolve()),
|
289
|
+
"size": stat.st_size,
|
290
|
+
"mtime": stat.st_mtime,
|
291
|
+
}
|
292
|
+
except OSError:
|
293
|
+
file_info = {
|
294
|
+
"path": str(path),
|
295
|
+
"size": 0,
|
296
|
+
"mtime": 0,
|
297
|
+
}
|
298
|
+
|
299
|
+
cache_kwargs = {
|
300
|
+
"file_info": str(sorted(file_info.items())),
|
301
|
+
"ocr_backend": "tesseract",
|
302
|
+
"ocr_config": str(sorted(kwargs.items())),
|
303
|
+
}
|
304
|
+
|
305
|
+
ocr_cache = get_ocr_cache()
|
306
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
307
|
+
if cached_result is not None:
|
308
|
+
return cached_result
|
309
|
+
|
310
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
311
|
+
import anyio
|
312
|
+
|
313
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
314
|
+
await anyio.to_thread.run_sync(event.wait)
|
315
|
+
|
316
|
+
# Try cache again after waiting for other process to complete # ~keep
|
317
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
318
|
+
if cached_result is not None:
|
319
|
+
return cached_result
|
320
|
+
|
321
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
322
|
+
|
323
|
+
try:
|
324
|
+
await self._validate_tesseract_version()
|
325
|
+
output_path, unlink = await create_temp_file(".txt")
|
326
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
327
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
328
|
+
try:
|
329
|
+
output_base = str(output_path).replace(".txt", "")
|
330
|
+
command = [
|
331
|
+
"tesseract",
|
332
|
+
str(path),
|
333
|
+
output_base,
|
334
|
+
"-l",
|
335
|
+
language,
|
336
|
+
"--psm",
|
337
|
+
str(psm.value),
|
338
|
+
"--oem",
|
339
|
+
"1",
|
340
|
+
"--loglevel",
|
341
|
+
"OFF",
|
342
|
+
]
|
343
|
+
for kwarg, value in kwargs.items():
|
344
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
345
|
+
|
346
|
+
env: dict[str, Any] | None = None
|
347
|
+
if sys.platform.startswith("linux"):
|
348
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
349
|
+
|
350
|
+
result = await run_process(command, env=env)
|
351
|
+
|
352
|
+
if not result.returncode == 0:
|
353
|
+
raise OCRError(
|
354
|
+
"OCR failed with a non-0 return code.",
|
355
|
+
context={
|
356
|
+
"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
|
357
|
+
},
|
358
|
+
)
|
359
|
+
|
360
|
+
output = await AsyncPath(output_path).read_text("utf-8")
|
361
|
+
extraction_result = ExtractionResult(
|
362
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
276
363
|
)
|
277
364
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
365
|
+
final_cache_kwargs = cache_kwargs.copy()
|
366
|
+
final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
|
367
|
+
await ocr_cache.aset(extraction_result, **final_cache_kwargs)
|
368
|
+
|
369
|
+
return extraction_result
|
370
|
+
except (RuntimeError, OSError) as e:
|
371
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
372
|
+
finally:
|
373
|
+
await unlink()
|
284
374
|
finally:
|
285
|
-
|
375
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
286
376
|
|
287
377
|
@classmethod
|
288
378
|
async def _validate_tesseract_version(cls) -> None:
|
kreuzberg/_playa.py
CHANGED
@@ -274,3 +274,46 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
|
|
274
274
|
|
275
275
|
if subtitle and "title" in result and subtitle != result["title"]:
|
276
276
|
result["subtitle"] = subtitle
|
277
|
+
|
278
|
+
|
279
|
+
def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
|
280
|
+
"""Synchronous version of extract_pdf_metadata.
|
281
|
+
|
282
|
+
Extract metadata from a PDF document without using async/await.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
pdf_content: The bytes of the PDF document.
|
286
|
+
|
287
|
+
Raises:
|
288
|
+
ParsingError: If the PDF metadata could not be extracted.
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
A dictionary of metadata extracted from the PDF.
|
292
|
+
"""
|
293
|
+
try:
|
294
|
+
document = parse(pdf_content, max_workers=1)
|
295
|
+
metadata: Metadata = {}
|
296
|
+
|
297
|
+
for raw_info in document.info:
|
298
|
+
pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
|
299
|
+
_extract_basic_metadata(pdf_info, metadata)
|
300
|
+
_extract_author_metadata(pdf_info, metadata)
|
301
|
+
_extract_keyword_metadata(pdf_info, metadata)
|
302
|
+
_extract_category_metadata(pdf_info, metadata)
|
303
|
+
_extract_date_metadata(pdf_info, metadata)
|
304
|
+
_extract_creator_metadata(pdf_info, metadata)
|
305
|
+
|
306
|
+
if document.pages:
|
307
|
+
_extract_document_dimensions(document, metadata)
|
308
|
+
|
309
|
+
if document.outline and "description" not in metadata:
|
310
|
+
metadata["description"] = _generate_outline_description(document)
|
311
|
+
|
312
|
+
if "summary" not in metadata:
|
313
|
+
metadata["summary"] = _generate_document_summary(document)
|
314
|
+
|
315
|
+
_extract_structure_information(document, metadata)
|
316
|
+
|
317
|
+
return metadata
|
318
|
+
except Exception as e:
|
319
|
+
raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
|