kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_cli_config.py +175 -0
  4. kreuzberg/_extractors/_image.py +39 -4
  5. kreuzberg/_extractors/_pandoc.py +158 -18
  6. kreuzberg/_extractors/_pdf.py +199 -19
  7. kreuzberg/_extractors/_presentation.py +1 -1
  8. kreuzberg/_extractors/_spread_sheet.py +65 -7
  9. kreuzberg/_gmft.py +222 -16
  10. kreuzberg/_mime_types.py +62 -16
  11. kreuzberg/_multiprocessing/__init__.py +6 -0
  12. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  13. kreuzberg/_multiprocessing/process_manager.py +188 -0
  14. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  15. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  16. kreuzberg/_ocr/_easyocr.py +66 -10
  17. kreuzberg/_ocr/_paddleocr.py +86 -7
  18. kreuzberg/_ocr/_tesseract.py +136 -46
  19. kreuzberg/_playa.py +43 -0
  20. kreuzberg/_utils/_cache.py +372 -0
  21. kreuzberg/_utils/_device.py +356 -0
  22. kreuzberg/_utils/_document_cache.py +220 -0
  23. kreuzberg/_utils/_errors.py +232 -0
  24. kreuzberg/_utils/_pdf_lock.py +72 -0
  25. kreuzberg/_utils/_process_pool.py +100 -0
  26. kreuzberg/_utils/_serialization.py +82 -0
  27. kreuzberg/_utils/_string.py +1 -1
  28. kreuzberg/_utils/_sync.py +21 -0
  29. kreuzberg/cli.py +338 -0
  30. kreuzberg/extraction.py +247 -36
  31. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
  32. kreuzberg-3.3.0.dist-info/RECORD +48 -0
  33. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
  34. kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
  35. kreuzberg-3.1.7.dist-info/RECORD +0 -33
  36. kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
  37. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import warnings
3
4
  from dataclasses import dataclass
4
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
5
6
 
@@ -8,6 +9,7 @@ from PIL import Image
8
9
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
9
10
  from kreuzberg._ocr._base import OCRBackend
10
11
  from kreuzberg._types import ExtractionResult, Metadata
12
+ from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
11
13
  from kreuzberg._utils._string import normalize_spaces
12
14
  from kreuzberg._utils._sync import run_sync
13
15
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -55,7 +57,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
55
57
  "hr",
56
58
  "hu",
57
59
  "id",
58
- "inh", # codespell:ignore
60
+ "inh",
59
61
  "is",
60
62
  "it",
61
63
  "ja",
@@ -95,7 +97,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
95
97
  "sw",
96
98
  "ta",
97
99
  "tab",
98
- "te", # codespell:ignore
100
+ "te",
99
101
  "th",
100
102
  "tjk",
101
103
  "tl",
@@ -144,7 +146,13 @@ class EasyOCRConfig:
144
146
  text_threshold: float = 0.7
145
147
  """Text confidence threshold."""
146
148
  use_gpu: bool = False
147
- """Whether to use GPU for inference."""
149
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
150
+ device: DeviceType = "auto"
151
+ """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
152
+ gpu_memory_limit: float | None = None
153
+ """Maximum GPU memory to use in GB. None for no limit."""
154
+ fallback_to_cpu: bool = True
155
+ """Whether to fallback to CPU if requested device is unavailable."""
148
156
  width_ths: float = 0.5
149
157
  """Maximum horizontal distance for merging boxes."""
150
158
  x_ths: float = 1.0
@@ -253,11 +261,12 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
253
261
  content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
254
262
  )
255
263
 
264
+ # Group text boxes by lines based on Y coordinate # ~keep
256
265
  sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
257
266
  line_groups: list[list[Any]] = []
258
267
  current_line: list[Any] = []
259
268
  prev_y_center: float | None = None
260
- line_height_threshold = 20
269
+ line_height_threshold = 20 # Minimum distance to consider as new line # ~keep
261
270
 
262
271
  for item in sorted_results:
263
272
  box, text, confidence = item
@@ -280,7 +289,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
280
289
  confidence_count = 0
281
290
 
282
291
  for line in line_groups:
283
- line_sorted = sorted(line, key=lambda x: x[0][0][0])
292
+ line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
284
293
 
285
294
  for item in line_sorted:
286
295
  _, text, confidence = item
@@ -336,8 +345,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
336
345
  ) from e
337
346
 
338
347
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
339
- has_gpu = cls._is_gpu_available()
340
- kwargs.setdefault("gpu", has_gpu)
348
+
349
+ device_info = cls._resolve_device_config(**kwargs)
350
+ use_gpu = device_info.device_type in ("cuda", "mps")
351
+
341
352
  kwargs.setdefault("detector", True)
342
353
  kwargs.setdefault("recognizer", True)
343
354
  kwargs.setdefault("download_enabled", True)
@@ -347,12 +358,59 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
347
358
  cls._reader = await run_sync(
348
359
  easyocr.Reader,
349
360
  languages,
350
- gpu=kwargs.get("use_gpu"),
361
+ gpu=use_gpu,
351
362
  verbose=False,
352
363
  )
353
364
  except Exception as e:
354
365
  raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
355
366
 
367
+ @classmethod
368
+ def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
369
+ """Resolve device configuration with backward compatibility.
370
+
371
+ Args:
372
+ **kwargs: Configuration parameters including device settings.
373
+
374
+ Returns:
375
+ DeviceInfo object for the selected device.
376
+
377
+ Raises:
378
+ ValidationError: If requested device is not available and fallback is disabled.
379
+ """
380
+ use_gpu = kwargs.get("use_gpu", False)
381
+ device = kwargs.get("device", "auto")
382
+ memory_limit = kwargs.get("gpu_memory_limit")
383
+ fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
384
+
385
+ if use_gpu and device == "auto":
386
+ warnings.warn(
387
+ "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
388
+ "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
389
+ DeprecationWarning,
390
+ stacklevel=4,
391
+ )
392
+
393
+ device = "auto" if use_gpu else "cpu"
394
+ elif use_gpu and device != "auto":
395
+ warnings.warn(
396
+ "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
397
+ "Using 'device' parameter value.",
398
+ DeprecationWarning,
399
+ stacklevel=4,
400
+ )
401
+
402
+ try:
403
+ return validate_device_request(
404
+ device,
405
+ "EasyOCR",
406
+ memory_limit=memory_limit,
407
+ fallback_to_cpu=fallback_to_cpu,
408
+ )
409
+ except ValidationError:
410
+ if not use_gpu and device == "cpu":
411
+ return DeviceInfo(device_type="cpu", name="CPU")
412
+ raise
413
+
356
414
  @staticmethod
357
415
  def _validate_language_code(language_codes: str | list[str]) -> list[str]:
358
416
  """Validate and normalize provided language codes.
@@ -367,10 +425,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
367
425
  A list with the normalized language codes.
368
426
  """
369
427
  if isinstance(language_codes, str):
370
- # Handle comma-separated language codes
371
428
  languages = [lang.strip().lower() for lang in language_codes.split(",")]
372
429
  else:
373
- # Handle list of language codes
374
430
  languages = [lang.lower() for lang in language_codes]
375
431
 
376
432
  unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import platform
4
+ import warnings
4
5
  from dataclasses import dataclass
5
6
  from importlib.util import find_spec
6
7
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -10,6 +11,7 @@ from PIL import Image
10
11
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
11
12
  from kreuzberg._ocr._base import OCRBackend
12
13
  from kreuzberg._types import ExtractionResult, Metadata
14
+ from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
13
15
  from kreuzberg._utils._string import normalize_spaces
14
16
  from kreuzberg._utils._sync import run_sync
15
17
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -91,7 +93,13 @@ class PaddleOCRConfig:
91
93
  use_angle_cls: bool = True
92
94
  """Whether to use text orientation classification model."""
93
95
  use_gpu: bool = False
94
- """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
96
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
97
+ device: DeviceType = "auto"
98
+ """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
99
+ gpu_memory_limit: float | None = None
100
+ """Maximum GPU memory to use in GB. None for no limit."""
101
+ fallback_to_cpu: bool = True
102
+ """Whether to fallback to CPU if requested device is unavailable."""
95
103
  use_space_char: bool = True
96
104
  """Whether to recognize spaces."""
97
105
  use_zero_copy_run: bool = False
@@ -117,6 +125,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
117
125
  import numpy as np
118
126
 
119
127
  await self._init_paddle_ocr(**kwargs)
128
+
129
+ if image.mode != "RGB":
130
+ image = image.convert("RGB")
131
+
120
132
  image_np = np.array(image)
121
133
  try:
122
134
  result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
@@ -145,7 +157,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
145
157
  raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
146
158
 
147
159
  @staticmethod
148
- def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
160
+ def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
149
161
  """Process PaddleOCR result into an ExtractionResult with metadata.
150
162
 
151
163
  Args:
@@ -163,6 +175,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
163
175
  if not page_result:
164
176
  continue
165
177
 
178
+ # Group text boxes by lines based on Y coordinate # ~keep
166
179
  sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
167
180
  line_groups: list[list[Any]] = []
168
181
  current_line: list[Any] = []
@@ -171,7 +184,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
171
184
  for box in sorted_boxes:
172
185
  box_points, (_, _) = box
173
186
  current_y = sum(point[1] for point in box_points) / 4
174
- min_box_distance = 20
187
+ min_box_distance = 20 # Minimum distance to consider as new line # ~keep
175
188
 
176
189
  if prev_y is None or abs(current_y - prev_y) > min_box_distance:
177
190
  if current_line:
@@ -186,7 +199,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
186
199
  line_groups.append(current_line)
187
200
 
188
201
  for line in line_groups:
189
- line_sorted = sorted(line, key=lambda x: x[0][0][0])
202
+ line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
190
203
 
191
204
  for box in line_sorted:
192
205
  _, (text, confidence) = box
@@ -197,7 +210,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
197
210
 
198
211
  text_content += "\n"
199
212
 
200
- width, height = image.size
213
+ if hasattr(image, "width") and hasattr(image, "height"):
214
+ width = image.width
215
+ height = image.height
216
+ else:
217
+ width, height = image.size
201
218
  metadata = Metadata(
202
219
  width=width,
203
220
  height=height,
@@ -248,19 +265,81 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
248
265
  ) from e
249
266
 
250
267
  language = cls._validate_language_code(kwargs.pop("language", "en"))
268
+
269
+ device_info = cls._resolve_device_config(**kwargs)
270
+ use_gpu = device_info.device_type == "cuda"
271
+
251
272
  has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
252
273
  kwargs.setdefault("use_angle_cls", True)
253
- kwargs.setdefault("use_gpu", has_gpu_package)
254
- kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
274
+ kwargs["use_gpu"] = use_gpu and has_gpu_package
275
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
255
276
  kwargs.setdefault("det_db_thresh", 0.3)
256
277
  kwargs.setdefault("det_db_box_thresh", 0.5)
257
278
  kwargs.setdefault("det_db_unclip_ratio", 1.6)
258
279
 
280
+ if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
281
+ kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
282
+
259
283
  try:
260
284
  cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
261
285
  except Exception as e:
262
286
  raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
263
287
 
288
+ @classmethod
289
+ def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
290
+ """Resolve device configuration with backward compatibility.
291
+
292
+ Args:
293
+ **kwargs: Configuration parameters including device settings.
294
+
295
+ Returns:
296
+ DeviceInfo object for the selected device.
297
+
298
+ Raises:
299
+ ValidationError: If requested device is not available and fallback is disabled.
300
+ """
301
+ use_gpu = kwargs.get("use_gpu", False)
302
+ device = kwargs.get("device", "auto")
303
+ memory_limit = kwargs.get("gpu_memory_limit")
304
+ fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
305
+
306
+ if use_gpu and device == "auto":
307
+ warnings.warn(
308
+ "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
309
+ "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
310
+ DeprecationWarning,
311
+ stacklevel=4,
312
+ )
313
+
314
+ device = "auto" if use_gpu else "cpu"
315
+ elif use_gpu and device != "auto":
316
+ warnings.warn(
317
+ "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
318
+ "Using 'device' parameter value.",
319
+ DeprecationWarning,
320
+ stacklevel=4,
321
+ )
322
+
323
+ if device == "mps":
324
+ warnings.warn(
325
+ "PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
326
+ UserWarning,
327
+ stacklevel=4,
328
+ )
329
+ device = "cpu"
330
+
331
+ try:
332
+ return validate_device_request(
333
+ device,
334
+ "PaddleOCR",
335
+ memory_limit=memory_limit,
336
+ fallback_to_cpu=fallback_to_cpu,
337
+ )
338
+ except ValidationError:
339
+ if not use_gpu and device == "cpu":
340
+ return DeviceInfo(device_type="cpu", name="CPU")
341
+ raise
342
+
264
343
  @staticmethod
265
344
  def _validate_language_code(lang_code: str) -> str:
266
345
  """Convert a language code to PaddleOCR format.
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
3
4
  import re
4
5
  import sys
5
6
  from dataclasses import dataclass
@@ -144,7 +145,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
144
145
  "tel",
145
146
  "tgk",
146
147
  "tgl",
147
- "tha", # codespell:ignore
148
+ "tha",
148
149
  "tir",
149
150
  "ton",
150
151
  "tur",
@@ -153,7 +154,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
153
154
  "urd",
154
155
  "uzb",
155
156
  "uzb_cyrl",
156
- "vie", # codespell:ignore
157
+ "vie",
157
158
  "yid",
158
159
  "yor",
159
160
  }
@@ -227,62 +228,151 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
227
228
  image: Image,
228
229
  **kwargs: Unpack[TesseractConfig],
229
230
  ) -> ExtractionResult:
230
- await self._validate_tesseract_version()
231
- image_path, unlink = await create_temp_file(".png")
232
- await run_sync(image.save, str(image_path), format="PNG")
231
+ import io
232
+
233
+ from kreuzberg._utils._cache import get_ocr_cache
234
+
235
+ image_buffer = io.BytesIO()
236
+ await run_sync(image.save, image_buffer, format="PNG")
237
+ image_content = image_buffer.getvalue()
238
+
239
+ cache_kwargs = {
240
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
241
+ "ocr_backend": "tesseract",
242
+ "ocr_config": str(sorted(kwargs.items())),
243
+ }
244
+
245
+ ocr_cache = get_ocr_cache()
246
+ cached_result = await ocr_cache.aget(**cache_kwargs)
247
+ if cached_result is not None:
248
+ return cached_result
249
+
250
+ if ocr_cache.is_processing(**cache_kwargs):
251
+ import anyio
252
+
253
+ event = ocr_cache.mark_processing(**cache_kwargs)
254
+ await anyio.to_thread.run_sync(event.wait)
255
+
256
+ # Try cache again after waiting for other process to complete # ~keep
257
+ cached_result = await ocr_cache.aget(**cache_kwargs)
258
+ if cached_result is not None:
259
+ return cached_result
260
+
261
+ ocr_cache.mark_processing(**cache_kwargs)
262
+
233
263
  try:
234
- return await self.process_file(image_path, **kwargs)
264
+ await self._validate_tesseract_version()
265
+ image_path, unlink = await create_temp_file(".png")
266
+ await run_sync(image.save, str(image_path), format="PNG")
267
+ try:
268
+ result = await self.process_file(image_path, **kwargs)
269
+
270
+ await ocr_cache.aset(result, **cache_kwargs)
271
+
272
+ return result
273
+ finally:
274
+ await unlink()
235
275
  finally:
236
- await unlink()
276
+ ocr_cache.mark_complete(**cache_kwargs)
237
277
 
238
278
  async def process_file(
239
279
  self,
240
280
  path: Path,
241
281
  **kwargs: Unpack[TesseractConfig],
242
282
  ) -> ExtractionResult:
243
- await self._validate_tesseract_version()
244
- output_path, unlink = await create_temp_file(".txt")
245
- language = self._validate_language_code(kwargs.pop("language", "eng"))
246
- psm = kwargs.pop("psm", PSMMode.AUTO)
283
+ from kreuzberg._utils._cache import get_ocr_cache
284
+
247
285
  try:
248
- output_base = str(output_path).replace(".txt", "")
249
- command = [
250
- "tesseract",
251
- str(path),
252
- output_base,
253
- "-l",
254
- language,
255
- "--psm",
256
- str(psm.value),
257
- "--oem",
258
- "1",
259
- "--loglevel",
260
- "OFF",
261
- ]
262
- for kwarg, value in kwargs.items():
263
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
264
-
265
- env: dict[str, Any] | None = None
266
- if sys.platform.startswith("linux"):
267
- # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
268
- env = {"OMP_THREAD_LIMIT": "1"}
269
-
270
- result = await run_process(command, env=env)
271
-
272
- if not result.returncode == 0:
273
- raise OCRError(
274
- "OCR failed with a non-0 return code.",
275
- context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
286
+ stat = path.stat()
287
+ file_info = {
288
+ "path": str(path.resolve()),
289
+ "size": stat.st_size,
290
+ "mtime": stat.st_mtime,
291
+ }
292
+ except OSError:
293
+ file_info = {
294
+ "path": str(path),
295
+ "size": 0,
296
+ "mtime": 0,
297
+ }
298
+
299
+ cache_kwargs = {
300
+ "file_info": str(sorted(file_info.items())),
301
+ "ocr_backend": "tesseract",
302
+ "ocr_config": str(sorted(kwargs.items())),
303
+ }
304
+
305
+ ocr_cache = get_ocr_cache()
306
+ cached_result = await ocr_cache.aget(**cache_kwargs)
307
+ if cached_result is not None:
308
+ return cached_result
309
+
310
+ if ocr_cache.is_processing(**cache_kwargs):
311
+ import anyio
312
+
313
+ event = ocr_cache.mark_processing(**cache_kwargs)
314
+ await anyio.to_thread.run_sync(event.wait)
315
+
316
+ # Try cache again after waiting for other process to complete # ~keep
317
+ cached_result = await ocr_cache.aget(**cache_kwargs)
318
+ if cached_result is not None:
319
+ return cached_result
320
+
321
+ ocr_cache.mark_processing(**cache_kwargs)
322
+
323
+ try:
324
+ await self._validate_tesseract_version()
325
+ output_path, unlink = await create_temp_file(".txt")
326
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
327
+ psm = kwargs.pop("psm", PSMMode.AUTO)
328
+ try:
329
+ output_base = str(output_path).replace(".txt", "")
330
+ command = [
331
+ "tesseract",
332
+ str(path),
333
+ output_base,
334
+ "-l",
335
+ language,
336
+ "--psm",
337
+ str(psm.value),
338
+ "--oem",
339
+ "1",
340
+ "--loglevel",
341
+ "OFF",
342
+ ]
343
+ for kwarg, value in kwargs.items():
344
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
345
+
346
+ env: dict[str, Any] | None = None
347
+ if sys.platform.startswith("linux"):
348
+ env = {"OMP_THREAD_LIMIT": "1"}
349
+
350
+ result = await run_process(command, env=env)
351
+
352
+ if not result.returncode == 0:
353
+ raise OCRError(
354
+ "OCR failed with a non-0 return code.",
355
+ context={
356
+ "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
357
+ },
358
+ )
359
+
360
+ output = await AsyncPath(output_path).read_text("utf-8")
361
+ extraction_result = ExtractionResult(
362
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
276
363
  )
277
364
 
278
- output = await AsyncPath(output_path).read_text("utf-8")
279
- return ExtractionResult(
280
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
281
- )
282
- except (RuntimeError, OSError) as e:
283
- raise OCRError(f"Failed to OCR using tesseract: {e}") from e
365
+ final_cache_kwargs = cache_kwargs.copy()
366
+ final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
367
+ await ocr_cache.aset(extraction_result, **final_cache_kwargs)
368
+
369
+ return extraction_result
370
+ except (RuntimeError, OSError) as e:
371
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
372
+ finally:
373
+ await unlink()
284
374
  finally:
285
- await unlink()
375
+ ocr_cache.mark_complete(**cache_kwargs)
286
376
 
287
377
  @classmethod
288
378
  async def _validate_tesseract_version(cls) -> None:
kreuzberg/_playa.py CHANGED
@@ -274,3 +274,46 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
274
274
 
275
275
  if subtitle and "title" in result and subtitle != result["title"]:
276
276
  result["subtitle"] = subtitle
277
+
278
+
279
+ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
280
+ """Synchronous version of extract_pdf_metadata.
281
+
282
+ Extract metadata from a PDF document without using async/await.
283
+
284
+ Args:
285
+ pdf_content: The bytes of the PDF document.
286
+
287
+ Raises:
288
+ ParsingError: If the PDF metadata could not be extracted.
289
+
290
+ Returns:
291
+ A dictionary of metadata extracted from the PDF.
292
+ """
293
+ try:
294
+ document = parse(pdf_content, max_workers=1)
295
+ metadata: Metadata = {}
296
+
297
+ for raw_info in document.info:
298
+ pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
299
+ _extract_basic_metadata(pdf_info, metadata)
300
+ _extract_author_metadata(pdf_info, metadata)
301
+ _extract_keyword_metadata(pdf_info, metadata)
302
+ _extract_category_metadata(pdf_info, metadata)
303
+ _extract_date_metadata(pdf_info, metadata)
304
+ _extract_creator_metadata(pdf_info, metadata)
305
+
306
+ if document.pages:
307
+ _extract_document_dimensions(document, metadata)
308
+
309
+ if document.outline and "description" not in metadata:
310
+ metadata["description"] = _generate_outline_description(document)
311
+
312
+ if "summary" not in metadata:
313
+ metadata["summary"] = _generate_document_summary(document)
314
+
315
+ _extract_structure_information(document, metadata)
316
+
317
+ return metadata
318
+ except Exception as e:
319
+ raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e