kreuzberg 3.1.7__py3-none-any.whl → 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -202,7 +202,7 @@ class PresentationExtractor(Extractor):
202
202
  ("keywords", "keywords"),
203
203
  ("modified_by", "last_modified_by"),
204
204
  ("modified_at", "modified"),
205
- ("version", "revision"), # if version and revision are given, version overwrites ~keep
205
+ ("version", "revision"), # if version and revision are given, version overwrites
206
206
  ("subject", "subject"),
207
207
  ("title", "title"),
208
208
  ("version", "version"),
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import warnings
3
4
  from dataclasses import dataclass
4
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
5
6
 
@@ -8,6 +9,7 @@ from PIL import Image
8
9
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
9
10
  from kreuzberg._ocr._base import OCRBackend
10
11
  from kreuzberg._types import ExtractionResult, Metadata
12
+ from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
11
13
  from kreuzberg._utils._string import normalize_spaces
12
14
  from kreuzberg._utils._sync import run_sync
13
15
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -144,7 +146,13 @@ class EasyOCRConfig:
144
146
  text_threshold: float = 0.7
145
147
  """Text confidence threshold."""
146
148
  use_gpu: bool = False
147
- """Whether to use GPU for inference."""
149
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
150
+ device: DeviceType = "auto"
151
+ """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
152
+ gpu_memory_limit: float | None = None
153
+ """Maximum GPU memory to use in GB. None for no limit."""
154
+ fallback_to_cpu: bool = True
155
+ """Whether to fallback to CPU if requested device is unavailable."""
148
156
  width_ths: float = 0.5
149
157
  """Maximum horizontal distance for merging boxes."""
150
158
  x_ths: float = 1.0
@@ -336,8 +344,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
336
344
  ) from e
337
345
 
338
346
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
339
- has_gpu = cls._is_gpu_available()
340
- kwargs.setdefault("gpu", has_gpu)
347
+
348
+ # Handle device selection with backward compatibility
349
+ device_info = cls._resolve_device_config(**kwargs)
350
+ use_gpu = device_info.device_type in ("cuda", "mps")
351
+
341
352
  kwargs.setdefault("detector", True)
342
353
  kwargs.setdefault("recognizer", True)
343
354
  kwargs.setdefault("download_enabled", True)
@@ -347,12 +358,63 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
347
358
  cls._reader = await run_sync(
348
359
  easyocr.Reader,
349
360
  languages,
350
- gpu=kwargs.get("use_gpu"),
361
+ gpu=use_gpu,
351
362
  verbose=False,
352
363
  )
353
364
  except Exception as e:
354
365
  raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
355
366
 
367
+ @classmethod
368
+ def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
369
+ """Resolve device configuration with backward compatibility.
370
+
371
+ Args:
372
+ **kwargs: Configuration parameters including device settings.
373
+
374
+ Returns:
375
+ DeviceInfo object for the selected device.
376
+
377
+ Raises:
378
+ ValidationError: If requested device is not available and fallback is disabled.
379
+ """
380
+ # Handle deprecated use_gpu parameter
381
+ use_gpu = kwargs.get("use_gpu", False)
382
+ device = kwargs.get("device", "auto")
383
+ memory_limit = kwargs.get("gpu_memory_limit")
384
+ fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
385
+
386
+ # Check for deprecated parameter usage
387
+ if use_gpu and device == "auto":
388
+ warnings.warn(
389
+ "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
390
+ "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
391
+ DeprecationWarning,
392
+ stacklevel=4,
393
+ )
394
+ # Convert deprecated use_gpu=True to device="auto"
395
+ device = "auto" if use_gpu else "cpu"
396
+ elif use_gpu and device != "auto":
397
+ warnings.warn(
398
+ "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
399
+ "Using 'device' parameter value.",
400
+ DeprecationWarning,
401
+ stacklevel=4,
402
+ )
403
+
404
+ # Validate and get device info
405
+ try:
406
+ return validate_device_request(
407
+ device,
408
+ "EasyOCR",
409
+ memory_limit=memory_limit,
410
+ fallback_to_cpu=fallback_to_cpu,
411
+ )
412
+ except ValidationError:
413
+ # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
414
+ if not use_gpu and device == "cpu":
415
+ return DeviceInfo(device_type="cpu", name="CPU")
416
+ raise
417
+
356
418
  @staticmethod
357
419
  def _validate_language_code(language_codes: str | list[str]) -> list[str]:
358
420
  """Validate and normalize provided language codes.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import platform
4
+ import warnings
4
5
  from dataclasses import dataclass
5
6
  from importlib.util import find_spec
6
7
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -10,6 +11,7 @@ from PIL import Image
10
11
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
11
12
  from kreuzberg._ocr._base import OCRBackend
12
13
  from kreuzberg._types import ExtractionResult, Metadata
14
+ from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
13
15
  from kreuzberg._utils._string import normalize_spaces
14
16
  from kreuzberg._utils._sync import run_sync
15
17
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -91,7 +93,13 @@ class PaddleOCRConfig:
91
93
  use_angle_cls: bool = True
92
94
  """Whether to use text orientation classification model."""
93
95
  use_gpu: bool = False
94
- """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
96
+ """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
97
+ device: DeviceType = "auto"
98
+ """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
99
+ gpu_memory_limit: float | None = None
100
+ """Maximum GPU memory to use in GB. None for no limit."""
101
+ fallback_to_cpu: bool = True
102
+ """Whether to fallback to CPU if requested device is unavailable."""
95
103
  use_space_char: bool = True
96
104
  """Whether to recognize spaces."""
97
105
  use_zero_copy_run: bool = False
@@ -248,19 +256,88 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
248
256
  ) from e
249
257
 
250
258
  language = cls._validate_language_code(kwargs.pop("language", "en"))
259
+
260
+ # Handle device selection with backward compatibility
261
+ device_info = cls._resolve_device_config(**kwargs)
262
+ use_gpu = device_info.device_type == "cuda"
263
+
251
264
  has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
252
265
  kwargs.setdefault("use_angle_cls", True)
253
- kwargs.setdefault("use_gpu", has_gpu_package)
254
- kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
266
+ kwargs["use_gpu"] = use_gpu and has_gpu_package
267
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
255
268
  kwargs.setdefault("det_db_thresh", 0.3)
256
269
  kwargs.setdefault("det_db_box_thresh", 0.5)
257
270
  kwargs.setdefault("det_db_unclip_ratio", 1.6)
258
271
 
272
+ # Set GPU memory limit if specified
273
+ if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
274
+ kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024) # Convert GB to MB
275
+
259
276
  try:
260
277
  cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
261
278
  except Exception as e:
262
279
  raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
263
280
 
281
+ @classmethod
282
+ def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
283
+ """Resolve device configuration with backward compatibility.
284
+
285
+ Args:
286
+ **kwargs: Configuration parameters including device settings.
287
+
288
+ Returns:
289
+ DeviceInfo object for the selected device.
290
+
291
+ Raises:
292
+ ValidationError: If requested device is not available and fallback is disabled.
293
+ """
294
+ # Handle deprecated use_gpu parameter
295
+ use_gpu = kwargs.get("use_gpu", False)
296
+ device = kwargs.get("device", "auto")
297
+ memory_limit = kwargs.get("gpu_memory_limit")
298
+ fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
299
+
300
+ # Check for deprecated parameter usage
301
+ if use_gpu and device == "auto":
302
+ warnings.warn(
303
+ "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
304
+ "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
305
+ DeprecationWarning,
306
+ stacklevel=4,
307
+ )
308
+ # Convert deprecated use_gpu=True to device="auto"
309
+ device = "auto" if use_gpu else "cpu"
310
+ elif use_gpu and device != "auto":
311
+ warnings.warn(
312
+ "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
313
+ "Using 'device' parameter value.",
314
+ DeprecationWarning,
315
+ stacklevel=4,
316
+ )
317
+
318
+ # PaddlePaddle doesn't support MPS, so warn if requested
319
+ if device == "mps":
320
+ warnings.warn(
321
+ "PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
322
+ UserWarning,
323
+ stacklevel=4,
324
+ )
325
+ device = "cpu"
326
+
327
+ # Validate and get device info
328
+ try:
329
+ return validate_device_request(
330
+ device,
331
+ "PaddleOCR",
332
+ memory_limit=memory_limit,
333
+ fallback_to_cpu=fallback_to_cpu,
334
+ )
335
+ except ValidationError:
336
+ # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
337
+ if not use_gpu and device == "cpu":
338
+ return DeviceInfo(device_type="cpu", name="CPU")
339
+ raise
340
+
264
341
  @staticmethod
265
342
  def _validate_language_code(lang_code: str) -> str:
266
343
  """Convert a language code to PaddleOCR format.
@@ -264,7 +264,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
264
264
 
265
265
  env: dict[str, Any] | None = None
266
266
  if sys.platform.startswith("linux"):
267
- # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
267
+ # we have to prevent multithreading this way otherwise we will get deadlocks
268
268
  env = {"OMP_THREAD_LIMIT": "1"}
269
269
 
270
270
  result = await run_process(command, env=env)
@@ -0,0 +1,373 @@
1
+ """Device detection and management utilities for GPU acceleration."""
2
+ # ruff: noqa: BLE001
3
+
4
+ from __future__ import annotations
5
+
6
+ import warnings
7
+ from dataclasses import dataclass
8
+ from typing import Literal
9
+
10
+ from kreuzberg.exceptions import ValidationError
11
+
12
+ DeviceType = Literal["cpu", "cuda", "mps", "auto"]
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class DeviceInfo:
17
+ """Information about a compute device."""
18
+
19
+ device_type: Literal["cpu", "cuda", "mps"]
20
+ """The type of device."""
21
+ device_id: int | None = None
22
+ """Device ID for multi-GPU systems. None for CPU or single GPU."""
23
+ memory_total: float | None = None
24
+ """Total memory in GB. None if unknown."""
25
+ memory_available: float | None = None
26
+ """Available memory in GB. None if unknown."""
27
+ name: str | None = None
28
+ """Human-readable device name."""
29
+
30
+
31
+ def detect_available_devices() -> list[DeviceInfo]:
32
+ """Detect all available compute devices.
33
+
34
+ Returns:
35
+ List of available devices, with the most preferred device first.
36
+ """
37
+ devices: list[DeviceInfo] = []
38
+
39
+ # Always include CPU as fallback
40
+ devices.append(
41
+ DeviceInfo(
42
+ device_type="cpu",
43
+ name="CPU",
44
+ )
45
+ )
46
+
47
+ # Check for CUDA (NVIDIA GPUs)
48
+ if _is_cuda_available():
49
+ cuda_devices = _get_cuda_devices()
50
+ devices.extend(cuda_devices)
51
+
52
+ # Check for MPS (Apple Silicon)
53
+ if _is_mps_available():
54
+ mps_device = _get_mps_device()
55
+ if mps_device:
56
+ devices.append(mps_device)
57
+
58
+ # Reorder to put GPU devices first
59
+ gpu_devices = [d for d in devices if d.device_type != "cpu"]
60
+ cpu_devices = [d for d in devices if d.device_type == "cpu"]
61
+
62
+ return gpu_devices + cpu_devices
63
+
64
+
65
+ def get_optimal_device() -> DeviceInfo:
66
+ """Get the optimal device for OCR processing.
67
+
68
+ Returns:
69
+ The best available device, preferring GPU over CPU.
70
+ """
71
+ devices = detect_available_devices()
72
+ return devices[0] if devices else DeviceInfo(device_type="cpu", name="CPU")
73
+
74
+
75
+ def validate_device_request(
76
+ requested: DeviceType,
77
+ backend: str,
78
+ *,
79
+ memory_limit: float | None = None,
80
+ fallback_to_cpu: bool = True,
81
+ ) -> DeviceInfo:
82
+ """Validate and resolve a device request.
83
+
84
+ Args:
85
+ requested: The requested device type.
86
+ backend: Name of the OCR backend requesting the device.
87
+ memory_limit: Optional memory limit in GB.
88
+ fallback_to_cpu: Whether to fallback to CPU if requested device unavailable.
89
+
90
+ Returns:
91
+ A validated DeviceInfo object.
92
+
93
+ Raises:
94
+ ValidationError: If the requested device is not available and fallback is disabled.
95
+ """
96
+ available_devices = detect_available_devices()
97
+
98
+ # Handle auto device selection
99
+ if requested == "auto":
100
+ device = get_optimal_device()
101
+ if memory_limit is not None:
102
+ _validate_memory_limit(device, memory_limit)
103
+ return device
104
+
105
+ # Find requested device
106
+ matching_devices = [d for d in available_devices if d.device_type == requested]
107
+
108
+ if not matching_devices:
109
+ if fallback_to_cpu and requested != "cpu":
110
+ warnings.warn(
111
+ f"Requested device '{requested}' not available for {backend}. Falling back to CPU.",
112
+ UserWarning,
113
+ stacklevel=2,
114
+ )
115
+ cpu_device = next((d for d in available_devices if d.device_type == "cpu"), None)
116
+ if cpu_device:
117
+ return cpu_device
118
+
119
+ raise ValidationError(
120
+ f"Requested device '{requested}' is not available for {backend}",
121
+ context={
122
+ "requested_device": requested,
123
+ "backend": backend,
124
+ "available_devices": [d.device_type for d in available_devices],
125
+ },
126
+ )
127
+
128
+ # Use the first matching device (typically the best one)
129
+ device = matching_devices[0]
130
+
131
+ # Validate memory limit if specified
132
+ if memory_limit is not None:
133
+ _validate_memory_limit(device, memory_limit)
134
+
135
+ return device
136
+
137
+
138
+ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | None]:
139
+ """Get memory information for a device.
140
+
141
+ Args:
142
+ device: The device to query.
143
+
144
+ Returns:
145
+ Tuple of (total_memory_gb, available_memory_gb). None values if unknown.
146
+ """
147
+ if device.device_type == "cpu":
148
+ return None, None
149
+
150
+ if device.device_type == "cuda":
151
+ return _get_cuda_memory_info(device.device_id or 0)
152
+
153
+ if device.device_type == "mps":
154
+ return _get_mps_memory_info()
155
+
156
+ return None, None
157
+
158
+
159
+ def _is_cuda_available() -> bool:
160
+ """Check if CUDA is available."""
161
+ try:
162
+ import torch
163
+
164
+ return torch.cuda.is_available()
165
+ except ImportError:
166
+ return False
167
+
168
+
169
+ def _is_mps_available() -> bool:
170
+ """Check if MPS (Apple Silicon) is available."""
171
+ try:
172
+ import torch
173
+
174
+ return torch.backends.mps.is_available()
175
+ except ImportError:
176
+ return False
177
+
178
+
179
+ def _get_cuda_devices() -> list[DeviceInfo]:
180
+ """Get information about available CUDA devices."""
181
+ devices: list[DeviceInfo] = []
182
+
183
+ try:
184
+ import torch
185
+
186
+ if not torch.cuda.is_available():
187
+ return devices
188
+
189
+ for i in range(torch.cuda.device_count()):
190
+ props = torch.cuda.get_device_properties(i)
191
+ total_memory = props.total_memory / (1024**3) # Convert to GB
192
+
193
+ # Get available memory
194
+ torch.cuda.set_device(i)
195
+ available_memory = torch.cuda.get_device_properties(i).total_memory / (1024**3)
196
+ try:
197
+ # Try to get current memory usage
198
+ allocated = torch.cuda.memory_allocated(i) / (1024**3)
199
+ available_memory = total_memory - allocated
200
+ except Exception:
201
+ # Fallback to total memory if we can't get allocation info
202
+ available_memory = total_memory
203
+
204
+ devices.append(
205
+ DeviceInfo(
206
+ device_type="cuda",
207
+ device_id=i,
208
+ memory_total=total_memory,
209
+ memory_available=available_memory,
210
+ name=props.name,
211
+ )
212
+ )
213
+
214
+ except ImportError:
215
+ pass
216
+
217
+ return devices
218
+
219
+
220
+ def _get_mps_device() -> DeviceInfo | None:
221
+ """Get information about the MPS device."""
222
+ try:
223
+ import torch
224
+
225
+ if not torch.backends.mps.is_available():
226
+ return None
227
+
228
+ # MPS doesn't provide detailed memory info
229
+ return DeviceInfo(
230
+ device_type="mps",
231
+ name="Apple Silicon GPU (MPS)",
232
+ )
233
+
234
+ except ImportError:
235
+ return None
236
+
237
+
238
+ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
239
+ """Get CUDA memory information for a specific device."""
240
+ try:
241
+ import torch
242
+
243
+ if not torch.cuda.is_available():
244
+ return None, None
245
+
246
+ props = torch.cuda.get_device_properties(device_id)
247
+ total_memory = props.total_memory / (1024**3)
248
+
249
+ try:
250
+ allocated = torch.cuda.memory_allocated(device_id) / (1024**3)
251
+ available_memory = total_memory - allocated
252
+ except Exception:
253
+ available_memory = total_memory
254
+
255
+ return total_memory, available_memory
256
+
257
+ except ImportError:
258
+ return None, None
259
+
260
+
261
+ def _get_mps_memory_info() -> tuple[float | None, float | None]:
262
+ """Get MPS memory information."""
263
+ # MPS doesn't provide detailed memory info through PyTorch
264
+ # We could potentially use system calls but that's platform-specific
265
+ return None, None
266
+
267
+
268
+ def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
269
+ """Validate that a device has enough memory for the requested limit.
270
+
271
+ Args:
272
+ device: The device to validate.
273
+ memory_limit: Required memory in GB.
274
+
275
+ Raises:
276
+ ValidationError: If the device doesn't have enough memory.
277
+ """
278
+ if device.device_type == "cpu":
279
+ # CPU memory validation is complex and OS-dependent, skip for now
280
+ return
281
+
282
+ total_memory, available_memory = get_device_memory_info(device)
283
+
284
+ if total_memory is not None and memory_limit > total_memory:
285
+ raise ValidationError(
286
+ f"Requested memory limit ({memory_limit:.1f}GB) exceeds device capacity ({total_memory:.1f}GB)",
287
+ context={
288
+ "device": device.device_type,
289
+ "device_name": device.name,
290
+ "requested_memory": memory_limit,
291
+ "total_memory": total_memory,
292
+ "available_memory": available_memory,
293
+ },
294
+ )
295
+
296
+ if available_memory is not None and memory_limit > available_memory:
297
+ warnings.warn(
298
+ f"Requested memory limit ({memory_limit:.1f}GB) exceeds available memory "
299
+ f"({available_memory:.1f}GB) on {device.name or device.device_type}",
300
+ UserWarning,
301
+ stacklevel=3,
302
+ )
303
+
304
+
305
+ def is_backend_gpu_compatible(backend: str) -> bool:
306
+ """Check if an OCR backend supports GPU acceleration.
307
+
308
+ Args:
309
+ backend: Name of the OCR backend.
310
+
311
+ Returns:
312
+ True if the backend supports GPU acceleration.
313
+ """
314
+ # EasyOCR and PaddleOCR support GPU, Tesseract does not
315
+ return backend.lower() in ("easyocr", "paddleocr")
316
+
317
+
318
+ def get_recommended_batch_size(device: DeviceInfo, input_size_mb: float = 10.0) -> int:
319
+ """Get recommended batch size for OCR processing.
320
+
321
+ Args:
322
+ device: The device to optimize for.
323
+ input_size_mb: Estimated input size per item in MB.
324
+
325
+ Returns:
326
+ Recommended batch size.
327
+ """
328
+ if device.device_type == "cpu":
329
+ # Conservative batch size for CPU
330
+ return 1
331
+
332
+ # For GPU devices, estimate based on available memory
333
+ _, available_memory = get_device_memory_info(device)
334
+
335
+ if available_memory is None:
336
+ # Conservative default for unknown memory
337
+ return 4
338
+
339
+ # Reserve some memory for model and intermediate calculations
340
+ # Use approximately 50% of available memory for batching
341
+ usable_memory_gb = available_memory * 0.5
342
+ usable_memory_mb = usable_memory_gb * 1024
343
+
344
+ # Estimate batch size (conservative)
345
+ estimated_batch_size = max(1, int(usable_memory_mb / (input_size_mb * 4)))
346
+
347
+ # Cap at reasonable limits
348
+ return min(estimated_batch_size, 32)
349
+
350
+
351
+ def cleanup_device_memory(device: DeviceInfo) -> None:
352
+ """Clean up device memory.
353
+
354
+ Args:
355
+ device: The device to clean up.
356
+ """
357
+ if device.device_type == "cuda":
358
+ try:
359
+ import torch
360
+
361
+ if torch.cuda.is_available():
362
+ torch.cuda.empty_cache()
363
+ except ImportError:
364
+ pass
365
+
366
+ elif device.device_type == "mps":
367
+ try:
368
+ import torch
369
+
370
+ if torch.backends.mps.is_available():
371
+ torch.mps.empty_cache()
372
+ except (ImportError, AttributeError):
373
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.1.7
3
+ Version: 3.2.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -27,8 +27,8 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.2
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.3.3
31
- Requires-Dist: playa-pdf>=0.5.1
30
+ Requires-Dist: html-to-markdown>=1.4.0
31
+ Requires-Dist: playa-pdf>=0.6.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
33
  Requires-Dist: python-calamine>=0.3.2
34
34
  Requires-Dist: python-pptx>=1.0.2
@@ -36,7 +36,7 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
38
  Requires-Dist: gmft>=0.4.1; extra == "all"
39
- Requires-Dist: paddleocr>=3.0.1; extra == "all"
39
+ Requires-Dist: paddleocr>=3.0.2; extra == "all"
40
40
  Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
41
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == "all"
42
42
  Requires-Dist: setuptools>=80.9.0; extra == "all"
@@ -47,7 +47,7 @@ Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
47
47
  Provides-Extra: gmft
48
48
  Requires-Dist: gmft>=0.4.1; extra == "gmft"
49
49
  Provides-Extra: paddleocr
50
- Requires-Dist: paddleocr>=3.0.1; extra == "paddleocr"
50
+ Requires-Dist: paddleocr>=3.0.2; extra == "paddleocr"
51
51
  Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
52
52
  Requires-Dist: setuptools>=80.9.0; extra == "paddleocr"
53
53
  Dynamic: license-file
@@ -157,17 +157,9 @@ Kreuzberg supports multiple OCR engines:
157
157
 
158
158
  For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
159
159
 
160
- ## Contribution
160
+ ## Contributing
161
161
 
162
- This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
163
-
164
- ### Local Development
165
-
166
- - Clone the repo
167
- - Install the system dependencies
168
- - Install the full dependencies with `uv sync`
169
- - Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
170
- - Make your changes and submit a PR
162
+ We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
171
163
 
172
164
  ## License
173
165
 
@@ -15,19 +15,20 @@ kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_w
15
15
  kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
16
16
  kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
17
17
  kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
18
- kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
18
+ kreuzberg/_extractors/_presentation.py,sha256=7W6RHTk-zksuHoSk0i6UaSBf5NatnPo17MxepQoI6XI,8758
19
19
  kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
20
20
  kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
21
21
  kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
22
- kreuzberg/_ocr/_easyocr.py,sha256=J8IP2Fg55dG2MH9-lpyZFounvgIgWgCrw694UkaUa9E,11491
23
- kreuzberg/_ocr/_paddleocr.py,sha256=FyALVb3AQFcej9NFOLy-0dkA-3uxE_ie9Mzd6ho3t68,10656
24
- kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
22
+ kreuzberg/_ocr/_easyocr.py,sha256=1OG2IbLdg4cXouV0FVzMnCkYYh6GN1pvXqXWw40PUz8,14054
23
+ kreuzberg/_ocr/_paddleocr.py,sha256=K6D3B2cn-JIhipI5UHMa0Kn2M-GKtyUFCahs8wJQZcA,13855
24
+ kreuzberg/_ocr/_tesseract.py,sha256=KcJMK4o__2H2ftibk1lC7HVqEfpaE_jVZgLhUXkxTvk,9773
25
25
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ kreuzberg/_utils/_device.py,sha256=Ja28S2psgEwWzjdO05ZI11RFb3MSlUZDT19sC4SAyVE,10955
26
27
  kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
27
28
  kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
28
29
  kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
29
- kreuzberg-3.1.7.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
30
- kreuzberg-3.1.7.dist-info/METADATA,sha256=3GqQckA40eybyTHQkNcTxSLFBvXGMwoL7mP-mYnZyig,6751
31
- kreuzberg-3.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
32
- kreuzberg-3.1.7.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
33
- kreuzberg-3.1.7.dist-info/RECORD,,
30
+ kreuzberg-3.2.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
31
+ kreuzberg-3.2.0.dist-info/METADATA,sha256=xffQAGQur7sCgUT9RDqZpfkYTdthsuYIhCvbUDKFnmA,6504
32
+ kreuzberg-3.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ kreuzberg-3.2.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
34
+ kreuzberg-3.2.0.dist-info/RECORD,,