kreuzberg 3.4.1__py3-none-any.whl → 3.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from importlib.metadata import version
2
2
 
3
3
  from kreuzberg._gmft import GMFTConfig
4
+ from kreuzberg._language_detection import LanguageDetectionConfig
4
5
  from kreuzberg._ocr._easyocr import EasyOCRConfig
5
6
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
6
7
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -29,6 +30,7 @@ __all__ = [
29
30
  "ExtractorRegistry",
30
31
  "GMFTConfig",
31
32
  "KreuzbergError",
33
+ "LanguageDetectionConfig",
32
34
  "Metadata",
33
35
  "MissingDependencyError",
34
36
  "OCRError",
@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
80
80
  if self.config.ocr_backend is None:
81
81
  raise ValidationError("ocr_backend is None, cannot perform OCR")
82
82
 
83
- from kreuzberg._ocr._tesseract import TesseractConfig
84
83
  from kreuzberg._types import ExtractionResult
85
84
 
86
85
  if self.config.ocr_backend == "tesseract":
87
86
  from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
87
+ from kreuzberg._ocr._tesseract import TesseractConfig
88
88
 
89
89
  if isinstance(self.config.ocr_config, TesseractConfig):
90
90
  config = self.config.ocr_config
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
96
96
  return results[0]
97
97
  return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
98
 
99
+ if self.config.ocr_backend == "paddleocr":
100
+ from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
101
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
102
+
103
+ paddle_config = (
104
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
105
+ )
106
+
107
+ return paddle_process(path, paddle_config)
108
+
109
+ if self.config.ocr_backend == "easyocr":
110
+ from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
111
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
112
+
113
+ easy_config = (
114
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
115
+ )
116
+
117
+ return easy_process(path, easy_config)
118
+
99
119
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
100
120
 
101
121
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
299
299
  """Extract text from PDF using OCR (sync version)."""
300
300
  pdf = None
301
301
  try:
302
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
303
-
304
302
  images = []
305
303
  with pypdfium_file_lock(path):
306
304
  pdf = pypdfium2.PdfDocument(str(path))
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
325
323
  os.close(fd)
326
324
  image_paths.append(temp_path)
327
325
 
328
- if self.config.ocr_backend == "tesseract":
329
- from kreuzberg._ocr._tesseract import TesseractConfig
330
-
331
- if isinstance(self.config.ocr_config, TesseractConfig):
332
- config = self.config.ocr_config
333
- else:
334
- config = TesseractConfig()
335
- results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
336
- text_parts = [r.content for r in results]
337
- return "\n\n".join(text_parts)
338
-
339
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
326
+ return self._process_pdf_images_with_ocr(image_paths)
340
327
 
341
328
  finally:
342
329
  for _, temp_path in temp_files:
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
349
336
  if pdf:
350
337
  with pypdfium_file_lock(path), contextlib.suppress(Exception):
351
338
  pdf.close()
339
+
340
+ def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
341
+ """Process PDF images with the configured OCR backend."""
342
+ if self.config.ocr_backend == "tesseract":
343
+ from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
344
+ from kreuzberg._ocr._tesseract import TesseractConfig
345
+
346
+ tesseract_config = (
347
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
348
+ )
349
+ results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
350
+ text_parts = [r.content for r in results]
351
+ return "\n\n".join(text_parts)
352
+
353
+ if self.config.ocr_backend == "paddleocr":
354
+ from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
355
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
356
+
357
+ paddle_config = (
358
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
359
+ )
360
+
361
+ text_parts = []
362
+ for image_path in image_paths:
363
+ result = paddle_process(Path(image_path), paddle_config)
364
+ text_parts.append(result.content)
365
+ return "\n\n".join(text_parts)
366
+
367
+ if self.config.ocr_backend == "easyocr":
368
+ from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
369
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
370
+
371
+ easy_config = (
372
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
373
+ )
374
+
375
+ text_parts = []
376
+ for image_path in image_paths:
377
+ result = easy_process(Path(image_path), easy_config)
378
+ text_parts.append(result.content)
379
+ return "\n\n".join(text_parts)
380
+
381
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
@@ -6,7 +6,7 @@ import sys
6
6
  from datetime import date, datetime, time, timedelta
7
7
  from io import StringIO
8
8
  from pathlib import Path
9
- from typing import Any, Union
9
+ from typing import Any
10
10
 
11
11
  from anyio import Path as AsyncPath
12
12
  from python_calamine import CalamineWorkbook
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
23
23
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
24
24
 
25
25
 
26
- CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
26
+ CellValue = int | float | str | bool | time | date | datetime | timedelta
27
27
 
28
28
 
29
29
  class SpreadSheetExtractor(Extractor):
kreuzberg/_gmft.py CHANGED
@@ -210,7 +210,7 @@ async def extract_tables( # noqa: PLR0915
210
210
  from gmft.formatters.tatr import TATRFormatConfig
211
211
  from gmft.pdf_bindings.pdfium import PyPDFium2Document
212
212
 
213
- formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
213
+ formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
214
214
  config=TATRFormatConfig(
215
215
  verbosity=config.verbosity,
216
216
  formatter_base_threshold=config.formatter_base_threshold,
@@ -226,7 +226,7 @@ async def extract_tables( # noqa: PLR0915
226
226
  force_large_table_assumption=config.force_large_table_assumption,
227
227
  )
228
228
  )
229
- detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
229
+ detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
230
230
  config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
231
231
  )
232
232
  doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -247,7 +247,7 @@ async def extract_tables( # noqa: PLR0915
247
247
  text=data_frame.to_markdown(),
248
248
  df=data_frame,
249
249
  )
250
- for data_frame, cropped_table in zip(dataframes, cropped_tables)
250
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
251
251
  ]
252
252
 
253
253
  await table_cache.aset(result, **cache_kwargs)
@@ -365,7 +365,7 @@ def extract_tables_sync(
365
365
  text=data_frame.to_markdown(),
366
366
  df=data_frame,
367
367
  )
368
- for data_frame, cropped_table in zip(dataframes, cropped_tables)
368
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
369
369
  ]
370
370
 
371
371
  table_cache.set(result, **cache_kwargs)
@@ -0,0 +1,95 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from functools import lru_cache
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from kreuzberg.exceptions import MissingDependencyError
8
+
9
+ if TYPE_CHECKING:
10
+ from fast_langdetect import LangDetectConfig as FastLangDetectConfig
11
+
12
+ try:
13
+ from fast_langdetect import LangDetectConfig as FastLangDetectConfig
14
+ from fast_langdetect import detect, detect_multilingual
15
+
16
+ HAS_FAST_LANGDETECT = True
17
+ except ImportError:
18
+ HAS_FAST_LANGDETECT = False
19
+ detect = None
20
+ detect_multilingual = None
21
+ FastLangDetectConfig = None
22
+
23
+ _CACHE_SIZE = 128
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class LanguageDetectionConfig:
28
+ """Configuration for language detection.
29
+
30
+ Attributes:
31
+ low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
32
+ Defaults to True for better memory efficiency.
33
+ top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
34
+ multilingual: If True, uses multilingual detection to handle mixed-language text.
35
+ If False, uses single language detection. Defaults to False.
36
+ cache_dir: Custom directory for model cache. If None, uses system default.
37
+ allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
38
+ """
39
+
40
+ low_memory: bool = True
41
+ top_k: int = 3
42
+ multilingual: bool = False
43
+ cache_dir: str | None = None
44
+ allow_fallback: bool = True
45
+
46
+
47
+ def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
48
+ """Create FastLangDetectConfig from our config."""
49
+ if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
50
+ return None
51
+
52
+ kwargs: dict[str, Any] = {
53
+ "allow_fallback": config.allow_fallback,
54
+ }
55
+ if config.cache_dir is not None:
56
+ kwargs["cache_dir"] = config.cache_dir
57
+
58
+ return FastLangDetectConfig(**kwargs)
59
+
60
+
61
+ @lru_cache(maxsize=_CACHE_SIZE)
62
+ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
63
+ """Detect the most probable languages in the given text using fast-langdetect.
64
+
65
+ Args:
66
+ text: The text to analyze.
67
+ config: Configuration for language detection. If None, uses defaults.
68
+
69
+ Returns:
70
+ A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
71
+ or None if detection fails.
72
+
73
+ Raises:
74
+ MissingDependencyError: If fast-langdetect is not installed.
75
+ """
76
+ if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
77
+ raise MissingDependencyError.create_for_package(
78
+ dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
79
+ )
80
+
81
+ if config is None:
82
+ config = LanguageDetectionConfig()
83
+
84
+ try:
85
+ if config.multilingual:
86
+ results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
87
+
88
+ return [result["lang"].lower() for result in results if result.get("lang")]
89
+
90
+ result = detect(text, low_memory=config.low_memory)
91
+ if result and result.get("lang"):
92
+ return [result["lang"].lower()]
93
+ return None
94
+ except Exception: # noqa: BLE001
95
+ return None
@@ -56,9 +56,7 @@ def _extract_tables_in_process(
56
56
  force_large_table_assumption=config.force_large_table_assumption,
57
57
  )
58
58
  )
59
- detector = AutoTableDetector( # type: ignore[no-untyped-call]
60
- config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
61
- )
59
+ detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
62
60
 
63
61
  doc = PyPDFium2Document(str(file_path))
64
62
  cropped_tables = []
@@ -73,7 +71,7 @@ def _extract_tables_in_process(
73
71
  dataframes.append(formatted_table.df())
74
72
 
75
73
  results = []
76
- for data_frame, cropped_table in zip(dataframes, cropped_tables):
74
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
77
75
  import io
78
76
 
79
77
  img_bytes = io.BytesIO()
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import multiprocessing as mp
6
6
  from concurrent.futures import ProcessPoolExecutor
7
- from typing import TYPE_CHECKING, Any, Callable, TypeVar
7
+ from typing import TYPE_CHECKING, Any, TypeVar
8
8
 
9
9
  import anyio
10
10
  import psutil
@@ -12,6 +12,7 @@ from typing_extensions import Self
12
12
 
13
13
  if TYPE_CHECKING:
14
14
  import types
15
+ from collections.abc import Callable
15
16
 
16
17
  T = TypeVar("T")
17
18
 
@@ -0,0 +1,235 @@
1
+ """Pure synchronous EasyOCR without any async overhead."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from PIL import Image
10
+
11
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
13
+ from kreuzberg._types import ExtractionResult
14
+ from kreuzberg._utils._string import normalize_spaces
15
+ from kreuzberg.exceptions import MissingDependencyError, OCRError
16
+
17
+
18
+ def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
19
+ """Get an EasyOCR Reader instance with the given configuration."""
20
+ try:
21
+ import easyocr
22
+ except ImportError as e:
23
+ raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
24
+
25
+ gpu = False
26
+ if hasattr(config, "device"):
27
+ if config.device and config.device.lower() != "cpu":
28
+ gpu = True
29
+ elif hasattr(config, "use_gpu"):
30
+ gpu = config.use_gpu
31
+
32
+ language = config.language if hasattr(config, "language") else "en"
33
+ if isinstance(language, str):
34
+ lang_list = [lang.strip().lower() for lang in language.split(",")]
35
+ else:
36
+ lang_list = [lang.lower() for lang in language]
37
+
38
+ kwargs = {
39
+ "lang_list": lang_list,
40
+ "gpu": gpu,
41
+ "model_storage_directory": getattr(config, "model_storage_directory", None),
42
+ "user_network_directory": getattr(config, "user_network_directory", None),
43
+ "recog_network": getattr(config, "recog_network", None),
44
+ "detector": getattr(config, "detector", None),
45
+ "recognizer": getattr(config, "recognizer", None),
46
+ "verbose": False,
47
+ "quantize": getattr(config, "quantize", None),
48
+ "cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
49
+ }
50
+
51
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
52
+
53
+ return easyocr.Reader(**kwargs)
54
+
55
+
56
+ def process_image_sync_pure(
57
+ image_path: str | Path,
58
+ config: EasyOCRConfig | None = None,
59
+ ) -> ExtractionResult:
60
+ """Process an image with EasyOCR using pure sync implementation.
61
+
62
+ This bypasses all async overhead and calls EasyOCR directly.
63
+
64
+ Args:
65
+ image_path: Path to the image file.
66
+ config: EasyOCR configuration.
67
+
68
+ Returns:
69
+ Extraction result.
70
+ """
71
+ cfg = config or EasyOCRConfig()
72
+
73
+ try:
74
+ reader = _get_easyocr_instance(cfg)
75
+
76
+ readtext_kwargs = {
77
+ "decoder": cfg.decoder,
78
+ "beamWidth": cfg.beam_width,
79
+ "batch_size": getattr(cfg, "batch_size", 1),
80
+ "workers": getattr(cfg, "workers", 0),
81
+ "allowlist": getattr(cfg, "allowlist", None),
82
+ "blocklist": getattr(cfg, "blocklist", None),
83
+ "detail": getattr(cfg, "detail", 1),
84
+ "rotation_info": cfg.rotation_info,
85
+ "paragraph": getattr(cfg, "paragraph", False),
86
+ "min_size": cfg.min_size,
87
+ "text_threshold": cfg.text_threshold,
88
+ "low_text": cfg.low_text,
89
+ "link_threshold": cfg.link_threshold,
90
+ "canvas_size": cfg.canvas_size,
91
+ "mag_ratio": cfg.mag_ratio,
92
+ "slope_ths": cfg.slope_ths,
93
+ "ycenter_ths": cfg.ycenter_ths,
94
+ "height_ths": cfg.height_ths,
95
+ "width_ths": cfg.width_ths,
96
+ "add_margin": cfg.add_margin,
97
+ "x_ths": cfg.x_ths,
98
+ "y_ths": cfg.y_ths,
99
+ }
100
+
101
+ readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
102
+
103
+ results = reader.readtext(str(image_path), **readtext_kwargs)
104
+
105
+ if not results:
106
+ return ExtractionResult(
107
+ content="",
108
+ mime_type=PLAIN_TEXT_MIME_TYPE,
109
+ metadata={},
110
+ chunks=[],
111
+ )
112
+
113
+ texts = []
114
+ confidences = []
115
+
116
+ detail_value = getattr(cfg, "detail", 1)
117
+ if detail_value:
118
+ for result in results:
119
+ min_result_length = 2
120
+ max_confidence_index = 2
121
+ if len(result) >= min_result_length:
122
+ _bbox, text = result[0], result[1]
123
+ confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
124
+ texts.append(text)
125
+ confidences.append(confidence)
126
+ else:
127
+ texts = results
128
+ confidences = [1.0] * len(texts)
129
+
130
+ content = "\n".join(texts)
131
+ content = normalize_spaces(content)
132
+
133
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
134
+
135
+ metadata = {"confidence": avg_confidence} if confidences else {}
136
+
137
+ return ExtractionResult(
138
+ content=content,
139
+ mime_type=PLAIN_TEXT_MIME_TYPE,
140
+ metadata=metadata, # type: ignore[arg-type]
141
+ chunks=[],
142
+ )
143
+
144
+ except Exception as e:
145
+ raise OCRError(f"EasyOCR processing failed: {e}") from e
146
+
147
+
148
+ def process_image_bytes_sync_pure(
149
+ image_bytes: bytes,
150
+ config: EasyOCRConfig | None = None,
151
+ ) -> ExtractionResult:
152
+ """Process image bytes with EasyOCR using pure sync implementation.
153
+
154
+ Args:
155
+ image_bytes: Image data as bytes.
156
+ config: EasyOCR configuration.
157
+
158
+ Returns:
159
+ Extraction result.
160
+ """
161
+ import io
162
+
163
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
164
+ with Image.open(io.BytesIO(image_bytes)) as image:
165
+ image.save(tmp_image.name, format="PNG")
166
+ image_path = tmp_image.name
167
+
168
+ try:
169
+ return process_image_sync_pure(image_path, config)
170
+ finally:
171
+ image_file = Path(image_path)
172
+ if image_file.exists():
173
+ image_file.unlink()
174
+
175
+
176
+ def process_batch_images_sync_pure(
177
+ image_paths: list[str | Path],
178
+ config: EasyOCRConfig | None = None,
179
+ ) -> list[ExtractionResult]:
180
+ """Process a batch of images sequentially with pure sync implementation.
181
+
182
+ Args:
183
+ image_paths: List of image file paths.
184
+ config: EasyOCR configuration.
185
+
186
+ Returns:
187
+ List of extraction results.
188
+ """
189
+ results = []
190
+ for image_path in image_paths:
191
+ result = process_image_sync_pure(image_path, config)
192
+ results.append(result)
193
+ return results
194
+
195
+
196
+ def process_batch_images_threaded(
197
+ image_paths: list[str | Path],
198
+ config: EasyOCRConfig | None = None,
199
+ max_workers: int | None = None,
200
+ ) -> list[ExtractionResult]:
201
+ """Process a batch of images using threading.
202
+
203
+ Args:
204
+ image_paths: List of image file paths.
205
+ config: EasyOCR configuration.
206
+ max_workers: Maximum number of threads.
207
+
208
+ Returns:
209
+ List of extraction results in same order as input.
210
+ """
211
+ import multiprocessing as mp
212
+ from concurrent.futures import ThreadPoolExecutor, as_completed
213
+
214
+ if max_workers is None:
215
+ max_workers = min(len(image_paths), mp.cpu_count())
216
+
217
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
218
+ future_to_index = {
219
+ executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
220
+ }
221
+
222
+ results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
223
+ for future in as_completed(future_to_index):
224
+ index = future_to_index[future]
225
+ try:
226
+ results[index] = future.result()
227
+ except Exception as e: # noqa: BLE001
228
+ results[index] = ExtractionResult(
229
+ content=f"Error: {e}",
230
+ mime_type=PLAIN_TEXT_MIME_TYPE,
231
+ metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
232
+ chunks=[],
233
+ )
234
+
235
+ return results
@@ -0,0 +1,199 @@
1
+ """Pure synchronous PaddleOCR without any async overhead."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from PIL import Image
10
+
11
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
13
+ from kreuzberg._types import ExtractionResult
14
+ from kreuzberg._utils._string import normalize_spaces
15
+ from kreuzberg.exceptions import MissingDependencyError, OCRError
16
+
17
+
18
+ def _get_paddleocr_instance(config: PaddleOCRConfig) -> Any:
19
+ """Get a PaddleOCR instance with the given configuration."""
20
+ try:
21
+ import paddleocr
22
+ except ImportError as e:
23
+ raise MissingDependencyError("PaddleOCR is not installed. Install it with: pip install paddleocr") from e
24
+
25
+ if hasattr(config, "device"):
26
+ if config.device and config.device.lower() != "cpu":
27
+ pass
28
+ elif hasattr(config, "use_gpu"):
29
+ pass
30
+
31
+ kwargs = {
32
+ "lang": config.language,
33
+ "use_textline_orientation": config.use_angle_cls,
34
+ }
35
+
36
+ if hasattr(config, "det_db_thresh"):
37
+ kwargs["text_det_thresh"] = config.det_db_thresh
38
+ if hasattr(config, "det_db_box_thresh"):
39
+ kwargs["text_det_box_thresh"] = config.det_db_box_thresh
40
+ if hasattr(config, "det_db_unclip_ratio"):
41
+ kwargs["text_det_unclip_ratio"] = config.det_db_unclip_ratio
42
+ if hasattr(config, "det_max_side_len"):
43
+ kwargs["text_det_limit_side_len"] = config.det_max_side_len
44
+ if hasattr(config, "drop_score"):
45
+ kwargs["text_rec_score_thresh"] = config.drop_score
46
+
47
+ return paddleocr.PaddleOCR(**kwargs)
48
+
49
+
50
+ def process_image_sync_pure(
51
+ image_path: str | Path,
52
+ config: PaddleOCRConfig | None = None,
53
+ ) -> ExtractionResult:
54
+ """Process an image with PaddleOCR using pure sync implementation.
55
+
56
+ This bypasses all async overhead and calls PaddleOCR directly.
57
+
58
+ Args:
59
+ image_path: Path to the image file.
60
+ config: PaddleOCR configuration.
61
+
62
+ Returns:
63
+ Extraction result.
64
+ """
65
+ cfg = config or PaddleOCRConfig()
66
+
67
+ try:
68
+ ocr_instance = _get_paddleocr_instance(cfg)
69
+
70
+ results = ocr_instance.ocr(str(image_path))
71
+
72
+ if not results or not results[0]:
73
+ return ExtractionResult(
74
+ content="",
75
+ mime_type=PLAIN_TEXT_MIME_TYPE,
76
+ metadata={},
77
+ chunks=[],
78
+ )
79
+
80
+ ocr_result = results[0]
81
+ result_data = ocr_result.json["res"]
82
+
83
+ texts = result_data.get("rec_texts", [])
84
+ scores = result_data.get("rec_scores", [])
85
+
86
+ if not texts:
87
+ return ExtractionResult(
88
+ content="",
89
+ mime_type=PLAIN_TEXT_MIME_TYPE,
90
+ metadata={},
91
+ chunks=[],
92
+ )
93
+
94
+ content = "\n".join(texts)
95
+ content = normalize_spaces(content)
96
+
97
+ avg_confidence = sum(scores) / len(scores) if scores else 0.0
98
+
99
+ metadata = {"confidence": avg_confidence} if scores else {}
100
+
101
+ return ExtractionResult(
102
+ content=content,
103
+ mime_type=PLAIN_TEXT_MIME_TYPE,
104
+ metadata=metadata, # type: ignore[arg-type]
105
+ chunks=[],
106
+ )
107
+
108
+ except Exception as e:
109
+ raise OCRError(f"PaddleOCR processing failed: {e}") from e
110
+
111
+
112
+ def process_image_bytes_sync_pure(
113
+ image_bytes: bytes,
114
+ config: PaddleOCRConfig | None = None,
115
+ ) -> ExtractionResult:
116
+ """Process image bytes with PaddleOCR using pure sync implementation.
117
+
118
+ Args:
119
+ image_bytes: Image data as bytes.
120
+ config: PaddleOCR configuration.
121
+
122
+ Returns:
123
+ Extraction result.
124
+ """
125
+ import io
126
+
127
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
128
+ with Image.open(io.BytesIO(image_bytes)) as image:
129
+ image.save(tmp_image.name, format="PNG")
130
+ image_path = tmp_image.name
131
+
132
+ try:
133
+ return process_image_sync_pure(image_path, config)
134
+ finally:
135
+ image_file = Path(image_path)
136
+ if image_file.exists():
137
+ image_file.unlink()
138
+
139
+
140
+ def process_batch_images_sync_pure(
141
+ image_paths: list[str | Path],
142
+ config: PaddleOCRConfig | None = None,
143
+ ) -> list[ExtractionResult]:
144
+ """Process a batch of images sequentially with pure sync implementation.
145
+
146
+ Args:
147
+ image_paths: List of image file paths.
148
+ config: PaddleOCR configuration.
149
+
150
+ Returns:
151
+ List of extraction results.
152
+ """
153
+ results = []
154
+ for image_path in image_paths:
155
+ result = process_image_sync_pure(image_path, config)
156
+ results.append(result)
157
+ return results
158
+
159
+
160
+ def process_batch_images_threaded(
161
+ image_paths: list[str | Path],
162
+ config: PaddleOCRConfig | None = None,
163
+ max_workers: int | None = None,
164
+ ) -> list[ExtractionResult]:
165
+ """Process a batch of images using threading.
166
+
167
+ Args:
168
+ image_paths: List of image file paths.
169
+ config: PaddleOCR configuration.
170
+ max_workers: Maximum number of threads.
171
+
172
+ Returns:
173
+ List of extraction results in same order as input.
174
+ """
175
+ import multiprocessing as mp
176
+ from concurrent.futures import ThreadPoolExecutor, as_completed
177
+
178
+ if max_workers is None:
179
+ max_workers = min(len(image_paths), mp.cpu_count())
180
+
181
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
182
+ future_to_index = {
183
+ executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
184
+ }
185
+
186
+ results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
187
+ for future in as_completed(future_to_index):
188
+ index = future_to_index[future]
189
+ try:
190
+ results[index] = future.result()
191
+ except Exception as e: # noqa: BLE001
192
+ results[index] = ExtractionResult(
193
+ content=f"Error: {e}",
194
+ mime_type=PLAIN_TEXT_MIME_TYPE,
195
+ metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
196
+ chunks=[],
197
+ )
198
+
199
+ return results
@@ -319,7 +319,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
319
319
  try:
320
320
  import torch
321
321
 
322
- return torch.cuda.is_available()
322
+ return bool(torch.cuda.is_available())
323
323
  except ImportError:
324
324
  return False
325
325
 
@@ -202,9 +202,11 @@ class TesseractConfig:
202
202
  - 'deu' for German
203
203
  - multiple languages combined with '+', e.g. 'eng+deu')
204
204
  """
205
- language_model_ngram_on: bool = True
206
- """Enable or disable the use of n-gram-based language models for improved text recognition."""
207
- psm: PSMMode = PSMMode.AUTO
205
+ language_model_ngram_on: bool = False
206
+ """Enable or disable the use of n-gram-based language models for improved text recognition.
207
+
208
+ Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
209
+ psm: PSMMode = PSMMode.AUTO_ONLY
208
210
  """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
209
211
  tessedit_dont_blkrej_good_wds: bool = True
210
212
  """If True, prevents block rejection of words identified as good, improving text output quality."""
@@ -212,6 +214,8 @@ class TesseractConfig:
212
214
  """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
213
215
  tessedit_enable_dict_correction: bool = True
214
216
  """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
217
+ tessedit_char_whitelist: str = ""
218
+ """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
215
219
  tessedit_use_primary_params_model: bool = True
216
220
  """If True, forces the use of the primary parameters model for text recognition."""
217
221
  textord_space_size_is_variable: bool = True
kreuzberg/_types.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable
4
+ from collections.abc import Awaitable, Callable
5
5
  from dataclasses import asdict, dataclass, field
6
- from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
6
+ from typing import TYPE_CHECKING, Any, Literal, TypedDict
7
7
 
8
8
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
9
9
  from kreuzberg.exceptions import ValidationError
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
18
18
  from PIL.Image import Image
19
19
 
20
20
  from kreuzberg._gmft import GMFTConfig
21
+ from kreuzberg._language_detection import LanguageDetectionConfig
21
22
  from kreuzberg._ocr._easyocr import EasyOCRConfig
22
23
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
24
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -113,14 +114,16 @@ class ExtractionResult:
113
114
  """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
114
115
  chunks: list[str] = field(default_factory=list)
115
116
  """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
117
+ detected_languages: list[str] | None = None
118
+ """Languages detected in the extracted content, if language detection is enabled."""
116
119
 
117
120
  def to_dict(self) -> dict[str, Any]:
118
121
  """Converts the ExtractionResult to a dictionary."""
119
122
  return asdict(self)
120
123
 
121
124
 
122
- PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
123
- ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
125
+ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
126
+ ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
124
127
 
125
128
 
126
129
  @dataclass(unsafe_hash=True)
@@ -157,6 +160,10 @@ class ExtractionConfig:
157
160
  """Post processing hooks to call after processing is done and before the final result is returned."""
158
161
  validators: list[ValidationHook] | None = None
159
162
  """Validation hooks to call after processing is done and before post-processing and result return."""
163
+ auto_detect_language: bool = False
164
+ """Whether to automatically detect language and configure OCR accordingly."""
165
+ language_detection_config: LanguageDetectionConfig | None = None
166
+ """Configuration for language detection. If None, uses default settings."""
160
167
 
161
168
  def __post_init__(self) -> None:
162
169
  from kreuzberg._ocr._easyocr import EasyOCRConfig
@@ -153,7 +153,7 @@ def _is_cuda_available() -> bool:
153
153
  try:
154
154
  import torch # type: ignore[import-not-found,unused-ignore]
155
155
 
156
- return torch.cuda.is_available()
156
+ return bool(torch.cuda.is_available())
157
157
  except ImportError:
158
158
  return False
159
159
 
@@ -163,7 +163,7 @@ def _is_mps_available() -> bool:
163
163
  try:
164
164
  import torch # type: ignore[import-not-found,unused-ignore]
165
165
 
166
- return torch.backends.mps.is_available()
166
+ return bool(torch.backends.mps.is_available())
167
167
  except ImportError:
168
168
  return False
169
169
 
@@ -5,10 +5,10 @@ from __future__ import annotations
5
5
  import multiprocessing as mp
6
6
  from concurrent.futures import ProcessPoolExecutor
7
7
  from contextlib import contextmanager
8
- from typing import TYPE_CHECKING, Any, Callable, TypeVar
8
+ from typing import TYPE_CHECKING, Any, TypeVar
9
9
 
10
10
  if TYPE_CHECKING:
11
- from collections.abc import Generator
11
+ from collections.abc import Callable, Generator
12
12
 
13
13
  T = TypeVar("T")
14
14
 
kreuzberg/_utils/_sync.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import sys
4
3
  from functools import partial
5
4
  from inspect import isawaitable, iscoroutinefunction
6
5
  from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -12,10 +11,7 @@ from anyio.to_thread import run_sync as any_io_run_sync
12
11
  if TYPE_CHECKING: # pragma: no cover
13
12
  from collections.abc import Awaitable, Callable
14
13
 
15
- if sys.version_info >= (3, 10):
16
- from typing import ParamSpec
17
- else: # pragma: no cover
18
- from typing_extensions import ParamSpec
14
+ from typing import ParamSpec
19
15
 
20
16
  T = TypeVar("T")
21
17
  P = ParamSpec("P")
kreuzberg/_utils/_tmp.py CHANGED
@@ -3,14 +3,14 @@ from __future__ import annotations
3
3
  from contextlib import suppress
4
4
  from pathlib import Path
5
5
  from tempfile import NamedTemporaryFile
6
- from typing import TYPE_CHECKING, Callable
6
+ from typing import TYPE_CHECKING
7
7
 
8
8
  from anyio import Path as AsyncPath
9
9
 
10
10
  from kreuzberg._utils._sync import run_sync
11
11
 
12
12
  if TYPE_CHECKING: # pragma: no cover
13
- from collections.abc import Coroutine
13
+ from collections.abc import Callable, Coroutine
14
14
 
15
15
 
16
16
  async def create_temp_file(
kreuzberg/extraction.py CHANGED
@@ -28,6 +28,11 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
28
28
  for validator in config.validators or []:
29
29
  await run_maybe_sync(validator, result)
30
30
 
31
+ if config.auto_detect_language and result.content:
32
+ from kreuzberg._language_detection import detect_languages
33
+
34
+ result.detected_languages = detect_languages(result.content, config.language_detection_config)
35
+
31
36
  if config.chunk_content:
32
37
  result.chunks = _handle_chunk_content(
33
38
  mime_type=result.mime_type,
@@ -45,6 +50,11 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
45
50
  for validator in config.validators or []:
46
51
  run_sync_only(validator, result)
47
52
 
53
+ if config.auto_detect_language and result.content:
54
+ from kreuzberg._language_detection import detect_languages
55
+
56
+ result.detected_languages = detect_languages(result.content, config.language_detection_config)
57
+
48
58
  if config.chunk_content:
49
59
  result.chunks = _handle_chunk_content(
50
60
  mime_type=result.mime_type,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.4.1
3
+ Version: 3.5.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -12,7 +12,6 @@ Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
@@ -22,7 +21,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
21
  Classifier: Topic :: Text Processing :: General
23
22
  Classifier: Topic :: Utilities
24
23
  Classifier: Typing :: Typed
25
- Requires-Python: >=3.9
24
+ Requires-Python: >=3.10
26
25
  Requires-Dist: anyio>=4.9.0
27
26
  Requires-Dist: charset-normalizer>=3.4.2
28
27
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
@@ -57,6 +56,8 @@ Provides-Extra: easyocr
57
56
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
58
57
  Provides-Extra: gmft
59
58
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
59
+ Provides-Extra: langdetect
60
+ Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
60
61
  Provides-Extra: paddleocr
61
62
  Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
62
63
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
@@ -1,50 +1,53 @@
1
- kreuzberg/__init__.py,sha256=5GP2j8PI3P_ZNSEhLpm8iqseY3i4nye6iUmVGUnfzno,1311
1
+ kreuzberg/__init__.py,sha256=zZ_puArNdw0pQk93BV99fXCxzkHFKXB9kINn8-6-y24,1408
2
2
  kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
3
  kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
4
  kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
6
+ kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
7
+ kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
7
8
  kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
8
9
  kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
9
10
  kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
10
- kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
11
+ kreuzberg/_types.py,sha256=Tnl9yP56dn8ziBZk1sorNk1ZHZbJYMjSoqh7xxImFHs,8092
11
12
  kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
12
13
  kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
13
- kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
14
+ kreuzberg/extraction.py,sha256=Jz0f31Mm90mBkWwn0L3vn3z7-irdwNIzMHWByIj5d_I,17005
14
15
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
16
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
17
  kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
17
18
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
19
20
  kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
20
- kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
21
+ kreuzberg/_extractors/_image.py,sha256=pYfh3x9CkiIxOLvp0jkkZcmLbB_FpdfDo01klSc6OzQ,4819
21
22
  kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
22
- kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
23
+ kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
23
24
  kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
24
- kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
25
+ kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
25
26
  kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
26
- kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
27
- kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
27
+ kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
28
+ kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
29
+ kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
30
+ kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
28
31
  kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
29
32
  kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
30
33
  kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
31
34
  kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
32
- kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
35
+ kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
33
36
  kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
34
- kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
37
+ kreuzberg/_ocr/_tesseract.py,sha256=3s3MkZN9xA_Uedx4s2p5m4IEIMhGjs9gYHxan9Iz-2g,13044
35
38
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
39
  kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
37
- kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
40
+ kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
38
41
  kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
39
42
  kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
40
43
  kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
41
- kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
44
+ kreuzberg/_utils/_process_pool.py,sha256=-0SNP01Qz21D7hgJmN0eHoqKusSygwPbi1U7IzJlPio,2895
42
45
  kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
43
46
  kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
44
- kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
45
- kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
46
- kreuzberg-3.4.1.dist-info/METADATA,sha256=g3DwLXNiDzvPDBApPnDp3BeZ4SbVN0NTrEzN9cyKy34,8751
47
- kreuzberg-3.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
- kreuzberg-3.4.1.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
49
- kreuzberg-3.4.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
50
- kreuzberg-3.4.1.dist-info/RECORD,,
47
+ kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
48
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
49
+ kreuzberg-3.5.0.dist-info/METADATA,sha256=jJXbwUuTXevmry2VVg1H8d6rEzebILJyN7q7kJ0M9mQ,8790
50
+ kreuzberg-3.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
+ kreuzberg-3.5.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
52
+ kreuzberg-3.5.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
53
+ kreuzberg-3.5.0.dist-info/RECORD,,