kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_api/__init__.py +0 -0
  4. kreuzberg/_api/main.py +87 -0
  5. kreuzberg/_cli_config.py +175 -0
  6. kreuzberg/_extractors/_image.py +39 -4
  7. kreuzberg/_extractors/_pandoc.py +158 -18
  8. kreuzberg/_extractors/_pdf.py +199 -19
  9. kreuzberg/_extractors/_presentation.py +1 -1
  10. kreuzberg/_extractors/_spread_sheet.py +65 -7
  11. kreuzberg/_gmft.py +222 -16
  12. kreuzberg/_mime_types.py +62 -16
  13. kreuzberg/_multiprocessing/__init__.py +6 -0
  14. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  15. kreuzberg/_multiprocessing/process_manager.py +188 -0
  16. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  17. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  18. kreuzberg/_ocr/_easyocr.py +6 -12
  19. kreuzberg/_ocr/_paddleocr.py +15 -13
  20. kreuzberg/_ocr/_tesseract.py +136 -46
  21. kreuzberg/_playa.py +43 -0
  22. kreuzberg/_types.py +4 -0
  23. kreuzberg/_utils/_cache.py +372 -0
  24. kreuzberg/_utils/_device.py +10 -27
  25. kreuzberg/_utils/_document_cache.py +220 -0
  26. kreuzberg/_utils/_errors.py +232 -0
  27. kreuzberg/_utils/_pdf_lock.py +72 -0
  28. kreuzberg/_utils/_process_pool.py +100 -0
  29. kreuzberg/_utils/_serialization.py +82 -0
  30. kreuzberg/_utils/_string.py +1 -1
  31. kreuzberg/_utils/_sync.py +21 -0
  32. kreuzberg/cli.py +338 -0
  33. kreuzberg/extraction.py +247 -36
  34. kreuzberg-3.4.0.dist-info/METADATA +290 -0
  35. kreuzberg-3.4.0.dist-info/RECORD +50 -0
  36. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
  37. kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
  38. kreuzberg-3.2.0.dist-info/METADATA +0 -166
  39. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  40. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  41. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
1
+ """Pure synchronous Tesseract OCR without any async overhead."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import subprocess
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ from PIL import Image
11
+
12
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
13
+ from kreuzberg._ocr._tesseract import TesseractConfig
14
+ from kreuzberg._types import ExtractionResult
15
+ from kreuzberg._utils._string import normalize_spaces
16
+ from kreuzberg.exceptions import OCRError
17
+
18
+
19
+ def process_image_sync_pure(
20
+ image_path: str | Path,
21
+ config: TesseractConfig | None = None,
22
+ ) -> ExtractionResult:
23
+ """Process an image with Tesseract using pure sync implementation.
24
+
25
+ This bypasses all async overhead and calls Tesseract directly.
26
+
27
+ Args:
28
+ image_path: Path to the image file.
29
+ config: Tesseract configuration.
30
+
31
+ Returns:
32
+ Extraction result.
33
+ """
34
+ cfg = config or TesseractConfig()
35
+
36
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
37
+ output_base = tmp_file.name.replace(".txt", "")
38
+
39
+ try:
40
+ command = [
41
+ "tesseract",
42
+ str(image_path),
43
+ output_base,
44
+ "-l",
45
+ cfg.language,
46
+ "--psm",
47
+ str(cfg.psm.value if hasattr(cfg.psm, "value") else cfg.psm),
48
+ "--oem",
49
+ "1",
50
+ "--loglevel",
51
+ "OFF",
52
+ ]
53
+
54
+ boolean_fields = [
55
+ "classify_use_pre_adapted_templates",
56
+ "language_model_ngram_on",
57
+ "tessedit_dont_blkrej_good_wds",
58
+ "tessedit_dont_rowrej_good_wds",
59
+ "tessedit_enable_dict_correction",
60
+ "tessedit_use_primary_params_model",
61
+ "textord_space_size_is_variable",
62
+ "thresholding_method",
63
+ ]
64
+
65
+ for field in boolean_fields:
66
+ if hasattr(cfg, field):
67
+ value = 1 if getattr(cfg, field) else 0
68
+ command.extend(["-c", f"{field}={value}"])
69
+
70
+ env = os.environ.copy()
71
+ env["OMP_THREAD_LIMIT"] = "1"
72
+
73
+ result = subprocess.run(
74
+ command,
75
+ check=False,
76
+ env=env,
77
+ capture_output=True,
78
+ text=True,
79
+ timeout=30,
80
+ )
81
+
82
+ if result.returncode != 0:
83
+ raise OCRError(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
84
+
85
+ output_file = output_base + ".txt"
86
+ with Path(output_file).open(encoding="utf-8") as f:
87
+ text = f.read()
88
+
89
+ text = normalize_spaces(text)
90
+
91
+ return ExtractionResult(
92
+ content=text,
93
+ mime_type=PLAIN_TEXT_MIME_TYPE,
94
+ metadata={},
95
+ chunks=[],
96
+ )
97
+
98
+ finally:
99
+ for ext in [".txt"]:
100
+ temp_file = output_base + ext
101
+ temp_path = Path(temp_file)
102
+ if temp_path.exists():
103
+ temp_path.unlink()
104
+
105
+
106
+ def process_image_bytes_sync_pure(
107
+ image_bytes: bytes,
108
+ config: TesseractConfig | None = None,
109
+ ) -> ExtractionResult:
110
+ """Process image bytes with Tesseract using pure sync implementation.
111
+
112
+ Args:
113
+ image_bytes: Image data as bytes.
114
+ config: Tesseract configuration.
115
+
116
+ Returns:
117
+ Extraction result.
118
+ """
119
+ import io
120
+
121
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
122
+ with Image.open(io.BytesIO(image_bytes)) as image:
123
+ image.save(tmp_image.name, format="PNG")
124
+ image_path = tmp_image.name
125
+
126
+ try:
127
+ return process_image_sync_pure(image_path, config)
128
+ finally:
129
+ image_file = Path(image_path)
130
+ if image_file.exists():
131
+ image_file.unlink()
132
+
133
+
134
+ def process_batch_images_sync_pure(
135
+ image_paths: list[str | Path],
136
+ config: TesseractConfig | None = None,
137
+ ) -> list[ExtractionResult]:
138
+ """Process a batch of images sequentially with pure sync implementation.
139
+
140
+ Args:
141
+ image_paths: List of image file paths.
142
+ config: Tesseract configuration.
143
+
144
+ Returns:
145
+ List of extraction results.
146
+ """
147
+ results = []
148
+ for image_path in image_paths:
149
+ result = process_image_sync_pure(image_path, config)
150
+ results.append(result)
151
+ return results
152
+
153
+
154
+ def process_batch_images_threaded(
155
+ image_paths: list[str | Path],
156
+ config: TesseractConfig | None = None,
157
+ max_workers: int | None = None,
158
+ ) -> list[ExtractionResult]:
159
+ """Process a batch of images using threading.
160
+
161
+ Args:
162
+ image_paths: List of image file paths.
163
+ config: Tesseract configuration.
164
+ max_workers: Maximum number of threads.
165
+
166
+ Returns:
167
+ List of extraction results in same order as input.
168
+ """
169
+ import multiprocessing as mp
170
+ from concurrent.futures import ThreadPoolExecutor, as_completed
171
+
172
+ if max_workers is None:
173
+ max_workers = min(len(image_paths), mp.cpu_count())
174
+
175
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
176
+ future_to_index = {
177
+ executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
178
+ }
179
+
180
+ results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
181
+ for future in as_completed(future_to_index):
182
+ index = future_to_index[future]
183
+ try:
184
+ results[index] = future.result()
185
+ except Exception as e: # noqa: BLE001 # noqa: BLE001
186
+ results[index] = ExtractionResult(
187
+ content=f"Error: {e}",
188
+ mime_type=PLAIN_TEXT_MIME_TYPE,
189
+ metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
190
+ chunks=[],
191
+ )
192
+
193
+ return results
194
+
195
+
196
+ def process_batch_images_process_pool(
197
+ image_paths: list[str | Path],
198
+ config: TesseractConfig | None = None,
199
+ max_workers: int | None = None,
200
+ ) -> list[ExtractionResult]:
201
+ """Process a batch of images using process pool.
202
+
203
+ Args:
204
+ image_paths: List of image file paths.
205
+ config: Tesseract configuration.
206
+ max_workers: Maximum number of processes.
207
+
208
+ Returns:
209
+ List of extraction results in same order as input.
210
+ """
211
+ import multiprocessing as mp
212
+ from concurrent.futures import ProcessPoolExecutor, as_completed
213
+
214
+ if max_workers is None:
215
+ max_workers = min(len(image_paths), mp.cpu_count())
216
+
217
+ cfg = config or TesseractConfig()
218
+ config_dict = {}
219
+ for field_name in cfg.__dataclass_fields__:
220
+ value = getattr(cfg, field_name)
221
+ if hasattr(value, "value"):
222
+ config_dict[field_name] = value.value
223
+ else:
224
+ config_dict[field_name] = value
225
+
226
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
227
+ from kreuzberg._multiprocessing.tesseract_pool import _process_image_with_tesseract
228
+
229
+ future_to_index = {
230
+ executor.submit(_process_image_with_tesseract, str(path), config_dict): i
231
+ for i, path in enumerate(image_paths)
232
+ }
233
+
234
+ results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
235
+ for future in as_completed(future_to_index):
236
+ index = future_to_index[future]
237
+ try:
238
+ result_dict = future.result()
239
+ if result_dict["success"]:
240
+ results[index] = ExtractionResult(
241
+ content=result_dict["text"],
242
+ mime_type=PLAIN_TEXT_MIME_TYPE,
243
+ metadata={},
244
+ chunks=[],
245
+ )
246
+ else:
247
+ results[index] = ExtractionResult(
248
+ content=f"Error: {result_dict['error']}",
249
+ mime_type=PLAIN_TEXT_MIME_TYPE,
250
+ metadata={"error": result_dict["error"]}, # type: ignore[typeddict-unknown-key]
251
+ chunks=[],
252
+ )
253
+ except Exception as e: # noqa: BLE001
254
+ results[index] = ExtractionResult(
255
+ content=f"Error: {e}",
256
+ mime_type=PLAIN_TEXT_MIME_TYPE,
257
+ metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
258
+ chunks=[],
259
+ )
260
+
261
+ return results
@@ -0,0 +1,359 @@
1
+ """Tesseract process pool for parallel OCR processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from PIL import Image
8
+ from typing_extensions import Self
9
+
10
+ from kreuzberg._ocr._tesseract import TesseractConfig
11
+ from kreuzberg._types import ExtractionResult
12
+
13
+ from .process_manager import ProcessPoolManager
14
+
15
+ if TYPE_CHECKING:
16
+ import types
17
+
18
+ from pathlib import Path
19
+
20
+
21
+ def _process_image_with_tesseract(
22
+ image_path: str,
23
+ config_dict: dict[str, Any],
24
+ ) -> dict[str, Any]:
25
+ """Process a single image with Tesseract in a separate process.
26
+
27
+ This function is designed to be pickled and executed in a subprocess.
28
+ It uses direct tesseract command execution to avoid async complications.
29
+
30
+ Args:
31
+ image_path: Path to the image file.
32
+ config_dict: Tesseract configuration as dictionary.
33
+
34
+ Returns:
35
+ OCR result as dictionary.
36
+ """
37
+ try:
38
+ import os
39
+ import subprocess
40
+ import tempfile
41
+
42
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
43
+ output_base = tmp_file.name.replace(".txt", "")
44
+
45
+ try:
46
+ language = config_dict.get("language", "eng")
47
+ psm = config_dict.get("psm", 3)
48
+
49
+ command = [
50
+ "tesseract",
51
+ image_path,
52
+ output_base,
53
+ "-l",
54
+ language,
55
+ "--psm",
56
+ str(psm),
57
+ "--oem",
58
+ "1",
59
+ "--loglevel",
60
+ "OFF",
61
+ ]
62
+
63
+ boolean_options = [
64
+ "classify_use_pre_adapted_templates",
65
+ "language_model_ngram_on",
66
+ "tessedit_dont_blkrej_good_wds",
67
+ "tessedit_dont_rowrej_good_wds",
68
+ "tessedit_enable_dict_correction",
69
+ "tessedit_use_primary_params_model",
70
+ "textord_space_size_is_variable",
71
+ "thresholding_method",
72
+ ]
73
+
74
+ for option in boolean_options:
75
+ if option in config_dict:
76
+ value = 1 if config_dict[option] else 0
77
+ command.extend(["-c", f"{option}={value}"])
78
+
79
+ env = os.environ.copy()
80
+ env["OMP_THREAD_LIMIT"] = "1"
81
+
82
+ result = subprocess.run(
83
+ command,
84
+ check=False,
85
+ env=env,
86
+ capture_output=True,
87
+ text=True,
88
+ timeout=30,
89
+ )
90
+
91
+ if result.returncode != 0:
92
+ raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
93
+
94
+ output_file = output_base + ".txt"
95
+ with Path(output_file).open(encoding="utf-8") as f:
96
+ text = f.read()
97
+
98
+ from kreuzberg._utils._string import normalize_spaces
99
+
100
+ text = normalize_spaces(text)
101
+
102
+ return {
103
+ "success": True,
104
+ "text": text,
105
+ "confidence": None,
106
+ "error": None,
107
+ }
108
+
109
+ finally:
110
+ for ext in [".txt"]:
111
+ temp_file = output_base + ext
112
+ temp_path = Path(temp_file)
113
+ if temp_path.exists():
114
+ temp_path.unlink()
115
+
116
+ except Exception as e: # noqa: BLE001
117
+ return {
118
+ "success": False,
119
+ "text": "",
120
+ "confidence": None,
121
+ "error": str(e),
122
+ }
123
+
124
+
125
+ def _process_image_bytes_with_tesseract(
126
+ image_bytes: bytes,
127
+ config_dict: dict[str, Any],
128
+ ) -> dict[str, Any]:
129
+ """Process image bytes with Tesseract in a separate process.
130
+
131
+ Args:
132
+ image_bytes: Image data as bytes.
133
+ config_dict: Tesseract configuration as dictionary.
134
+
135
+ Returns:
136
+ OCR result as dictionary.
137
+ """
138
+ try:
139
+ import io
140
+ import tempfile
141
+
142
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
143
+ with Image.open(io.BytesIO(image_bytes)) as image:
144
+ image.save(tmp_image.name, format="PNG")
145
+ image_path = tmp_image.name
146
+
147
+ try:
148
+ return _process_image_with_tesseract(image_path, config_dict)
149
+ finally:
150
+ image_file = Path(image_path)
151
+ if image_file.exists():
152
+ image_file.unlink()
153
+
154
+ except Exception as e: # noqa: BLE001
155
+ return {
156
+ "success": False,
157
+ "text": "",
158
+ "confidence": None,
159
+ "error": str(e),
160
+ }
161
+
162
+
163
+ class TesseractProcessPool:
164
+ """Process pool for parallel Tesseract OCR processing."""
165
+
166
+ def __init__(
167
+ self,
168
+ config: TesseractConfig | None = None,
169
+ max_processes: int | None = None,
170
+ memory_limit_gb: float | None = None,
171
+ ) -> None:
172
+ """Initialize the Tesseract process pool.
173
+
174
+ Args:
175
+ config: Default Tesseract configuration.
176
+ max_processes: Maximum number of processes.
177
+ memory_limit_gb: Memory limit in GB.
178
+ """
179
+ self.config = config or TesseractConfig()
180
+ self.process_manager = ProcessPoolManager(
181
+ max_processes=max_processes,
182
+ memory_limit_gb=memory_limit_gb,
183
+ )
184
+
185
+ def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
186
+ """Convert TesseractConfig to dictionary for pickling."""
187
+ cfg = config or self.config
188
+
189
+ config_dict = {}
190
+ for field_name in cfg.__dataclass_fields__:
191
+ value = getattr(cfg, field_name)
192
+
193
+ if hasattr(value, "value"):
194
+ config_dict[field_name] = value.value
195
+ else:
196
+ config_dict[field_name] = value
197
+
198
+ return config_dict
199
+
200
+ def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
201
+ """Convert result dictionary back to OCRResult."""
202
+ if not result_dict["success"]:
203
+ from kreuzberg.exceptions import OCRError
204
+
205
+ raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
206
+
207
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
208
+
209
+ return ExtractionResult(
210
+ content=result_dict["text"],
211
+ mime_type=PLAIN_TEXT_MIME_TYPE,
212
+ metadata={"confidence": result_dict["confidence"]} if result_dict["confidence"] else {}, # type: ignore[typeddict-unknown-key]
213
+ chunks=[],
214
+ )
215
+
216
+ async def process_image(
217
+ self,
218
+ image_path: str | Path,
219
+ config: TesseractConfig | None = None,
220
+ ) -> ExtractionResult:
221
+ """Process a single image file with Tesseract.
222
+
223
+ Args:
224
+ image_path: Path to the image file.
225
+ config: Tesseract configuration (uses default if None).
226
+
227
+ Returns:
228
+ OCR result.
229
+ """
230
+ config_dict = self._config_to_dict(config)
231
+
232
+ task_memory_mb = 80
233
+
234
+ result_dict = await self.process_manager.submit_task(
235
+ _process_image_with_tesseract,
236
+ str(image_path),
237
+ config_dict,
238
+ task_memory_mb=task_memory_mb,
239
+ )
240
+
241
+ return self._result_from_dict(result_dict)
242
+
243
+ async def process_image_bytes(
244
+ self,
245
+ image_bytes: bytes,
246
+ config: TesseractConfig | None = None,
247
+ ) -> ExtractionResult:
248
+ """Process image bytes with Tesseract.
249
+
250
+ Args:
251
+ image_bytes: Image data as bytes.
252
+ config: Tesseract configuration (uses default if None).
253
+
254
+ Returns:
255
+ OCR result.
256
+ """
257
+ config_dict = self._config_to_dict(config)
258
+
259
+ image_size_mb = len(image_bytes) / 1024 / 1024
260
+ task_memory_mb = max(80, image_size_mb * 2 + 50)
261
+
262
+ result_dict = await self.process_manager.submit_task(
263
+ _process_image_bytes_with_tesseract,
264
+ image_bytes,
265
+ config_dict,
266
+ task_memory_mb=task_memory_mb,
267
+ )
268
+
269
+ return self._result_from_dict(result_dict)
270
+
271
+ async def process_batch_images(
272
+ self,
273
+ image_paths: list[str | Path],
274
+ config: TesseractConfig | None = None,
275
+ max_concurrent: int | None = None,
276
+ ) -> list[ExtractionResult]:
277
+ """Process a batch of images in parallel.
278
+
279
+ Args:
280
+ image_paths: List of image file paths.
281
+ config: Tesseract configuration (uses default if None).
282
+ max_concurrent: Maximum concurrent processes.
283
+
284
+ Returns:
285
+ List of OCR results in the same order as input.
286
+ """
287
+ if not image_paths:
288
+ return []
289
+
290
+ config_dict = self._config_to_dict(config)
291
+
292
+ arg_batches = [(str(path), config_dict) for path in image_paths]
293
+
294
+ task_memory_mb = 80
295
+
296
+ result_dicts = await self.process_manager.submit_batch(
297
+ _process_image_with_tesseract,
298
+ arg_batches,
299
+ task_memory_mb=task_memory_mb,
300
+ max_concurrent=max_concurrent,
301
+ )
302
+
303
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
304
+
305
+ async def process_batch_bytes(
306
+ self,
307
+ image_bytes_list: list[bytes],
308
+ config: TesseractConfig | None = None,
309
+ max_concurrent: int | None = None,
310
+ ) -> list[ExtractionResult]:
311
+ """Process a batch of image bytes in parallel.
312
+
313
+ Args:
314
+ image_bytes_list: List of image data as bytes.
315
+ config: Tesseract configuration (uses default if None).
316
+ max_concurrent: Maximum concurrent processes.
317
+
318
+ Returns:
319
+ List of OCR results in the same order as input.
320
+ """
321
+ if not image_bytes_list:
322
+ return []
323
+
324
+ config_dict = self._config_to_dict(config)
325
+
326
+ arg_batches = [(image_bytes, config_dict) for image_bytes in image_bytes_list]
327
+
328
+ avg_image_size_mb = sum(len(img) for img in image_bytes_list) / len(image_bytes_list) / 1024 / 1024
329
+ task_memory_mb = max(80, avg_image_size_mb * 2 + 50)
330
+
331
+ result_dicts = await self.process_manager.submit_batch(
332
+ _process_image_bytes_with_tesseract,
333
+ arg_batches,
334
+ task_memory_mb=task_memory_mb,
335
+ max_concurrent=max_concurrent,
336
+ )
337
+
338
+ return [self._result_from_dict(result_dict) for result_dict in result_dicts]
339
+
340
+ def get_system_info(self) -> dict[str, Any]:
341
+ """Get system information from the process manager."""
342
+ return self.process_manager.get_system_info()
343
+
344
+ def shutdown(self, wait: bool = True) -> None:
345
+ """Shutdown the process pool."""
346
+ self.process_manager.shutdown(wait=wait)
347
+
348
+ async def __aenter__(self) -> Self:
349
+ """Async context manager entry."""
350
+ return self
351
+
352
+ async def __aexit__(
353
+ self,
354
+ exc_type: type[BaseException] | None,
355
+ exc_val: BaseException | None,
356
+ exc_tb: types.TracebackType | None,
357
+ ) -> None:
358
+ """Async context manager exit."""
359
+ self.shutdown()