kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_entity_extraction.py +1 -2
  2. kreuzberg/_extractors/_base.py +39 -1
  3. kreuzberg/_extractors/_email.py +149 -0
  4. kreuzberg/_extractors/_html.py +15 -3
  5. kreuzberg/_extractors/_image.py +21 -36
  6. kreuzberg/_extractors/_pandoc.py +3 -14
  7. kreuzberg/_extractors/_pdf.py +81 -48
  8. kreuzberg/_extractors/_presentation.py +62 -10
  9. kreuzberg/_extractors/_spread_sheet.py +179 -4
  10. kreuzberg/_extractors/_structured.py +148 -0
  11. kreuzberg/_gmft.py +314 -7
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_ocr/__init__.py +10 -1
  14. kreuzberg/_ocr/_base.py +59 -0
  15. kreuzberg/_ocr/_easyocr.py +91 -0
  16. kreuzberg/_ocr/_paddleocr.py +89 -0
  17. kreuzberg/_ocr/_tesseract.py +564 -4
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +52 -4
  21. kreuzberg/_utils/_errors.py +3 -7
  22. kreuzberg/_utils/_process_pool.py +180 -7
  23. kreuzberg/_utils/_quality.py +237 -0
  24. kreuzberg/_utils/_serialization.py +4 -2
  25. kreuzberg/_utils/_string.py +153 -10
  26. kreuzberg/_utils/_sync.py +5 -2
  27. kreuzberg/_utils/_table.py +261 -0
  28. kreuzberg/cli.py +1 -2
  29. kreuzberg/extraction.py +4 -22
  30. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
  31. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  32. kreuzberg/_multiprocessing/__init__.py +0 -6
  33. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  34. kreuzberg/_multiprocessing/process_manager.py +0 -189
  35. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  36. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  37. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  38. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  39. kreuzberg-3.7.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,235 +0,0 @@
1
- """Pure synchronous EasyOCR without any async overhead."""
2
-
3
- from __future__ import annotations
4
-
5
- import tempfile
6
- from pathlib import Path
7
- from typing import Any
8
-
9
- from PIL import Image
10
-
11
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
- from kreuzberg._ocr._easyocr import EasyOCRConfig
13
- from kreuzberg._types import ExtractionResult
14
- from kreuzberg._utils._string import normalize_spaces
15
- from kreuzberg.exceptions import MissingDependencyError, OCRError
16
-
17
-
18
- def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
19
- """Get an EasyOCR Reader instance with the given configuration."""
20
- try:
21
- import easyocr
22
- except ImportError as e:
23
- raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
24
-
25
- gpu = False
26
- if hasattr(config, "device"):
27
- if config.device and config.device.lower() != "cpu":
28
- gpu = True
29
- elif hasattr(config, "use_gpu"):
30
- gpu = config.use_gpu
31
-
32
- language = config.language if hasattr(config, "language") else "en"
33
- if isinstance(language, str):
34
- lang_list = [lang.strip().lower() for lang in language.split(",")]
35
- else:
36
- lang_list = [lang.lower() for lang in language]
37
-
38
- kwargs = {
39
- "lang_list": lang_list,
40
- "gpu": gpu,
41
- "model_storage_directory": getattr(config, "model_storage_directory", None),
42
- "user_network_directory": getattr(config, "user_network_directory", None),
43
- "recog_network": getattr(config, "recog_network", None),
44
- "detector": getattr(config, "detector", None),
45
- "recognizer": getattr(config, "recognizer", None),
46
- "verbose": False,
47
- "quantize": getattr(config, "quantize", None),
48
- "cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
49
- }
50
-
51
- kwargs = {k: v for k, v in kwargs.items() if v is not None}
52
-
53
- return easyocr.Reader(**kwargs)
54
-
55
-
56
- def process_image_sync_pure(
57
- image_path: str | Path,
58
- config: EasyOCRConfig | None = None,
59
- ) -> ExtractionResult:
60
- """Process an image with EasyOCR using pure sync implementation.
61
-
62
- This bypasses all async overhead and calls EasyOCR directly.
63
-
64
- Args:
65
- image_path: Path to the image file.
66
- config: EasyOCR configuration.
67
-
68
- Returns:
69
- Extraction result.
70
- """
71
- cfg = config or EasyOCRConfig()
72
-
73
- try:
74
- reader = _get_easyocr_instance(cfg)
75
-
76
- readtext_kwargs = {
77
- "decoder": cfg.decoder,
78
- "beamWidth": cfg.beam_width,
79
- "batch_size": getattr(cfg, "batch_size", 1),
80
- "workers": getattr(cfg, "workers", 0),
81
- "allowlist": getattr(cfg, "allowlist", None),
82
- "blocklist": getattr(cfg, "blocklist", None),
83
- "detail": getattr(cfg, "detail", 1),
84
- "rotation_info": cfg.rotation_info,
85
- "paragraph": getattr(cfg, "paragraph", False),
86
- "min_size": cfg.min_size,
87
- "text_threshold": cfg.text_threshold,
88
- "low_text": cfg.low_text,
89
- "link_threshold": cfg.link_threshold,
90
- "canvas_size": cfg.canvas_size,
91
- "mag_ratio": cfg.mag_ratio,
92
- "slope_ths": cfg.slope_ths,
93
- "ycenter_ths": cfg.ycenter_ths,
94
- "height_ths": cfg.height_ths,
95
- "width_ths": cfg.width_ths,
96
- "add_margin": cfg.add_margin,
97
- "x_ths": cfg.x_ths,
98
- "y_ths": cfg.y_ths,
99
- }
100
-
101
- readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
102
-
103
- results = reader.readtext(str(image_path), **readtext_kwargs)
104
-
105
- if not results:
106
- return ExtractionResult(
107
- content="",
108
- mime_type=PLAIN_TEXT_MIME_TYPE,
109
- metadata={},
110
- chunks=[],
111
- )
112
-
113
- texts = []
114
- confidences = []
115
-
116
- detail_value = getattr(cfg, "detail", 1)
117
- if detail_value:
118
- for result in results:
119
- min_result_length = 2
120
- max_confidence_index = 2
121
- if len(result) >= min_result_length:
122
- _bbox, text = result[0], result[1]
123
- confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
124
- texts.append(text)
125
- confidences.append(confidence)
126
- else:
127
- texts = results
128
- confidences = [1.0] * len(texts)
129
-
130
- content = "\n".join(texts)
131
- content = normalize_spaces(content)
132
-
133
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
134
-
135
- metadata = {"confidence": avg_confidence} if confidences else {}
136
-
137
- return ExtractionResult(
138
- content=content,
139
- mime_type=PLAIN_TEXT_MIME_TYPE,
140
- metadata=metadata, # type: ignore[arg-type]
141
- chunks=[],
142
- )
143
-
144
- except Exception as e:
145
- raise OCRError(f"EasyOCR processing failed: {e}") from e
146
-
147
-
148
- def process_image_bytes_sync_pure(
149
- image_bytes: bytes,
150
- config: EasyOCRConfig | None = None,
151
- ) -> ExtractionResult:
152
- """Process image bytes with EasyOCR using pure sync implementation.
153
-
154
- Args:
155
- image_bytes: Image data as bytes.
156
- config: EasyOCR configuration.
157
-
158
- Returns:
159
- Extraction result.
160
- """
161
- import io
162
-
163
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
164
- with Image.open(io.BytesIO(image_bytes)) as image:
165
- image.save(tmp_image.name, format="PNG")
166
- image_path = tmp_image.name
167
-
168
- try:
169
- return process_image_sync_pure(image_path, config)
170
- finally:
171
- image_file = Path(image_path)
172
- if image_file.exists():
173
- image_file.unlink()
174
-
175
-
176
- def process_batch_images_sync_pure(
177
- image_paths: list[str | Path],
178
- config: EasyOCRConfig | None = None,
179
- ) -> list[ExtractionResult]:
180
- """Process a batch of images sequentially with pure sync implementation.
181
-
182
- Args:
183
- image_paths: List of image file paths.
184
- config: EasyOCR configuration.
185
-
186
- Returns:
187
- List of extraction results.
188
- """
189
- results = []
190
- for image_path in image_paths:
191
- result = process_image_sync_pure(image_path, config)
192
- results.append(result)
193
- return results
194
-
195
-
196
- def process_batch_images_threaded(
197
- image_paths: list[str | Path],
198
- config: EasyOCRConfig | None = None,
199
- max_workers: int | None = None,
200
- ) -> list[ExtractionResult]:
201
- """Process a batch of images using threading.
202
-
203
- Args:
204
- image_paths: List of image file paths.
205
- config: EasyOCR configuration.
206
- max_workers: Maximum number of threads.
207
-
208
- Returns:
209
- List of extraction results in same order as input.
210
- """
211
- import multiprocessing as mp
212
- from concurrent.futures import ThreadPoolExecutor, as_completed
213
-
214
- if max_workers is None:
215
- max_workers = min(len(image_paths), mp.cpu_count())
216
-
217
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
218
- future_to_index = {
219
- executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
220
- }
221
-
222
- results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
223
- for future in as_completed(future_to_index):
224
- index = future_to_index[future]
225
- try:
226
- results[index] = future.result()
227
- except Exception as e: # noqa: BLE001
228
- results[index] = ExtractionResult(
229
- content=f"Error: {e}",
230
- mime_type=PLAIN_TEXT_MIME_TYPE,
231
- metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
232
- chunks=[],
233
- )
234
-
235
- return results
@@ -1,199 +0,0 @@
1
- """Pure synchronous PaddleOCR without any async overhead."""
2
-
3
- from __future__ import annotations
4
-
5
- import tempfile
6
- from pathlib import Path
7
- from typing import Any
8
-
9
- from PIL import Image
10
-
11
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
13
- from kreuzberg._types import ExtractionResult
14
- from kreuzberg._utils._string import normalize_spaces
15
- from kreuzberg.exceptions import MissingDependencyError, OCRError
16
-
17
-
18
- def _get_paddleocr_instance(config: PaddleOCRConfig) -> Any:
19
- """Get a PaddleOCR instance with the given configuration."""
20
- try:
21
- import paddleocr
22
- except ImportError as e:
23
- raise MissingDependencyError("PaddleOCR is not installed. Install it with: pip install paddleocr") from e
24
-
25
- if hasattr(config, "device"):
26
- if config.device and config.device.lower() != "cpu":
27
- pass
28
- elif hasattr(config, "use_gpu"):
29
- pass
30
-
31
- kwargs = {
32
- "lang": config.language,
33
- "use_textline_orientation": config.use_angle_cls,
34
- }
35
-
36
- if hasattr(config, "det_db_thresh"):
37
- kwargs["text_det_thresh"] = config.det_db_thresh
38
- if hasattr(config, "det_db_box_thresh"):
39
- kwargs["text_det_box_thresh"] = config.det_db_box_thresh
40
- if hasattr(config, "det_db_unclip_ratio"):
41
- kwargs["text_det_unclip_ratio"] = config.det_db_unclip_ratio
42
- if hasattr(config, "det_max_side_len"):
43
- kwargs["text_det_limit_side_len"] = config.det_max_side_len
44
- if hasattr(config, "drop_score"):
45
- kwargs["text_rec_score_thresh"] = config.drop_score
46
-
47
- return paddleocr.PaddleOCR(**kwargs)
48
-
49
-
50
- def process_image_sync_pure(
51
- image_path: str | Path,
52
- config: PaddleOCRConfig | None = None,
53
- ) -> ExtractionResult:
54
- """Process an image with PaddleOCR using pure sync implementation.
55
-
56
- This bypasses all async overhead and calls PaddleOCR directly.
57
-
58
- Args:
59
- image_path: Path to the image file.
60
- config: PaddleOCR configuration.
61
-
62
- Returns:
63
- Extraction result.
64
- """
65
- cfg = config or PaddleOCRConfig()
66
-
67
- try:
68
- ocr_instance = _get_paddleocr_instance(cfg)
69
-
70
- results = ocr_instance.ocr(str(image_path))
71
-
72
- if not results or not results[0]:
73
- return ExtractionResult(
74
- content="",
75
- mime_type=PLAIN_TEXT_MIME_TYPE,
76
- metadata={},
77
- chunks=[],
78
- )
79
-
80
- ocr_result = results[0]
81
- result_data = ocr_result.json["res"]
82
-
83
- texts = result_data.get("rec_texts", [])
84
- scores = result_data.get("rec_scores", [])
85
-
86
- if not texts:
87
- return ExtractionResult(
88
- content="",
89
- mime_type=PLAIN_TEXT_MIME_TYPE,
90
- metadata={},
91
- chunks=[],
92
- )
93
-
94
- content = "\n".join(texts)
95
- content = normalize_spaces(content)
96
-
97
- avg_confidence = sum(scores) / len(scores) if scores else 0.0
98
-
99
- metadata = {"confidence": avg_confidence} if scores else {}
100
-
101
- return ExtractionResult(
102
- content=content,
103
- mime_type=PLAIN_TEXT_MIME_TYPE,
104
- metadata=metadata, # type: ignore[arg-type]
105
- chunks=[],
106
- )
107
-
108
- except Exception as e:
109
- raise OCRError(f"PaddleOCR processing failed: {e}") from e
110
-
111
-
112
- def process_image_bytes_sync_pure(
113
- image_bytes: bytes,
114
- config: PaddleOCRConfig | None = None,
115
- ) -> ExtractionResult:
116
- """Process image bytes with PaddleOCR using pure sync implementation.
117
-
118
- Args:
119
- image_bytes: Image data as bytes.
120
- config: PaddleOCR configuration.
121
-
122
- Returns:
123
- Extraction result.
124
- """
125
- import io
126
-
127
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
128
- with Image.open(io.BytesIO(image_bytes)) as image:
129
- image.save(tmp_image.name, format="PNG")
130
- image_path = tmp_image.name
131
-
132
- try:
133
- return process_image_sync_pure(image_path, config)
134
- finally:
135
- image_file = Path(image_path)
136
- if image_file.exists():
137
- image_file.unlink()
138
-
139
-
140
- def process_batch_images_sync_pure(
141
- image_paths: list[str | Path],
142
- config: PaddleOCRConfig | None = None,
143
- ) -> list[ExtractionResult]:
144
- """Process a batch of images sequentially with pure sync implementation.
145
-
146
- Args:
147
- image_paths: List of image file paths.
148
- config: PaddleOCR configuration.
149
-
150
- Returns:
151
- List of extraction results.
152
- """
153
- results = []
154
- for image_path in image_paths:
155
- result = process_image_sync_pure(image_path, config)
156
- results.append(result)
157
- return results
158
-
159
-
160
- def process_batch_images_threaded(
161
- image_paths: list[str | Path],
162
- config: PaddleOCRConfig | None = None,
163
- max_workers: int | None = None,
164
- ) -> list[ExtractionResult]:
165
- """Process a batch of images using threading.
166
-
167
- Args:
168
- image_paths: List of image file paths.
169
- config: PaddleOCR configuration.
170
- max_workers: Maximum number of threads.
171
-
172
- Returns:
173
- List of extraction results in same order as input.
174
- """
175
- import multiprocessing as mp
176
- from concurrent.futures import ThreadPoolExecutor, as_completed
177
-
178
- if max_workers is None:
179
- max_workers = min(len(image_paths), mp.cpu_count())
180
-
181
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
182
- future_to_index = {
183
- executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
184
- }
185
-
186
- results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
187
- for future in as_completed(future_to_index):
188
- index = future_to_index[future]
189
- try:
190
- results[index] = future.result()
191
- except Exception as e: # noqa: BLE001
192
- results[index] = ExtractionResult(
193
- content=f"Error: {e}",
194
- mime_type=PLAIN_TEXT_MIME_TYPE,
195
- metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
196
- chunks=[],
197
- )
198
-
199
- return results
@@ -1,261 +0,0 @@
1
- """Pure synchronous Tesseract OCR without any async overhead."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- import subprocess
7
- import tempfile
8
- from pathlib import Path
9
-
10
- from PIL import Image
11
-
12
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
13
- from kreuzberg._ocr._tesseract import TesseractConfig
14
- from kreuzberg._types import ExtractionResult
15
- from kreuzberg._utils._string import normalize_spaces
16
- from kreuzberg.exceptions import OCRError
17
-
18
-
19
- def process_image_sync_pure(
20
- image_path: str | Path,
21
- config: TesseractConfig | None = None,
22
- ) -> ExtractionResult:
23
- """Process an image with Tesseract using pure sync implementation.
24
-
25
- This bypasses all async overhead and calls Tesseract directly.
26
-
27
- Args:
28
- image_path: Path to the image file.
29
- config: Tesseract configuration.
30
-
31
- Returns:
32
- Extraction result.
33
- """
34
- cfg = config or TesseractConfig()
35
-
36
- with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
37
- output_base = tmp_file.name.replace(".txt", "")
38
-
39
- try:
40
- command = [
41
- "tesseract",
42
- str(image_path),
43
- output_base,
44
- "-l",
45
- cfg.language,
46
- "--psm",
47
- str(cfg.psm.value if hasattr(cfg.psm, "value") else cfg.psm),
48
- "--oem",
49
- "1",
50
- "--loglevel",
51
- "OFF",
52
- ]
53
-
54
- boolean_fields = [
55
- "classify_use_pre_adapted_templates",
56
- "language_model_ngram_on",
57
- "tessedit_dont_blkrej_good_wds",
58
- "tessedit_dont_rowrej_good_wds",
59
- "tessedit_enable_dict_correction",
60
- "tessedit_use_primary_params_model",
61
- "textord_space_size_is_variable",
62
- "thresholding_method",
63
- ]
64
-
65
- for field in boolean_fields:
66
- if hasattr(cfg, field):
67
- value = 1 if getattr(cfg, field) else 0
68
- command.extend(["-c", f"{field}={value}"])
69
-
70
- env = os.environ.copy()
71
- env["OMP_THREAD_LIMIT"] = "1"
72
-
73
- result = subprocess.run(
74
- command,
75
- check=False,
76
- env=env,
77
- capture_output=True,
78
- text=True,
79
- timeout=30,
80
- )
81
-
82
- if result.returncode != 0:
83
- raise OCRError(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
84
-
85
- output_file = output_base + ".txt"
86
- with Path(output_file).open(encoding="utf-8") as f:
87
- text = f.read()
88
-
89
- text = normalize_spaces(text)
90
-
91
- return ExtractionResult(
92
- content=text,
93
- mime_type=PLAIN_TEXT_MIME_TYPE,
94
- metadata={},
95
- chunks=[],
96
- )
97
-
98
- finally:
99
- for ext in [".txt"]:
100
- temp_file = output_base + ext
101
- temp_path = Path(temp_file)
102
- if temp_path.exists():
103
- temp_path.unlink()
104
-
105
-
106
- def process_image_bytes_sync_pure(
107
- image_bytes: bytes,
108
- config: TesseractConfig | None = None,
109
- ) -> ExtractionResult:
110
- """Process image bytes with Tesseract using pure sync implementation.
111
-
112
- Args:
113
- image_bytes: Image data as bytes.
114
- config: Tesseract configuration.
115
-
116
- Returns:
117
- Extraction result.
118
- """
119
- import io
120
-
121
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
122
- with Image.open(io.BytesIO(image_bytes)) as image:
123
- image.save(tmp_image.name, format="PNG")
124
- image_path = tmp_image.name
125
-
126
- try:
127
- return process_image_sync_pure(image_path, config)
128
- finally:
129
- image_file = Path(image_path)
130
- if image_file.exists():
131
- image_file.unlink()
132
-
133
-
134
- def process_batch_images_sync_pure(
135
- image_paths: list[str | Path],
136
- config: TesseractConfig | None = None,
137
- ) -> list[ExtractionResult]:
138
- """Process a batch of images sequentially with pure sync implementation.
139
-
140
- Args:
141
- image_paths: List of image file paths.
142
- config: Tesseract configuration.
143
-
144
- Returns:
145
- List of extraction results.
146
- """
147
- results = []
148
- for image_path in image_paths:
149
- result = process_image_sync_pure(image_path, config)
150
- results.append(result)
151
- return results
152
-
153
-
154
- def process_batch_images_threaded(
155
- image_paths: list[str | Path],
156
- config: TesseractConfig | None = None,
157
- max_workers: int | None = None,
158
- ) -> list[ExtractionResult]:
159
- """Process a batch of images using threading.
160
-
161
- Args:
162
- image_paths: List of image file paths.
163
- config: Tesseract configuration.
164
- max_workers: Maximum number of threads.
165
-
166
- Returns:
167
- List of extraction results in same order as input.
168
- """
169
- import multiprocessing as mp
170
- from concurrent.futures import ThreadPoolExecutor, as_completed
171
-
172
- if max_workers is None:
173
- max_workers = min(len(image_paths), mp.cpu_count())
174
-
175
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
176
- future_to_index = {
177
- executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
178
- }
179
-
180
- results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
181
- for future in as_completed(future_to_index):
182
- index = future_to_index[future]
183
- try:
184
- results[index] = future.result()
185
- except Exception as e: # noqa: BLE001 # noqa: BLE001
186
- results[index] = ExtractionResult(
187
- content=f"Error: {e}",
188
- mime_type=PLAIN_TEXT_MIME_TYPE,
189
- metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
190
- chunks=[],
191
- )
192
-
193
- return results
194
-
195
-
196
- def process_batch_images_process_pool(
197
- image_paths: list[str | Path],
198
- config: TesseractConfig | None = None,
199
- max_workers: int | None = None,
200
- ) -> list[ExtractionResult]:
201
- """Process a batch of images using process pool.
202
-
203
- Args:
204
- image_paths: List of image file paths.
205
- config: Tesseract configuration.
206
- max_workers: Maximum number of processes.
207
-
208
- Returns:
209
- List of extraction results in same order as input.
210
- """
211
- import multiprocessing as mp
212
- from concurrent.futures import ProcessPoolExecutor, as_completed
213
-
214
- if max_workers is None:
215
- max_workers = min(len(image_paths), mp.cpu_count())
216
-
217
- cfg = config or TesseractConfig()
218
- config_dict = {}
219
- for field_name in cfg.__dataclass_fields__:
220
- value = getattr(cfg, field_name)
221
- if hasattr(value, "value"):
222
- config_dict[field_name] = value.value
223
- else:
224
- config_dict[field_name] = value
225
-
226
- with ProcessPoolExecutor(max_workers=max_workers) as executor:
227
- from kreuzberg._multiprocessing.tesseract_pool import _process_image_with_tesseract
228
-
229
- future_to_index = {
230
- executor.submit(_process_image_with_tesseract, str(path), config_dict): i
231
- for i, path in enumerate(image_paths)
232
- }
233
-
234
- results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
235
- for future in as_completed(future_to_index):
236
- index = future_to_index[future]
237
- try:
238
- result_dict = future.result()
239
- if result_dict["success"]:
240
- results[index] = ExtractionResult(
241
- content=result_dict["text"],
242
- mime_type=PLAIN_TEXT_MIME_TYPE,
243
- metadata={},
244
- chunks=[],
245
- )
246
- else:
247
- results[index] = ExtractionResult(
248
- content=f"Error: {result_dict['error']}",
249
- mime_type=PLAIN_TEXT_MIME_TYPE,
250
- metadata={"error": result_dict["error"]}, # type: ignore[typeddict-unknown-key]
251
- chunks=[],
252
- )
253
- except Exception as e: # noqa: BLE001
254
- results[index] = ExtractionResult(
255
- content=f"Error: {e}",
256
- mime_type=PLAIN_TEXT_MIME_TYPE,
257
- metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
258
- chunks=[],
259
- )
260
-
261
- return results