kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kreuzberg might be problematic. Click here for more details.

@@ -0,0 +1,371 @@
1
+ """EasyOCR backend for document OCR processing.
2
+
3
+ This module provides integration with EasyOCR for optical character recognition.
4
+ EasyOCR supports 80+ languages and can run on CPU or GPU (CUDA).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any
11
+
12
+ from kreuzberg.exceptions import OCRError, ValidationError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ SUPPORTED_LANGUAGES = {
17
+ "abq",
18
+ "ady",
19
+ "af",
20
+ "ang",
21
+ "ar",
22
+ "as",
23
+ "ava",
24
+ "az",
25
+ "be",
26
+ "bg",
27
+ "bh",
28
+ "bho",
29
+ "bn",
30
+ "bs",
31
+ "ch_sim",
32
+ "ch_tra",
33
+ "che",
34
+ "cs",
35
+ "cy",
36
+ "da",
37
+ "dar",
38
+ "de",
39
+ "en",
40
+ "es",
41
+ "et",
42
+ "fa",
43
+ "fr",
44
+ "ga",
45
+ "gom",
46
+ "hi",
47
+ "hr",
48
+ "hu",
49
+ "id",
50
+ "inh",
51
+ "is",
52
+ "it",
53
+ "ja",
54
+ "kbd",
55
+ "kn",
56
+ "ko",
57
+ "ku",
58
+ "la",
59
+ "lbe",
60
+ "lez",
61
+ "lt",
62
+ "lv",
63
+ "mah",
64
+ "mai",
65
+ "mi",
66
+ "mn",
67
+ "mr",
68
+ "ms",
69
+ "mt",
70
+ "ne",
71
+ "new",
72
+ "nl",
73
+ "no",
74
+ "oc",
75
+ "pi",
76
+ "pl",
77
+ "pt",
78
+ "ro",
79
+ "ru",
80
+ "rs_cyrillic",
81
+ "rs_latin",
82
+ "sck",
83
+ "sk",
84
+ "sl",
85
+ "sq",
86
+ "sv",
87
+ "sw",
88
+ "ta",
89
+ "tab",
90
+ "te",
91
+ "th",
92
+ "tjk",
93
+ "tl",
94
+ "tr",
95
+ "ug",
96
+ "uk",
97
+ "ur",
98
+ "uz",
99
+ "vi",
100
+ }
101
+
102
+
103
+ class EasyOCRBackend:
104
+ """EasyOCR backend for OCR processing.
105
+
106
+ This backend uses EasyOCR for text extraction from images. It supports
107
+ 80+ languages and can run on both CPU and GPU (CUDA).
108
+
109
+ Args:
110
+ languages: Language codes to enable (default: ``["en"]``).
111
+ use_gpu: Whether to force GPU usage. If ``None``, CUDA availability is auto-detected.
112
+ model_storage_directory: Directory used for EasyOCR model cache.
113
+ beam_width: Beam width for recognition (higher values are slower but more accurate).
114
+
115
+ Raises:
116
+ ImportError: If the easyocr package is not installed.
117
+ ValidationError: If any supplied language code is not supported.
118
+
119
+ Note:
120
+ All parameters are keyword-only. Python will raise TypeError if invalid
121
+ parameters are passed, providing automatic validation.
122
+
123
+ Installation:
124
+ pip install "kreuzberg[easyocr]"
125
+
126
+ Example:
127
+ >>> from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
128
+ >>> # Register backend with custom options via extraction API
129
+ >>> config = ExtractionConfig(ocr=OcrConfig(backend="easyocr", language="en"))
130
+ >>> result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True, "beam_width": 10})
131
+
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ *,
137
+ languages: list[str] | None = None,
138
+ use_gpu: bool | None = None,
139
+ model_storage_directory: str | None = None,
140
+ beam_width: int = 5,
141
+ ) -> None:
142
+ try:
143
+ import easyocr as easyocr_module # noqa: PLC0415
144
+ except ImportError as e:
145
+ msg = "EasyOCR support requires the 'easyocr' package. Install with: pip install \"kreuzberg[easyocr]\""
146
+ raise ImportError(msg) from e
147
+
148
+ self._easyocr_module = easyocr_module
149
+
150
+ self.languages = languages or ["en"]
151
+ self.beam_width = beam_width
152
+ self.model_storage_directory = model_storage_directory
153
+
154
+ unsupported = [lang for lang in self.languages if lang not in SUPPORTED_LANGUAGES]
155
+ if unsupported:
156
+ msg = f"Unsupported EasyOCR language codes: {', '.join(unsupported)}"
157
+ raise ValidationError(
158
+ msg,
159
+ context={
160
+ "unsupported_languages": unsupported,
161
+ "supported_languages": sorted(SUPPORTED_LANGUAGES),
162
+ },
163
+ )
164
+
165
+ if use_gpu is None:
166
+ self.use_gpu = self._is_cuda_available()
167
+ else:
168
+ self.use_gpu = use_gpu
169
+
170
+ self._reader: Any | None = None
171
+
172
+ def name(self) -> str:
173
+ """Return backend name."""
174
+ return "easyocr"
175
+
176
+ def supported_languages(self) -> list[str]:
177
+ """Return list of all supported language codes."""
178
+ return sorted(SUPPORTED_LANGUAGES)
179
+
180
+ def initialize(self) -> None:
181
+ """Initialize EasyOCR reader (loads models)."""
182
+ if self._reader is not None:
183
+ return
184
+
185
+ try:
186
+ logger.info(
187
+ "Initializing EasyOCR reader with languages=%s, GPU=%s",
188
+ self.languages,
189
+ self.use_gpu,
190
+ )
191
+
192
+ self._reader = self._easyocr_module.Reader(
193
+ self.languages,
194
+ gpu=self.use_gpu,
195
+ verbose=False,
196
+ model_storage_directory=self.model_storage_directory,
197
+ )
198
+
199
+ logger.info("EasyOCR reader initialized successfully")
200
+ except Exception as e:
201
+ msg = f"Failed to initialize EasyOCR: {e}"
202
+ raise OCRError(msg) from e
203
+
204
+ def shutdown(self) -> None:
205
+ """Shutdown backend and cleanup resources."""
206
+ self._reader = None
207
+ logger.info("EasyOCR backend shutdown")
208
+
209
+ def process_image(self, image_bytes: bytes, language: str) -> dict[str, Any]:
210
+ """Process image bytes and extract text using EasyOCR.
211
+
212
+ Args:
213
+ image_bytes: Raw image data.
214
+ language: Language code (must be in ``supported_languages()``).
215
+
216
+ Returns:
217
+ Dictionary with the format:
218
+ {
219
+ "content": "extracted text",
220
+ "metadata": {
221
+ "width": 800,
222
+ "height": 600,
223
+ "confidence": 0.95,
224
+ "text_regions": 42
225
+ }
226
+ }
227
+
228
+ Raises:
229
+ ValidationError: If the supplied language is not supported.
230
+ RuntimeError: If EasyOCR fails to initialize.
231
+ OCRError: If OCR processing fails.
232
+
233
+ """
234
+ if self._reader is None:
235
+ self.initialize()
236
+
237
+ if self._reader is None:
238
+ msg = "EasyOCR reader failed to initialize"
239
+ raise RuntimeError(msg)
240
+
241
+ if language not in SUPPORTED_LANGUAGES:
242
+ msg = f"Language '{language}' not supported by EasyOCR"
243
+ raise ValidationError(
244
+ msg,
245
+ context={"language": language, "supported_languages": sorted(SUPPORTED_LANGUAGES)},
246
+ )
247
+
248
+ try:
249
+ import io # noqa: PLC0415
250
+
251
+ import numpy as np # noqa: PLC0415 # type: ignore[import-not-found]
252
+ from PIL import Image # noqa: PLC0415
253
+
254
+ image = Image.open(io.BytesIO(image_bytes))
255
+ width, height = image.size
256
+
257
+ image_array = np.array(image)
258
+
259
+ result = self._reader.readtext(
260
+ image_array,
261
+ beamWidth=self.beam_width,
262
+ )
263
+
264
+ content, confidence, text_regions = self._process_easyocr_result(result)
265
+
266
+ return {
267
+ "content": content,
268
+ "metadata": {
269
+ "width": width,
270
+ "height": height,
271
+ "confidence": confidence,
272
+ "text_regions": text_regions,
273
+ },
274
+ }
275
+
276
+ except Exception as e:
277
+ msg = f"EasyOCR processing failed: {e}"
278
+ raise OCRError(msg) from e
279
+
280
+ def process_file(self, path: str, language: str) -> dict[str, Any]:
281
+ """Process image file using EasyOCR.
282
+
283
+ Args:
284
+ path: Path to the image file.
285
+ language: Language code (must be in ``supported_languages()``).
286
+
287
+ Returns:
288
+ Dictionary in the same format as ``process_image()``.
289
+
290
+ Note:
291
+ Exceptions from :meth:`process_image` propagate unchanged.
292
+
293
+ """
294
+ from pathlib import Path # noqa: PLC0415
295
+
296
+ with Path(path).open("rb") as f:
297
+ image_bytes = f.read()
298
+
299
+ return self.process_image(image_bytes, language)
300
+
301
+ @staticmethod
302
+ def _process_easyocr_result(result: list[Any]) -> tuple[str, float, int]:
303
+ if not result:
304
+ return "", 0.0, 0
305
+
306
+ if all(len(item) == 2 for item in result):
307
+ text_parts = []
308
+ total_confidence = 0.0
309
+ for text, confidence in result:
310
+ if text:
311
+ text_parts.append(text)
312
+ total_confidence += confidence
313
+
314
+ content = "\n".join(text_parts)
315
+ avg_confidence = total_confidence / len(result) if result else 0.0
316
+ return content, avg_confidence, len(result)
317
+
318
+ sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
319
+
320
+ line_groups: list[list[Any]] = []
321
+ current_line: list[Any] = []
322
+ prev_y_center: float | None = None
323
+ line_height_threshold = 20
324
+
325
+ for item in sorted_results:
326
+ box, text, confidence = item
327
+ y_center = sum(point[1] for point in box) / 4
328
+
329
+ if prev_y_center is None or abs(y_center - prev_y_center) > line_height_threshold:
330
+ if current_line:
331
+ line_groups.append(current_line)
332
+ current_line = [item]
333
+ else:
334
+ current_line.append(item)
335
+
336
+ prev_y_center = y_center
337
+
338
+ if current_line:
339
+ line_groups.append(current_line)
340
+
341
+ text_parts = []
342
+ total_confidence = 0.0
343
+ text_count = 0
344
+
345
+ for line in line_groups:
346
+ line_sorted = sorted(line, key=lambda x: x[0][0][0])
347
+
348
+ line_text = []
349
+ for item in line_sorted:
350
+ _, text, confidence = item
351
+ if text:
352
+ line_text.append(text)
353
+ total_confidence += confidence
354
+ text_count += 1
355
+
356
+ if line_text:
357
+ text_parts.append(" ".join(line_text))
358
+
359
+ content = "\n".join(text_parts)
360
+ avg_confidence = total_confidence / text_count if text_count > 0 else 0.0
361
+
362
+ return content, avg_confidence, text_count
363
+
364
+ @staticmethod
365
+ def _is_cuda_available() -> bool:
366
+ try:
367
+ import torch # noqa: PLC0415
368
+
369
+ return bool(torch.cuda.is_available())
370
+ except ImportError:
371
+ return False
@@ -0,0 +1,284 @@
1
+ """PaddleOCR backend for document OCR processing.
2
+
3
+ This module provides integration with PaddleOCR for optical character recognition.
4
+ PaddleOCR supports 80+ languages and is optimized for production deployments.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any
11
+
12
+ from kreuzberg.exceptions import OCRError, ValidationError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ SUPPORTED_LANGUAGES = {
17
+ "ch",
18
+ "en",
19
+ "french",
20
+ "german",
21
+ "korean",
22
+ "japan",
23
+ "chinese_cht",
24
+ "ta",
25
+ "te",
26
+ "ka",
27
+ "latin",
28
+ "arabic",
29
+ "cyrillic",
30
+ "devanagari",
31
+ }
32
+
33
+
34
+ class PaddleOCRBackend:
35
+ """PaddleOCR backend for OCR processing.
36
+
37
+ This backend uses PaddleOCR for text extraction from images. It supports
38
+ 80+ languages and can run on CPU or GPU (CUDA).
39
+
40
+ Args:
41
+ lang: Language code (default: "en").
42
+ use_gpu: Whether to force GPU usage. If ``None``, CUDA availability is auto-detected.
43
+ use_textline_orientation: Whether to enable orientation classification for rotated text.
44
+
45
+ Raises:
46
+ ImportError: If the paddleocr package is not installed.
47
+ ValidationError: If an unsupported language code is provided.
48
+
49
+ Note:
50
+ All parameters are keyword-only. Python will raise TypeError if invalid
51
+ parameters are passed, providing automatic validation.
52
+
53
+ Installation:
54
+ pip install "kreuzberg[paddleocr]"
55
+
56
+ Example:
57
+ >>> from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
58
+ >>> # Register backend with custom options via extraction API
59
+ >>> config = ExtractionConfig(ocr=OcrConfig(backend="paddleocr", language="ch"))
60
+ >>> result = extract_file_sync("scanned.pdf", config=config, paddleocr_kwargs={"use_gpu": True})
61
+
62
+ """
63
+
64
+ def __init__(
65
+ self,
66
+ *,
67
+ lang: str = "en",
68
+ use_gpu: bool | None = None,
69
+ use_textline_orientation: bool = True,
70
+ ) -> None:
71
+ if lang not in SUPPORTED_LANGUAGES:
72
+ msg = f"Unsupported PaddleOCR language code: {lang}"
73
+ raise ValidationError(
74
+ msg,
75
+ context={
76
+ "language": lang,
77
+ "supported_languages": sorted(SUPPORTED_LANGUAGES),
78
+ },
79
+ )
80
+
81
+ try:
82
+ from paddleocr import PaddleOCR as PaddleOCRClass # noqa: PLC0415
83
+ except ImportError as e:
84
+ msg = (
85
+ "PaddleOCR support requires the 'paddleocr' package. Install with: pip install \"kreuzberg[paddleocr]\""
86
+ )
87
+ raise ImportError(msg) from e
88
+
89
+ self._paddleocr_cls = PaddleOCRClass
90
+
91
+ self.lang = lang
92
+ self.use_textline_orientation = use_textline_orientation
93
+
94
+ if use_gpu is None:
95
+ self.device = "gpu" if self._is_cuda_available() else "cpu"
96
+ else:
97
+ self.device = "gpu" if use_gpu else "cpu"
98
+
99
+ self._ocr: Any | None = None
100
+
101
+ def name(self) -> str:
102
+ """Return backend name."""
103
+ return "paddleocr"
104
+
105
+ def supported_languages(self) -> list[str]:
106
+ """Return list of all supported language codes."""
107
+ return sorted(SUPPORTED_LANGUAGES)
108
+
109
+ def initialize(self) -> None:
110
+ """Initialize PaddleOCR (loads models)."""
111
+ if self._ocr is not None:
112
+ return
113
+
114
+ try:
115
+ logger.info(
116
+ "Initializing PaddleOCR with lang=%s, device=%s",
117
+ self.lang,
118
+ self.device,
119
+ )
120
+
121
+ self._ocr = self._paddleocr_cls(
122
+ lang=self.lang,
123
+ device=self.device,
124
+ use_textline_orientation=self.use_textline_orientation,
125
+ )
126
+
127
+ logger.info("PaddleOCR initialized successfully")
128
+ except Exception as e:
129
+ msg = f"Failed to initialize PaddleOCR: {e}"
130
+ raise OCRError(msg) from e
131
+
132
+ def shutdown(self) -> None:
133
+ """Shutdown backend and cleanup resources."""
134
+ self._ocr = None
135
+ logger.info("PaddleOCR backend shutdown")
136
+
137
+ def process_image(self, image_bytes: bytes, language: str) -> dict[str, Any]:
138
+ """Process image bytes and extract text using PaddleOCR.
139
+
140
+ Args:
141
+ image_bytes: Raw image data.
142
+ language: Language code (must be in ``supported_languages()``).
143
+
144
+ Returns:
145
+ Dictionary with the structure:
146
+ {
147
+ "content": "extracted text", # Concatenated text content.
148
+ "metadata": {
149
+ "width": 800,
150
+ "height": 600,
151
+ "confidence": 0.95,
152
+ "text_regions": 42
153
+ }
154
+ }
155
+
156
+ Raises:
157
+ ValidationError: If the supplied language is not supported.
158
+ RuntimeError: If PaddleOCR fails to initialize.
159
+ OCRError: If OCR processing fails.
160
+
161
+ """
162
+ if self._ocr is None:
163
+ self.initialize()
164
+
165
+ if self._ocr is None:
166
+ msg = "PaddleOCR failed to initialize"
167
+ raise RuntimeError(msg)
168
+
169
+ if language not in SUPPORTED_LANGUAGES:
170
+ msg = f"Language '{language}' not supported by PaddleOCR"
171
+ raise ValidationError(
172
+ msg,
173
+ context={"language": language, "supported_languages": sorted(SUPPORTED_LANGUAGES)},
174
+ )
175
+
176
+ try:
177
+ import io # noqa: PLC0415
178
+
179
+ import numpy as np # noqa: PLC0415 # type: ignore[import-not-found]
180
+ from PIL import Image # noqa: PLC0415
181
+
182
+ image = Image.open(io.BytesIO(image_bytes))
183
+ width, height = image.size
184
+
185
+ image_array = np.array(image)
186
+
187
+ result = self._ocr.predict(image_array)
188
+
189
+ content, confidence, text_regions = self._process_paddleocr_result(result)
190
+
191
+ return {
192
+ "content": content,
193
+ "metadata": {
194
+ "width": width,
195
+ "height": height,
196
+ "confidence": confidence,
197
+ "text_regions": text_regions,
198
+ },
199
+ }
200
+
201
+ except Exception as e:
202
+ msg = f"PaddleOCR processing failed: {e}"
203
+ raise OCRError(msg) from e
204
+
205
+ def process_file(self, path: str, _language: str) -> dict[str, Any]:
206
+ """Process image file using PaddleOCR.
207
+
208
+ Args:
209
+ path: Path to the image file.
210
+ _language: Language code (unused - PaddleOCR uses language from initialization).
211
+
212
+ Returns:
213
+ Dictionary in the same format as ``process_image()``.
214
+
215
+ Raises:
216
+ RuntimeError: If PaddleOCR fails to initialize.
217
+ OCRError: If OCR processing fails.
218
+
219
+ """
220
+ if self._ocr is None:
221
+ self.initialize()
222
+
223
+ if self._ocr is None:
224
+ msg = "PaddleOCR failed to initialize"
225
+ raise RuntimeError(msg)
226
+
227
+ try:
228
+ from PIL import Image # noqa: PLC0415
229
+
230
+ image = Image.open(path)
231
+ width, height = image.size
232
+
233
+ result = self._ocr.predict(path)
234
+
235
+ content, confidence, text_regions = self._process_paddleocr_result(result)
236
+
237
+ return {
238
+ "content": content,
239
+ "metadata": {
240
+ "width": width,
241
+ "height": height,
242
+ "confidence": confidence,
243
+ "text_regions": text_regions,
244
+ },
245
+ }
246
+
247
+ except Exception as e:
248
+ msg = f"PaddleOCR file processing failed: {e}"
249
+ raise OCRError(msg) from e
250
+
251
+ @staticmethod
252
+ def _process_paddleocr_result(result: list[Any] | None) -> tuple[str, float, int]:
253
+ if not result or result[0] is None:
254
+ return "", 0.0, 0
255
+
256
+ page_result = result[0]
257
+
258
+ text_parts = []
259
+ total_confidence = 0.0
260
+ text_count = 0
261
+
262
+ for line in page_result:
263
+ if isinstance(line, (list, tuple)) and len(line) >= 2:
264
+ text_info = line[1]
265
+ if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
266
+ text, confidence = text_info[0], text_info[1]
267
+ if text:
268
+ text_parts.append(str(text))
269
+ total_confidence += float(confidence)
270
+ text_count += 1
271
+
272
+ content = "\n".join(text_parts)
273
+ avg_confidence = total_confidence / text_count if text_count > 0 else 0.0
274
+
275
+ return content, avg_confidence, text_count
276
+
277
+ @staticmethod
278
+ def _is_cuda_available() -> bool:
279
+ try:
280
+ import paddle # noqa: PLC0415
281
+
282
+ return bool(paddle.device.is_compiled_with_cuda())
283
+ except (ImportError, AttributeError):
284
+ return False