kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kreuzberg might be problematic. Click here for more details.
- kreuzberg/__init__.py +931 -0
- kreuzberg/__main__.py +160 -0
- kreuzberg/_internal_bindings.abi3.so +0 -0
- kreuzberg/_setup_lib_path.py +143 -0
- kreuzberg/exceptions.py +254 -0
- kreuzberg/ocr/__init__.py +25 -0
- kreuzberg/ocr/easyocr.py +371 -0
- kreuzberg/ocr/paddleocr.py +284 -0
- kreuzberg/ocr/protocol.py +150 -0
- kreuzberg/postprocessors/__init__.py +61 -0
- kreuzberg/postprocessors/protocol.py +83 -0
- kreuzberg/py.typed +0 -0
- kreuzberg/types.py +509 -0
- kreuzberg-4.0.6.dist-info/METADATA +470 -0
- kreuzberg-4.0.6.dist-info/RECORD +17 -0
- kreuzberg-4.0.6.dist-info/WHEEL +4 -0
- kreuzberg-4.0.6.dist-info/entry_points.txt +2 -0
kreuzberg/__init__.py
ADDED
|
@@ -0,0 +1,931 @@
|
|
|
1
|
+
"""Kreuzberg - Multi-language document intelligence framework.
|
|
2
|
+
|
|
3
|
+
This is a thin Python wrapper around a high-performance Rust core.
|
|
4
|
+
All extraction logic, chunking, quality processing, and language detection
|
|
5
|
+
are implemented in Rust for maximum performance.
|
|
6
|
+
|
|
7
|
+
Python-specific features:
|
|
8
|
+
- OCR backends: EasyOCR, PaddleOCR (Python-based OCR engines)
|
|
9
|
+
- Custom PostProcessors: Register your own Python processing logic
|
|
10
|
+
|
|
11
|
+
Architecture:
|
|
12
|
+
- Rust handles: Extraction, parsing, chunking, quality, language detection, NLP (keyword extraction), API server, MCP server, CLI
|
|
13
|
+
- Python adds: OCR backends (EasyOCR, PaddleOCR), custom postprocessors
|
|
14
|
+
|
|
15
|
+
Creating Custom PostProcessors:
|
|
16
|
+
>>> from kreuzberg import PostProcessorProtocol, register_post_processor, ExtractionResult
|
|
17
|
+
>>>
|
|
18
|
+
>>> class MyProcessor:
|
|
19
|
+
... def name(self) -> str:
|
|
20
|
+
... return "my_processor"
|
|
21
|
+
...
|
|
22
|
+
... def process(self, result: ExtractionResult) -> ExtractionResult:
|
|
23
|
+
... result.metadata["custom_field"] = "custom_value"
|
|
24
|
+
... return result
|
|
25
|
+
...
|
|
26
|
+
... def processing_stage(self) -> str:
|
|
27
|
+
... return "middle"
|
|
28
|
+
>>>
|
|
29
|
+
>>> register_post_processor(MyProcessor())
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import hashlib
|
|
35
|
+
import json
|
|
36
|
+
import threading
|
|
37
|
+
from importlib.metadata import version
|
|
38
|
+
from typing import TYPE_CHECKING, Any
|
|
39
|
+
|
|
40
|
+
# ~keep: This must be imported FIRST before any Rust bindings
|
|
41
|
+
# ~keep: It sets up dynamic library paths for bundled native libraries (pdfium, etc.)
|
|
42
|
+
from kreuzberg import _setup_lib_path # noqa: F401
|
|
43
|
+
from kreuzberg._internal_bindings import (
|
|
44
|
+
ChunkingConfig,
|
|
45
|
+
EmbeddingConfig,
|
|
46
|
+
EmbeddingModelType,
|
|
47
|
+
EmbeddingPreset,
|
|
48
|
+
ExtractedTable,
|
|
49
|
+
ExtractionConfig,
|
|
50
|
+
ExtractionResult,
|
|
51
|
+
HierarchyConfig,
|
|
52
|
+
ImageExtractionConfig,
|
|
53
|
+
ImagePreprocessingConfig,
|
|
54
|
+
KeywordAlgorithm,
|
|
55
|
+
KeywordConfig,
|
|
56
|
+
LanguageDetectionConfig,
|
|
57
|
+
OcrConfig,
|
|
58
|
+
PageConfig,
|
|
59
|
+
PdfConfig,
|
|
60
|
+
PostProcessorConfig,
|
|
61
|
+
RakeParams,
|
|
62
|
+
TesseractConfig,
|
|
63
|
+
TokenReductionConfig,
|
|
64
|
+
YakeParams,
|
|
65
|
+
_discover_extraction_config_impl,
|
|
66
|
+
_load_extraction_config_from_file_impl,
|
|
67
|
+
clear_document_extractors,
|
|
68
|
+
clear_ocr_backends,
|
|
69
|
+
clear_post_processors,
|
|
70
|
+
clear_validators,
|
|
71
|
+
config_get_field,
|
|
72
|
+
config_merge,
|
|
73
|
+
config_to_json,
|
|
74
|
+
detect_mime_type_from_bytes,
|
|
75
|
+
get_embedding_preset,
|
|
76
|
+
get_extensions_for_mime,
|
|
77
|
+
get_last_panic_context,
|
|
78
|
+
get_valid_binarization_methods,
|
|
79
|
+
get_valid_language_codes,
|
|
80
|
+
get_valid_ocr_backends,
|
|
81
|
+
get_valid_token_reduction_levels,
|
|
82
|
+
list_document_extractors,
|
|
83
|
+
list_embedding_presets,
|
|
84
|
+
list_ocr_backends,
|
|
85
|
+
list_post_processors,
|
|
86
|
+
list_validators,
|
|
87
|
+
unregister_document_extractor,
|
|
88
|
+
unregister_ocr_backend,
|
|
89
|
+
unregister_post_processor,
|
|
90
|
+
unregister_validator,
|
|
91
|
+
validate_binarization_method,
|
|
92
|
+
validate_chunking_params,
|
|
93
|
+
validate_confidence,
|
|
94
|
+
validate_dpi,
|
|
95
|
+
validate_language_code,
|
|
96
|
+
validate_mime_type,
|
|
97
|
+
validate_ocr_backend,
|
|
98
|
+
validate_output_format,
|
|
99
|
+
validate_tesseract_oem,
|
|
100
|
+
validate_tesseract_psm,
|
|
101
|
+
validate_token_reduction_level,
|
|
102
|
+
)
|
|
103
|
+
from kreuzberg._internal_bindings import (
|
|
104
|
+
batch_extract_bytes as batch_extract_bytes_impl,
|
|
105
|
+
)
|
|
106
|
+
from kreuzberg._internal_bindings import (
|
|
107
|
+
batch_extract_bytes_sync as batch_extract_bytes_sync_impl,
|
|
108
|
+
)
|
|
109
|
+
from kreuzberg._internal_bindings import (
|
|
110
|
+
batch_extract_files as batch_extract_files_impl,
|
|
111
|
+
)
|
|
112
|
+
from kreuzberg._internal_bindings import (
|
|
113
|
+
batch_extract_files_sync as batch_extract_files_sync_impl,
|
|
114
|
+
)
|
|
115
|
+
from kreuzberg._internal_bindings import (
|
|
116
|
+
classify_error as _classify_error_impl,
|
|
117
|
+
)
|
|
118
|
+
from kreuzberg._internal_bindings import (
|
|
119
|
+
detect_mime_type_from_path as _detect_mime_type_from_path_impl,
|
|
120
|
+
)
|
|
121
|
+
from kreuzberg._internal_bindings import (
|
|
122
|
+
error_code_name as _error_code_name_impl,
|
|
123
|
+
)
|
|
124
|
+
from kreuzberg._internal_bindings import (
|
|
125
|
+
extract_bytes as extract_bytes_impl,
|
|
126
|
+
)
|
|
127
|
+
from kreuzberg._internal_bindings import (
|
|
128
|
+
extract_bytes_sync as extract_bytes_sync_impl,
|
|
129
|
+
)
|
|
130
|
+
from kreuzberg._internal_bindings import (
|
|
131
|
+
extract_file as extract_file_impl,
|
|
132
|
+
)
|
|
133
|
+
from kreuzberg._internal_bindings import (
|
|
134
|
+
extract_file_sync as extract_file_sync_impl,
|
|
135
|
+
)
|
|
136
|
+
from kreuzberg._internal_bindings import (
|
|
137
|
+
get_error_details as _get_error_details_impl,
|
|
138
|
+
)
|
|
139
|
+
from kreuzberg._internal_bindings import (
|
|
140
|
+
get_last_error_code as _get_last_error_code_impl,
|
|
141
|
+
)
|
|
142
|
+
from kreuzberg._internal_bindings import (
|
|
143
|
+
register_ocr_backend as _register_ocr_backend_impl,
|
|
144
|
+
)
|
|
145
|
+
from kreuzberg._internal_bindings import (
|
|
146
|
+
register_post_processor as _register_post_processor_impl,
|
|
147
|
+
)
|
|
148
|
+
from kreuzberg._internal_bindings import (
|
|
149
|
+
register_validator as _register_validator_impl,
|
|
150
|
+
)
|
|
151
|
+
from kreuzberg.exceptions import (
|
|
152
|
+
CacheError,
|
|
153
|
+
ErrorCode,
|
|
154
|
+
ImageProcessingError,
|
|
155
|
+
KreuzbergError,
|
|
156
|
+
MissingDependencyError,
|
|
157
|
+
OCRError,
|
|
158
|
+
PanicContext,
|
|
159
|
+
ParsingError,
|
|
160
|
+
PluginError,
|
|
161
|
+
ValidationError,
|
|
162
|
+
)
|
|
163
|
+
from kreuzberg.postprocessors.protocol import PostProcessorProtocol
|
|
164
|
+
from kreuzberg.types import Chunk, ChunkMetadata, ExtractedImage, Metadata
|
|
165
|
+
|
|
166
|
+
if TYPE_CHECKING:
|
|
167
|
+
from pathlib import Path
|
|
168
|
+
|
|
169
|
+
from kreuzberg.ocr.easyocr import EasyOCRBackend # noqa: F401
|
|
170
|
+
from kreuzberg.ocr.paddleocr import PaddleOCRBackend # noqa: F401
|
|
171
|
+
|
|
172
|
+
__version__ = version("kreuzberg")
|
|
173
|
+
|
|
174
|
+
__all__ = [
|
|
175
|
+
"CacheError",
|
|
176
|
+
"Chunk",
|
|
177
|
+
"ChunkMetadata",
|
|
178
|
+
"ChunkingConfig",
|
|
179
|
+
"EmbeddingConfig",
|
|
180
|
+
"EmbeddingModelType",
|
|
181
|
+
"EmbeddingPreset",
|
|
182
|
+
"ErrorCode",
|
|
183
|
+
"ExtractedImage",
|
|
184
|
+
"ExtractedTable",
|
|
185
|
+
"ExtractionConfig",
|
|
186
|
+
"ExtractionResult",
|
|
187
|
+
"HierarchyConfig",
|
|
188
|
+
"ImageExtractionConfig",
|
|
189
|
+
"ImagePreprocessingConfig",
|
|
190
|
+
"ImageProcessingError",
|
|
191
|
+
"KeywordAlgorithm",
|
|
192
|
+
"KeywordConfig",
|
|
193
|
+
"KreuzbergError",
|
|
194
|
+
"LanguageDetectionConfig",
|
|
195
|
+
"Metadata",
|
|
196
|
+
"MissingDependencyError",
|
|
197
|
+
"OCRError",
|
|
198
|
+
"OcrConfig",
|
|
199
|
+
"PageConfig",
|
|
200
|
+
"PanicContext",
|
|
201
|
+
"ParsingError",
|
|
202
|
+
"PdfConfig",
|
|
203
|
+
"PluginError",
|
|
204
|
+
"PostProcessorConfig",
|
|
205
|
+
"PostProcessorProtocol",
|
|
206
|
+
"RakeParams",
|
|
207
|
+
"TesseractConfig",
|
|
208
|
+
"TokenReductionConfig",
|
|
209
|
+
"ValidationError",
|
|
210
|
+
"YakeParams",
|
|
211
|
+
"__version__",
|
|
212
|
+
"batch_extract_bytes",
|
|
213
|
+
"batch_extract_bytes_sync",
|
|
214
|
+
"batch_extract_files",
|
|
215
|
+
"batch_extract_files_sync",
|
|
216
|
+
"classify_error",
|
|
217
|
+
"clear_document_extractors",
|
|
218
|
+
"clear_ocr_backends",
|
|
219
|
+
"clear_post_processors",
|
|
220
|
+
"clear_validators",
|
|
221
|
+
"config_get_field",
|
|
222
|
+
"config_merge",
|
|
223
|
+
"config_to_json",
|
|
224
|
+
"detect_mime_type",
|
|
225
|
+
"detect_mime_type_from_path",
|
|
226
|
+
"discover_extraction_config",
|
|
227
|
+
"error_code_name",
|
|
228
|
+
"extract_bytes",
|
|
229
|
+
"extract_bytes_sync",
|
|
230
|
+
"extract_file",
|
|
231
|
+
"extract_file_sync",
|
|
232
|
+
"get_embedding_preset",
|
|
233
|
+
"get_error_details",
|
|
234
|
+
"get_extensions_for_mime",
|
|
235
|
+
"get_last_error_code",
|
|
236
|
+
"get_last_panic_context",
|
|
237
|
+
"get_valid_binarization_methods",
|
|
238
|
+
"get_valid_language_codes",
|
|
239
|
+
"get_valid_ocr_backends",
|
|
240
|
+
"get_valid_token_reduction_levels",
|
|
241
|
+
"list_document_extractors",
|
|
242
|
+
"list_embedding_presets",
|
|
243
|
+
"list_ocr_backends",
|
|
244
|
+
"list_post_processors",
|
|
245
|
+
"list_validators",
|
|
246
|
+
"load_extraction_config_from_file",
|
|
247
|
+
"register_ocr_backend",
|
|
248
|
+
"register_post_processor",
|
|
249
|
+
"register_validator",
|
|
250
|
+
"unregister_document_extractor",
|
|
251
|
+
"unregister_ocr_backend",
|
|
252
|
+
"unregister_post_processor",
|
|
253
|
+
"unregister_validator",
|
|
254
|
+
"validate_binarization_method",
|
|
255
|
+
"validate_chunking_params",
|
|
256
|
+
"validate_confidence",
|
|
257
|
+
"validate_dpi",
|
|
258
|
+
"validate_language_code",
|
|
259
|
+
"validate_mime_type",
|
|
260
|
+
"validate_ocr_backend",
|
|
261
|
+
"validate_output_format",
|
|
262
|
+
"validate_tesseract_oem",
|
|
263
|
+
"validate_tesseract_psm",
|
|
264
|
+
"validate_token_reduction_level",
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
_REGISTERED_OCR_BACKENDS: dict[tuple[str, str], Any] = {}
|
|
269
|
+
|
|
270
|
+
_OCR_CACHE_LOCK = threading.Lock()
|
|
271
|
+
|
|
272
|
+
_MAX_CACHE_SIZE = 10
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _hash_kwargs(kwargs: dict[str, Any]) -> str:
|
|
276
|
+
try:
|
|
277
|
+
serialized = json.dumps(kwargs, sort_keys=True, default=str)
|
|
278
|
+
return hashlib.md5(serialized.encode()).hexdigest() # noqa: S324
|
|
279
|
+
except (TypeError, ValueError):
|
|
280
|
+
return hashlib.md5(repr(kwargs).encode()).hexdigest() # noqa: S324
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _ensure_ocr_backend_registered(
|
|
284
|
+
config: ExtractionConfig,
|
|
285
|
+
easyocr_kwargs: dict[str, Any] | None,
|
|
286
|
+
paddleocr_kwargs: dict[str, Any] | None,
|
|
287
|
+
) -> None:
|
|
288
|
+
if config.ocr is None:
|
|
289
|
+
return
|
|
290
|
+
|
|
291
|
+
backend_name = config.ocr.backend
|
|
292
|
+
|
|
293
|
+
if backend_name == "tesseract":
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
kwargs_map = {
|
|
297
|
+
"easyocr": easyocr_kwargs or {},
|
|
298
|
+
"paddleocr": paddleocr_kwargs or {},
|
|
299
|
+
}
|
|
300
|
+
kwargs = kwargs_map.get(backend_name, {})
|
|
301
|
+
|
|
302
|
+
with _OCR_CACHE_LOCK:
|
|
303
|
+
cache_key = (backend_name, _hash_kwargs(kwargs))
|
|
304
|
+
|
|
305
|
+
if cache_key in _REGISTERED_OCR_BACKENDS:
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
if len(_REGISTERED_OCR_BACKENDS) >= _MAX_CACHE_SIZE:
|
|
309
|
+
oldest_key = next(iter(_REGISTERED_OCR_BACKENDS))
|
|
310
|
+
del _REGISTERED_OCR_BACKENDS[oldest_key]
|
|
311
|
+
|
|
312
|
+
backend: Any
|
|
313
|
+
if backend_name == "easyocr":
|
|
314
|
+
try:
|
|
315
|
+
from kreuzberg.ocr.easyocr import EasyOCRBackend # noqa: PLC0415
|
|
316
|
+
|
|
317
|
+
if "languages" not in kwargs:
|
|
318
|
+
kwargs["languages"] = [config.ocr.language]
|
|
319
|
+
|
|
320
|
+
backend = EasyOCRBackend(**kwargs)
|
|
321
|
+
except ImportError as e:
|
|
322
|
+
raise MissingDependencyError.create_for_package(
|
|
323
|
+
dependency_group="easyocr",
|
|
324
|
+
functionality="EasyOCR backend",
|
|
325
|
+
package_name="easyocr",
|
|
326
|
+
) from e
|
|
327
|
+
elif backend_name == "paddleocr":
|
|
328
|
+
try:
|
|
329
|
+
from kreuzberg.ocr.paddleocr import PaddleOCRBackend # noqa: PLC0415
|
|
330
|
+
|
|
331
|
+
if "lang" not in kwargs:
|
|
332
|
+
kwargs["lang"] = config.ocr.language
|
|
333
|
+
|
|
334
|
+
backend = PaddleOCRBackend(**kwargs)
|
|
335
|
+
except ImportError as e:
|
|
336
|
+
raise MissingDependencyError.create_for_package(
|
|
337
|
+
dependency_group="paddleocr",
|
|
338
|
+
functionality="PaddleOCR backend",
|
|
339
|
+
package_name="paddleocr",
|
|
340
|
+
) from e
|
|
341
|
+
else:
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
register_ocr_backend(backend)
|
|
345
|
+
_REGISTERED_OCR_BACKENDS[cache_key] = backend
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def extract_file_sync(
|
|
349
|
+
file_path: str | Path,
|
|
350
|
+
mime_type: str | None = None,
|
|
351
|
+
config: ExtractionConfig | None = None,
|
|
352
|
+
*,
|
|
353
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
354
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
355
|
+
) -> ExtractionResult:
|
|
356
|
+
"""Extract content from a file (synchronous).
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
file_path: Path to the file (str or pathlib.Path)
|
|
360
|
+
mime_type: Optional MIME type hint (auto-detected if None)
|
|
361
|
+
config: Extraction configuration (uses defaults if None)
|
|
362
|
+
easyocr_kwargs: EasyOCR initialization options (languages, use_gpu, beam_width, etc.)
|
|
363
|
+
paddleocr_kwargs: PaddleOCR initialization options (lang, use_angle_cls, show_log, etc.)
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
ExtractionResult with content, metadata, and tables
|
|
367
|
+
|
|
368
|
+
Example:
|
|
369
|
+
>>> from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig
|
|
370
|
+
>>> # Basic usage
|
|
371
|
+
>>> result = extract_file_sync("document.pdf")
|
|
372
|
+
>>>
|
|
373
|
+
>>> # With Tesseract configuration
|
|
374
|
+
>>> config = ExtractionConfig(
|
|
375
|
+
... ocr=OcrConfig(
|
|
376
|
+
... backend="tesseract",
|
|
377
|
+
... language="eng",
|
|
378
|
+
... tesseract_config=TesseractConfig(
|
|
379
|
+
... psm=6,
|
|
380
|
+
... enable_table_detection=True,
|
|
381
|
+
... tessedit_char_whitelist="0123456789",
|
|
382
|
+
... ),
|
|
383
|
+
... )
|
|
384
|
+
... )
|
|
385
|
+
>>> result = extract_file_sync("invoice.pdf", config=config)
|
|
386
|
+
>>>
|
|
387
|
+
>>> # With EasyOCR custom options
|
|
388
|
+
>>> config = ExtractionConfig(ocr=OcrConfig(backend="easyocr", language="eng"))
|
|
389
|
+
>>> result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True, "beam_width": 10})
|
|
390
|
+
"""
|
|
391
|
+
if config is None:
|
|
392
|
+
config = ExtractionConfig()
|
|
393
|
+
|
|
394
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
395
|
+
|
|
396
|
+
return extract_file_sync_impl(str(file_path), mime_type, config)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def extract_bytes_sync(
|
|
400
|
+
data: bytes | bytearray,
|
|
401
|
+
mime_type: str,
|
|
402
|
+
config: ExtractionConfig | None = None,
|
|
403
|
+
*,
|
|
404
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
405
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
406
|
+
) -> ExtractionResult:
|
|
407
|
+
"""Extract content from bytes (synchronous).
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
data: File content as bytes or bytearray
|
|
411
|
+
mime_type: MIME type of the data (required for format detection)
|
|
412
|
+
config: Extraction configuration (uses defaults if None)
|
|
413
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
414
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
ExtractionResult with content, metadata, and tables
|
|
418
|
+
"""
|
|
419
|
+
if config is None:
|
|
420
|
+
config = ExtractionConfig()
|
|
421
|
+
|
|
422
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
423
|
+
|
|
424
|
+
return extract_bytes_sync_impl(bytes(data), mime_type, config)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def batch_extract_files_sync(
|
|
428
|
+
paths: list[str | Path],
|
|
429
|
+
config: ExtractionConfig | None = None,
|
|
430
|
+
*,
|
|
431
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
432
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
433
|
+
) -> list[ExtractionResult]:
|
|
434
|
+
"""Extract content from multiple files in parallel (synchronous).
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
paths: List of file paths
|
|
438
|
+
config: Extraction configuration (uses defaults if None)
|
|
439
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
440
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
List of ExtractionResults (one per file)
|
|
444
|
+
"""
|
|
445
|
+
if config is None:
|
|
446
|
+
config = ExtractionConfig()
|
|
447
|
+
|
|
448
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
449
|
+
|
|
450
|
+
return batch_extract_files_sync_impl([str(p) for p in paths], config)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def batch_extract_bytes_sync(
|
|
454
|
+
data_list: list[bytes | bytearray],
|
|
455
|
+
mime_types: list[str],
|
|
456
|
+
config: ExtractionConfig | None = None,
|
|
457
|
+
*,
|
|
458
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
459
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
460
|
+
) -> list[ExtractionResult]:
|
|
461
|
+
"""Extract content from multiple byte arrays in parallel (synchronous).
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
data_list: List of file contents as bytes/bytearray
|
|
465
|
+
mime_types: List of MIME types (one per data item)
|
|
466
|
+
config: Extraction configuration (uses defaults if None)
|
|
467
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
468
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
List of ExtractionResults (one per data item)
|
|
472
|
+
"""
|
|
473
|
+
if config is None:
|
|
474
|
+
config = ExtractionConfig()
|
|
475
|
+
|
|
476
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
477
|
+
|
|
478
|
+
return batch_extract_bytes_sync_impl([bytes(d) for d in data_list], mime_types, config)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
async def extract_file(
|
|
482
|
+
file_path: str | Path,
|
|
483
|
+
mime_type: str | None = None,
|
|
484
|
+
config: ExtractionConfig | None = None,
|
|
485
|
+
*,
|
|
486
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
487
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
488
|
+
) -> ExtractionResult:
|
|
489
|
+
"""Extract content from a file (asynchronous).
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
file_path: Path to the file (str or pathlib.Path)
|
|
493
|
+
mime_type: Optional MIME type hint (auto-detected if None)
|
|
494
|
+
config: Extraction configuration (uses defaults if None)
|
|
495
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
496
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
ExtractionResult with content, metadata, and tables
|
|
500
|
+
"""
|
|
501
|
+
if config is None:
|
|
502
|
+
config = ExtractionConfig()
|
|
503
|
+
|
|
504
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
505
|
+
|
|
506
|
+
return await extract_file_impl(str(file_path), mime_type, config)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
async def extract_bytes(
|
|
510
|
+
data: bytes | bytearray,
|
|
511
|
+
mime_type: str,
|
|
512
|
+
config: ExtractionConfig | None = None,
|
|
513
|
+
*,
|
|
514
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
515
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
516
|
+
) -> ExtractionResult:
|
|
517
|
+
"""Extract content from bytes (asynchronous).
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
data: File content as bytes or bytearray
|
|
521
|
+
mime_type: MIME type of the data (required for format detection)
|
|
522
|
+
config: Extraction configuration (uses defaults if None)
|
|
523
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
524
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
ExtractionResult with content, metadata, and tables
|
|
528
|
+
"""
|
|
529
|
+
if config is None:
|
|
530
|
+
config = ExtractionConfig()
|
|
531
|
+
|
|
532
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
533
|
+
|
|
534
|
+
return await extract_bytes_impl(bytes(data), mime_type, config)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
async def batch_extract_files(
|
|
538
|
+
paths: list[str | Path],
|
|
539
|
+
config: ExtractionConfig | None = None,
|
|
540
|
+
*,
|
|
541
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
542
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
543
|
+
) -> list[ExtractionResult]:
|
|
544
|
+
"""Extract content from multiple files in parallel (asynchronous).
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
paths: List of file paths
|
|
548
|
+
config: Extraction configuration (uses defaults if None)
|
|
549
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
550
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
List of ExtractionResults (one per file)
|
|
554
|
+
"""
|
|
555
|
+
if config is None:
|
|
556
|
+
config = ExtractionConfig()
|
|
557
|
+
|
|
558
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
559
|
+
|
|
560
|
+
return await batch_extract_files_impl([str(p) for p in paths], config)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
async def batch_extract_bytes(
|
|
564
|
+
data_list: list[bytes | bytearray],
|
|
565
|
+
mime_types: list[str],
|
|
566
|
+
config: ExtractionConfig | None = None,
|
|
567
|
+
*,
|
|
568
|
+
easyocr_kwargs: dict[str, Any] | None = None,
|
|
569
|
+
paddleocr_kwargs: dict[str, Any] | None = None,
|
|
570
|
+
) -> list[ExtractionResult]:
|
|
571
|
+
"""Extract content from multiple byte arrays in parallel (asynchronous).
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
data_list: List of file contents as bytes/bytearray
|
|
575
|
+
mime_types: List of MIME types (one per data item)
|
|
576
|
+
config: Extraction configuration (uses defaults if None)
|
|
577
|
+
easyocr_kwargs: EasyOCR initialization options
|
|
578
|
+
paddleocr_kwargs: PaddleOCR initialization options
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
List of ExtractionResults (one per data item)
|
|
582
|
+
"""
|
|
583
|
+
if config is None:
|
|
584
|
+
config = ExtractionConfig()
|
|
585
|
+
|
|
586
|
+
_ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
|
|
587
|
+
|
|
588
|
+
return await batch_extract_bytes_impl([bytes(d) for d in data_list], mime_types, config)
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def detect_mime_type(data: bytes | bytearray) -> str:
|
|
592
|
+
r"""Detect MIME type from file bytes.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
data: File content as bytes or bytearray
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
Detected MIME type (e.g., "application/pdf", "image/png")
|
|
599
|
+
|
|
600
|
+
Example:
|
|
601
|
+
>>> from kreuzberg import detect_mime_type
|
|
602
|
+
>>> pdf_bytes = b"%PDF-1.4\\n"
|
|
603
|
+
>>> mime_type = detect_mime_type(pdf_bytes)
|
|
604
|
+
>>> assert "pdf" in mime_type.lower()
|
|
605
|
+
"""
|
|
606
|
+
return detect_mime_type_from_bytes(bytes(data))
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def detect_mime_type_from_path(path: str | Path) -> str:
|
|
610
|
+
"""Detect MIME type from file path.
|
|
611
|
+
|
|
612
|
+
Reads the file at the given path and detects its MIME type using magic number detection.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
path: Path to the file (str or pathlib.Path)
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
Detected MIME type (e.g., "application/pdf", "text/plain")
|
|
619
|
+
|
|
620
|
+
Raises:
|
|
621
|
+
OSError: If file cannot be read (file not found, permission denied, etc.)
|
|
622
|
+
RuntimeError: If MIME type detection fails
|
|
623
|
+
|
|
624
|
+
Example:
|
|
625
|
+
>>> from kreuzberg import detect_mime_type_from_path
|
|
626
|
+
>>> mime_type = detect_mime_type_from_path("document.pdf")
|
|
627
|
+
>>> assert "pdf" in mime_type.lower()
|
|
628
|
+
"""
|
|
629
|
+
return _detect_mime_type_from_path_impl(str(path))
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def discover_extraction_config() -> ExtractionConfig | None:
|
|
633
|
+
"""Discover extraction configuration from the environment.
|
|
634
|
+
|
|
635
|
+
Attempts to locate a Kreuzberg configuration file using the following strategy:
|
|
636
|
+
1. If KREUZBERG_CONFIG_PATH environment variable is set, load from that path
|
|
637
|
+
2. Otherwise, search for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json
|
|
638
|
+
in the current directory and parent directories (walking up the tree)
|
|
639
|
+
3. Return None if no configuration file is found
|
|
640
|
+
|
|
641
|
+
The search order for auto-discovery (when env var is not set):
|
|
642
|
+
- kreuzberg.toml (highest priority)
|
|
643
|
+
- kreuzberg.yaml
|
|
644
|
+
- kreuzberg.json (lowest priority)
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
ExtractionConfig if a configuration file is found and valid, None otherwise
|
|
648
|
+
|
|
649
|
+
Raises:
|
|
650
|
+
RuntimeError: If the discovered config file is invalid or cannot be parsed
|
|
651
|
+
IOError: If there's an error reading the config file
|
|
652
|
+
|
|
653
|
+
Example:
|
|
654
|
+
>>> from kreuzberg import discover_extraction_config
|
|
655
|
+
>>> config = discover_extraction_config()
|
|
656
|
+
>>> if config:
|
|
657
|
+
... print(f"Loaded config with use_cache={config.use_cache}")
|
|
658
|
+
... else:
|
|
659
|
+
... print("No config found, using defaults")
|
|
660
|
+
"""
|
|
661
|
+
return _discover_extraction_config_impl()
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def load_extraction_config_from_file(path: str | Path) -> ExtractionConfig:
|
|
665
|
+
"""Load extraction configuration from a specific file.
|
|
666
|
+
|
|
667
|
+
Loads an ExtractionConfig from the specified file path. The file format
|
|
668
|
+
is determined by the file extension (.toml, .yaml, or .json).
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
path: Path to the configuration file (str or pathlib.Path).
|
|
672
|
+
Supports absolute and relative paths.
|
|
673
|
+
|
|
674
|
+
Returns:
|
|
675
|
+
ExtractionConfig parsed from the file
|
|
676
|
+
|
|
677
|
+
Raises:
|
|
678
|
+
FileNotFoundError: If the configuration file does not exist
|
|
679
|
+
RuntimeError: If the file cannot be read or parsed
|
|
680
|
+
ValueError: If the file format is invalid or unsupported
|
|
681
|
+
|
|
682
|
+
Example:
|
|
683
|
+
>>> from kreuzberg import load_extraction_config_from_file
|
|
684
|
+
>>> config = load_extraction_config_from_file("kreuzberg.toml")
|
|
685
|
+
>>> result = extract_file_sync("document.pdf", config=config)
|
|
686
|
+
"""
|
|
687
|
+
return _load_extraction_config_from_file_impl(str(path))
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def register_ocr_backend(backend: Any) -> None:
|
|
691
|
+
"""Register a Python OCR backend with the Rust core.
|
|
692
|
+
|
|
693
|
+
This function validates the Python backend object, wraps it in a Rust OcrBackend
|
|
694
|
+
implementation, and registers it with the global OCR backend registry. Once registered,
|
|
695
|
+
the backend can be used by the Rust CLI, API, and MCP server.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
backend: Python object implementing the OCR backend protocol
|
|
699
|
+
|
|
700
|
+
Required methods on the backend object:
|
|
701
|
+
- name() -> str: Return backend name (must be non-empty)
|
|
702
|
+
- supported_languages() -> list[str]: Return list of supported language codes
|
|
703
|
+
- process_image(image_bytes: bytes, language: str) -> dict: Process image and return result dict
|
|
704
|
+
|
|
705
|
+
Optional methods:
|
|
706
|
+
- process_file(path: str, language: str) -> dict: Custom file processing
|
|
707
|
+
- initialize(): Called when backend is registered
|
|
708
|
+
- shutdown(): Called when backend is unregistered
|
|
709
|
+
- version() -> str: Backend version (defaults to "1.0.0")
|
|
710
|
+
|
|
711
|
+
Raises:
|
|
712
|
+
TypeError: If backend is missing required methods (name, supported_languages, process_image)
|
|
713
|
+
ValueError: If backend name is empty or already registered
|
|
714
|
+
RuntimeError: If registration with the Rust registry fails
|
|
715
|
+
|
|
716
|
+
Example:
|
|
717
|
+
>>> from kreuzberg import register_ocr_backend
|
|
718
|
+
>>> class MyOcrBackend:
|
|
719
|
+
... def name(self) -> str:
|
|
720
|
+
... return "my-ocr"
|
|
721
|
+
...
|
|
722
|
+
... def supported_languages(self) -> list[str]:
|
|
723
|
+
... return ["eng", "deu", "fra"]
|
|
724
|
+
...
|
|
725
|
+
... def process_image(self, image_bytes: bytes, language: str) -> dict:
|
|
726
|
+
... return {"content": "extracted text", "metadata": {"confidence": 0.95}, "tables": []}
|
|
727
|
+
>>> register_ocr_backend(MyOcrBackend())
|
|
728
|
+
"""
|
|
729
|
+
return _register_ocr_backend_impl(backend)
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def register_post_processor(processor: Any) -> None:
|
|
733
|
+
"""Register a Python PostProcessor with the Rust core.
|
|
734
|
+
|
|
735
|
+
This function validates the Python processor object, wraps it in a Rust PostProcessor
|
|
736
|
+
implementation, and registers it with the global PostProcessor registry. Once registered,
|
|
737
|
+
the processor will be called automatically after extraction to enrich results.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
processor: Python object implementing the PostProcessor protocol
|
|
741
|
+
|
|
742
|
+
Required methods on the processor object:
|
|
743
|
+
- name() -> str: Return processor name (must be non-empty)
|
|
744
|
+
- process(result: dict) -> dict: Process and enrich the extraction result
|
|
745
|
+
- processing_stage() -> str: Return "early", "middle", or "late" (REQUIRED, not optional)
|
|
746
|
+
|
|
747
|
+
Optional methods:
|
|
748
|
+
- initialize(): Called when processor is registered
|
|
749
|
+
- shutdown(): Called when processor is unregistered
|
|
750
|
+
- version() -> str: Processor version (defaults to "1.0.0")
|
|
751
|
+
|
|
752
|
+
Raises:
|
|
753
|
+
TypeError: If processor is missing required methods (name, process, processing_stage)
|
|
754
|
+
ValueError: If processor name is empty or already registered
|
|
755
|
+
RuntimeError: If registration with the Rust registry fails
|
|
756
|
+
|
|
757
|
+
Example:
|
|
758
|
+
>>> from kreuzberg import register_post_processor
|
|
759
|
+
>>> class EntityExtractor:
|
|
760
|
+
... def name(self) -> str:
|
|
761
|
+
... return "entity_extraction"
|
|
762
|
+
...
|
|
763
|
+
... def processing_stage(self) -> str:
|
|
764
|
+
... return "early"
|
|
765
|
+
...
|
|
766
|
+
... def process(self, result: dict) -> dict:
|
|
767
|
+
... entities = {"PERSON": ["John Doe"], "ORG": ["Microsoft"]}
|
|
768
|
+
... result["metadata"]["entities"] = entities
|
|
769
|
+
... return result
|
|
770
|
+
>>> register_post_processor(EntityExtractor())
|
|
771
|
+
"""
|
|
772
|
+
return _register_post_processor_impl(processor)
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def register_validator(validator: Any) -> None:
|
|
776
|
+
"""Register a Python Validator with the Rust core.
|
|
777
|
+
|
|
778
|
+
This function validates the Python validator object, wraps it in a Rust Validator
|
|
779
|
+
implementation, and registers it with the global Validator registry. Once registered,
|
|
780
|
+
the validator will be called automatically after extraction to validate results.
|
|
781
|
+
|
|
782
|
+
Args:
|
|
783
|
+
validator: Python object implementing the Validator protocol
|
|
784
|
+
|
|
785
|
+
Required methods on the validator object:
|
|
786
|
+
- name() -> str: Return validator name (must be non-empty)
|
|
787
|
+
- validate(result: dict) -> None: Validate the extraction result (raise error to fail)
|
|
788
|
+
|
|
789
|
+
Optional methods:
|
|
790
|
+
- should_validate(result: dict) -> bool: Check if validator should run (defaults to True)
|
|
791
|
+
- priority() -> int: Return priority (defaults to 50, higher runs first)
|
|
792
|
+
- initialize(): Called when validator is registered
|
|
793
|
+
- shutdown(): Called when validator is unregistered
|
|
794
|
+
- version() -> str: Validator version (defaults to "1.0.0")
|
|
795
|
+
|
|
796
|
+
Raises:
|
|
797
|
+
TypeError: If validator is missing required methods (name, validate)
|
|
798
|
+
ValueError: If validator name is empty or already registered
|
|
799
|
+
RuntimeError: If registration with the Rust registry fails
|
|
800
|
+
|
|
801
|
+
Example:
|
|
802
|
+
>>> from kreuzberg import register_validator
|
|
803
|
+
>>> from kreuzberg.exceptions import ValidationError
|
|
804
|
+
>>> class MinLengthValidator:
|
|
805
|
+
... def name(self) -> str:
|
|
806
|
+
... return "min_length_validator"
|
|
807
|
+
...
|
|
808
|
+
... def priority(self) -> int:
|
|
809
|
+
... return 100
|
|
810
|
+
...
|
|
811
|
+
... def validate(self, result: dict) -> None:
|
|
812
|
+
... if len(result["content"]) < 100:
|
|
813
|
+
... raise ValidationError(f"Content too short")
|
|
814
|
+
>>> register_validator(MinLengthValidator())
|
|
815
|
+
"""
|
|
816
|
+
return _register_validator_impl(validator)
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def get_last_error_code() -> int | None:
|
|
820
|
+
"""Get the last error code from the FFI layer.
|
|
821
|
+
|
|
822
|
+
Returns the error code from the most recent operation. Useful for debugging
|
|
823
|
+
and understanding what went wrong when an operation fails.
|
|
824
|
+
|
|
825
|
+
Error codes:
|
|
826
|
+
- 0 (SUCCESS): No error occurred
|
|
827
|
+
- 1 (GENERIC_ERROR): Generic unspecified error
|
|
828
|
+
- 2 (PANIC): A panic occurred in the Rust core
|
|
829
|
+
- 3 (INVALID_ARGUMENT): Invalid argument provided
|
|
830
|
+
- 4 (IO_ERROR): I/O operation failed
|
|
831
|
+
- 5 (PARSING_ERROR): Document parsing failed
|
|
832
|
+
- 6 (OCR_ERROR): OCR operation failed
|
|
833
|
+
- 7 (MISSING_DEPENDENCY): Required dependency not available
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
int: The error code (0 if no error has occurred)
|
|
837
|
+
|
|
838
|
+
Example:
|
|
839
|
+
>>> from kreuzberg import get_last_error_code, ErrorCode
|
|
840
|
+
>>> code = get_last_error_code()
|
|
841
|
+
>>> if code == ErrorCode.SUCCESS:
|
|
842
|
+
... print("No errors")
|
|
843
|
+
>>> elif code == ErrorCode.OCR_ERROR:
|
|
844
|
+
... print("OCR operation failed")
|
|
845
|
+
>>> elif code == 2:
|
|
846
|
+
... print("A panic occurred")
|
|
847
|
+
"""
|
|
848
|
+
return _get_last_error_code_impl()
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def get_error_details() -> dict[str, Any]:
|
|
852
|
+
"""Get detailed error information from the FFI layer.
|
|
853
|
+
|
|
854
|
+
Retrieves structured error information from the thread-local error storage
|
|
855
|
+
in the FFI layer. Returns comprehensive details about the most recent error
|
|
856
|
+
including message, code, type, and source location if available.
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
dict: Structured error details with keys:
|
|
860
|
+
- "message" (str): Human-readable error message
|
|
861
|
+
- "error_code" (int): Numeric error code (0-7)
|
|
862
|
+
- "error_type" (str): Error type name (e.g., "validation", "ocr")
|
|
863
|
+
- "source_file" (str | None): Source file path if available
|
|
864
|
+
- "source_function" (str | None): Function name if available
|
|
865
|
+
- "source_line" (int): Line number (0 if unknown)
|
|
866
|
+
- "context_info" (str | None): Additional context if available
|
|
867
|
+
- "is_panic" (bool): Whether error came from a panic
|
|
868
|
+
|
|
869
|
+
Example:
|
|
870
|
+
>>> from kreuzberg import get_error_details
|
|
871
|
+
>>> details = get_error_details()
|
|
872
|
+
>>> print(f"Error: {details['message']} (code={details['error_code']})")
|
|
873
|
+
>>> if details["source_file"]:
|
|
874
|
+
... print(f" at {details['source_file']}:{details['source_line']}")
|
|
875
|
+
"""
|
|
876
|
+
return _get_error_details_impl()
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def classify_error(message: str) -> int:
|
|
880
|
+
"""Classify an error message into a Kreuzberg error code.
|
|
881
|
+
|
|
882
|
+
Analyzes an error message and returns the most likely Kreuzberg error code
|
|
883
|
+
(0-7). Useful for categorizing error messages from external libraries or
|
|
884
|
+
system calls into standard Kreuzberg error categories.
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
message: The error message to classify
|
|
888
|
+
|
|
889
|
+
Returns:
|
|
890
|
+
int: Error code (0-7) representing the classification:
|
|
891
|
+
- 0 (Validation): Invalid parameters, constraints, format mismatches
|
|
892
|
+
- 1 (Parsing): Parse errors, corrupt data, malformed content
|
|
893
|
+
- 2 (OCR): OCR processing failures
|
|
894
|
+
- 3 (MissingDependency): Missing libraries or system dependencies
|
|
895
|
+
- 4 (Io): File I/O, permissions, disk errors
|
|
896
|
+
- 5 (Plugin): Plugin loading or registry errors
|
|
897
|
+
- 6 (UnsupportedFormat): Unsupported MIME types or formats
|
|
898
|
+
- 7 (Internal): Unknown or internal errors
|
|
899
|
+
|
|
900
|
+
Example:
|
|
901
|
+
>>> from kreuzberg import classify_error
|
|
902
|
+
>>> code = classify_error("Failed to open file: permission denied")
|
|
903
|
+
>>> if code == 4:
|
|
904
|
+
... print("This is an I/O error")
|
|
905
|
+
>>> code = classify_error("OCR processing failed")
|
|
906
|
+
>>> if code == 2:
|
|
907
|
+
... print("This is an OCR error")
|
|
908
|
+
"""
|
|
909
|
+
return _classify_error_impl(message)
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def error_code_name(code: int) -> str:
|
|
913
|
+
"""Get the human-readable name of an error code.
|
|
914
|
+
|
|
915
|
+
Args:
|
|
916
|
+
code: Numeric error code (0-7)
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
str: Human-readable error code name (e.g., "validation", "ocr")
|
|
920
|
+
Returns "unknown" for codes outside the valid range.
|
|
921
|
+
|
|
922
|
+
Example:
|
|
923
|
+
>>> from kreuzberg import error_code_name
|
|
924
|
+
>>> name = error_code_name(0)
|
|
925
|
+
>>> print(name) # output: "validation"
|
|
926
|
+
>>> name = error_code_name(2)
|
|
927
|
+
>>> print(name) # output: "ocr"
|
|
928
|
+
>>> name = error_code_name(99)
|
|
929
|
+
>>> print(name) # output: "unknown"
|
|
930
|
+
"""
|
|
931
|
+
return _error_code_name_impl(code)
|