kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kreuzberg might be problematic. Click here for more details.

kreuzberg/__init__.py ADDED
@@ -0,0 +1,931 @@
1
+ """Kreuzberg - Multi-language document intelligence framework.
2
+
3
+ This is a thin Python wrapper around a high-performance Rust core.
4
+ All extraction logic, chunking, quality processing, and language detection
5
+ are implemented in Rust for maximum performance.
6
+
7
+ Python-specific features:
8
+ - OCR backends: EasyOCR, PaddleOCR (Python-based OCR engines)
9
+ - Custom PostProcessors: Register your own Python processing logic
10
+
11
+ Architecture:
12
+ - Rust handles: Extraction, parsing, chunking, quality, language detection, NLP (keyword extraction), API server, MCP server, CLI
13
+ - Python adds: OCR backends (EasyOCR, PaddleOCR), custom postprocessors
14
+
15
+ Creating Custom PostProcessors:
16
+ >>> from kreuzberg import PostProcessorProtocol, register_post_processor, ExtractionResult
17
+ >>>
18
+ >>> class MyProcessor:
19
+ ... def name(self) -> str:
20
+ ... return "my_processor"
21
+ ...
22
+ ... def process(self, result: ExtractionResult) -> ExtractionResult:
23
+ ... result.metadata["custom_field"] = "custom_value"
24
+ ... return result
25
+ ...
26
+ ... def processing_stage(self) -> str:
27
+ ... return "middle"
28
+ >>>
29
+ >>> register_post_processor(MyProcessor())
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import hashlib
35
+ import json
36
+ import threading
37
+ from importlib.metadata import version
38
+ from typing import TYPE_CHECKING, Any
39
+
40
+ # ~keep: This must be imported FIRST before any Rust bindings
41
+ # ~keep: It sets up dynamic library paths for bundled native libraries (pdfium, etc.)
42
+ from kreuzberg import _setup_lib_path # noqa: F401
43
+ from kreuzberg._internal_bindings import (
44
+ ChunkingConfig,
45
+ EmbeddingConfig,
46
+ EmbeddingModelType,
47
+ EmbeddingPreset,
48
+ ExtractedTable,
49
+ ExtractionConfig,
50
+ ExtractionResult,
51
+ HierarchyConfig,
52
+ ImageExtractionConfig,
53
+ ImagePreprocessingConfig,
54
+ KeywordAlgorithm,
55
+ KeywordConfig,
56
+ LanguageDetectionConfig,
57
+ OcrConfig,
58
+ PageConfig,
59
+ PdfConfig,
60
+ PostProcessorConfig,
61
+ RakeParams,
62
+ TesseractConfig,
63
+ TokenReductionConfig,
64
+ YakeParams,
65
+ _discover_extraction_config_impl,
66
+ _load_extraction_config_from_file_impl,
67
+ clear_document_extractors,
68
+ clear_ocr_backends,
69
+ clear_post_processors,
70
+ clear_validators,
71
+ config_get_field,
72
+ config_merge,
73
+ config_to_json,
74
+ detect_mime_type_from_bytes,
75
+ get_embedding_preset,
76
+ get_extensions_for_mime,
77
+ get_last_panic_context,
78
+ get_valid_binarization_methods,
79
+ get_valid_language_codes,
80
+ get_valid_ocr_backends,
81
+ get_valid_token_reduction_levels,
82
+ list_document_extractors,
83
+ list_embedding_presets,
84
+ list_ocr_backends,
85
+ list_post_processors,
86
+ list_validators,
87
+ unregister_document_extractor,
88
+ unregister_ocr_backend,
89
+ unregister_post_processor,
90
+ unregister_validator,
91
+ validate_binarization_method,
92
+ validate_chunking_params,
93
+ validate_confidence,
94
+ validate_dpi,
95
+ validate_language_code,
96
+ validate_mime_type,
97
+ validate_ocr_backend,
98
+ validate_output_format,
99
+ validate_tesseract_oem,
100
+ validate_tesseract_psm,
101
+ validate_token_reduction_level,
102
+ )
103
+ from kreuzberg._internal_bindings import (
104
+ batch_extract_bytes as batch_extract_bytes_impl,
105
+ )
106
+ from kreuzberg._internal_bindings import (
107
+ batch_extract_bytes_sync as batch_extract_bytes_sync_impl,
108
+ )
109
+ from kreuzberg._internal_bindings import (
110
+ batch_extract_files as batch_extract_files_impl,
111
+ )
112
+ from kreuzberg._internal_bindings import (
113
+ batch_extract_files_sync as batch_extract_files_sync_impl,
114
+ )
115
+ from kreuzberg._internal_bindings import (
116
+ classify_error as _classify_error_impl,
117
+ )
118
+ from kreuzberg._internal_bindings import (
119
+ detect_mime_type_from_path as _detect_mime_type_from_path_impl,
120
+ )
121
+ from kreuzberg._internal_bindings import (
122
+ error_code_name as _error_code_name_impl,
123
+ )
124
+ from kreuzberg._internal_bindings import (
125
+ extract_bytes as extract_bytes_impl,
126
+ )
127
+ from kreuzberg._internal_bindings import (
128
+ extract_bytes_sync as extract_bytes_sync_impl,
129
+ )
130
+ from kreuzberg._internal_bindings import (
131
+ extract_file as extract_file_impl,
132
+ )
133
+ from kreuzberg._internal_bindings import (
134
+ extract_file_sync as extract_file_sync_impl,
135
+ )
136
+ from kreuzberg._internal_bindings import (
137
+ get_error_details as _get_error_details_impl,
138
+ )
139
+ from kreuzberg._internal_bindings import (
140
+ get_last_error_code as _get_last_error_code_impl,
141
+ )
142
+ from kreuzberg._internal_bindings import (
143
+ register_ocr_backend as _register_ocr_backend_impl,
144
+ )
145
+ from kreuzberg._internal_bindings import (
146
+ register_post_processor as _register_post_processor_impl,
147
+ )
148
+ from kreuzberg._internal_bindings import (
149
+ register_validator as _register_validator_impl,
150
+ )
151
+ from kreuzberg.exceptions import (
152
+ CacheError,
153
+ ErrorCode,
154
+ ImageProcessingError,
155
+ KreuzbergError,
156
+ MissingDependencyError,
157
+ OCRError,
158
+ PanicContext,
159
+ ParsingError,
160
+ PluginError,
161
+ ValidationError,
162
+ )
163
+ from kreuzberg.postprocessors.protocol import PostProcessorProtocol
164
+ from kreuzberg.types import Chunk, ChunkMetadata, ExtractedImage, Metadata
165
+
166
+ if TYPE_CHECKING:
167
+ from pathlib import Path
168
+
169
+ from kreuzberg.ocr.easyocr import EasyOCRBackend # noqa: F401
170
+ from kreuzberg.ocr.paddleocr import PaddleOCRBackend # noqa: F401
171
+
172
+ __version__ = version("kreuzberg")
173
+
174
+ __all__ = [
175
+ "CacheError",
176
+ "Chunk",
177
+ "ChunkMetadata",
178
+ "ChunkingConfig",
179
+ "EmbeddingConfig",
180
+ "EmbeddingModelType",
181
+ "EmbeddingPreset",
182
+ "ErrorCode",
183
+ "ExtractedImage",
184
+ "ExtractedTable",
185
+ "ExtractionConfig",
186
+ "ExtractionResult",
187
+ "HierarchyConfig",
188
+ "ImageExtractionConfig",
189
+ "ImagePreprocessingConfig",
190
+ "ImageProcessingError",
191
+ "KeywordAlgorithm",
192
+ "KeywordConfig",
193
+ "KreuzbergError",
194
+ "LanguageDetectionConfig",
195
+ "Metadata",
196
+ "MissingDependencyError",
197
+ "OCRError",
198
+ "OcrConfig",
199
+ "PageConfig",
200
+ "PanicContext",
201
+ "ParsingError",
202
+ "PdfConfig",
203
+ "PluginError",
204
+ "PostProcessorConfig",
205
+ "PostProcessorProtocol",
206
+ "RakeParams",
207
+ "TesseractConfig",
208
+ "TokenReductionConfig",
209
+ "ValidationError",
210
+ "YakeParams",
211
+ "__version__",
212
+ "batch_extract_bytes",
213
+ "batch_extract_bytes_sync",
214
+ "batch_extract_files",
215
+ "batch_extract_files_sync",
216
+ "classify_error",
217
+ "clear_document_extractors",
218
+ "clear_ocr_backends",
219
+ "clear_post_processors",
220
+ "clear_validators",
221
+ "config_get_field",
222
+ "config_merge",
223
+ "config_to_json",
224
+ "detect_mime_type",
225
+ "detect_mime_type_from_path",
226
+ "discover_extraction_config",
227
+ "error_code_name",
228
+ "extract_bytes",
229
+ "extract_bytes_sync",
230
+ "extract_file",
231
+ "extract_file_sync",
232
+ "get_embedding_preset",
233
+ "get_error_details",
234
+ "get_extensions_for_mime",
235
+ "get_last_error_code",
236
+ "get_last_panic_context",
237
+ "get_valid_binarization_methods",
238
+ "get_valid_language_codes",
239
+ "get_valid_ocr_backends",
240
+ "get_valid_token_reduction_levels",
241
+ "list_document_extractors",
242
+ "list_embedding_presets",
243
+ "list_ocr_backends",
244
+ "list_post_processors",
245
+ "list_validators",
246
+ "load_extraction_config_from_file",
247
+ "register_ocr_backend",
248
+ "register_post_processor",
249
+ "register_validator",
250
+ "unregister_document_extractor",
251
+ "unregister_ocr_backend",
252
+ "unregister_post_processor",
253
+ "unregister_validator",
254
+ "validate_binarization_method",
255
+ "validate_chunking_params",
256
+ "validate_confidence",
257
+ "validate_dpi",
258
+ "validate_language_code",
259
+ "validate_mime_type",
260
+ "validate_ocr_backend",
261
+ "validate_output_format",
262
+ "validate_tesseract_oem",
263
+ "validate_tesseract_psm",
264
+ "validate_token_reduction_level",
265
+ ]
266
+
267
+
268
+ _REGISTERED_OCR_BACKENDS: dict[tuple[str, str], Any] = {}
269
+
270
+ _OCR_CACHE_LOCK = threading.Lock()
271
+
272
+ _MAX_CACHE_SIZE = 10
273
+
274
+
275
+ def _hash_kwargs(kwargs: dict[str, Any]) -> str:
276
+ try:
277
+ serialized = json.dumps(kwargs, sort_keys=True, default=str)
278
+ return hashlib.md5(serialized.encode()).hexdigest() # noqa: S324
279
+ except (TypeError, ValueError):
280
+ return hashlib.md5(repr(kwargs).encode()).hexdigest() # noqa: S324
281
+
282
+
283
+ def _ensure_ocr_backend_registered(
284
+ config: ExtractionConfig,
285
+ easyocr_kwargs: dict[str, Any] | None,
286
+ paddleocr_kwargs: dict[str, Any] | None,
287
+ ) -> None:
288
+ if config.ocr is None:
289
+ return
290
+
291
+ backend_name = config.ocr.backend
292
+
293
+ if backend_name == "tesseract":
294
+ return
295
+
296
+ kwargs_map = {
297
+ "easyocr": easyocr_kwargs or {},
298
+ "paddleocr": paddleocr_kwargs or {},
299
+ }
300
+ kwargs = kwargs_map.get(backend_name, {})
301
+
302
+ with _OCR_CACHE_LOCK:
303
+ cache_key = (backend_name, _hash_kwargs(kwargs))
304
+
305
+ if cache_key in _REGISTERED_OCR_BACKENDS:
306
+ return
307
+
308
+ if len(_REGISTERED_OCR_BACKENDS) >= _MAX_CACHE_SIZE:
309
+ oldest_key = next(iter(_REGISTERED_OCR_BACKENDS))
310
+ del _REGISTERED_OCR_BACKENDS[oldest_key]
311
+
312
+ backend: Any
313
+ if backend_name == "easyocr":
314
+ try:
315
+ from kreuzberg.ocr.easyocr import EasyOCRBackend # noqa: PLC0415
316
+
317
+ if "languages" not in kwargs:
318
+ kwargs["languages"] = [config.ocr.language]
319
+
320
+ backend = EasyOCRBackend(**kwargs)
321
+ except ImportError as e:
322
+ raise MissingDependencyError.create_for_package(
323
+ dependency_group="easyocr",
324
+ functionality="EasyOCR backend",
325
+ package_name="easyocr",
326
+ ) from e
327
+ elif backend_name == "paddleocr":
328
+ try:
329
+ from kreuzberg.ocr.paddleocr import PaddleOCRBackend # noqa: PLC0415
330
+
331
+ if "lang" not in kwargs:
332
+ kwargs["lang"] = config.ocr.language
333
+
334
+ backend = PaddleOCRBackend(**kwargs)
335
+ except ImportError as e:
336
+ raise MissingDependencyError.create_for_package(
337
+ dependency_group="paddleocr",
338
+ functionality="PaddleOCR backend",
339
+ package_name="paddleocr",
340
+ ) from e
341
+ else:
342
+ return
343
+
344
+ register_ocr_backend(backend)
345
+ _REGISTERED_OCR_BACKENDS[cache_key] = backend
346
+
347
+
348
+ def extract_file_sync(
349
+ file_path: str | Path,
350
+ mime_type: str | None = None,
351
+ config: ExtractionConfig | None = None,
352
+ *,
353
+ easyocr_kwargs: dict[str, Any] | None = None,
354
+ paddleocr_kwargs: dict[str, Any] | None = None,
355
+ ) -> ExtractionResult:
356
+ """Extract content from a file (synchronous).
357
+
358
+ Args:
359
+ file_path: Path to the file (str or pathlib.Path)
360
+ mime_type: Optional MIME type hint (auto-detected if None)
361
+ config: Extraction configuration (uses defaults if None)
362
+ easyocr_kwargs: EasyOCR initialization options (languages, use_gpu, beam_width, etc.)
363
+ paddleocr_kwargs: PaddleOCR initialization options (lang, use_angle_cls, show_log, etc.)
364
+
365
+ Returns:
366
+ ExtractionResult with content, metadata, and tables
367
+
368
+ Example:
369
+ >>> from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig
370
+ >>> # Basic usage
371
+ >>> result = extract_file_sync("document.pdf")
372
+ >>>
373
+ >>> # With Tesseract configuration
374
+ >>> config = ExtractionConfig(
375
+ ... ocr=OcrConfig(
376
+ ... backend="tesseract",
377
+ ... language="eng",
378
+ ... tesseract_config=TesseractConfig(
379
+ ... psm=6,
380
+ ... enable_table_detection=True,
381
+ ... tessedit_char_whitelist="0123456789",
382
+ ... ),
383
+ ... )
384
+ ... )
385
+ >>> result = extract_file_sync("invoice.pdf", config=config)
386
+ >>>
387
+ >>> # With EasyOCR custom options
388
+ >>> config = ExtractionConfig(ocr=OcrConfig(backend="easyocr", language="eng"))
389
+ >>> result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True, "beam_width": 10})
390
+ """
391
+ if config is None:
392
+ config = ExtractionConfig()
393
+
394
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
395
+
396
+ return extract_file_sync_impl(str(file_path), mime_type, config)
397
+
398
+
399
+ def extract_bytes_sync(
400
+ data: bytes | bytearray,
401
+ mime_type: str,
402
+ config: ExtractionConfig | None = None,
403
+ *,
404
+ easyocr_kwargs: dict[str, Any] | None = None,
405
+ paddleocr_kwargs: dict[str, Any] | None = None,
406
+ ) -> ExtractionResult:
407
+ """Extract content from bytes (synchronous).
408
+
409
+ Args:
410
+ data: File content as bytes or bytearray
411
+ mime_type: MIME type of the data (required for format detection)
412
+ config: Extraction configuration (uses defaults if None)
413
+ easyocr_kwargs: EasyOCR initialization options
414
+ paddleocr_kwargs: PaddleOCR initialization options
415
+
416
+ Returns:
417
+ ExtractionResult with content, metadata, and tables
418
+ """
419
+ if config is None:
420
+ config = ExtractionConfig()
421
+
422
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
423
+
424
+ return extract_bytes_sync_impl(bytes(data), mime_type, config)
425
+
426
+
427
+ def batch_extract_files_sync(
428
+ paths: list[str | Path],
429
+ config: ExtractionConfig | None = None,
430
+ *,
431
+ easyocr_kwargs: dict[str, Any] | None = None,
432
+ paddleocr_kwargs: dict[str, Any] | None = None,
433
+ ) -> list[ExtractionResult]:
434
+ """Extract content from multiple files in parallel (synchronous).
435
+
436
+ Args:
437
+ paths: List of file paths
438
+ config: Extraction configuration (uses defaults if None)
439
+ easyocr_kwargs: EasyOCR initialization options
440
+ paddleocr_kwargs: PaddleOCR initialization options
441
+
442
+ Returns:
443
+ List of ExtractionResults (one per file)
444
+ """
445
+ if config is None:
446
+ config = ExtractionConfig()
447
+
448
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
449
+
450
+ return batch_extract_files_sync_impl([str(p) for p in paths], config)
451
+
452
+
453
+ def batch_extract_bytes_sync(
454
+ data_list: list[bytes | bytearray],
455
+ mime_types: list[str],
456
+ config: ExtractionConfig | None = None,
457
+ *,
458
+ easyocr_kwargs: dict[str, Any] | None = None,
459
+ paddleocr_kwargs: dict[str, Any] | None = None,
460
+ ) -> list[ExtractionResult]:
461
+ """Extract content from multiple byte arrays in parallel (synchronous).
462
+
463
+ Args:
464
+ data_list: List of file contents as bytes/bytearray
465
+ mime_types: List of MIME types (one per data item)
466
+ config: Extraction configuration (uses defaults if None)
467
+ easyocr_kwargs: EasyOCR initialization options
468
+ paddleocr_kwargs: PaddleOCR initialization options
469
+
470
+ Returns:
471
+ List of ExtractionResults (one per data item)
472
+ """
473
+ if config is None:
474
+ config = ExtractionConfig()
475
+
476
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
477
+
478
+ return batch_extract_bytes_sync_impl([bytes(d) for d in data_list], mime_types, config)
479
+
480
+
481
+ async def extract_file(
482
+ file_path: str | Path,
483
+ mime_type: str | None = None,
484
+ config: ExtractionConfig | None = None,
485
+ *,
486
+ easyocr_kwargs: dict[str, Any] | None = None,
487
+ paddleocr_kwargs: dict[str, Any] | None = None,
488
+ ) -> ExtractionResult:
489
+ """Extract content from a file (asynchronous).
490
+
491
+ Args:
492
+ file_path: Path to the file (str or pathlib.Path)
493
+ mime_type: Optional MIME type hint (auto-detected if None)
494
+ config: Extraction configuration (uses defaults if None)
495
+ easyocr_kwargs: EasyOCR initialization options
496
+ paddleocr_kwargs: PaddleOCR initialization options
497
+
498
+ Returns:
499
+ ExtractionResult with content, metadata, and tables
500
+ """
501
+ if config is None:
502
+ config = ExtractionConfig()
503
+
504
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
505
+
506
+ return await extract_file_impl(str(file_path), mime_type, config)
507
+
508
+
509
+ async def extract_bytes(
510
+ data: bytes | bytearray,
511
+ mime_type: str,
512
+ config: ExtractionConfig | None = None,
513
+ *,
514
+ easyocr_kwargs: dict[str, Any] | None = None,
515
+ paddleocr_kwargs: dict[str, Any] | None = None,
516
+ ) -> ExtractionResult:
517
+ """Extract content from bytes (asynchronous).
518
+
519
+ Args:
520
+ data: File content as bytes or bytearray
521
+ mime_type: MIME type of the data (required for format detection)
522
+ config: Extraction configuration (uses defaults if None)
523
+ easyocr_kwargs: EasyOCR initialization options
524
+ paddleocr_kwargs: PaddleOCR initialization options
525
+
526
+ Returns:
527
+ ExtractionResult with content, metadata, and tables
528
+ """
529
+ if config is None:
530
+ config = ExtractionConfig()
531
+
532
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
533
+
534
+ return await extract_bytes_impl(bytes(data), mime_type, config)
535
+
536
+
537
+ async def batch_extract_files(
538
+ paths: list[str | Path],
539
+ config: ExtractionConfig | None = None,
540
+ *,
541
+ easyocr_kwargs: dict[str, Any] | None = None,
542
+ paddleocr_kwargs: dict[str, Any] | None = None,
543
+ ) -> list[ExtractionResult]:
544
+ """Extract content from multiple files in parallel (asynchronous).
545
+
546
+ Args:
547
+ paths: List of file paths
548
+ config: Extraction configuration (uses defaults if None)
549
+ easyocr_kwargs: EasyOCR initialization options
550
+ paddleocr_kwargs: PaddleOCR initialization options
551
+
552
+ Returns:
553
+ List of ExtractionResults (one per file)
554
+ """
555
+ if config is None:
556
+ config = ExtractionConfig()
557
+
558
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
559
+
560
+ return await batch_extract_files_impl([str(p) for p in paths], config)
561
+
562
+
563
+ async def batch_extract_bytes(
564
+ data_list: list[bytes | bytearray],
565
+ mime_types: list[str],
566
+ config: ExtractionConfig | None = None,
567
+ *,
568
+ easyocr_kwargs: dict[str, Any] | None = None,
569
+ paddleocr_kwargs: dict[str, Any] | None = None,
570
+ ) -> list[ExtractionResult]:
571
+ """Extract content from multiple byte arrays in parallel (asynchronous).
572
+
573
+ Args:
574
+ data_list: List of file contents as bytes/bytearray
575
+ mime_types: List of MIME types (one per data item)
576
+ config: Extraction configuration (uses defaults if None)
577
+ easyocr_kwargs: EasyOCR initialization options
578
+ paddleocr_kwargs: PaddleOCR initialization options
579
+
580
+ Returns:
581
+ List of ExtractionResults (one per data item)
582
+ """
583
+ if config is None:
584
+ config = ExtractionConfig()
585
+
586
+ _ensure_ocr_backend_registered(config, easyocr_kwargs, paddleocr_kwargs)
587
+
588
+ return await batch_extract_bytes_impl([bytes(d) for d in data_list], mime_types, config)
589
+
590
+
591
+ def detect_mime_type(data: bytes | bytearray) -> str:
592
+ r"""Detect MIME type from file bytes.
593
+
594
+ Args:
595
+ data: File content as bytes or bytearray
596
+
597
+ Returns:
598
+ Detected MIME type (e.g., "application/pdf", "image/png")
599
+
600
+ Example:
601
+ >>> from kreuzberg import detect_mime_type
602
+ >>> pdf_bytes = b"%PDF-1.4\\n"
603
+ >>> mime_type = detect_mime_type(pdf_bytes)
604
+ >>> assert "pdf" in mime_type.lower()
605
+ """
606
+ return detect_mime_type_from_bytes(bytes(data))
607
+
608
+
609
+ def detect_mime_type_from_path(path: str | Path) -> str:
610
+ """Detect MIME type from file path.
611
+
612
+ Reads the file at the given path and detects its MIME type using magic number detection.
613
+
614
+ Args:
615
+ path: Path to the file (str or pathlib.Path)
616
+
617
+ Returns:
618
+ Detected MIME type (e.g., "application/pdf", "text/plain")
619
+
620
+ Raises:
621
+ OSError: If file cannot be read (file not found, permission denied, etc.)
622
+ RuntimeError: If MIME type detection fails
623
+
624
+ Example:
625
+ >>> from kreuzberg import detect_mime_type_from_path
626
+ >>> mime_type = detect_mime_type_from_path("document.pdf")
627
+ >>> assert "pdf" in mime_type.lower()
628
+ """
629
+ return _detect_mime_type_from_path_impl(str(path))
630
+
631
+
632
+ def discover_extraction_config() -> ExtractionConfig | None:
633
+ """Discover extraction configuration from the environment.
634
+
635
+ Attempts to locate a Kreuzberg configuration file using the following strategy:
636
+ 1. If KREUZBERG_CONFIG_PATH environment variable is set, load from that path
637
+ 2. Otherwise, search for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json
638
+ in the current directory and parent directories (walking up the tree)
639
+ 3. Return None if no configuration file is found
640
+
641
+ The search order for auto-discovery (when env var is not set):
642
+ - kreuzberg.toml (highest priority)
643
+ - kreuzberg.yaml
644
+ - kreuzberg.json (lowest priority)
645
+
646
+ Returns:
647
+ ExtractionConfig if a configuration file is found and valid, None otherwise
648
+
649
+ Raises:
650
+ RuntimeError: If the discovered config file is invalid or cannot be parsed
651
+ IOError: If there's an error reading the config file
652
+
653
+ Example:
654
+ >>> from kreuzberg import discover_extraction_config
655
+ >>> config = discover_extraction_config()
656
+ >>> if config:
657
+ ... print(f"Loaded config with use_cache={config.use_cache}")
658
+ ... else:
659
+ ... print("No config found, using defaults")
660
+ """
661
+ return _discover_extraction_config_impl()
662
+
663
+
664
+ def load_extraction_config_from_file(path: str | Path) -> ExtractionConfig:
665
+ """Load extraction configuration from a specific file.
666
+
667
+ Loads an ExtractionConfig from the specified file path. The file format
668
+ is determined by the file extension (.toml, .yaml, or .json).
669
+
670
+ Args:
671
+ path: Path to the configuration file (str or pathlib.Path).
672
+ Supports absolute and relative paths.
673
+
674
+ Returns:
675
+ ExtractionConfig parsed from the file
676
+
677
+ Raises:
678
+ FileNotFoundError: If the configuration file does not exist
679
+ RuntimeError: If the file cannot be read or parsed
680
+ ValueError: If the file format is invalid or unsupported
681
+
682
+ Example:
683
+ >>> from kreuzberg import load_extraction_config_from_file
684
+ >>> config = load_extraction_config_from_file("kreuzberg.toml")
685
+ >>> result = extract_file_sync("document.pdf", config=config)
686
+ """
687
+ return _load_extraction_config_from_file_impl(str(path))
688
+
689
+
690
+ def register_ocr_backend(backend: Any) -> None:
691
+ """Register a Python OCR backend with the Rust core.
692
+
693
+ This function validates the Python backend object, wraps it in a Rust OcrBackend
694
+ implementation, and registers it with the global OCR backend registry. Once registered,
695
+ the backend can be used by the Rust CLI, API, and MCP server.
696
+
697
+ Args:
698
+ backend: Python object implementing the OCR backend protocol
699
+
700
+ Required methods on the backend object:
701
+ - name() -> str: Return backend name (must be non-empty)
702
+ - supported_languages() -> list[str]: Return list of supported language codes
703
+ - process_image(image_bytes: bytes, language: str) -> dict: Process image and return result dict
704
+
705
+ Optional methods:
706
+ - process_file(path: str, language: str) -> dict: Custom file processing
707
+ - initialize(): Called when backend is registered
708
+ - shutdown(): Called when backend is unregistered
709
+ - version() -> str: Backend version (defaults to "1.0.0")
710
+
711
+ Raises:
712
+ TypeError: If backend is missing required methods (name, supported_languages, process_image)
713
+ ValueError: If backend name is empty or already registered
714
+ RuntimeError: If registration with the Rust registry fails
715
+
716
+ Example:
717
+ >>> from kreuzberg import register_ocr_backend
718
+ >>> class MyOcrBackend:
719
+ ... def name(self) -> str:
720
+ ... return "my-ocr"
721
+ ...
722
+ ... def supported_languages(self) -> list[str]:
723
+ ... return ["eng", "deu", "fra"]
724
+ ...
725
+ ... def process_image(self, image_bytes: bytes, language: str) -> dict:
726
+ ... return {"content": "extracted text", "metadata": {"confidence": 0.95}, "tables": []}
727
+ >>> register_ocr_backend(MyOcrBackend())
728
+ """
729
+ return _register_ocr_backend_impl(backend)
730
+
731
+
732
+ def register_post_processor(processor: Any) -> None:
733
+ """Register a Python PostProcessor with the Rust core.
734
+
735
+ This function validates the Python processor object, wraps it in a Rust PostProcessor
736
+ implementation, and registers it with the global PostProcessor registry. Once registered,
737
+ the processor will be called automatically after extraction to enrich results.
738
+
739
+ Args:
740
+ processor: Python object implementing the PostProcessor protocol
741
+
742
+ Required methods on the processor object:
743
+ - name() -> str: Return processor name (must be non-empty)
744
+ - process(result: dict) -> dict: Process and enrich the extraction result
745
+ - processing_stage() -> str: Return "early", "middle", or "late" (REQUIRED, not optional)
746
+
747
+ Optional methods:
748
+ - initialize(): Called when processor is registered
749
+ - shutdown(): Called when processor is unregistered
750
+ - version() -> str: Processor version (defaults to "1.0.0")
751
+
752
+ Raises:
753
+ TypeError: If processor is missing required methods (name, process, processing_stage)
754
+ ValueError: If processor name is empty or already registered
755
+ RuntimeError: If registration with the Rust registry fails
756
+
757
+ Example:
758
+ >>> from kreuzberg import register_post_processor
759
+ >>> class EntityExtractor:
760
+ ... def name(self) -> str:
761
+ ... return "entity_extraction"
762
+ ...
763
+ ... def processing_stage(self) -> str:
764
+ ... return "early"
765
+ ...
766
+ ... def process(self, result: dict) -> dict:
767
+ ... entities = {"PERSON": ["John Doe"], "ORG": ["Microsoft"]}
768
+ ... result["metadata"]["entities"] = entities
769
+ ... return result
770
+ >>> register_post_processor(EntityExtractor())
771
+ """
772
+ return _register_post_processor_impl(processor)
773
+
774
+
775
+ def register_validator(validator: Any) -> None:
776
+ """Register a Python Validator with the Rust core.
777
+
778
+ This function validates the Python validator object, wraps it in a Rust Validator
779
+ implementation, and registers it with the global Validator registry. Once registered,
780
+ the validator will be called automatically after extraction to validate results.
781
+
782
+ Args:
783
+ validator: Python object implementing the Validator protocol
784
+
785
+ Required methods on the validator object:
786
+ - name() -> str: Return validator name (must be non-empty)
787
+ - validate(result: dict) -> None: Validate the extraction result (raise error to fail)
788
+
789
+ Optional methods:
790
+ - should_validate(result: dict) -> bool: Check if validator should run (defaults to True)
791
+ - priority() -> int: Return priority (defaults to 50, higher runs first)
792
+ - initialize(): Called when validator is registered
793
+ - shutdown(): Called when validator is unregistered
794
+ - version() -> str: Validator version (defaults to "1.0.0")
795
+
796
+ Raises:
797
+ TypeError: If validator is missing required methods (name, validate)
798
+ ValueError: If validator name is empty or already registered
799
+ RuntimeError: If registration with the Rust registry fails
800
+
801
+ Example:
802
+ >>> from kreuzberg import register_validator
803
+ >>> from kreuzberg.exceptions import ValidationError
804
+ >>> class MinLengthValidator:
805
+ ... def name(self) -> str:
806
+ ... return "min_length_validator"
807
+ ...
808
+ ... def priority(self) -> int:
809
+ ... return 100
810
+ ...
811
+ ... def validate(self, result: dict) -> None:
812
+ ... if len(result["content"]) < 100:
813
+ ... raise ValidationError(f"Content too short")
814
+ >>> register_validator(MinLengthValidator())
815
+ """
816
+ return _register_validator_impl(validator)
817
+
818
+
819
+ def get_last_error_code() -> int | None:
820
+ """Get the last error code from the FFI layer.
821
+
822
+ Returns the error code from the most recent operation. Useful for debugging
823
+ and understanding what went wrong when an operation fails.
824
+
825
+ Error codes:
826
+ - 0 (SUCCESS): No error occurred
827
+ - 1 (GENERIC_ERROR): Generic unspecified error
828
+ - 2 (PANIC): A panic occurred in the Rust core
829
+ - 3 (INVALID_ARGUMENT): Invalid argument provided
830
+ - 4 (IO_ERROR): I/O operation failed
831
+ - 5 (PARSING_ERROR): Document parsing failed
832
+ - 6 (OCR_ERROR): OCR operation failed
833
+ - 7 (MISSING_DEPENDENCY): Required dependency not available
834
+
835
+ Returns:
836
+ int: The error code (0 if no error has occurred)
837
+
838
+ Example:
839
+ >>> from kreuzberg import get_last_error_code, ErrorCode
840
+ >>> code = get_last_error_code()
841
+ >>> if code == ErrorCode.SUCCESS:
842
+ ... print("No errors")
843
+ >>> elif code == ErrorCode.OCR_ERROR:
844
+ ... print("OCR operation failed")
845
+ >>> elif code == 2:
846
+ ... print("A panic occurred")
847
+ """
848
+ return _get_last_error_code_impl()
849
+
850
+
851
+ def get_error_details() -> dict[str, Any]:
852
+ """Get detailed error information from the FFI layer.
853
+
854
+ Retrieves structured error information from the thread-local error storage
855
+ in the FFI layer. Returns comprehensive details about the most recent error
856
+ including message, code, type, and source location if available.
857
+
858
+ Returns:
859
+ dict: Structured error details with keys:
860
+ - "message" (str): Human-readable error message
861
+ - "error_code" (int): Numeric error code (0-7)
862
+ - "error_type" (str): Error type name (e.g., "validation", "ocr")
863
+ - "source_file" (str | None): Source file path if available
864
+ - "source_function" (str | None): Function name if available
865
+ - "source_line" (int): Line number (0 if unknown)
866
+ - "context_info" (str | None): Additional context if available
867
+ - "is_panic" (bool): Whether error came from a panic
868
+
869
+ Example:
870
+ >>> from kreuzberg import get_error_details
871
+ >>> details = get_error_details()
872
+ >>> print(f"Error: {details['message']} (code={details['error_code']})")
873
+ >>> if details["source_file"]:
874
+ ... print(f" at {details['source_file']}:{details['source_line']}")
875
+ """
876
+ return _get_error_details_impl()
877
+
878
+
879
+ def classify_error(message: str) -> int:
880
+ """Classify an error message into a Kreuzberg error code.
881
+
882
+ Analyzes an error message and returns the most likely Kreuzberg error code
883
+ (0-7). Useful for categorizing error messages from external libraries or
884
+ system calls into standard Kreuzberg error categories.
885
+
886
+ Args:
887
+ message: The error message to classify
888
+
889
+ Returns:
890
+ int: Error code (0-7) representing the classification:
891
+ - 0 (Validation): Invalid parameters, constraints, format mismatches
892
+ - 1 (Parsing): Parse errors, corrupt data, malformed content
893
+ - 2 (OCR): OCR processing failures
894
+ - 3 (MissingDependency): Missing libraries or system dependencies
895
+ - 4 (Io): File I/O, permissions, disk errors
896
+ - 5 (Plugin): Plugin loading or registry errors
897
+ - 6 (UnsupportedFormat): Unsupported MIME types or formats
898
+ - 7 (Internal): Unknown or internal errors
899
+
900
+ Example:
901
+ >>> from kreuzberg import classify_error
902
+ >>> code = classify_error("Failed to open file: permission denied")
903
+ >>> if code == 4:
904
+ ... print("This is an I/O error")
905
+ >>> code = classify_error("OCR processing failed")
906
+ >>> if code == 2:
907
+ ... print("This is an OCR error")
908
+ """
909
+ return _classify_error_impl(message)
910
+
911
+
912
+ def error_code_name(code: int) -> str:
913
+ """Get the human-readable name of an error code.
914
+
915
+ Args:
916
+ code: Numeric error code (0-7)
917
+
918
+ Returns:
919
+ str: Human-readable error code name (e.g., "validation", "ocr")
920
+ Returns "unknown" for codes outside the valid range.
921
+
922
+ Example:
923
+ >>> from kreuzberg import error_code_name
924
+ >>> name = error_code_name(0)
925
+ >>> print(name) # output: "validation"
926
+ >>> name = error_code_name(2)
927
+ >>> print(name) # output: "ocr"
928
+ >>> name = error_code_name(99)
929
+ >>> print(name) # output: "unknown"
930
+ """
931
+ return _error_code_name_impl(code)