kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kreuzberg might be problematic. Click here for more details.

@@ -0,0 +1,150 @@
1
+ """Protocol for Python OCR backends compatible with Rust FFI bridge.
2
+
3
+ This module defines the interface that Python OCR backends must implement
4
+ to be registered with the Rust extraction core via the FFI bridge.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Protocol
10
+
11
+
12
+ class OcrBackendProtocol(Protocol):
13
+ """Protocol for OCR backends compatible with Rust FFI bridge.
14
+
15
+ Python OCR backends implementing this protocol can be registered with the
16
+ Rust extraction core using `register_ocr_backend()`. Once registered, they
17
+ can be used by all extraction APIs (Python, Rust CLI, API server, MCP).
18
+
19
+ Required Methods:
20
+ name: Return backend name (e.g., 'easyocr', 'paddleocr')
21
+ supported_languages: Return list of supported language codes
22
+ process_image: Process image bytes and return extraction result
23
+
24
+ Optional Methods:
25
+ process_file: Process file from path (defaults to reading and calling process_image)
26
+ initialize: Called when backend is registered (e.g., load models)
27
+ shutdown: Called when backend is unregistered (e.g., cleanup resources)
28
+ version: Return backend version string (defaults to '1.0.0')
29
+
30
+ Example:
31
+ >>> class MyOcrBackend:
32
+ ... def name(self) -> str:
33
+ ... return "my-ocr"
34
+ ...
35
+ ... def supported_languages(self) -> list[str]:
36
+ ... return ["eng", "deu", "fra"]
37
+ ...
38
+ ... def process_image(self, image_bytes: bytes, language: str) -> dict[str, Any]:
39
+ ... # Process image and extract text
40
+ ... return {
41
+ ... "content": "extracted text",
42
+ ... "metadata": {"confidence": 0.95, "width": 800, "height": 600},
43
+ ... "tables": [],
44
+ ... }
45
+ >>> from kreuzberg import register_ocr_backend
46
+ >>> backend = MyOcrBackend()
47
+ >>> register_ocr_backend(backend)
48
+
49
+ """
50
+
51
+ def name(self) -> str:
52
+ """Return backend name (e.g., 'easyocr', 'paddleocr').
53
+
54
+ The name must be unique across all registered backends and is used
55
+ to identify and select the backend for OCR operations.
56
+
57
+ Returns:
58
+ Backend name as lowercase string with no spaces
59
+
60
+ """
61
+ ...
62
+
63
+ def supported_languages(self) -> list[str]:
64
+ """Return list of supported language codes.
65
+
66
+ Language codes should use standard ISO 639 codes (e.g., 'eng', 'deu')
67
+ or library-specific codes (e.g., 'ch_sim', 'ch_tra' for Chinese).
68
+
69
+ Returns:
70
+ List of supported language code strings
71
+
72
+ """
73
+ ...
74
+
75
+ def process_image(self, image_bytes: bytes, language: str) -> dict[str, Any]:
76
+ r"""Process image bytes and return extraction result.
77
+
78
+ This is the core OCR method that takes raw image data and returns
79
+ extracted text with metadata. The method should handle all image
80
+ formats supported by the backend.
81
+
82
+ Args:
83
+ image_bytes: Raw image data (PNG, JPEG, TIFF, etc.)
84
+ language: Language code for OCR (must be in supported_languages())
85
+
86
+ Returns:
87
+ Dictionary with extraction result:
88
+ {
89
+ "content": "extracted text content",
90
+ "metadata": {
91
+ "width": 800, # Optional: image width
92
+ "height": 600, # Optional: image height
93
+ "confidence": 0.95, # Optional: OCR confidence score
94
+ ... # Any other metadata
95
+ },
96
+ "tables": [ # Optional: extracted tables (list of dicts)
97
+ {
98
+ "cells": [["row1col1", "row1col2"], ["row2col1", "row2col2"]], # Required: 2D array of strings
99
+ "markdown": "| Header |\\n| ------ |\\n| Cell |", # Required: markdown representation
100
+ "page_number": 1 # Required: 1-indexed page number
101
+ }
102
+ ]
103
+ }
104
+
105
+ """
106
+ ...
107
+
108
+ def process_file(self, path: str, language: str) -> dict[str, Any]:
109
+ """Process file from path (optional, defaults to reading and calling process_image).
110
+
111
+ Backends can override this method if they have optimized file processing
112
+ that avoids loading the entire file into memory.
113
+
114
+ Args:
115
+ path: Path to image file
116
+ language: Language code for OCR
117
+
118
+ Returns:
119
+ Same format as process_image()
120
+
121
+ """
122
+ ...
123
+
124
+ def initialize(self) -> None:
125
+ """Initialize backend (optional, called when backend is registered).
126
+
127
+ Use this method to:
128
+ - Load ML models
129
+ - Initialize GPU/CUDA
130
+ - Download required data files
131
+ - Set up caching
132
+
133
+ This method is called exactly once when the backend is registered
134
+ via `register_ocr_backend()`.
135
+ """
136
+ ...
137
+
138
+ def shutdown(self) -> None:
139
+ """Shutdown backend (optional, called when backend is unregistered).
140
+
141
+ Use this method to:
142
+ - Cleanup temporary files
143
+ - Release GPU memory
144
+ - Close file handles
145
+ - Save cached data
146
+
147
+ This method is called when the backend is unregistered via
148
+ `unregister_ocr_backend()` or when the program exits.
149
+ """
150
+ ...
@@ -0,0 +1,61 @@
1
+ """Custom PostProcessor support for Kreuzberg.
2
+
3
+ This module provides the protocol interface for creating custom post-processors
4
+ that can be registered with the Rust extraction pipeline.
5
+
6
+ Example:
7
+ >>> from kreuzberg import PostProcessorProtocol, register_post_processor, ExtractionResult
8
+ >>>
9
+ >>> class MyCustomProcessor:
10
+ ... '''Custom processor that adds custom metadata.'''
11
+ ...
12
+ ... def name(self) -> str:
13
+ ... return "my_custom_processor"
14
+ ...
15
+ ... def process(self, result: ExtractionResult) -> ExtractionResult:
16
+ ... # Add custom processing logic
17
+ ... word_count = len(result.content.split())
18
+ ... result.metadata["custom_word_count"] = word_count
19
+ ... result.metadata["custom_tag"] = "processed"
20
+ ... return result
21
+ ...
22
+ ... def processing_stage(self) -> str:
23
+ ... return "middle" # or "early" or "late"
24
+ ...
25
+ ... def initialize(self) -> None:
26
+ ... # Optional: Initialize resources (e.g., load ML models)
27
+ ... pass
28
+ ...
29
+ ... def shutdown(self) -> None:
30
+ ... # Optional: Release resources
31
+ ... pass
32
+ >>>
33
+ >>> # Register the processor
34
+ >>> processor = MyCustomProcessor()
35
+ >>> register_post_processor(processor)
36
+ >>>
37
+ >>> # Now it will be called automatically during extraction
38
+ >>> from kreuzberg import extract_file_sync
39
+ >>> result = extract_file_sync("document.pdf")
40
+ >>> print(result.metadata.get("custom_word_count"))
41
+ >>> print(result.metadata.get("custom_tag"))
42
+
43
+ Processing Stages:
44
+ - **early**: Runs first in the pipeline (e.g., language detection)
45
+ - **middle**: Runs in the middle (default, most processors)
46
+ - **late**: Runs last (e.g., final formatting, summaries)
47
+
48
+ Thread Safety:
49
+ Processors are called from the Rust core which may use threading.
50
+ Ensure your processor is thread-safe or uses appropriate locking.
51
+
52
+ Performance:
53
+ Keep processing fast - slow processors will impact extraction performance.
54
+ Consider lazy initialization for expensive resources (ML models, etc.).
55
+ """
56
+
57
+ from __future__ import annotations
58
+
59
+ from kreuzberg.postprocessors.protocol import PostProcessorProtocol
60
+
61
+ __all__ = ["PostProcessorProtocol"]
@@ -0,0 +1,83 @@
1
+ """PostProcessor protocol interface.
2
+
3
+ This module defines the protocol that all Python postprocessors must implement
4
+ to be registered with the Rust core via the FFI bridge.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING, Literal, Protocol
10
+
11
+ if TYPE_CHECKING:
12
+ from kreuzberg._internal_bindings import ExtractionResult
13
+
14
+
15
+ class PostProcessorProtocol(Protocol):
16
+ """Protocol for Python postprocessors.
17
+
18
+ All postprocessors must implement these methods to be compatible
19
+ with the Rust PostProcessor FFI bridge.
20
+ """
21
+
22
+ def name(self) -> str:
23
+ """Return the unique name of this postprocessor.
24
+
25
+ Returns:
26
+ str: Processor name (e.g., "entity_extraction", "keyword_extraction")
27
+
28
+ """
29
+ ...
30
+
31
+ def process(self, result: ExtractionResult) -> ExtractionResult:
32
+ """Process and enrich an extraction result.
33
+
34
+ Args:
35
+ result: ExtractionResult with extracted content, metadata, and tables
36
+
37
+ Returns:
38
+ ExtractionResult: Modified result with enriched metadata.
39
+ New metadata keys are added, existing keys are preserved.
40
+
41
+ Note:
42
+ The processor should add its results to result.metadata and
43
+ return the modified ExtractionResult. Existing metadata keys will not be
44
+ overwritten by the FFI bridge.
45
+
46
+ Example:
47
+ >>> def process(self, result: ExtractionResult) -> ExtractionResult:
48
+ ... text = result.content
49
+ ... entities = extract_entities(text)
50
+ ... result.metadata["entities"] = entities
51
+ ... return result
52
+
53
+ """
54
+ ...
55
+
56
+ def processing_stage(self) -> Literal["early", "middle", "late"]:
57
+ """Return the processing stage for this processor.
58
+
59
+ Returns:
60
+ str: One of "early", "middle", or "late" (default: "middle")
61
+
62
+ Note:
63
+ Processing stages control the order in which processors are called:
64
+ - Early: Runs first (e.g., language detection)
65
+ - Middle: Runs in the middle (default, e.g., entity extraction)
66
+ - Late: Runs last (e.g., final formatting)
67
+
68
+ """
69
+ ...
70
+
71
+ def initialize(self) -> None:
72
+ """Initialize the processor (e.g., load ML models).
73
+
74
+ Called once when the processor is registered.
75
+ """
76
+ ...
77
+
78
+ def shutdown(self) -> None:
79
+ """Shutdown the processor and release resources.
80
+
81
+ Called when the processor is unregistered.
82
+ """
83
+ ...
kreuzberg/py.typed ADDED
File without changes