kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kreuzberg might be problematic. Click here for more details.
- kreuzberg/__init__.py +931 -0
- kreuzberg/__main__.py +160 -0
- kreuzberg/_internal_bindings.abi3.so +0 -0
- kreuzberg/_setup_lib_path.py +143 -0
- kreuzberg/exceptions.py +254 -0
- kreuzberg/ocr/__init__.py +25 -0
- kreuzberg/ocr/easyocr.py +371 -0
- kreuzberg/ocr/paddleocr.py +284 -0
- kreuzberg/ocr/protocol.py +150 -0
- kreuzberg/postprocessors/__init__.py +61 -0
- kreuzberg/postprocessors/protocol.py +83 -0
- kreuzberg/py.typed +0 -0
- kreuzberg/types.py +509 -0
- kreuzberg-4.0.6.dist-info/METADATA +470 -0
- kreuzberg-4.0.6.dist-info/RECORD +17 -0
- kreuzberg-4.0.6.dist-info/WHEEL +4 -0
- kreuzberg-4.0.6.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Protocol for Python OCR backends compatible with Rust FFI bridge.
|
|
2
|
+
|
|
3
|
+
This module defines the interface that Python OCR backends must implement
|
|
4
|
+
to be registered with the Rust extraction core via the FFI bridge.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Protocol
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OcrBackendProtocol(Protocol):
|
|
13
|
+
"""Protocol for OCR backends compatible with Rust FFI bridge.
|
|
14
|
+
|
|
15
|
+
Python OCR backends implementing this protocol can be registered with the
|
|
16
|
+
Rust extraction core using `register_ocr_backend()`. Once registered, they
|
|
17
|
+
can be used by all extraction APIs (Python, Rust CLI, API server, MCP).
|
|
18
|
+
|
|
19
|
+
Required Methods:
|
|
20
|
+
name: Return backend name (e.g., 'easyocr', 'paddleocr')
|
|
21
|
+
supported_languages: Return list of supported language codes
|
|
22
|
+
process_image: Process image bytes and return extraction result
|
|
23
|
+
|
|
24
|
+
Optional Methods:
|
|
25
|
+
process_file: Process file from path (defaults to reading and calling process_image)
|
|
26
|
+
initialize: Called when backend is registered (e.g., load models)
|
|
27
|
+
shutdown: Called when backend is unregistered (e.g., cleanup resources)
|
|
28
|
+
version: Return backend version string (defaults to '1.0.0')
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> class MyOcrBackend:
|
|
32
|
+
... def name(self) -> str:
|
|
33
|
+
... return "my-ocr"
|
|
34
|
+
...
|
|
35
|
+
... def supported_languages(self) -> list[str]:
|
|
36
|
+
... return ["eng", "deu", "fra"]
|
|
37
|
+
...
|
|
38
|
+
... def process_image(self, image_bytes: bytes, language: str) -> dict[str, Any]:
|
|
39
|
+
... # Process image and extract text
|
|
40
|
+
... return {
|
|
41
|
+
... "content": "extracted text",
|
|
42
|
+
... "metadata": {"confidence": 0.95, "width": 800, "height": 600},
|
|
43
|
+
... "tables": [],
|
|
44
|
+
... }
|
|
45
|
+
>>> from kreuzberg import register_ocr_backend
|
|
46
|
+
>>> backend = MyOcrBackend()
|
|
47
|
+
>>> register_ocr_backend(backend)
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def name(self) -> str:
|
|
52
|
+
"""Return backend name (e.g., 'easyocr', 'paddleocr').
|
|
53
|
+
|
|
54
|
+
The name must be unique across all registered backends and is used
|
|
55
|
+
to identify and select the backend for OCR operations.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Backend name as lowercase string with no spaces
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
def supported_languages(self) -> list[str]:
|
|
64
|
+
"""Return list of supported language codes.
|
|
65
|
+
|
|
66
|
+
Language codes should use standard ISO 639 codes (e.g., 'eng', 'deu')
|
|
67
|
+
or library-specific codes (e.g., 'ch_sim', 'ch_tra' for Chinese).
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of supported language code strings
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
...
|
|
74
|
+
|
|
75
|
+
def process_image(self, image_bytes: bytes, language: str) -> dict[str, Any]:
|
|
76
|
+
r"""Process image bytes and return extraction result.
|
|
77
|
+
|
|
78
|
+
This is the core OCR method that takes raw image data and returns
|
|
79
|
+
extracted text with metadata. The method should handle all image
|
|
80
|
+
formats supported by the backend.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
image_bytes: Raw image data (PNG, JPEG, TIFF, etc.)
|
|
84
|
+
language: Language code for OCR (must be in supported_languages())
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Dictionary with extraction result:
|
|
88
|
+
{
|
|
89
|
+
"content": "extracted text content",
|
|
90
|
+
"metadata": {
|
|
91
|
+
"width": 800, # Optional: image width
|
|
92
|
+
"height": 600, # Optional: image height
|
|
93
|
+
"confidence": 0.95, # Optional: OCR confidence score
|
|
94
|
+
... # Any other metadata
|
|
95
|
+
},
|
|
96
|
+
"tables": [ # Optional: extracted tables (list of dicts)
|
|
97
|
+
{
|
|
98
|
+
"cells": [["row1col1", "row1col2"], ["row2col1", "row2col2"]], # Required: 2D array of strings
|
|
99
|
+
"markdown": "| Header |\\n| ------ |\\n| Cell |", # Required: markdown representation
|
|
100
|
+
"page_number": 1 # Required: 1-indexed page number
|
|
101
|
+
}
|
|
102
|
+
]
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
def process_file(self, path: str, language: str) -> dict[str, Any]:
|
|
109
|
+
"""Process file from path (optional, defaults to reading and calling process_image).
|
|
110
|
+
|
|
111
|
+
Backends can override this method if they have optimized file processing
|
|
112
|
+
that avoids loading the entire file into memory.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
path: Path to image file
|
|
116
|
+
language: Language code for OCR
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Same format as process_image()
|
|
120
|
+
|
|
121
|
+
"""
|
|
122
|
+
...
|
|
123
|
+
|
|
124
|
+
def initialize(self) -> None:
|
|
125
|
+
"""Initialize backend (optional, called when backend is registered).
|
|
126
|
+
|
|
127
|
+
Use this method to:
|
|
128
|
+
- Load ML models
|
|
129
|
+
- Initialize GPU/CUDA
|
|
130
|
+
- Download required data files
|
|
131
|
+
- Set up caching
|
|
132
|
+
|
|
133
|
+
This method is called exactly once when the backend is registered
|
|
134
|
+
via `register_ocr_backend()`.
|
|
135
|
+
"""
|
|
136
|
+
...
|
|
137
|
+
|
|
138
|
+
def shutdown(self) -> None:
|
|
139
|
+
"""Shutdown backend (optional, called when backend is unregistered).
|
|
140
|
+
|
|
141
|
+
Use this method to:
|
|
142
|
+
- Cleanup temporary files
|
|
143
|
+
- Release GPU memory
|
|
144
|
+
- Close file handles
|
|
145
|
+
- Save cached data
|
|
146
|
+
|
|
147
|
+
This method is called when the backend is unregistered via
|
|
148
|
+
`unregister_ocr_backend()` or when the program exits.
|
|
149
|
+
"""
|
|
150
|
+
...
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Custom PostProcessor support for Kreuzberg.
|
|
2
|
+
|
|
3
|
+
This module provides the protocol interface for creating custom post-processors
|
|
4
|
+
that can be registered with the Rust extraction pipeline.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> from kreuzberg import PostProcessorProtocol, register_post_processor, ExtractionResult
|
|
8
|
+
>>>
|
|
9
|
+
>>> class MyCustomProcessor:
|
|
10
|
+
... '''Custom processor that adds custom metadata.'''
|
|
11
|
+
...
|
|
12
|
+
... def name(self) -> str:
|
|
13
|
+
... return "my_custom_processor"
|
|
14
|
+
...
|
|
15
|
+
... def process(self, result: ExtractionResult) -> ExtractionResult:
|
|
16
|
+
... # Add custom processing logic
|
|
17
|
+
... word_count = len(result.content.split())
|
|
18
|
+
... result.metadata["custom_word_count"] = word_count
|
|
19
|
+
... result.metadata["custom_tag"] = "processed"
|
|
20
|
+
... return result
|
|
21
|
+
...
|
|
22
|
+
... def processing_stage(self) -> str:
|
|
23
|
+
... return "middle" # or "early" or "late"
|
|
24
|
+
...
|
|
25
|
+
... def initialize(self) -> None:
|
|
26
|
+
... # Optional: Initialize resources (e.g., load ML models)
|
|
27
|
+
... pass
|
|
28
|
+
...
|
|
29
|
+
... def shutdown(self) -> None:
|
|
30
|
+
... # Optional: Release resources
|
|
31
|
+
... pass
|
|
32
|
+
>>>
|
|
33
|
+
>>> # Register the processor
|
|
34
|
+
>>> processor = MyCustomProcessor()
|
|
35
|
+
>>> register_post_processor(processor)
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Now it will be called automatically during extraction
|
|
38
|
+
>>> from kreuzberg import extract_file_sync
|
|
39
|
+
>>> result = extract_file_sync("document.pdf")
|
|
40
|
+
>>> print(result.metadata.get("custom_word_count"))
|
|
41
|
+
>>> print(result.metadata.get("custom_tag"))
|
|
42
|
+
|
|
43
|
+
Processing Stages:
|
|
44
|
+
- **early**: Runs first in the pipeline (e.g., language detection)
|
|
45
|
+
- **middle**: Runs in the middle (default, most processors)
|
|
46
|
+
- **late**: Runs last (e.g., final formatting, summaries)
|
|
47
|
+
|
|
48
|
+
Thread Safety:
|
|
49
|
+
Processors are called from the Rust core which may use threading.
|
|
50
|
+
Ensure your processor is thread-safe or uses appropriate locking.
|
|
51
|
+
|
|
52
|
+
Performance:
|
|
53
|
+
Keep processing fast - slow processors will impact extraction performance.
|
|
54
|
+
Consider lazy initialization for expensive resources (ML models, etc.).
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
from __future__ import annotations
|
|
58
|
+
|
|
59
|
+
from kreuzberg.postprocessors.protocol import PostProcessorProtocol
|
|
60
|
+
|
|
61
|
+
__all__ = ["PostProcessorProtocol"]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""PostProcessor protocol interface.
|
|
2
|
+
|
|
3
|
+
This module defines the protocol that all Python postprocessors must implement
|
|
4
|
+
to be registered with the Rust core via the FFI bridge.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import TYPE_CHECKING, Literal, Protocol
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from kreuzberg._internal_bindings import ExtractionResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PostProcessorProtocol(Protocol):
|
|
16
|
+
"""Protocol for Python postprocessors.
|
|
17
|
+
|
|
18
|
+
All postprocessors must implement these methods to be compatible
|
|
19
|
+
with the Rust PostProcessor FFI bridge.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def name(self) -> str:
|
|
23
|
+
"""Return the unique name of this postprocessor.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
str: Processor name (e.g., "entity_extraction", "keyword_extraction")
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
...
|
|
30
|
+
|
|
31
|
+
def process(self, result: ExtractionResult) -> ExtractionResult:
|
|
32
|
+
"""Process and enrich an extraction result.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
result: ExtractionResult with extracted content, metadata, and tables
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
ExtractionResult: Modified result with enriched metadata.
|
|
39
|
+
New metadata keys are added, existing keys are preserved.
|
|
40
|
+
|
|
41
|
+
Note:
|
|
42
|
+
The processor should add its results to result.metadata and
|
|
43
|
+
return the modified ExtractionResult. Existing metadata keys will not be
|
|
44
|
+
overwritten by the FFI bridge.
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> def process(self, result: ExtractionResult) -> ExtractionResult:
|
|
48
|
+
... text = result.content
|
|
49
|
+
... entities = extract_entities(text)
|
|
50
|
+
... result.metadata["entities"] = entities
|
|
51
|
+
... return result
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
def processing_stage(self) -> Literal["early", "middle", "late"]:
|
|
57
|
+
"""Return the processing stage for this processor.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: One of "early", "middle", or "late" (default: "middle")
|
|
61
|
+
|
|
62
|
+
Note:
|
|
63
|
+
Processing stages control the order in which processors are called:
|
|
64
|
+
- Early: Runs first (e.g., language detection)
|
|
65
|
+
- Middle: Runs in the middle (default, e.g., entity extraction)
|
|
66
|
+
- Late: Runs last (e.g., final formatting)
|
|
67
|
+
|
|
68
|
+
"""
|
|
69
|
+
...
|
|
70
|
+
|
|
71
|
+
def initialize(self) -> None:
|
|
72
|
+
"""Initialize the processor (e.g., load ML models).
|
|
73
|
+
|
|
74
|
+
Called once when the processor is registered.
|
|
75
|
+
"""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
def shutdown(self) -> None:
|
|
79
|
+
"""Shutdown the processor and release resources.
|
|
80
|
+
|
|
81
|
+
Called when the processor is unregistered.
|
|
82
|
+
"""
|
|
83
|
+
...
|
kreuzberg/py.typed
ADDED
|
File without changes
|