kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kreuzberg might be problematic. Click here for more details.

kreuzberg/__main__.py ADDED
@@ -0,0 +1,160 @@
1
+ """Proxy entry point that forwards to the Rust-based Kreuzberg CLI.
2
+
3
+ This keeps `python -m kreuzberg` and the `kreuzberg` console script working
4
+ without shipping an additional Python CLI implementation.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import shutil
10
+ import subprocess
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Sequence
17
+
18
+
19
+ _FEATURE_SUBCOMMANDS: dict[str, str] = {"serve": "all", "mcp": "all"}
20
+
21
+
22
+ def _iter_dev_cli_candidates(workspace_root: Path) -> list[Path]:
23
+ suffixes = [".exe"] if sys.platform == "win32" else [""]
24
+ candidate_dirs = ("target/release", "target/debug")
25
+ candidate_names = ("kreuzberg-cli", "kreuzberg")
26
+
27
+ candidates: list[Path] = []
28
+ for directory in candidate_dirs:
29
+ for name in candidate_names:
30
+ for suffix in suffixes:
31
+ candidate = workspace_root / directory / f"{name}{suffix}"
32
+ if candidate.exists():
33
+ candidates.append(candidate)
34
+ return candidates
35
+
36
+
37
+ def _binary_supports_subcommand(binary: Path, subcommand: str) -> bool:
38
+ try:
39
+ probe = subprocess.run(
40
+ [str(binary), subcommand, "--help"],
41
+ capture_output=True,
42
+ text=True,
43
+ check=False,
44
+ timeout=2,
45
+ )
46
+
47
+ if probe.returncode == 0:
48
+ return True
49
+
50
+ stderr = probe.stderr.lower()
51
+ return subcommand not in stderr or "unrecognized subcommand" not in stderr
52
+ except subprocess.TimeoutExpired:
53
+ return False
54
+
55
+
56
+ def _build_cli_with_features(workspace_root: Path, feature: str) -> bool:
57
+ """Build CLI with specified features. Returns True if successful."""
58
+ cargo = shutil.which("cargo")
59
+ if cargo is None:
60
+ return False
61
+
62
+ try:
63
+ result = subprocess.run(
64
+ [cargo, "build", "-p", "kreuzberg-cli", "--features", feature],
65
+ cwd=workspace_root,
66
+ check=False,
67
+ capture_output=True,
68
+ timeout=300,
69
+ )
70
+ return result.returncode == 0
71
+ except (subprocess.TimeoutExpired, OSError):
72
+ return False
73
+
74
+
75
+ def _discover_dev_cli_binary(requested_subcommand: str | None) -> str | None:
76
+ """Return the path to a locally built CLI binary if available."""
77
+ workspace_root = Path(__file__).resolve().parents[3]
78
+ candidates = _iter_dev_cli_candidates(workspace_root)
79
+
80
+ if requested_subcommand is None:
81
+ if candidates:
82
+ return str(candidates[0])
83
+ return None
84
+
85
+ for candidate in candidates:
86
+ if _binary_supports_subcommand(candidate, requested_subcommand):
87
+ return str(candidate)
88
+
89
+ if not (workspace_root / "Cargo.toml").exists():
90
+ return None
91
+
92
+ feature = _FEATURE_SUBCOMMANDS.get(requested_subcommand)
93
+ if feature is None:
94
+ return None
95
+
96
+ if not _build_cli_with_features(workspace_root, feature):
97
+ return None
98
+
99
+ for candidate in _iter_dev_cli_candidates(workspace_root):
100
+ if _binary_supports_subcommand(candidate, requested_subcommand):
101
+ return str(candidate)
102
+
103
+ return None
104
+
105
+
106
+ def _find_packaged_cli_binary() -> str | None:
107
+ """Look for the CLI binary in common installation paths before building one."""
108
+ package_dir = Path(__file__).parent
109
+ for name in ("kreuzberg-cli", "kreuzberg", "kreuzberg-cli.exe", "kreuzberg.exe"):
110
+ candidate = package_dir / name
111
+ if candidate.exists() and candidate.is_file():
112
+ return str(candidate)
113
+
114
+ script_dir = Path(sys.executable).parent
115
+ for name in ("kreuzberg-cli", "kreuzberg"):
116
+ candidate = script_dir / name
117
+ if candidate.exists():
118
+ try:
119
+ with candidate.open("rb") as f:
120
+ header = f.read(2)
121
+ if header == b"#!":
122
+ continue
123
+ except OSError:
124
+ continue
125
+ return str(candidate)
126
+ return None
127
+
128
+
129
+ def main(argv: Sequence[str] | None = None) -> int:
130
+ """Execute the Rust CLI with the provided arguments."""
131
+ args = list(argv[1:] if argv is not None else sys.argv[1:])
132
+
133
+ requested_subcommand: str | None = None
134
+ if args:
135
+ first = args[0]
136
+ if not first.startswith("-"):
137
+ requested_subcommand = first
138
+
139
+ cli_path = shutil.which("kreuzberg-cli")
140
+
141
+ if cli_path is None:
142
+ cli_path = _find_packaged_cli_binary()
143
+
144
+ if cli_path is None:
145
+ cli_path = _discover_dev_cli_binary(requested_subcommand)
146
+
147
+ if cli_path is None:
148
+ sys.stderr.write(
149
+ "The embedded Kreuzberg CLI binary could not be located. "
150
+ "This indicates a packaging issue with the wheel; please open an issue at "
151
+ "https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\n",
152
+ )
153
+ return 1
154
+
155
+ completed = subprocess.run([cli_path, *args], check=False)
156
+ return completed.returncode
157
+
158
+
159
+ if __name__ == "__main__":
160
+ raise SystemExit(main())
Binary file
@@ -0,0 +1,143 @@
1
+ """Set up dynamic library search paths for bundled native libraries.
2
+
3
+ This module must be imported before _internal_bindings to ensure pdfium
4
+ and other native libraries can be found at runtime without requiring users
5
+ to manually set DYLD_LIBRARY_PATH (macOS), LD_LIBRARY_PATH (Linux), or
6
+ PATH (Windows).
7
+
8
+ Additionally, on macOS, this module fixes the library install names if needed
9
+ using install_name_tool, ensuring @loader_path is used for relative references.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import contextlib
15
+ import os
16
+ import platform
17
+ import subprocess
18
+ import sys
19
+ from pathlib import Path
20
+
21
+
22
+ def setup_library_paths() -> None:
23
+ """Add package directory to dynamic library search path.
24
+
25
+ This ensures bundled native libraries (pdfium, etc.) can be found
26
+ at runtime across all platforms.
27
+ """
28
+ package_dir = Path(__file__).parent.resolve()
29
+
30
+ system = platform.system()
31
+
32
+ if system == "Darwin":
33
+ _fix_macos_install_names(package_dir)
34
+ _setup_macos_paths(package_dir)
35
+ elif system == "Linux":
36
+ _setup_linux_paths(package_dir)
37
+ elif system == "Windows":
38
+ _setup_windows_paths(package_dir)
39
+
40
+
41
+ def _fix_macos_install_names(package_dir: Path) -> None:
42
+ so_file = package_dir / "_internal_bindings.abi3.so"
43
+ pdfium_lib = package_dir / "libpdfium.dylib"
44
+
45
+ if not so_file.exists() or not pdfium_lib.exists():
46
+ return
47
+
48
+ try:
49
+ result = subprocess.run(
50
+ ["otool", "-L", str(so_file)], # noqa: S607
51
+ capture_output=True,
52
+ text=True,
53
+ check=True,
54
+ timeout=5,
55
+ )
56
+
57
+ if "@loader_path/libpdfium.dylib" in result.stdout:
58
+ return
59
+
60
+ if "./libpdfium.dylib" in result.stdout:
61
+ with contextlib.suppress(subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
62
+ subprocess.run(
63
+ [ # noqa: S607
64
+ "install_name_tool",
65
+ "-change",
66
+ "./libpdfium.dylib",
67
+ "@loader_path/libpdfium.dylib",
68
+ str(so_file),
69
+ ],
70
+ check=True,
71
+ timeout=5,
72
+ capture_output=True,
73
+ )
74
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
75
+ pass
76
+
77
+
78
+ def _setup_macos_paths(package_dir: Path) -> None:
79
+ current_path = os.environ.get("DYLD_LIBRARY_PATH", "")
80
+ package_str = str(package_dir)
81
+
82
+ if package_str not in current_path:
83
+ if current_path:
84
+ os.environ["DYLD_LIBRARY_PATH"] = f"{package_str}:{current_path}"
85
+ else:
86
+ os.environ["DYLD_LIBRARY_PATH"] = package_str
87
+
88
+ current_fallback = os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", "")
89
+ if package_str not in current_fallback:
90
+ if current_fallback:
91
+ os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = f"{package_str}:{current_fallback}"
92
+ else:
93
+ os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = f"{package_str}:/usr/local/lib:/usr/lib"
94
+
95
+
96
+ def _setup_linux_paths(package_dir: Path) -> None:
97
+ current_path = os.environ.get("LD_LIBRARY_PATH", "")
98
+ package_str = str(package_dir)
99
+
100
+ if package_str not in current_path:
101
+ if current_path:
102
+ os.environ["LD_LIBRARY_PATH"] = f"{package_str}:{current_path}"
103
+ else:
104
+ os.environ["LD_LIBRARY_PATH"] = package_str
105
+
106
+ try:
107
+ import ctypes # noqa: PLC0415
108
+ import ctypes.util # noqa: PLC0415
109
+
110
+ lib_path = package_dir / "libpdfium.so"
111
+ if lib_path.exists():
112
+ with contextlib.suppress(OSError):
113
+ ctypes.CDLL(str(lib_path))
114
+ except (ImportError, AttributeError):
115
+ pass
116
+
117
+
118
+ def _setup_windows_paths(package_dir: Path) -> None:
119
+ package_str = str(package_dir)
120
+
121
+ current_path = os.environ.get("PATH", "")
122
+ if package_str not in current_path:
123
+ if current_path:
124
+ os.environ["PATH"] = f"{package_str};{current_path}"
125
+ else:
126
+ os.environ["PATH"] = package_str
127
+
128
+ if sys.version_info >= (3, 8) and hasattr(os, "add_dll_directory"):
129
+ with contextlib.suppress(OSError, AttributeError):
130
+ os.add_dll_directory(str(package_dir))
131
+
132
+ try:
133
+ import ctypes # noqa: PLC0415
134
+
135
+ lib_path = package_dir / "pdfium.dll"
136
+ if lib_path.exists():
137
+ with contextlib.suppress(OSError):
138
+ ctypes.CDLL(str(lib_path))
139
+ except (ImportError, AttributeError):
140
+ pass
141
+
142
+
143
+ setup_library_paths()
@@ -0,0 +1,254 @@
1
+ """Exception classes for Kreuzberg.
2
+
3
+ All Kreuzberg exceptions inherit from KreuzbergError and support optional context
4
+ for debugging information.
5
+ """
6
+
7
+ import json
8
+ from dataclasses import dataclass
9
+ from enum import IntEnum
10
+ from typing import Any
11
+
12
+
13
+ class ErrorCode(IntEnum):
14
+ """Error codes from the Kreuzberg FFI layer.
15
+
16
+ Attributes:
17
+ SUCCESS: No error occurred
18
+ GENERIC_ERROR: Generic/unknown error
19
+ PANIC: A panic occurred in the library
20
+ INVALID_ARGUMENT: Invalid argument provided
21
+ IO_ERROR: I/O operation failed
22
+ PARSING_ERROR: Document parsing failed
23
+ OCR_ERROR: OCR processing failed
24
+ MISSING_DEPENDENCY: Required dependency not found
25
+
26
+ Example:
27
+ >>> from kreuzberg import get_last_error_code, ErrorCode
28
+ >>> code = get_last_error_code()
29
+ >>> if code == ErrorCode.PANIC:
30
+ ... print("A panic occurred")
31
+ """
32
+
33
+ SUCCESS = 0
34
+ GENERIC_ERROR = 1
35
+ PANIC = 2
36
+ INVALID_ARGUMENT = 3
37
+ IO_ERROR = 4
38
+ PARSING_ERROR = 5
39
+ OCR_ERROR = 6
40
+ MISSING_DEPENDENCY = 7
41
+
42
+
43
+ @dataclass(frozen=True, slots=True)
44
+ class PanicContext:
45
+ """Structured panic context information from FFI layer.
46
+
47
+ Attributes:
48
+ file: Source file where panic occurred
49
+ line: Line number in source file
50
+ function: Function name where panic occurred
51
+ message: Panic message
52
+ timestamp_secs: Unix timestamp (seconds since epoch) when panic occurred
53
+
54
+ Example:
55
+ >>> import json
56
+ >>> from kreuzberg import get_last_panic_context, PanicContext
57
+ >>> context_json = get_last_panic_context()
58
+ >>> if context_json:
59
+ ... data = json.loads(context_json)
60
+ ... context = PanicContext(**data)
61
+ ... print(f"Panic at {context.file}:{context.line}")
62
+ """
63
+
64
+ file: str
65
+ line: int
66
+ function: str
67
+ message: str
68
+ timestamp_secs: int
69
+
70
+ @classmethod
71
+ def from_json(cls, json_str: str) -> "PanicContext":
72
+ """Parse panic context from JSON string.
73
+
74
+ Args:
75
+ json_str: JSON string with panic context (as returned by get_last_panic_context)
76
+
77
+ Returns:
78
+ PanicContext dataclass instance
79
+
80
+ Raises:
81
+ ValueError: If JSON is invalid or missing required fields
82
+ """
83
+ data = json.loads(json_str)
84
+ return cls(**data)
85
+
86
+
87
+ class KreuzbergError(Exception):
88
+ """Base exception class for all Kreuzberg errors.
89
+
90
+ All Kreuzberg exceptions support an optional context dictionary for debugging
91
+ information. The context is serialized to JSON when the exception is converted
92
+ to a string.
93
+
94
+ Args:
95
+ message: Human-readable error message
96
+ context: Optional dictionary with debugging context (file paths, config, etc.)
97
+
98
+ Example:
99
+ >>> raise KreuzbergError("Failed to parse document", context={"file": "document.pdf", "page": 5})
100
+
101
+ """
102
+
103
+ def __init__(self, message: str, *, context: dict[str, Any] | None = None) -> None:
104
+ super().__init__(message)
105
+ self.message = message
106
+ self.context = context
107
+
108
+ def __str__(self) -> str:
109
+ """Format error with context as JSON."""
110
+ error_name = self.__class__.__name__
111
+ if self.context:
112
+ serialized_context = self._serialize_context(self.context)
113
+ context_json = json.dumps(serialized_context, sort_keys=True)
114
+ return f"{error_name}: {self.message}\nContext: {context_json}"
115
+ return f"{error_name}: {self.message}"
116
+
117
+ @staticmethod
118
+ def _serialize_context(context: dict[str, Any]) -> dict[str, Any]:
119
+ def serialize_value(value: Any) -> Any:
120
+ if isinstance(value, bytes):
121
+ return value.decode("utf-8", errors="replace")
122
+ if isinstance(value, Exception):
123
+ return {"type": type(value).__name__, "message": str(value)}
124
+ if isinstance(value, tuple):
125
+ return [serialize_value(item) for item in value]
126
+ if isinstance(value, list):
127
+ return [serialize_value(item) for item in value]
128
+ if isinstance(value, dict):
129
+ return {k: serialize_value(v) for k, v in value.items()}
130
+ return value
131
+
132
+ serialized: dict[str, Any] = serialize_value(context)
133
+ return serialized
134
+
135
+
136
+ class ValidationError(KreuzbergError):
137
+ """Raised when input validation fails.
138
+
139
+ This includes configuration validation, parameter validation, and input
140
+ data validation errors.
141
+
142
+ Example:
143
+ >>> raise ValidationError("Invalid language code", context={"language": "xyz", "supported": ["en", "de"]})
144
+
145
+ """
146
+
147
+
148
+ class ParsingError(KreuzbergError):
149
+ """Raised when document parsing fails.
150
+
151
+ This includes errors from extractors when they cannot parse a document
152
+ (corrupt files, unsupported features, etc.).
153
+
154
+ Example:
155
+ >>> raise ParsingError("Failed to parse PDF", context={"file": "document.pdf", "extractor": "pdf"})
156
+
157
+ """
158
+
159
+
160
+ class OCRError(KreuzbergError):
161
+ """Raised when OCR processing fails.
162
+
163
+ This includes errors from OCR backends during text extraction from images.
164
+
165
+ Example:
166
+ >>> raise OCRError("OCR processing failed", context={"backend": "tesseract", "language": "en"})
167
+
168
+ """
169
+
170
+
171
+ class MissingDependencyError(KreuzbergError):
172
+ """Raised when a required dependency is not installed.
173
+
174
+ This includes missing Python packages and missing system dependencies
175
+ (tesseract, pandoc, etc.).
176
+
177
+ Example:
178
+ >>> raise MissingDependencyError(
179
+ ... "EasyOCR not installed", context={"package": "easyocr", "install_command": "pip install kreuzberg[easyocr]"}
180
+ ... )
181
+
182
+ """
183
+
184
+ @classmethod
185
+ def create_for_package(
186
+ cls,
187
+ *,
188
+ dependency_group: str,
189
+ functionality: str,
190
+ package_name: str,
191
+ ) -> "MissingDependencyError":
192
+ """Create a MissingDependencyError for a missing package.
193
+
194
+ This is a convenience method for creating standardized error messages
195
+ for missing optional dependencies.
196
+
197
+ Args:
198
+ dependency_group: The optional dependency group (e.g., "ocr", "api", "cli")
199
+ functionality: Description of what functionality requires this package
200
+ package_name: Name of the missing package
201
+
202
+ Returns:
203
+ MissingDependencyError with formatted message and context
204
+
205
+ Example:
206
+ >>> error = MissingDependencyError.create_for_package(
207
+ ... dependency_group="easyocr", functionality="EasyOCR backend", package_name="easyocr"
208
+ ... )
209
+ >>> raise error
210
+
211
+ """
212
+ install_cmd = f"pip install kreuzberg[{dependency_group}]"
213
+ message = f"Missing required dependency '{package_name}' for {functionality}. Install with: {install_cmd}"
214
+ context = {
215
+ "package": package_name,
216
+ "dependency_group": dependency_group,
217
+ "functionality": functionality,
218
+ "install_command": install_cmd,
219
+ }
220
+ return cls(message, context=context)
221
+
222
+
223
+ class CacheError(KreuzbergError):
224
+ """Raised when cache operations fail.
225
+
226
+ This includes errors during cache reads, writes, or invalidations.
227
+
228
+ Example:
229
+ >>> raise CacheError("Failed to write cache", context={"path": "/tmp/cache", "operation": "write"})
230
+
231
+ """
232
+
233
+
234
+ class ImageProcessingError(KreuzbergError):
235
+ """Raised when image manipulation fails.
236
+
237
+ This includes errors during image resizing, format conversion, or other
238
+ image processing operations.
239
+
240
+ Example:
241
+ >>> raise ImageProcessingError("Failed to resize image", context={"width": 1920, "height": 1080})
242
+
243
+ """
244
+
245
+
246
+ class PluginError(KreuzbergError):
247
+ """Raised when plugin operations fail.
248
+
249
+ This includes errors during plugin initialization, registration, or execution.
250
+
251
+ Example:
252
+ >>> raise PluginError("Plugin initialization failed", context={"plugin_name": "pdf-extractor"})
253
+
254
+ """
@@ -0,0 +1,25 @@
1
+ """Python OCR backend implementations.
2
+
3
+ These backends can be imported and manually registered, or they will be
4
+ auto-registered when kreuzberg is imported (if their dependencies are installed).
5
+
6
+ Each backend has a separate optional dependency group:
7
+ - EasyOCR: pip install "kreuzberg[easyocr]"
8
+ - PaddleOCR: pip install "kreuzberg[paddleocr]"
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ __all__ = ["EasyOCRBackend", "OcrBackendProtocol", "PaddleOCRBackend"]
14
+
15
+ from kreuzberg.ocr.protocol import OcrBackendProtocol
16
+
17
+ try:
18
+ from kreuzberg.ocr.easyocr import EasyOCRBackend
19
+ except ImportError:
20
+ EasyOCRBackend = None # type: ignore[assignment,misc]
21
+
22
+ try:
23
+ from kreuzberg.ocr.paddleocr import PaddleOCRBackend
24
+ except ImportError:
25
+ PaddleOCRBackend = None # type: ignore[assignment,misc]