kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kreuzberg might be problematic. Click here for more details.
- kreuzberg/__init__.py +931 -0
- kreuzberg/__main__.py +160 -0
- kreuzberg/_internal_bindings.abi3.so +0 -0
- kreuzberg/_setup_lib_path.py +143 -0
- kreuzberg/exceptions.py +254 -0
- kreuzberg/ocr/__init__.py +25 -0
- kreuzberg/ocr/easyocr.py +371 -0
- kreuzberg/ocr/paddleocr.py +284 -0
- kreuzberg/ocr/protocol.py +150 -0
- kreuzberg/postprocessors/__init__.py +61 -0
- kreuzberg/postprocessors/protocol.py +83 -0
- kreuzberg/py.typed +0 -0
- kreuzberg/types.py +509 -0
- kreuzberg-4.0.6.dist-info/METADATA +470 -0
- kreuzberg-4.0.6.dist-info/RECORD +17 -0
- kreuzberg-4.0.6.dist-info/WHEEL +4 -0
- kreuzberg-4.0.6.dist-info/entry_points.txt +2 -0
kreuzberg/__main__.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Proxy entry point that forwards to the Rust-based Kreuzberg CLI.
|
|
2
|
+
|
|
3
|
+
This keeps `python -m kreuzberg` and the `kreuzberg` console script working
|
|
4
|
+
without shipping an additional Python CLI implementation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Sequence
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_FEATURE_SUBCOMMANDS: dict[str, str] = {"serve": "all", "mcp": "all"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _iter_dev_cli_candidates(workspace_root: Path) -> list[Path]:
|
|
23
|
+
suffixes = [".exe"] if sys.platform == "win32" else [""]
|
|
24
|
+
candidate_dirs = ("target/release", "target/debug")
|
|
25
|
+
candidate_names = ("kreuzberg-cli", "kreuzberg")
|
|
26
|
+
|
|
27
|
+
candidates: list[Path] = []
|
|
28
|
+
for directory in candidate_dirs:
|
|
29
|
+
for name in candidate_names:
|
|
30
|
+
for suffix in suffixes:
|
|
31
|
+
candidate = workspace_root / directory / f"{name}{suffix}"
|
|
32
|
+
if candidate.exists():
|
|
33
|
+
candidates.append(candidate)
|
|
34
|
+
return candidates
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _binary_supports_subcommand(binary: Path, subcommand: str) -> bool:
|
|
38
|
+
try:
|
|
39
|
+
probe = subprocess.run(
|
|
40
|
+
[str(binary), subcommand, "--help"],
|
|
41
|
+
capture_output=True,
|
|
42
|
+
text=True,
|
|
43
|
+
check=False,
|
|
44
|
+
timeout=2,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if probe.returncode == 0:
|
|
48
|
+
return True
|
|
49
|
+
|
|
50
|
+
stderr = probe.stderr.lower()
|
|
51
|
+
return subcommand not in stderr or "unrecognized subcommand" not in stderr
|
|
52
|
+
except subprocess.TimeoutExpired:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _build_cli_with_features(workspace_root: Path, feature: str) -> bool:
|
|
57
|
+
"""Build CLI with specified features. Returns True if successful."""
|
|
58
|
+
cargo = shutil.which("cargo")
|
|
59
|
+
if cargo is None:
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
result = subprocess.run(
|
|
64
|
+
[cargo, "build", "-p", "kreuzberg-cli", "--features", feature],
|
|
65
|
+
cwd=workspace_root,
|
|
66
|
+
check=False,
|
|
67
|
+
capture_output=True,
|
|
68
|
+
timeout=300,
|
|
69
|
+
)
|
|
70
|
+
return result.returncode == 0
|
|
71
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _discover_dev_cli_binary(requested_subcommand: str | None) -> str | None:
|
|
76
|
+
"""Return the path to a locally built CLI binary if available."""
|
|
77
|
+
workspace_root = Path(__file__).resolve().parents[3]
|
|
78
|
+
candidates = _iter_dev_cli_candidates(workspace_root)
|
|
79
|
+
|
|
80
|
+
if requested_subcommand is None:
|
|
81
|
+
if candidates:
|
|
82
|
+
return str(candidates[0])
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
for candidate in candidates:
|
|
86
|
+
if _binary_supports_subcommand(candidate, requested_subcommand):
|
|
87
|
+
return str(candidate)
|
|
88
|
+
|
|
89
|
+
if not (workspace_root / "Cargo.toml").exists():
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
feature = _FEATURE_SUBCOMMANDS.get(requested_subcommand)
|
|
93
|
+
if feature is None:
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
if not _build_cli_with_features(workspace_root, feature):
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
for candidate in _iter_dev_cli_candidates(workspace_root):
|
|
100
|
+
if _binary_supports_subcommand(candidate, requested_subcommand):
|
|
101
|
+
return str(candidate)
|
|
102
|
+
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _find_packaged_cli_binary() -> str | None:
|
|
107
|
+
"""Look for the CLI binary in common installation paths before building one."""
|
|
108
|
+
package_dir = Path(__file__).parent
|
|
109
|
+
for name in ("kreuzberg-cli", "kreuzberg", "kreuzberg-cli.exe", "kreuzberg.exe"):
|
|
110
|
+
candidate = package_dir / name
|
|
111
|
+
if candidate.exists() and candidate.is_file():
|
|
112
|
+
return str(candidate)
|
|
113
|
+
|
|
114
|
+
script_dir = Path(sys.executable).parent
|
|
115
|
+
for name in ("kreuzberg-cli", "kreuzberg"):
|
|
116
|
+
candidate = script_dir / name
|
|
117
|
+
if candidate.exists():
|
|
118
|
+
try:
|
|
119
|
+
with candidate.open("rb") as f:
|
|
120
|
+
header = f.read(2)
|
|
121
|
+
if header == b"#!":
|
|
122
|
+
continue
|
|
123
|
+
except OSError:
|
|
124
|
+
continue
|
|
125
|
+
return str(candidate)
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
130
|
+
"""Execute the Rust CLI with the provided arguments."""
|
|
131
|
+
args = list(argv[1:] if argv is not None else sys.argv[1:])
|
|
132
|
+
|
|
133
|
+
requested_subcommand: str | None = None
|
|
134
|
+
if args:
|
|
135
|
+
first = args[0]
|
|
136
|
+
if not first.startswith("-"):
|
|
137
|
+
requested_subcommand = first
|
|
138
|
+
|
|
139
|
+
cli_path = shutil.which("kreuzberg-cli")
|
|
140
|
+
|
|
141
|
+
if cli_path is None:
|
|
142
|
+
cli_path = _find_packaged_cli_binary()
|
|
143
|
+
|
|
144
|
+
if cli_path is None:
|
|
145
|
+
cli_path = _discover_dev_cli_binary(requested_subcommand)
|
|
146
|
+
|
|
147
|
+
if cli_path is None:
|
|
148
|
+
sys.stderr.write(
|
|
149
|
+
"The embedded Kreuzberg CLI binary could not be located. "
|
|
150
|
+
"This indicates a packaging issue with the wheel; please open an issue at "
|
|
151
|
+
"https://github.com/kreuzberg-dev/kreuzberg/issues so we can investigate.\n",
|
|
152
|
+
)
|
|
153
|
+
return 1
|
|
154
|
+
|
|
155
|
+
completed = subprocess.run([cli_path, *args], check=False)
|
|
156
|
+
return completed.returncode
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if __name__ == "__main__":
|
|
160
|
+
raise SystemExit(main())
|
|
Binary file
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Set up dynamic library search paths for bundled native libraries.
|
|
2
|
+
|
|
3
|
+
This module must be imported before _internal_bindings to ensure pdfium
|
|
4
|
+
and other native libraries can be found at runtime without requiring users
|
|
5
|
+
to manually set DYLD_LIBRARY_PATH (macOS), LD_LIBRARY_PATH (Linux), or
|
|
6
|
+
PATH (Windows).
|
|
7
|
+
|
|
8
|
+
Additionally, on macOS, this module fixes the library install names if needed
|
|
9
|
+
using install_name_tool, ensuring @loader_path is used for relative references.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import contextlib
|
|
15
|
+
import os
|
|
16
|
+
import platform
|
|
17
|
+
import subprocess
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def setup_library_paths() -> None:
|
|
23
|
+
"""Add package directory to dynamic library search path.
|
|
24
|
+
|
|
25
|
+
This ensures bundled native libraries (pdfium, etc.) can be found
|
|
26
|
+
at runtime across all platforms.
|
|
27
|
+
"""
|
|
28
|
+
package_dir = Path(__file__).parent.resolve()
|
|
29
|
+
|
|
30
|
+
system = platform.system()
|
|
31
|
+
|
|
32
|
+
if system == "Darwin":
|
|
33
|
+
_fix_macos_install_names(package_dir)
|
|
34
|
+
_setup_macos_paths(package_dir)
|
|
35
|
+
elif system == "Linux":
|
|
36
|
+
_setup_linux_paths(package_dir)
|
|
37
|
+
elif system == "Windows":
|
|
38
|
+
_setup_windows_paths(package_dir)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _fix_macos_install_names(package_dir: Path) -> None:
|
|
42
|
+
so_file = package_dir / "_internal_bindings.abi3.so"
|
|
43
|
+
pdfium_lib = package_dir / "libpdfium.dylib"
|
|
44
|
+
|
|
45
|
+
if not so_file.exists() or not pdfium_lib.exists():
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
result = subprocess.run(
|
|
50
|
+
["otool", "-L", str(so_file)], # noqa: S607
|
|
51
|
+
capture_output=True,
|
|
52
|
+
text=True,
|
|
53
|
+
check=True,
|
|
54
|
+
timeout=5,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if "@loader_path/libpdfium.dylib" in result.stdout:
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
if "./libpdfium.dylib" in result.stdout:
|
|
61
|
+
with contextlib.suppress(subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
|
|
62
|
+
subprocess.run(
|
|
63
|
+
[ # noqa: S607
|
|
64
|
+
"install_name_tool",
|
|
65
|
+
"-change",
|
|
66
|
+
"./libpdfium.dylib",
|
|
67
|
+
"@loader_path/libpdfium.dylib",
|
|
68
|
+
str(so_file),
|
|
69
|
+
],
|
|
70
|
+
check=True,
|
|
71
|
+
timeout=5,
|
|
72
|
+
capture_output=True,
|
|
73
|
+
)
|
|
74
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _setup_macos_paths(package_dir: Path) -> None:
|
|
79
|
+
current_path = os.environ.get("DYLD_LIBRARY_PATH", "")
|
|
80
|
+
package_str = str(package_dir)
|
|
81
|
+
|
|
82
|
+
if package_str not in current_path:
|
|
83
|
+
if current_path:
|
|
84
|
+
os.environ["DYLD_LIBRARY_PATH"] = f"{package_str}:{current_path}"
|
|
85
|
+
else:
|
|
86
|
+
os.environ["DYLD_LIBRARY_PATH"] = package_str
|
|
87
|
+
|
|
88
|
+
current_fallback = os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", "")
|
|
89
|
+
if package_str not in current_fallback:
|
|
90
|
+
if current_fallback:
|
|
91
|
+
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = f"{package_str}:{current_fallback}"
|
|
92
|
+
else:
|
|
93
|
+
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = f"{package_str}:/usr/local/lib:/usr/lib"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _setup_linux_paths(package_dir: Path) -> None:
|
|
97
|
+
current_path = os.environ.get("LD_LIBRARY_PATH", "")
|
|
98
|
+
package_str = str(package_dir)
|
|
99
|
+
|
|
100
|
+
if package_str not in current_path:
|
|
101
|
+
if current_path:
|
|
102
|
+
os.environ["LD_LIBRARY_PATH"] = f"{package_str}:{current_path}"
|
|
103
|
+
else:
|
|
104
|
+
os.environ["LD_LIBRARY_PATH"] = package_str
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
import ctypes # noqa: PLC0415
|
|
108
|
+
import ctypes.util # noqa: PLC0415
|
|
109
|
+
|
|
110
|
+
lib_path = package_dir / "libpdfium.so"
|
|
111
|
+
if lib_path.exists():
|
|
112
|
+
with contextlib.suppress(OSError):
|
|
113
|
+
ctypes.CDLL(str(lib_path))
|
|
114
|
+
except (ImportError, AttributeError):
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _setup_windows_paths(package_dir: Path) -> None:
|
|
119
|
+
package_str = str(package_dir)
|
|
120
|
+
|
|
121
|
+
current_path = os.environ.get("PATH", "")
|
|
122
|
+
if package_str not in current_path:
|
|
123
|
+
if current_path:
|
|
124
|
+
os.environ["PATH"] = f"{package_str};{current_path}"
|
|
125
|
+
else:
|
|
126
|
+
os.environ["PATH"] = package_str
|
|
127
|
+
|
|
128
|
+
if sys.version_info >= (3, 8) and hasattr(os, "add_dll_directory"):
|
|
129
|
+
with contextlib.suppress(OSError, AttributeError):
|
|
130
|
+
os.add_dll_directory(str(package_dir))
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
import ctypes # noqa: PLC0415
|
|
134
|
+
|
|
135
|
+
lib_path = package_dir / "pdfium.dll"
|
|
136
|
+
if lib_path.exists():
|
|
137
|
+
with contextlib.suppress(OSError):
|
|
138
|
+
ctypes.CDLL(str(lib_path))
|
|
139
|
+
except (ImportError, AttributeError):
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
setup_library_paths()
|
kreuzberg/exceptions.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Exception classes for Kreuzberg.
|
|
2
|
+
|
|
3
|
+
All Kreuzberg exceptions inherit from KreuzbergError and support optional context
|
|
4
|
+
for debugging information.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import IntEnum
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ErrorCode(IntEnum):
|
|
14
|
+
"""Error codes from the Kreuzberg FFI layer.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
SUCCESS: No error occurred
|
|
18
|
+
GENERIC_ERROR: Generic/unknown error
|
|
19
|
+
PANIC: A panic occurred in the library
|
|
20
|
+
INVALID_ARGUMENT: Invalid argument provided
|
|
21
|
+
IO_ERROR: I/O operation failed
|
|
22
|
+
PARSING_ERROR: Document parsing failed
|
|
23
|
+
OCR_ERROR: OCR processing failed
|
|
24
|
+
MISSING_DEPENDENCY: Required dependency not found
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> from kreuzberg import get_last_error_code, ErrorCode
|
|
28
|
+
>>> code = get_last_error_code()
|
|
29
|
+
>>> if code == ErrorCode.PANIC:
|
|
30
|
+
... print("A panic occurred")
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
SUCCESS = 0
|
|
34
|
+
GENERIC_ERROR = 1
|
|
35
|
+
PANIC = 2
|
|
36
|
+
INVALID_ARGUMENT = 3
|
|
37
|
+
IO_ERROR = 4
|
|
38
|
+
PARSING_ERROR = 5
|
|
39
|
+
OCR_ERROR = 6
|
|
40
|
+
MISSING_DEPENDENCY = 7
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True, slots=True)
|
|
44
|
+
class PanicContext:
|
|
45
|
+
"""Structured panic context information from FFI layer.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
file: Source file where panic occurred
|
|
49
|
+
line: Line number in source file
|
|
50
|
+
function: Function name where panic occurred
|
|
51
|
+
message: Panic message
|
|
52
|
+
timestamp_secs: Unix timestamp (seconds since epoch) when panic occurred
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> import json
|
|
56
|
+
>>> from kreuzberg import get_last_panic_context, PanicContext
|
|
57
|
+
>>> context_json = get_last_panic_context()
|
|
58
|
+
>>> if context_json:
|
|
59
|
+
... data = json.loads(context_json)
|
|
60
|
+
... context = PanicContext(**data)
|
|
61
|
+
... print(f"Panic at {context.file}:{context.line}")
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
file: str
|
|
65
|
+
line: int
|
|
66
|
+
function: str
|
|
67
|
+
message: str
|
|
68
|
+
timestamp_secs: int
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_json(cls, json_str: str) -> "PanicContext":
|
|
72
|
+
"""Parse panic context from JSON string.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
json_str: JSON string with panic context (as returned by get_last_panic_context)
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
PanicContext dataclass instance
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If JSON is invalid or missing required fields
|
|
82
|
+
"""
|
|
83
|
+
data = json.loads(json_str)
|
|
84
|
+
return cls(**data)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class KreuzbergError(Exception):
|
|
88
|
+
"""Base exception class for all Kreuzberg errors.
|
|
89
|
+
|
|
90
|
+
All Kreuzberg exceptions support an optional context dictionary for debugging
|
|
91
|
+
information. The context is serialized to JSON when the exception is converted
|
|
92
|
+
to a string.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
message: Human-readable error message
|
|
96
|
+
context: Optional dictionary with debugging context (file paths, config, etc.)
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
>>> raise KreuzbergError("Failed to parse document", context={"file": "document.pdf", "page": 5})
|
|
100
|
+
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(self, message: str, *, context: dict[str, Any] | None = None) -> None:
|
|
104
|
+
super().__init__(message)
|
|
105
|
+
self.message = message
|
|
106
|
+
self.context = context
|
|
107
|
+
|
|
108
|
+
def __str__(self) -> str:
|
|
109
|
+
"""Format error with context as JSON."""
|
|
110
|
+
error_name = self.__class__.__name__
|
|
111
|
+
if self.context:
|
|
112
|
+
serialized_context = self._serialize_context(self.context)
|
|
113
|
+
context_json = json.dumps(serialized_context, sort_keys=True)
|
|
114
|
+
return f"{error_name}: {self.message}\nContext: {context_json}"
|
|
115
|
+
return f"{error_name}: {self.message}"
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _serialize_context(context: dict[str, Any]) -> dict[str, Any]:
|
|
119
|
+
def serialize_value(value: Any) -> Any:
|
|
120
|
+
if isinstance(value, bytes):
|
|
121
|
+
return value.decode("utf-8", errors="replace")
|
|
122
|
+
if isinstance(value, Exception):
|
|
123
|
+
return {"type": type(value).__name__, "message": str(value)}
|
|
124
|
+
if isinstance(value, tuple):
|
|
125
|
+
return [serialize_value(item) for item in value]
|
|
126
|
+
if isinstance(value, list):
|
|
127
|
+
return [serialize_value(item) for item in value]
|
|
128
|
+
if isinstance(value, dict):
|
|
129
|
+
return {k: serialize_value(v) for k, v in value.items()}
|
|
130
|
+
return value
|
|
131
|
+
|
|
132
|
+
serialized: dict[str, Any] = serialize_value(context)
|
|
133
|
+
return serialized
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class ValidationError(KreuzbergError):
|
|
137
|
+
"""Raised when input validation fails.
|
|
138
|
+
|
|
139
|
+
This includes configuration validation, parameter validation, and input
|
|
140
|
+
data validation errors.
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
>>> raise ValidationError("Invalid language code", context={"language": "xyz", "supported": ["en", "de"]})
|
|
144
|
+
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class ParsingError(KreuzbergError):
|
|
149
|
+
"""Raised when document parsing fails.
|
|
150
|
+
|
|
151
|
+
This includes errors from extractors when they cannot parse a document
|
|
152
|
+
(corrupt files, unsupported features, etc.).
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> raise ParsingError("Failed to parse PDF", context={"file": "document.pdf", "extractor": "pdf"})
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class OCRError(KreuzbergError):
|
|
161
|
+
"""Raised when OCR processing fails.
|
|
162
|
+
|
|
163
|
+
This includes errors from OCR backends during text extraction from images.
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
>>> raise OCRError("OCR processing failed", context={"backend": "tesseract", "language": "en"})
|
|
167
|
+
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class MissingDependencyError(KreuzbergError):
|
|
172
|
+
"""Raised when a required dependency is not installed.
|
|
173
|
+
|
|
174
|
+
This includes missing Python packages and missing system dependencies
|
|
175
|
+
(tesseract, pandoc, etc.).
|
|
176
|
+
|
|
177
|
+
Example:
|
|
178
|
+
>>> raise MissingDependencyError(
|
|
179
|
+
... "EasyOCR not installed", context={"package": "easyocr", "install_command": "pip install kreuzberg[easyocr]"}
|
|
180
|
+
... )
|
|
181
|
+
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def create_for_package(
|
|
186
|
+
cls,
|
|
187
|
+
*,
|
|
188
|
+
dependency_group: str,
|
|
189
|
+
functionality: str,
|
|
190
|
+
package_name: str,
|
|
191
|
+
) -> "MissingDependencyError":
|
|
192
|
+
"""Create a MissingDependencyError for a missing package.
|
|
193
|
+
|
|
194
|
+
This is a convenience method for creating standardized error messages
|
|
195
|
+
for missing optional dependencies.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
dependency_group: The optional dependency group (e.g., "ocr", "api", "cli")
|
|
199
|
+
functionality: Description of what functionality requires this package
|
|
200
|
+
package_name: Name of the missing package
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
MissingDependencyError with formatted message and context
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> error = MissingDependencyError.create_for_package(
|
|
207
|
+
... dependency_group="easyocr", functionality="EasyOCR backend", package_name="easyocr"
|
|
208
|
+
... )
|
|
209
|
+
>>> raise error
|
|
210
|
+
|
|
211
|
+
"""
|
|
212
|
+
install_cmd = f"pip install kreuzberg[{dependency_group}]"
|
|
213
|
+
message = f"Missing required dependency '{package_name}' for {functionality}. Install with: {install_cmd}"
|
|
214
|
+
context = {
|
|
215
|
+
"package": package_name,
|
|
216
|
+
"dependency_group": dependency_group,
|
|
217
|
+
"functionality": functionality,
|
|
218
|
+
"install_command": install_cmd,
|
|
219
|
+
}
|
|
220
|
+
return cls(message, context=context)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class CacheError(KreuzbergError):
|
|
224
|
+
"""Raised when cache operations fail.
|
|
225
|
+
|
|
226
|
+
This includes errors during cache reads, writes, or invalidations.
|
|
227
|
+
|
|
228
|
+
Example:
|
|
229
|
+
>>> raise CacheError("Failed to write cache", context={"path": "/tmp/cache", "operation": "write"})
|
|
230
|
+
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class ImageProcessingError(KreuzbergError):
|
|
235
|
+
"""Raised when image manipulation fails.
|
|
236
|
+
|
|
237
|
+
This includes errors during image resizing, format conversion, or other
|
|
238
|
+
image processing operations.
|
|
239
|
+
|
|
240
|
+
Example:
|
|
241
|
+
>>> raise ImageProcessingError("Failed to resize image", context={"width": 1920, "height": 1080})
|
|
242
|
+
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class PluginError(KreuzbergError):
|
|
247
|
+
"""Raised when plugin operations fail.
|
|
248
|
+
|
|
249
|
+
This includes errors during plugin initialization, registration, or execution.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
>>> raise PluginError("Plugin initialization failed", context={"plugin_name": "pdf-extractor"})
|
|
253
|
+
|
|
254
|
+
"""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Python OCR backend implementations.
|
|
2
|
+
|
|
3
|
+
These backends can be imported and manually registered, or they will be
|
|
4
|
+
auto-registered when kreuzberg is imported (if their dependencies are installed).
|
|
5
|
+
|
|
6
|
+
Each backend has a separate optional dependency group:
|
|
7
|
+
- EasyOCR: pip install "kreuzberg[easyocr]"
|
|
8
|
+
- PaddleOCR: pip install "kreuzberg[paddleocr]"
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
__all__ = ["EasyOCRBackend", "OcrBackendProtocol", "PaddleOCRBackend"]
|
|
14
|
+
|
|
15
|
+
from kreuzberg.ocr.protocol import OcrBackendProtocol
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from kreuzberg.ocr.easyocr import EasyOCRBackend
|
|
19
|
+
except ImportError:
|
|
20
|
+
EasyOCRBackend = None # type: ignore[assignment,misc]
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from kreuzberg.ocr.paddleocr import PaddleOCRBackend
|
|
24
|
+
except ImportError:
|
|
25
|
+
PaddleOCRBackend = None # type: ignore[assignment,misc]
|