kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_extractors/__init__.py +0 -0
- kreuzberg/_extractors/_base.py +92 -0
- kreuzberg/_extractors/_html.py +34 -0
- kreuzberg/_extractors/_image.py +74 -0
- kreuzberg/_extractors/_pandoc.py +613 -0
- kreuzberg/_extractors/_pdf.py +163 -0
- kreuzberg/_extractors/_presentation.py +233 -0
- kreuzberg/_extractors/_spread_sheet.py +125 -0
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_ocr/__init__.py +17 -0
- kreuzberg/_ocr/_base.py +54 -0
- kreuzberg/_ocr/_easyocr.py +376 -0
- kreuzberg/_ocr/_paddleocr.py +291 -0
- kreuzberg/_ocr/_tesseract.py +342 -0
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/_utils/__init__.py +0 -0
- kreuzberg/{_string.py → _utils/_string.py} +0 -2
- kreuzberg/_utils/_sync.py +121 -0
- kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.1.dist-info/METADATA +178 -0
- kreuzberg-3.0.1.dist-info/RECORD +32 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.2.dist-info/METADATA +0 -446
- kreuzberg-2.1.2.dist-info/RECORD +0 -21
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
kreuzberg/_registry.py
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from functools import lru_cache
|
4
|
+
from typing import TYPE_CHECKING, ClassVar
|
5
|
+
|
6
|
+
from kreuzberg._extractors._html import HTMLExtractor
|
7
|
+
from kreuzberg._extractors._image import ImageExtractor
|
8
|
+
from kreuzberg._extractors._pandoc import (
|
9
|
+
BibliographyExtractor,
|
10
|
+
EbookExtractor,
|
11
|
+
LaTeXExtractor,
|
12
|
+
MarkdownExtractor,
|
13
|
+
MiscFormatExtractor,
|
14
|
+
OfficeDocumentExtractor,
|
15
|
+
StructuredTextExtractor,
|
16
|
+
TabularDataExtractor,
|
17
|
+
XMLBasedExtractor,
|
18
|
+
)
|
19
|
+
from kreuzberg._extractors._pdf import PDFExtractor
|
20
|
+
from kreuzberg._extractors._presentation import PresentationExtractor
|
21
|
+
from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
|
22
|
+
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
from kreuzberg._extractors._base import Extractor
|
25
|
+
from kreuzberg._types import ExtractionConfig
|
26
|
+
|
27
|
+
|
28
|
+
class ExtractorRegistry:
|
29
|
+
"""Manages extractors for different MIME types and their configurations.
|
30
|
+
|
31
|
+
This class provides functionality to register, unregister, and retrieve
|
32
|
+
extractors based on MIME types. It supports both synchronous and asynchronous
|
33
|
+
operations for managing extractors. A default set of extractors is also
|
34
|
+
maintained alongside user-registered extractors.
|
35
|
+
"""
|
36
|
+
|
37
|
+
_default_extractors: ClassVar[list[type[Extractor]]] = [
|
38
|
+
PDFExtractor,
|
39
|
+
OfficeDocumentExtractor,
|
40
|
+
PresentationExtractor,
|
41
|
+
SpreadSheetExtractor,
|
42
|
+
HTMLExtractor,
|
43
|
+
MarkdownExtractor,
|
44
|
+
ImageExtractor,
|
45
|
+
BibliographyExtractor,
|
46
|
+
EbookExtractor,
|
47
|
+
LaTeXExtractor,
|
48
|
+
MiscFormatExtractor,
|
49
|
+
StructuredTextExtractor,
|
50
|
+
TabularDataExtractor,
|
51
|
+
XMLBasedExtractor,
|
52
|
+
]
|
53
|
+
_registered_extractors: ClassVar[list[type[Extractor]]] = []
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
@lru_cache
|
57
|
+
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
|
58
|
+
"""Gets the extractor for the mimetype.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
mime_type: The mime type of the content.
|
62
|
+
config: Extraction options object, defaults to the default object.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
The extractor
|
66
|
+
"""
|
67
|
+
extractors: list[type[Extractor]] = [
|
68
|
+
*cls._registered_extractors,
|
69
|
+
*cls._default_extractors,
|
70
|
+
]
|
71
|
+
if mime_type:
|
72
|
+
for extractor in extractors:
|
73
|
+
if extractor.supports_mimetype(mime_type):
|
74
|
+
return extractor(mime_type=mime_type, config=config)
|
75
|
+
|
76
|
+
return None
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def add_extractor(cls, extractor: type[Extractor]) -> None:
|
80
|
+
"""Add an extractor to the registry.
|
81
|
+
|
82
|
+
Note:
|
83
|
+
Extractors are tried in the order they are added: first added, first tried.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
extractor: The extractor to add.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
None
|
90
|
+
"""
|
91
|
+
cls._registered_extractors.append(extractor)
|
92
|
+
cls.get_extractor.cache_clear()
|
93
|
+
|
94
|
+
@classmethod
|
95
|
+
def remove_extractor(cls, extractor: type[Extractor]) -> None:
|
96
|
+
"""Remove an extractor from the registry.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
extractor: The extractor to remove.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
None
|
103
|
+
"""
|
104
|
+
try:
|
105
|
+
cls._registered_extractors.remove(extractor)
|
106
|
+
cls.get_extractor.cache_clear()
|
107
|
+
except ValueError:
|
108
|
+
pass
|
kreuzberg/_types.py
CHANGED
@@ -1,71 +1,168 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from
|
4
|
+
from collections.abc import Awaitable
|
5
|
+
from dataclasses import asdict, dataclass
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
|
7
|
+
|
8
|
+
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
9
|
+
from kreuzberg.exceptions import ValidationError
|
5
10
|
|
6
11
|
if sys.version_info < (3, 11): # pragma: no cover
|
7
12
|
from typing_extensions import NotRequired
|
8
13
|
else: # pragma: no cover
|
9
14
|
from typing import NotRequired
|
10
15
|
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
18
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
19
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
20
|
+
|
21
|
+
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
22
|
+
|
11
23
|
|
12
24
|
class Metadata(TypedDict, total=False):
|
13
|
-
"""
|
25
|
+
"""Base metadata common to all document types.
|
14
26
|
|
15
|
-
All fields
|
27
|
+
All fields will only be included if they contain non-empty values.
|
16
28
|
Any field that would be empty or None is omitted from the dictionary.
|
17
|
-
|
18
|
-
Different documents and extraction methods will yield different metadata.
|
19
29
|
"""
|
20
30
|
|
21
|
-
title: NotRequired[str]
|
22
|
-
"""Document title."""
|
23
|
-
subtitle: NotRequired[str]
|
24
|
-
"""Document subtitle."""
|
25
|
-
abstract: NotRequired[str | list[str]]
|
26
|
-
"""Document abstract, summary or description."""
|
27
31
|
authors: NotRequired[list[str]]
|
28
32
|
"""List of document authors."""
|
29
|
-
date: NotRequired[str]
|
30
|
-
"""Document date as string to preserve original format."""
|
31
|
-
subject: NotRequired[str]
|
32
|
-
"""Document subject or topic."""
|
33
|
-
description: NotRequired[str]
|
34
|
-
"""Extended description."""
|
35
|
-
keywords: NotRequired[list[str]]
|
36
|
-
"""Keywords or tags."""
|
37
33
|
categories: NotRequired[list[str]]
|
38
34
|
"""Categories or classifications."""
|
39
|
-
version: NotRequired[str]
|
40
|
-
"""Version identifier."""
|
41
|
-
language: NotRequired[str]
|
42
|
-
"""Document language code."""
|
43
|
-
references: NotRequired[list[str]]
|
44
|
-
"""Reference entries."""
|
45
35
|
citations: NotRequired[list[str]]
|
46
36
|
"""Citation identifiers."""
|
37
|
+
comments: NotRequired[str]
|
38
|
+
"""General comments."""
|
47
39
|
copyright: NotRequired[str]
|
48
40
|
"""Copyright information."""
|
41
|
+
created_at: NotRequired[str]
|
42
|
+
"""Creation timestamp in ISO format."""
|
43
|
+
created_by: NotRequired[str]
|
44
|
+
"""Document creator."""
|
45
|
+
description: NotRequired[str]
|
46
|
+
"""Document description."""
|
47
|
+
fonts: NotRequired[list[str]]
|
48
|
+
"""List of fonts used in the document."""
|
49
|
+
height: NotRequired[int]
|
50
|
+
"""Height of the document page/slide/image, if applicable."""
|
51
|
+
identifier: NotRequired[str]
|
52
|
+
"""Unique document identifier."""
|
53
|
+
keywords: NotRequired[list[str]]
|
54
|
+
"""Keywords or tags."""
|
55
|
+
languages: NotRequired[list[str]]
|
56
|
+
"""Document language code."""
|
49
57
|
license: NotRequired[str]
|
50
58
|
"""License information."""
|
51
|
-
|
52
|
-
"""
|
59
|
+
modified_at: NotRequired[str]
|
60
|
+
"""Last modification timestamp in ISO format."""
|
61
|
+
modified_by: NotRequired[str]
|
62
|
+
"""Username of last modifier."""
|
63
|
+
organization: NotRequired[str | list[str]]
|
64
|
+
"""Organizational affiliation."""
|
53
65
|
publisher: NotRequired[str]
|
54
|
-
"""Publisher name."""
|
55
|
-
|
56
|
-
"""
|
57
|
-
|
58
|
-
"""Document
|
59
|
-
|
60
|
-
"""
|
66
|
+
"""Publisher or organization name."""
|
67
|
+
references: NotRequired[list[str]]
|
68
|
+
"""Reference entries."""
|
69
|
+
status: NotRequired[str]
|
70
|
+
"""Document status (e.g., draft, final)."""
|
71
|
+
subject: NotRequired[str]
|
72
|
+
"""Document subject or topic."""
|
73
|
+
subtitle: NotRequired[str]
|
74
|
+
"""Document subtitle."""
|
75
|
+
summary: NotRequired[str]
|
76
|
+
"""Document Summary"""
|
77
|
+
title: NotRequired[str]
|
78
|
+
"""Document title."""
|
79
|
+
version: NotRequired[str]
|
80
|
+
"""Version identifier or revision number."""
|
81
|
+
width: NotRequired[int]
|
82
|
+
"""Width of the document page/slide/image, if applicable."""
|
61
83
|
|
62
84
|
|
63
|
-
|
85
|
+
@dataclass
|
86
|
+
class ExtractionResult:
|
64
87
|
"""The result of a file extraction."""
|
65
88
|
|
66
89
|
content: str
|
67
90
|
"""The extracted content."""
|
91
|
+
chunks: list[str]
|
92
|
+
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
68
93
|
mime_type: str
|
69
|
-
"""The mime type of the content."""
|
94
|
+
"""The mime type of the extracted content. Is either text/plain or text/markdown."""
|
70
95
|
metadata: Metadata
|
71
96
|
"""The metadata of the content."""
|
97
|
+
|
98
|
+
|
99
|
+
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
100
|
+
ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
|
101
|
+
|
102
|
+
|
103
|
+
@dataclass(unsafe_hash=True)
|
104
|
+
class ExtractionConfig:
|
105
|
+
"""Represents configuration settings for an extraction process.
|
106
|
+
|
107
|
+
This class encapsulates the configuration options for extracting text
|
108
|
+
from images or documents using Optical Character Recognition (OCR). It
|
109
|
+
provides options to customize the OCR behavior, select the backend
|
110
|
+
engine, and configure engine-specific parameters.
|
111
|
+
"""
|
112
|
+
|
113
|
+
force_ocr: bool = False
|
114
|
+
"""Whether to force OCR."""
|
115
|
+
chunk_content: bool = False
|
116
|
+
"""Whether to chunk the content into smaller chunks."""
|
117
|
+
max_chars: int = DEFAULT_MAX_CHARACTERS
|
118
|
+
"""The size of each chunk in characters."""
|
119
|
+
max_overlap: int = DEFAULT_MAX_OVERLAP
|
120
|
+
"""The overlap between chunks in characters."""
|
121
|
+
ocr_backend: OcrBackendType | None = "tesseract"
|
122
|
+
"""The OCR backend to use."""
|
123
|
+
ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
|
124
|
+
"""Configuration to pass to the OCR backend."""
|
125
|
+
post_processing_hooks: list[PostProcessingHook] | None = None
|
126
|
+
"""Post processing hooks to call after processing is done and before the final result is returned."""
|
127
|
+
validators: list[ValidationHook] | None = None
|
128
|
+
"""Validation hooks to call after processing is done and before post-processing and result return."""
|
129
|
+
|
130
|
+
def __post_init__(self) -> None:
|
131
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
132
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
133
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
134
|
+
|
135
|
+
if self.ocr_backend is None and self.ocr_config is not None:
|
136
|
+
raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
|
137
|
+
|
138
|
+
if self.ocr_config is not None and (
|
139
|
+
(self.ocr_backend == "tesseract" and not isinstance(self.ocr_config, TesseractConfig))
|
140
|
+
or (self.ocr_backend == "easyocr" and not isinstance(self.ocr_config, EasyOCRConfig))
|
141
|
+
or (self.ocr_backend == "paddleocr" and not isinstance(self.ocr_config, PaddleOCRConfig))
|
142
|
+
):
|
143
|
+
raise ValidationError(
|
144
|
+
"incompatible 'ocr_config' value provided for 'ocr_backend'",
|
145
|
+
context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
|
146
|
+
)
|
147
|
+
|
148
|
+
def get_config_dict(self) -> dict[str, Any]:
|
149
|
+
"""Returns the OCR configuration object based on the backend specified.
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
A dict of the OCR configuration or an empty dict if no backend is provided.
|
153
|
+
"""
|
154
|
+
if self.ocr_backend is not None:
|
155
|
+
if self.ocr_config is not None:
|
156
|
+
return asdict(self.ocr_config)
|
157
|
+
if self.ocr_backend == "tesseract":
|
158
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
159
|
+
|
160
|
+
return asdict(TesseractConfig())
|
161
|
+
if self.ocr_backend == "easyocr":
|
162
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
163
|
+
|
164
|
+
return asdict(EasyOCRConfig())
|
165
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
166
|
+
|
167
|
+
return asdict(PaddleOCRConfig())
|
168
|
+
return {}
|
File without changes
|
@@ -18,14 +18,12 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
18
18
|
if not byte_data:
|
19
19
|
return ""
|
20
20
|
|
21
|
-
# We try each encoding in order until one works
|
22
21
|
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
23
22
|
|
24
23
|
for enc in [e for e in encodings if e]: # pragma: no cover
|
25
24
|
with suppress(UnicodeDecodeError, LookupError):
|
26
25
|
return byte_data.decode(enc)
|
27
26
|
|
28
|
-
# If all encodings fail, fall back to latin-1 which can handle any byte
|
29
27
|
return byte_data.decode("latin-1", errors="replace")
|
30
28
|
|
31
29
|
|
@@ -0,0 +1,121 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from functools import partial
|
5
|
+
from inspect import isawaitable, iscoroutinefunction
|
6
|
+
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
7
|
+
|
8
|
+
import anyio
|
9
|
+
from anyio import create_task_group
|
10
|
+
from anyio.to_thread import run_sync as any_io_run_sync
|
11
|
+
|
12
|
+
if TYPE_CHECKING: # pragma: no cover
|
13
|
+
from collections.abc import Awaitable, Callable
|
14
|
+
|
15
|
+
if sys.version_info >= (3, 10):
|
16
|
+
from typing import ParamSpec
|
17
|
+
else: # pragma: no cover
|
18
|
+
from typing_extensions import ParamSpec
|
19
|
+
|
20
|
+
T = TypeVar("T")
|
21
|
+
P = ParamSpec("P")
|
22
|
+
|
23
|
+
|
24
|
+
async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
|
25
|
+
"""Run a synchronous function in an asynchronous context.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
sync_fn: The synchronous function to run.
|
29
|
+
*args: The positional arguments to pass to the function.
|
30
|
+
**kwargs: The keyword arguments to pass to the function.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
The result of the synchronous function.
|
34
|
+
"""
|
35
|
+
handler = partial(sync_fn, **kwargs)
|
36
|
+
return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
37
|
+
|
38
|
+
|
39
|
+
async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
40
|
+
"""Run a list of coroutines concurrently.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
*async_tasks: The list of coroutines to run.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
The results of the coroutines.
|
47
|
+
"""
|
48
|
+
results: list[Any] = [None] * len(async_tasks)
|
49
|
+
|
50
|
+
async def run_task(index: int, task: Awaitable[T]) -> None:
|
51
|
+
results[index] = await task
|
52
|
+
|
53
|
+
async with create_task_group() as tg:
|
54
|
+
for i, t in enumerate(async_tasks):
|
55
|
+
tg.start_soon(run_task, i, t)
|
56
|
+
|
57
|
+
return results
|
58
|
+
|
59
|
+
|
60
|
+
async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
|
61
|
+
"""Run a list of coroutines concurrently in batches.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
*async_tasks: The list of coroutines to run.
|
65
|
+
batch_size: The size of each batch.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
The results of the coroutines.
|
69
|
+
"""
|
70
|
+
results: list[Any] = []
|
71
|
+
|
72
|
+
for i in range(0, len(async_tasks), batch_size):
|
73
|
+
batch = async_tasks[i : i + batch_size]
|
74
|
+
results.extend(await run_taskgroup(*batch))
|
75
|
+
|
76
|
+
return results
|
77
|
+
|
78
|
+
|
79
|
+
async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
80
|
+
"""Executes a callable function and handles both synchronous and asynchronous
|
81
|
+
results.
|
82
|
+
|
83
|
+
This function invokes the provided callable `sync_fn` with the given
|
84
|
+
arguments and keyword arguments. If the result of `sync_fn` is awaitable,
|
85
|
+
it awaits the result before returning it. Otherwise, the result is returned
|
86
|
+
directly.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
fn: The callable to be executed. It can produce either a
|
90
|
+
synchronous or asynchronous result.
|
91
|
+
*args: Positional arguments to pass to `sync_fn`.
|
92
|
+
**kwargs: Keyword arguments to pass to `sync_fn`.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
The result of `sync_fn` invocation. If the result is awaitable, the
|
96
|
+
awaited value is returned. Otherwise, the synchronous result is
|
97
|
+
returned.
|
98
|
+
"""
|
99
|
+
result = fn(*args, **kwargs)
|
100
|
+
if isawaitable(result):
|
101
|
+
return cast("T", await result)
|
102
|
+
return result
|
103
|
+
|
104
|
+
|
105
|
+
def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
106
|
+
"""Runs a synchronous or asynchronous function, resolving the output.
|
107
|
+
|
108
|
+
Determines if the provided function is synchronous or asynchronous. If synchronous,
|
109
|
+
executes it directly. If asynchronous, it runs the function within the event loop
|
110
|
+
using anyio. The return value is resolved regardless of the function type.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
fn: The function to be executed, which can
|
114
|
+
either be synchronous or asynchronous.
|
115
|
+
*args: Positional arguments to be passed to the function.
|
116
|
+
**kwargs: Keyword arguments to be passed to the function.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
T: The return value of the executed function, resolved if asynchronous.
|
120
|
+
"""
|
121
|
+
return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
|
kreuzberg/exceptions.py
CHANGED
@@ -51,6 +51,31 @@ class ValidationError(KreuzbergError):
|
|
51
51
|
class MissingDependencyError(KreuzbergError):
|
52
52
|
"""Raised when a dependency is missing."""
|
53
53
|
|
54
|
+
@classmethod
|
55
|
+
def create_for_package(
|
56
|
+
cls, *, dependency_group: str, functionality: str, package_name: str
|
57
|
+
) -> MissingDependencyError:
|
58
|
+
"""Creates a MissingDependencyError for a specified package and functionality.
|
59
|
+
|
60
|
+
This class method generates an error message to notify users about a
|
61
|
+
missing package dependency required for specific functionality. The error
|
62
|
+
message includes details about the missing package and the optional
|
63
|
+
dependency group required for installation.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
dependency_group: The name of the optional dependency group that includes
|
67
|
+
the required package.
|
68
|
+
functionality: The functionality that requires the missing package.
|
69
|
+
package_name: The name of the missing package.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
MissingDependencyError: A customized error indicating the missing
|
73
|
+
dependency and how to resolve it.
|
74
|
+
"""
|
75
|
+
return MissingDependencyError(
|
76
|
+
f"The package '{package_name}' is required to use {functionality}. You can install using the provided optional dependency group by installing `kreuzberg['{dependency_group}']`."
|
77
|
+
)
|
78
|
+
|
54
79
|
|
55
80
|
class OCRError(KreuzbergError):
|
56
81
|
"""Raised when an OCR error occurs."""
|