kreuzberg 2.1.2__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.0.dist-info/METADATA +178 -0
- kreuzberg-3.0.0.dist-info/RECORD +15 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.0.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_string.py +0 -41
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_tmp.py +0 -37
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.2.dist-info/METADATA +0 -446
- kreuzberg-2.1.2.dist-info/RECORD +0 -21
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.0.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/_types.py
CHANGED
@@ -1,71 +1,168 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from
|
4
|
+
from collections.abc import Awaitable
|
5
|
+
from dataclasses import asdict, dataclass
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
|
7
|
+
|
8
|
+
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
9
|
+
from kreuzberg.exceptions import ValidationError
|
5
10
|
|
6
11
|
if sys.version_info < (3, 11): # pragma: no cover
|
7
12
|
from typing_extensions import NotRequired
|
8
13
|
else: # pragma: no cover
|
9
14
|
from typing import NotRequired
|
10
15
|
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
18
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
19
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
20
|
+
|
21
|
+
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
22
|
+
|
11
23
|
|
12
24
|
class Metadata(TypedDict, total=False):
|
13
|
-
"""
|
25
|
+
"""Base metadata common to all document types.
|
14
26
|
|
15
|
-
All fields
|
27
|
+
All fields will only be included if they contain non-empty values.
|
16
28
|
Any field that would be empty or None is omitted from the dictionary.
|
17
|
-
|
18
|
-
Different documents and extraction methods will yield different metadata.
|
19
29
|
"""
|
20
30
|
|
21
|
-
title: NotRequired[str]
|
22
|
-
"""Document title."""
|
23
|
-
subtitle: NotRequired[str]
|
24
|
-
"""Document subtitle."""
|
25
|
-
abstract: NotRequired[str | list[str]]
|
26
|
-
"""Document abstract, summary or description."""
|
27
31
|
authors: NotRequired[list[str]]
|
28
32
|
"""List of document authors."""
|
29
|
-
date: NotRequired[str]
|
30
|
-
"""Document date as string to preserve original format."""
|
31
|
-
subject: NotRequired[str]
|
32
|
-
"""Document subject or topic."""
|
33
|
-
description: NotRequired[str]
|
34
|
-
"""Extended description."""
|
35
|
-
keywords: NotRequired[list[str]]
|
36
|
-
"""Keywords or tags."""
|
37
33
|
categories: NotRequired[list[str]]
|
38
34
|
"""Categories or classifications."""
|
39
|
-
version: NotRequired[str]
|
40
|
-
"""Version identifier."""
|
41
|
-
language: NotRequired[str]
|
42
|
-
"""Document language code."""
|
43
|
-
references: NotRequired[list[str]]
|
44
|
-
"""Reference entries."""
|
45
35
|
citations: NotRequired[list[str]]
|
46
36
|
"""Citation identifiers."""
|
37
|
+
comments: NotRequired[str]
|
38
|
+
"""General comments."""
|
47
39
|
copyright: NotRequired[str]
|
48
40
|
"""Copyright information."""
|
41
|
+
created_at: NotRequired[str]
|
42
|
+
"""Creation timestamp in ISO format."""
|
43
|
+
created_by: NotRequired[str]
|
44
|
+
"""Document creator."""
|
45
|
+
description: NotRequired[str]
|
46
|
+
"""Document description."""
|
47
|
+
fonts: NotRequired[list[str]]
|
48
|
+
"""List of fonts used in the document."""
|
49
|
+
height: NotRequired[int]
|
50
|
+
"""Height of the document page/slide/image, if applicable."""
|
51
|
+
identifier: NotRequired[str]
|
52
|
+
"""Unique document identifier."""
|
53
|
+
keywords: NotRequired[list[str]]
|
54
|
+
"""Keywords or tags."""
|
55
|
+
languages: NotRequired[list[str]]
|
56
|
+
"""Document language code."""
|
49
57
|
license: NotRequired[str]
|
50
58
|
"""License information."""
|
51
|
-
|
52
|
-
"""
|
59
|
+
modified_at: NotRequired[str]
|
60
|
+
"""Last modification timestamp in ISO format."""
|
61
|
+
modified_by: NotRequired[str]
|
62
|
+
"""Username of last modifier."""
|
63
|
+
organization: NotRequired[str | list[str]]
|
64
|
+
"""Organizational affiliation."""
|
53
65
|
publisher: NotRequired[str]
|
54
|
-
"""Publisher name."""
|
55
|
-
|
56
|
-
"""
|
57
|
-
|
58
|
-
"""Document
|
59
|
-
|
60
|
-
"""
|
66
|
+
"""Publisher or organization name."""
|
67
|
+
references: NotRequired[list[str]]
|
68
|
+
"""Reference entries."""
|
69
|
+
status: NotRequired[str]
|
70
|
+
"""Document status (e.g., draft, final)."""
|
71
|
+
subject: NotRequired[str]
|
72
|
+
"""Document subject or topic."""
|
73
|
+
subtitle: NotRequired[str]
|
74
|
+
"""Document subtitle."""
|
75
|
+
summary: NotRequired[str]
|
76
|
+
"""Document Summary"""
|
77
|
+
title: NotRequired[str]
|
78
|
+
"""Document title."""
|
79
|
+
version: NotRequired[str]
|
80
|
+
"""Version identifier or revision number."""
|
81
|
+
width: NotRequired[int]
|
82
|
+
"""Width of the document page/slide/image, if applicable."""
|
61
83
|
|
62
84
|
|
63
|
-
|
85
|
+
@dataclass
|
86
|
+
class ExtractionResult:
|
64
87
|
"""The result of a file extraction."""
|
65
88
|
|
66
89
|
content: str
|
67
90
|
"""The extracted content."""
|
91
|
+
chunks: list[str]
|
92
|
+
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
68
93
|
mime_type: str
|
69
|
-
"""The mime type of the content."""
|
94
|
+
"""The mime type of the extracted content. Is either text/plain or text/markdown."""
|
70
95
|
metadata: Metadata
|
71
96
|
"""The metadata of the content."""
|
97
|
+
|
98
|
+
|
99
|
+
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
100
|
+
ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
|
101
|
+
|
102
|
+
|
103
|
+
@dataclass(unsafe_hash=True)
|
104
|
+
class ExtractionConfig:
|
105
|
+
"""Represents configuration settings for an extraction process.
|
106
|
+
|
107
|
+
This class encapsulates the configuration options for extracting text
|
108
|
+
from images or documents using Optical Character Recognition (OCR). It
|
109
|
+
provides options to customize the OCR behavior, select the backend
|
110
|
+
engine, and configure engine-specific parameters.
|
111
|
+
"""
|
112
|
+
|
113
|
+
force_ocr: bool = False
|
114
|
+
"""Whether to force OCR."""
|
115
|
+
chunk_content: bool = False
|
116
|
+
"""Whether to chunk the content into smaller chunks."""
|
117
|
+
max_chars: int = DEFAULT_MAX_CHARACTERS
|
118
|
+
"""The size of each chunk in characters."""
|
119
|
+
max_overlap: int = DEFAULT_MAX_OVERLAP
|
120
|
+
"""The overlap between chunks in characters."""
|
121
|
+
ocr_backend: OcrBackendType | None = "tesseract"
|
122
|
+
"""The OCR backend to use."""
|
123
|
+
ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
|
124
|
+
"""Configuration to pass to the OCR backend."""
|
125
|
+
post_processing_hooks: list[PostProcessingHook] | None = None
|
126
|
+
"""Post processing hooks to call after processing is done and before the final result is returned."""
|
127
|
+
validators: list[ValidationHook] | None = None
|
128
|
+
"""Validation hooks to call after processing is done and before post-processing and result return."""
|
129
|
+
|
130
|
+
def __post_init__(self) -> None:
|
131
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
132
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
133
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
134
|
+
|
135
|
+
if self.ocr_backend is None and self.ocr_config is not None:
|
136
|
+
raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
|
137
|
+
|
138
|
+
if self.ocr_config is not None and (
|
139
|
+
(self.ocr_backend == "tesseract" and not isinstance(self.ocr_config, TesseractConfig))
|
140
|
+
or (self.ocr_backend == "easyocr" and not isinstance(self.ocr_config, EasyOCRConfig))
|
141
|
+
or (self.ocr_backend == "paddleocr" and not isinstance(self.ocr_config, PaddleOCRConfig))
|
142
|
+
):
|
143
|
+
raise ValidationError(
|
144
|
+
"incompatible 'ocr_config' value provided for 'ocr_backend'",
|
145
|
+
context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
|
146
|
+
)
|
147
|
+
|
148
|
+
def get_config_dict(self) -> dict[str, Any]:
|
149
|
+
"""Returns the OCR configuration object based on the backend specified.
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
A dict of the OCR configuration or an empty dict if no backend is provided.
|
153
|
+
"""
|
154
|
+
if self.ocr_backend is not None:
|
155
|
+
if self.ocr_config is not None:
|
156
|
+
return asdict(self.ocr_config)
|
157
|
+
if self.ocr_backend == "tesseract":
|
158
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
159
|
+
|
160
|
+
return asdict(TesseractConfig())
|
161
|
+
if self.ocr_backend == "easyocr":
|
162
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
163
|
+
|
164
|
+
return asdict(EasyOCRConfig())
|
165
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
166
|
+
|
167
|
+
return asdict(PaddleOCRConfig())
|
168
|
+
return {}
|
kreuzberg/exceptions.py
CHANGED
@@ -51,6 +51,31 @@ class ValidationError(KreuzbergError):
|
|
51
51
|
class MissingDependencyError(KreuzbergError):
|
52
52
|
"""Raised when a dependency is missing."""
|
53
53
|
|
54
|
+
@classmethod
|
55
|
+
def create_for_package(
|
56
|
+
cls, *, dependency_group: str, functionality: str, package_name: str
|
57
|
+
) -> MissingDependencyError:
|
58
|
+
"""Creates a MissingDependencyError for a specified package and functionality.
|
59
|
+
|
60
|
+
This class method generates an error message to notify users about a
|
61
|
+
missing package dependency required for specific functionality. The error
|
62
|
+
message includes details about the missing package and the optional
|
63
|
+
dependency group required for installation.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
dependency_group: The name of the optional dependency group that includes
|
67
|
+
the required package.
|
68
|
+
functionality: The functionality that requires the missing package.
|
69
|
+
package_name: The name of the missing package.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
MissingDependencyError: A customized error indicating the missing
|
73
|
+
dependency and how to resolve it.
|
74
|
+
"""
|
75
|
+
return MissingDependencyError(
|
76
|
+
f"The package '{package_name}' is required to use {functionality}. You can install using the provided optional dependency group by installing `kreuzberg['{dependency_group}']`."
|
77
|
+
)
|
78
|
+
|
54
79
|
|
55
80
|
class OCRError(KreuzbergError):
|
56
81
|
"""Raised when an OCR error occurs."""
|