kreuzberg 2.1.2__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_types.py CHANGED
@@ -1,71 +1,168 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from typing import NamedTuple, TypedDict
4
+ from collections.abc import Awaitable
5
+ from dataclasses import asdict, dataclass
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
7
+
8
+ from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
9
+ from kreuzberg.exceptions import ValidationError
5
10
 
6
11
  if sys.version_info < (3, 11): # pragma: no cover
7
12
  from typing_extensions import NotRequired
8
13
  else: # pragma: no cover
9
14
  from typing import NotRequired
10
15
 
16
+ if TYPE_CHECKING:
17
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
18
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
19
+ from kreuzberg._ocr._tesseract import TesseractConfig
20
+
21
+ OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
22
+
11
23
 
12
24
  class Metadata(TypedDict, total=False):
13
- """Document metadata.
25
+ """Base metadata common to all document types.
14
26
 
15
- All fields are optional but will only be included if they contain non-empty values.
27
+ All fields will only be included if they contain non-empty values.
16
28
  Any field that would be empty or None is omitted from the dictionary.
17
-
18
- Different documents and extraction methods will yield different metadata.
19
29
  """
20
30
 
21
- title: NotRequired[str]
22
- """Document title."""
23
- subtitle: NotRequired[str]
24
- """Document subtitle."""
25
- abstract: NotRequired[str | list[str]]
26
- """Document abstract, summary or description."""
27
31
  authors: NotRequired[list[str]]
28
32
  """List of document authors."""
29
- date: NotRequired[str]
30
- """Document date as string to preserve original format."""
31
- subject: NotRequired[str]
32
- """Document subject or topic."""
33
- description: NotRequired[str]
34
- """Extended description."""
35
- keywords: NotRequired[list[str]]
36
- """Keywords or tags."""
37
33
  categories: NotRequired[list[str]]
38
34
  """Categories or classifications."""
39
- version: NotRequired[str]
40
- """Version identifier."""
41
- language: NotRequired[str]
42
- """Document language code."""
43
- references: NotRequired[list[str]]
44
- """Reference entries."""
45
35
  citations: NotRequired[list[str]]
46
36
  """Citation identifiers."""
37
+ comments: NotRequired[str]
38
+ """General comments."""
47
39
  copyright: NotRequired[str]
48
40
  """Copyright information."""
41
+ created_at: NotRequired[str]
42
+ """Creation timestamp in ISO format."""
43
+ created_by: NotRequired[str]
44
+ """Document creator."""
45
+ description: NotRequired[str]
46
+ """Document description."""
47
+ fonts: NotRequired[list[str]]
48
+ """List of fonts used in the document."""
49
+ height: NotRequired[int]
50
+ """Height of the document page/slide/image, if applicable."""
51
+ identifier: NotRequired[str]
52
+ """Unique document identifier."""
53
+ keywords: NotRequired[list[str]]
54
+ """Keywords or tags."""
55
+ languages: NotRequired[list[str]]
56
+ """Document language code."""
49
57
  license: NotRequired[str]
50
58
  """License information."""
51
- identifier: NotRequired[str]
52
- """Document identifier."""
59
+ modified_at: NotRequired[str]
60
+ """Last modification timestamp in ISO format."""
61
+ modified_by: NotRequired[str]
62
+ """Username of last modifier."""
63
+ organization: NotRequired[str | list[str]]
64
+ """Organizational affiliation."""
53
65
  publisher: NotRequired[str]
54
- """Publisher name."""
55
- contributors: NotRequired[list[str]]
56
- """Additional contributors."""
57
- creator: NotRequired[str]
58
- """Document creator."""
59
- institute: NotRequired[str | list[str]]
60
- """Institute or organization."""
66
+ """Publisher or organization name."""
67
+ references: NotRequired[list[str]]
68
+ """Reference entries."""
69
+ status: NotRequired[str]
70
+ """Document status (e.g., draft, final)."""
71
+ subject: NotRequired[str]
72
+ """Document subject or topic."""
73
+ subtitle: NotRequired[str]
74
+ """Document subtitle."""
75
+ summary: NotRequired[str]
76
+ """Document Summary"""
77
+ title: NotRequired[str]
78
+ """Document title."""
79
+ version: NotRequired[str]
80
+ """Version identifier or revision number."""
81
+ width: NotRequired[int]
82
+ """Width of the document page/slide/image, if applicable."""
61
83
 
62
84
 
63
- class ExtractionResult(NamedTuple):
85
+ @dataclass
86
+ class ExtractionResult:
64
87
  """The result of a file extraction."""
65
88
 
66
89
  content: str
67
90
  """The extracted content."""
91
+ chunks: list[str]
92
+ """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
68
93
  mime_type: str
69
- """The mime type of the content."""
94
+ """The mime type of the extracted content. Is either text/plain or text/markdown."""
70
95
  metadata: Metadata
71
96
  """The metadata of the content."""
97
+
98
+
99
+ PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
100
+ ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
101
+
102
+
103
+ @dataclass(unsafe_hash=True)
104
+ class ExtractionConfig:
105
+ """Represents configuration settings for an extraction process.
106
+
107
+ This class encapsulates the configuration options for extracting text
108
+ from images or documents using Optical Character Recognition (OCR). It
109
+ provides options to customize the OCR behavior, select the backend
110
+ engine, and configure engine-specific parameters.
111
+ """
112
+
113
+ force_ocr: bool = False
114
+ """Whether to force OCR."""
115
+ chunk_content: bool = False
116
+ """Whether to chunk the content into smaller chunks."""
117
+ max_chars: int = DEFAULT_MAX_CHARACTERS
118
+ """The size of each chunk in characters."""
119
+ max_overlap: int = DEFAULT_MAX_OVERLAP
120
+ """The overlap between chunks in characters."""
121
+ ocr_backend: OcrBackendType | None = "tesseract"
122
+ """The OCR backend to use."""
123
+ ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
124
+ """Configuration to pass to the OCR backend."""
125
+ post_processing_hooks: list[PostProcessingHook] | None = None
126
+ """Post processing hooks to call after processing is done and before the final result is returned."""
127
+ validators: list[ValidationHook] | None = None
128
+ """Validation hooks to call after processing is done and before post-processing and result return."""
129
+
130
+ def __post_init__(self) -> None:
131
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
132
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
133
+ from kreuzberg._ocr._tesseract import TesseractConfig
134
+
135
+ if self.ocr_backend is None and self.ocr_config is not None:
136
+ raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
137
+
138
+ if self.ocr_config is not None and (
139
+ (self.ocr_backend == "tesseract" and not isinstance(self.ocr_config, TesseractConfig))
140
+ or (self.ocr_backend == "easyocr" and not isinstance(self.ocr_config, EasyOCRConfig))
141
+ or (self.ocr_backend == "paddleocr" and not isinstance(self.ocr_config, PaddleOCRConfig))
142
+ ):
143
+ raise ValidationError(
144
+ "incompatible 'ocr_config' value provided for 'ocr_backend'",
145
+ context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
146
+ )
147
+
148
+ def get_config_dict(self) -> dict[str, Any]:
149
+ """Returns the OCR configuration object based on the backend specified.
150
+
151
+ Returns:
152
+ A dict of the OCR configuration or an empty dict if no backend is provided.
153
+ """
154
+ if self.ocr_backend is not None:
155
+ if self.ocr_config is not None:
156
+ return asdict(self.ocr_config)
157
+ if self.ocr_backend == "tesseract":
158
+ from kreuzberg._ocr._tesseract import TesseractConfig
159
+
160
+ return asdict(TesseractConfig())
161
+ if self.ocr_backend == "easyocr":
162
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
163
+
164
+ return asdict(EasyOCRConfig())
165
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
166
+
167
+ return asdict(PaddleOCRConfig())
168
+ return {}
kreuzberg/exceptions.py CHANGED
@@ -51,6 +51,31 @@ class ValidationError(KreuzbergError):
51
51
  class MissingDependencyError(KreuzbergError):
52
52
  """Raised when a dependency is missing."""
53
53
 
54
+ @classmethod
55
+ def create_for_package(
56
+ cls, *, dependency_group: str, functionality: str, package_name: str
57
+ ) -> MissingDependencyError:
58
+ """Creates a MissingDependencyError for a specified package and functionality.
59
+
60
+ This class method generates an error message to notify users about a
61
+ missing package dependency required for specific functionality. The error
62
+ message includes details about the missing package and the optional
63
+ dependency group required for installation.
64
+
65
+ Args:
66
+ dependency_group: The name of the optional dependency group that includes
67
+ the required package.
68
+ functionality: The functionality that requires the missing package.
69
+ package_name: The name of the missing package.
70
+
71
+ Returns:
72
+ MissingDependencyError: A customized error indicating the missing
73
+ dependency and how to resolve it.
74
+ """
75
+ return MissingDependencyError(
76
+ f"The package '{package_name}' is required to use {functionality}. You can install using the provided optional dependency group by installing `kreuzberg['{dependency_group}']`."
77
+ )
78
+
54
79
 
55
80
  class OCRError(KreuzbergError):
56
81
  """Raised when an OCR error occurs."""