kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. kreuzberg/__init__.py +16 -2
  2. kreuzberg/_chunker.py +51 -0
  3. kreuzberg/_constants.py +2 -3
  4. kreuzberg/_extractors/__init__.py +0 -0
  5. kreuzberg/_extractors/_base.py +92 -0
  6. kreuzberg/_extractors/_html.py +34 -0
  7. kreuzberg/_extractors/_image.py +74 -0
  8. kreuzberg/_extractors/_pandoc.py +613 -0
  9. kreuzberg/_extractors/_pdf.py +163 -0
  10. kreuzberg/_extractors/_presentation.py +233 -0
  11. kreuzberg/_extractors/_spread_sheet.py +125 -0
  12. kreuzberg/_mime_types.py +19 -26
  13. kreuzberg/_ocr/__init__.py +17 -0
  14. kreuzberg/_ocr/_base.py +54 -0
  15. kreuzberg/_ocr/_easyocr.py +376 -0
  16. kreuzberg/_ocr/_paddleocr.py +291 -0
  17. kreuzberg/_ocr/_tesseract.py +342 -0
  18. kreuzberg/_playa.py +276 -0
  19. kreuzberg/_registry.py +108 -0
  20. kreuzberg/_types.py +133 -36
  21. kreuzberg/_utils/__init__.py +0 -0
  22. kreuzberg/{_string.py → _utils/_string.py} +0 -2
  23. kreuzberg/_utils/_sync.py +121 -0
  24. kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
  25. kreuzberg/exceptions.py +25 -0
  26. kreuzberg/extraction.py +114 -227
  27. kreuzberg-3.0.1.dist-info/METADATA +178 -0
  28. kreuzberg-3.0.1.dist-info/RECORD +32 -0
  29. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
  30. kreuzberg/_html.py +0 -31
  31. kreuzberg/_pandoc.py +0 -366
  32. kreuzberg/_pdf.py +0 -190
  33. kreuzberg/_pptx.py +0 -88
  34. kreuzberg/_sync.py +0 -74
  35. kreuzberg/_tesseract.py +0 -231
  36. kreuzberg/_xlsx.py +0 -88
  37. kreuzberg-2.1.2.dist-info/METADATA +0 -446
  38. kreuzberg-2.1.2.dist-info/RECORD +0 -21
  39. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
  40. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
kreuzberg/_registry.py ADDED
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from typing import TYPE_CHECKING, ClassVar
5
+
6
+ from kreuzberg._extractors._html import HTMLExtractor
7
+ from kreuzberg._extractors._image import ImageExtractor
8
+ from kreuzberg._extractors._pandoc import (
9
+ BibliographyExtractor,
10
+ EbookExtractor,
11
+ LaTeXExtractor,
12
+ MarkdownExtractor,
13
+ MiscFormatExtractor,
14
+ OfficeDocumentExtractor,
15
+ StructuredTextExtractor,
16
+ TabularDataExtractor,
17
+ XMLBasedExtractor,
18
+ )
19
+ from kreuzberg._extractors._pdf import PDFExtractor
20
+ from kreuzberg._extractors._presentation import PresentationExtractor
21
+ from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
22
+
23
+ if TYPE_CHECKING:
24
+ from kreuzberg._extractors._base import Extractor
25
+ from kreuzberg._types import ExtractionConfig
26
+
27
+
28
+ class ExtractorRegistry:
29
+ """Manages extractors for different MIME types and their configurations.
30
+
31
+ This class provides functionality to register, unregister, and retrieve
32
+ extractors based on MIME types. It supports both synchronous and asynchronous
33
+ operations for managing extractors. A default set of extractors is also
34
+ maintained alongside user-registered extractors.
35
+ """
36
+
37
+ _default_extractors: ClassVar[list[type[Extractor]]] = [
38
+ PDFExtractor,
39
+ OfficeDocumentExtractor,
40
+ PresentationExtractor,
41
+ SpreadSheetExtractor,
42
+ HTMLExtractor,
43
+ MarkdownExtractor,
44
+ ImageExtractor,
45
+ BibliographyExtractor,
46
+ EbookExtractor,
47
+ LaTeXExtractor,
48
+ MiscFormatExtractor,
49
+ StructuredTextExtractor,
50
+ TabularDataExtractor,
51
+ XMLBasedExtractor,
52
+ ]
53
+ _registered_extractors: ClassVar[list[type[Extractor]]] = []
54
+
55
+ @classmethod
56
+ @lru_cache
57
+ def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
58
+ """Gets the extractor for the mimetype.
59
+
60
+ Args:
61
+ mime_type: The mime type of the content.
62
+ config: Extraction options object, defaults to the default object.
63
+
64
+ Returns:
65
+ The extractor
66
+ """
67
+ extractors: list[type[Extractor]] = [
68
+ *cls._registered_extractors,
69
+ *cls._default_extractors,
70
+ ]
71
+ if mime_type:
72
+ for extractor in extractors:
73
+ if extractor.supports_mimetype(mime_type):
74
+ return extractor(mime_type=mime_type, config=config)
75
+
76
+ return None
77
+
78
+ @classmethod
79
+ def add_extractor(cls, extractor: type[Extractor]) -> None:
80
+ """Add an extractor to the registry.
81
+
82
+ Note:
83
+ Extractors are tried in the order they are added: first added, first tried.
84
+
85
+ Args:
86
+ extractor: The extractor to add.
87
+
88
+ Returns:
89
+ None
90
+ """
91
+ cls._registered_extractors.append(extractor)
92
+ cls.get_extractor.cache_clear()
93
+
94
+ @classmethod
95
+ def remove_extractor(cls, extractor: type[Extractor]) -> None:
96
+ """Remove an extractor from the registry.
97
+
98
+ Args:
99
+ extractor: The extractor to remove.
100
+
101
+ Returns:
102
+ None
103
+ """
104
+ try:
105
+ cls._registered_extractors.remove(extractor)
106
+ cls.get_extractor.cache_clear()
107
+ except ValueError:
108
+ pass
kreuzberg/_types.py CHANGED
@@ -1,71 +1,168 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from typing import NamedTuple, TypedDict
4
+ from collections.abc import Awaitable
5
+ from dataclasses import asdict, dataclass
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
7
+
8
+ from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
9
+ from kreuzberg.exceptions import ValidationError
5
10
 
6
11
  if sys.version_info < (3, 11): # pragma: no cover
7
12
  from typing_extensions import NotRequired
8
13
  else: # pragma: no cover
9
14
  from typing import NotRequired
10
15
 
16
+ if TYPE_CHECKING:
17
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
18
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
19
+ from kreuzberg._ocr._tesseract import TesseractConfig
20
+
21
+ OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
22
+
11
23
 
12
24
  class Metadata(TypedDict, total=False):
13
- """Document metadata.
25
+ """Base metadata common to all document types.
14
26
 
15
- All fields are optional but will only be included if they contain non-empty values.
27
+ All fields will only be included if they contain non-empty values.
16
28
  Any field that would be empty or None is omitted from the dictionary.
17
-
18
- Different documents and extraction methods will yield different metadata.
19
29
  """
20
30
 
21
- title: NotRequired[str]
22
- """Document title."""
23
- subtitle: NotRequired[str]
24
- """Document subtitle."""
25
- abstract: NotRequired[str | list[str]]
26
- """Document abstract, summary or description."""
27
31
  authors: NotRequired[list[str]]
28
32
  """List of document authors."""
29
- date: NotRequired[str]
30
- """Document date as string to preserve original format."""
31
- subject: NotRequired[str]
32
- """Document subject or topic."""
33
- description: NotRequired[str]
34
- """Extended description."""
35
- keywords: NotRequired[list[str]]
36
- """Keywords or tags."""
37
33
  categories: NotRequired[list[str]]
38
34
  """Categories or classifications."""
39
- version: NotRequired[str]
40
- """Version identifier."""
41
- language: NotRequired[str]
42
- """Document language code."""
43
- references: NotRequired[list[str]]
44
- """Reference entries."""
45
35
  citations: NotRequired[list[str]]
46
36
  """Citation identifiers."""
37
+ comments: NotRequired[str]
38
+ """General comments."""
47
39
  copyright: NotRequired[str]
48
40
  """Copyright information."""
41
+ created_at: NotRequired[str]
42
+ """Creation timestamp in ISO format."""
43
+ created_by: NotRequired[str]
44
+ """Document creator."""
45
+ description: NotRequired[str]
46
+ """Document description."""
47
+ fonts: NotRequired[list[str]]
48
+ """List of fonts used in the document."""
49
+ height: NotRequired[int]
50
+ """Height of the document page/slide/image, if applicable."""
51
+ identifier: NotRequired[str]
52
+ """Unique document identifier."""
53
+ keywords: NotRequired[list[str]]
54
+ """Keywords or tags."""
55
+ languages: NotRequired[list[str]]
56
+ """Document language code."""
49
57
  license: NotRequired[str]
50
58
  """License information."""
51
- identifier: NotRequired[str]
52
- """Document identifier."""
59
+ modified_at: NotRequired[str]
60
+ """Last modification timestamp in ISO format."""
61
+ modified_by: NotRequired[str]
62
+ """Username of last modifier."""
63
+ organization: NotRequired[str | list[str]]
64
+ """Organizational affiliation."""
53
65
  publisher: NotRequired[str]
54
- """Publisher name."""
55
- contributors: NotRequired[list[str]]
56
- """Additional contributors."""
57
- creator: NotRequired[str]
58
- """Document creator."""
59
- institute: NotRequired[str | list[str]]
60
- """Institute or organization."""
66
+ """Publisher or organization name."""
67
+ references: NotRequired[list[str]]
68
+ """Reference entries."""
69
+ status: NotRequired[str]
70
+ """Document status (e.g., draft, final)."""
71
+ subject: NotRequired[str]
72
+ """Document subject or topic."""
73
+ subtitle: NotRequired[str]
74
+ """Document subtitle."""
75
+ summary: NotRequired[str]
76
+ """Document Summary"""
77
+ title: NotRequired[str]
78
+ """Document title."""
79
+ version: NotRequired[str]
80
+ """Version identifier or revision number."""
81
+ width: NotRequired[int]
82
+ """Width of the document page/slide/image, if applicable."""
61
83
 
62
84
 
63
- class ExtractionResult(NamedTuple):
85
+ @dataclass
86
+ class ExtractionResult:
64
87
  """The result of a file extraction."""
65
88
 
66
89
  content: str
67
90
  """The extracted content."""
91
+ chunks: list[str]
92
+ """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
68
93
  mime_type: str
69
- """The mime type of the content."""
94
+ """The mime type of the extracted content. Is either text/plain or text/markdown."""
70
95
  metadata: Metadata
71
96
  """The metadata of the content."""
97
+
98
+
99
+ PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
100
+ ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
101
+
102
+
103
+ @dataclass(unsafe_hash=True)
104
+ class ExtractionConfig:
105
+ """Represents configuration settings for an extraction process.
106
+
107
+ This class encapsulates the configuration options for extracting text
108
+ from images or documents using Optical Character Recognition (OCR). It
109
+ provides options to customize the OCR behavior, select the backend
110
+ engine, and configure engine-specific parameters.
111
+ """
112
+
113
+ force_ocr: bool = False
114
+ """Whether to force OCR."""
115
+ chunk_content: bool = False
116
+ """Whether to chunk the content into smaller chunks."""
117
+ max_chars: int = DEFAULT_MAX_CHARACTERS
118
+ """The size of each chunk in characters."""
119
+ max_overlap: int = DEFAULT_MAX_OVERLAP
120
+ """The overlap between chunks in characters."""
121
+ ocr_backend: OcrBackendType | None = "tesseract"
122
+ """The OCR backend to use."""
123
+ ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
124
+ """Configuration to pass to the OCR backend."""
125
+ post_processing_hooks: list[PostProcessingHook] | None = None
126
+ """Post processing hooks to call after processing is done and before the final result is returned."""
127
+ validators: list[ValidationHook] | None = None
128
+ """Validation hooks to call after processing is done and before post-processing and result return."""
129
+
130
+ def __post_init__(self) -> None:
131
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
132
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
133
+ from kreuzberg._ocr._tesseract import TesseractConfig
134
+
135
+ if self.ocr_backend is None and self.ocr_config is not None:
136
+ raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
137
+
138
+ if self.ocr_config is not None and (
139
+ (self.ocr_backend == "tesseract" and not isinstance(self.ocr_config, TesseractConfig))
140
+ or (self.ocr_backend == "easyocr" and not isinstance(self.ocr_config, EasyOCRConfig))
141
+ or (self.ocr_backend == "paddleocr" and not isinstance(self.ocr_config, PaddleOCRConfig))
142
+ ):
143
+ raise ValidationError(
144
+ "incompatible 'ocr_config' value provided for 'ocr_backend'",
145
+ context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
146
+ )
147
+
148
+ def get_config_dict(self) -> dict[str, Any]:
149
+ """Returns the OCR configuration object based on the backend specified.
150
+
151
+ Returns:
152
+ A dict of the OCR configuration or an empty dict if no backend is provided.
153
+ """
154
+ if self.ocr_backend is not None:
155
+ if self.ocr_config is not None:
156
+ return asdict(self.ocr_config)
157
+ if self.ocr_backend == "tesseract":
158
+ from kreuzberg._ocr._tesseract import TesseractConfig
159
+
160
+ return asdict(TesseractConfig())
161
+ if self.ocr_backend == "easyocr":
162
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
163
+
164
+ return asdict(EasyOCRConfig())
165
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
166
+
167
+ return asdict(PaddleOCRConfig())
168
+ return {}
File without changes
@@ -18,14 +18,12 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
18
18
  if not byte_data:
19
19
  return ""
20
20
 
21
- # We try each encoding in order until one works
22
21
  encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
23
22
 
24
23
  for enc in [e for e in encodings if e]: # pragma: no cover
25
24
  with suppress(UnicodeDecodeError, LookupError):
26
25
  return byte_data.decode(enc)
27
26
 
28
- # If all encodings fail, fall back to latin-1 which can handle any byte
29
27
  return byte_data.decode("latin-1", errors="replace")
30
28
 
31
29
 
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from functools import partial
5
+ from inspect import isawaitable, iscoroutinefunction
6
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
7
+
8
+ import anyio
9
+ from anyio import create_task_group
10
+ from anyio.to_thread import run_sync as any_io_run_sync
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from collections.abc import Awaitable, Callable
14
+
15
+ if sys.version_info >= (3, 10):
16
+ from typing import ParamSpec
17
+ else: # pragma: no cover
18
+ from typing_extensions import ParamSpec
19
+
20
+ T = TypeVar("T")
21
+ P = ParamSpec("P")
22
+
23
+
24
+ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
25
+ """Run a synchronous function in an asynchronous context.
26
+
27
+ Args:
28
+ sync_fn: The synchronous function to run.
29
+ *args: The positional arguments to pass to the function.
30
+ **kwargs: The keyword arguments to pass to the function.
31
+
32
+ Returns:
33
+ The result of the synchronous function.
34
+ """
35
+ handler = partial(sync_fn, **kwargs)
36
+ return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
37
+
38
+
39
+ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
40
+ """Run a list of coroutines concurrently.
41
+
42
+ Args:
43
+ *async_tasks: The list of coroutines to run.
44
+
45
+ Returns:
46
+ The results of the coroutines.
47
+ """
48
+ results: list[Any] = [None] * len(async_tasks)
49
+
50
+ async def run_task(index: int, task: Awaitable[T]) -> None:
51
+ results[index] = await task
52
+
53
+ async with create_task_group() as tg:
54
+ for i, t in enumerate(async_tasks):
55
+ tg.start_soon(run_task, i, t)
56
+
57
+ return results
58
+
59
+
60
+ async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
61
+ """Run a list of coroutines concurrently in batches.
62
+
63
+ Args:
64
+ *async_tasks: The list of coroutines to run.
65
+ batch_size: The size of each batch.
66
+
67
+ Returns:
68
+ The results of the coroutines.
69
+ """
70
+ results: list[Any] = []
71
+
72
+ for i in range(0, len(async_tasks), batch_size):
73
+ batch = async_tasks[i : i + batch_size]
74
+ results.extend(await run_taskgroup(*batch))
75
+
76
+ return results
77
+
78
+
79
+ async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
80
+ """Executes a callable function and handles both synchronous and asynchronous
81
+ results.
82
+
83
+ This function invokes the provided callable `sync_fn` with the given
84
+ arguments and keyword arguments. If the result of `sync_fn` is awaitable,
85
+ it awaits the result before returning it. Otherwise, the result is returned
86
+ directly.
87
+
88
+ Args:
89
+ fn: The callable to be executed. It can produce either a
90
+ synchronous or asynchronous result.
91
+ *args: Positional arguments to pass to `sync_fn`.
92
+ **kwargs: Keyword arguments to pass to `sync_fn`.
93
+
94
+ Returns:
95
+ The result of `sync_fn` invocation. If the result is awaitable, the
96
+ awaited value is returned. Otherwise, the synchronous result is
97
+ returned.
98
+ """
99
+ result = fn(*args, **kwargs)
100
+ if isawaitable(result):
101
+ return cast("T", await result)
102
+ return result
103
+
104
+
105
+ def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
106
+ """Runs a synchronous or asynchronous function, resolving the output.
107
+
108
+ Determines if the provided function is synchronous or asynchronous. If synchronous,
109
+ executes it directly. If asynchronous, it runs the function within the event loop
110
+ using anyio. The return value is resolved regardless of the function type.
111
+
112
+ Args:
113
+ fn: The function to be executed, which can
114
+ either be synchronous or asynchronous.
115
+ *args: Positional arguments to be passed to the function.
116
+ **kwargs: Keyword arguments to be passed to the function.
117
+
118
+ Returns:
119
+ T: The return value of the executed function, resolved if asynchronous.
120
+ """
121
+ return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Callable
7
7
 
8
8
  from anyio import Path as AsyncPath
9
9
 
10
- from kreuzberg._sync import run_sync
10
+ from kreuzberg._utils._sync import run_sync
11
11
 
12
12
  if TYPE_CHECKING: # pragma: no cover
13
13
  from collections.abc import Coroutine
kreuzberg/exceptions.py CHANGED
@@ -51,6 +51,31 @@ class ValidationError(KreuzbergError):
51
51
  class MissingDependencyError(KreuzbergError):
52
52
  """Raised when a dependency is missing."""
53
53
 
54
+ @classmethod
55
+ def create_for_package(
56
+ cls, *, dependency_group: str, functionality: str, package_name: str
57
+ ) -> MissingDependencyError:
58
+ """Creates a MissingDependencyError for a specified package and functionality.
59
+
60
+ This class method generates an error message to notify users about a
61
+ missing package dependency required for specific functionality. The error
62
+ message includes details about the missing package and the optional
63
+ dependency group required for installation.
64
+
65
+ Args:
66
+ dependency_group: The name of the optional dependency group that includes
67
+ the required package.
68
+ functionality: The functionality that requires the missing package.
69
+ package_name: The name of the missing package.
70
+
71
+ Returns:
72
+ MissingDependencyError: A customized error indicating the missing
73
+ dependency and how to resolve it.
74
+ """
75
+ return MissingDependencyError(
76
+ f"The package '{package_name}' is required to use {functionality}. You can install using the provided optional dependency group by installing `kreuzberg['{dependency_group}']`."
77
+ )
78
+
54
79
 
55
80
  class OCRError(KreuzbergError):
56
81
  """Raised when an OCR error occurs."""