PyPI - kreuzberg - Versions diffs - 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

kreuzberg 2.1.2py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

kreuzberg/__init__.py +16 -2
kreuzberg/_chunker.py +51 -0
kreuzberg/_constants.py +2 -3
kreuzberg/_extractors/__init__.py +0 -0
kreuzberg/_extractors/_base.py +92 -0
kreuzberg/_extractors/_html.py +34 -0
kreuzberg/_extractors/_image.py +74 -0
kreuzberg/_extractors/_pandoc.py +613 -0
kreuzberg/_extractors/_pdf.py +163 -0
kreuzberg/_extractors/_presentation.py +233 -0
kreuzberg/_extractors/_spread_sheet.py +125 -0
kreuzberg/_mime_types.py +19 -26
kreuzberg/_ocr/__init__.py +17 -0
kreuzberg/_ocr/_base.py +54 -0
kreuzberg/_ocr/_easyocr.py +376 -0
kreuzberg/_ocr/_paddleocr.py +291 -0
kreuzberg/_ocr/_tesseract.py +342 -0
kreuzberg/_playa.py +276 -0
kreuzberg/_registry.py +108 -0
kreuzberg/_types.py +133 -36
kreuzberg/_utils/__init__.py +0 -0
kreuzberg/{_string.py → _utils/_string.py} +0 -2
kreuzberg/_utils/_sync.py +121 -0
kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
kreuzberg/exceptions.py +25 -0
kreuzberg/extraction.py +114 -227
kreuzberg-3.0.1.dist-info/METADATA +178 -0
kreuzberg-3.0.1.dist-info/RECORD +32 -0
{kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
kreuzberg/_html.py +0 -31
kreuzberg/_pandoc.py +0 -366
kreuzberg/_pdf.py +0 -190
kreuzberg/_pptx.py +0 -88
kreuzberg/_sync.py +0 -74
kreuzberg/_tesseract.py +0 -231
kreuzberg/_xlsx.py +0 -88
kreuzberg-2.1.2.dist-info/METADATA +0 -446
kreuzberg-2.1.2.dist-info/RECORD +0 -21
{kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
{kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0

kreuzberg/_registry.py ADDED Viewed

@@ -0,0 +1,108 @@
+from __future__ import annotations
+from functools import lru_cache
+from typing import TYPE_CHECKING, ClassVar
+from kreuzberg._extractors._html import HTMLExtractor
+from kreuzberg._extractors._image import ImageExtractor
+from kreuzberg._extractors._pandoc import (
+    BibliographyExtractor,
+    EbookExtractor,
+    LaTeXExtractor,
+    MarkdownExtractor,
+    MiscFormatExtractor,
+    OfficeDocumentExtractor,
+    StructuredTextExtractor,
+    TabularDataExtractor,
+    XMLBasedExtractor,
+)
+from kreuzberg._extractors._pdf import PDFExtractor
+from kreuzberg._extractors._presentation import PresentationExtractor
+from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
+if TYPE_CHECKING:
+    from kreuzberg._extractors._base import Extractor
+    from kreuzberg._types import ExtractionConfig
+class ExtractorRegistry:
+    """Manages extractors for different MIME types and their configurations.
+    This class provides functionality to register, unregister, and retrieve
+    extractors based on MIME types. It supports both synchronous and asynchronous
+    operations for managing extractors. A default set of extractors is also
+    maintained alongside user-registered extractors.
+    """
+    _default_extractors: ClassVar[list[type[Extractor]]] = [
+        PDFExtractor,
+        OfficeDocumentExtractor,
+        PresentationExtractor,
+        SpreadSheetExtractor,
+        HTMLExtractor,
+        MarkdownExtractor,
+        ImageExtractor,
+        BibliographyExtractor,
+        EbookExtractor,
+        LaTeXExtractor,
+        MiscFormatExtractor,
+        StructuredTextExtractor,
+        TabularDataExtractor,
+        XMLBasedExtractor,
+    ]
+    _registered_extractors: ClassVar[list[type[Extractor]]] = []
+    @classmethod
+    @lru_cache
+    def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
+        """Gets the extractor for the mimetype.
+        Args:
+            mime_type: The mime type of the content.
+            config: Extraction options object, defaults to the default object.
+        Returns:
+            The extractor
+        """
+        extractors: list[type[Extractor]] = [
+            *cls._registered_extractors,
+            *cls._default_extractors,
+        ]
+        if mime_type:
+            for extractor in extractors:
+                if extractor.supports_mimetype(mime_type):
+                    return extractor(mime_type=mime_type, config=config)
+        return None
+    @classmethod
+    def add_extractor(cls, extractor: type[Extractor]) -> None:
+        """Add an extractor to the registry.
+        Note:
+            Extractors are tried in the order they are added: first added, first tried.
+        Args:
+            extractor: The extractor to add.
+        Returns:
+            None
+        """
+        cls._registered_extractors.append(extractor)
+        cls.get_extractor.cache_clear()
+    @classmethod
+    def remove_extractor(cls, extractor: type[Extractor]) -> None:
+        """Remove an extractor from the registry.
+        Args:
+            extractor: The extractor to remove.
+        Returns:
+            None
+        """
+        try:
+            cls._registered_extractors.remove(extractor)
+            cls.get_extractor.cache_clear()
+        except ValueError:
+            pass

kreuzberg/_types.py CHANGED Viewed

@@ -1,71 +1,168 @@
 from __future__ import annotations
 import sys
-from typing import NamedTuple, TypedDict
+from collections.abc import Awaitable
+from dataclasses import asdict, dataclass
+from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
+from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
+from kreuzberg.exceptions import ValidationError
 if sys.version_info < (3, 11):  # pragma: no cover
     from typing_extensions import NotRequired
 else:  # pragma: no cover
     from typing import NotRequired
+if TYPE_CHECKING:
+    from kreuzberg._ocr._easyocr import EasyOCRConfig
+    from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+    from kreuzberg._ocr._tesseract import TesseractConfig
+OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
 class Metadata(TypedDict, total=False):
-    """Document metadata.
+    """Base metadata common to all document types.
-    All fields are optional but will only be included if they contain non-empty values.
+    All fields will only be included if they contain non-empty values.
     Any field that would be empty or None is omitted from the dictionary.
-    Different documents and extraction methods will yield different metadata.
     """
-    title: NotRequired[str]
-    """Document title."""
-    subtitle: NotRequired[str]
-    """Document subtitle."""
-    abstract: NotRequired[str | list[str]]
-    """Document abstract, summary or description."""
     authors: NotRequired[list[str]]
     """List of document authors."""
-    date: NotRequired[str]
-    """Document date as string to preserve original format."""
-    subject: NotRequired[str]
-    """Document subject or topic."""
-    description: NotRequired[str]
-    """Extended description."""
-    keywords: NotRequired[list[str]]
-    """Keywords or tags."""
     categories: NotRequired[list[str]]
     """Categories or classifications."""
-    version: NotRequired[str]
-    """Version identifier."""
-    language: NotRequired[str]
-    """Document language code."""
-    references: NotRequired[list[str]]
-    """Reference entries."""
     citations: NotRequired[list[str]]
     """Citation identifiers."""
+    comments: NotRequired[str]
+    """General comments."""
     copyright: NotRequired[str]
     """Copyright information."""
+    created_at: NotRequired[str]
+    """Creation timestamp in ISO format."""
+    created_by: NotRequired[str]
+    """Document creator."""
+    description: NotRequired[str]
+    """Document description."""
+    fonts: NotRequired[list[str]]
+    """List of fonts used in the document."""
+    height: NotRequired[int]
+    """Height of the document page/slide/image, if applicable."""
+    identifier: NotRequired[str]
+    """Unique document identifier."""
+    keywords: NotRequired[list[str]]
+    """Keywords or tags."""
+    languages: NotRequired[list[str]]
+    """Document language code."""
     license: NotRequired[str]
     """License information."""
-    identifier: NotRequired[str]
-    """Document identifier."""
+    modified_at: NotRequired[str]
+    """Last modification timestamp in ISO format."""
+    modified_by: NotRequired[str]
+    """Username of last modifier."""
+    organization: NotRequired[str | list[str]]
+    """Organizational affiliation."""
     publisher: NotRequired[str]
-    """Publisher name."""
-    contributors: NotRequired[list[str]]
-    """Additional contributors."""
-    creator: NotRequired[str]
-    """Document creator."""
-    institute: NotRequired[str | list[str]]
-    """Institute or organization."""
+    """Publisher or organization name."""
+    references: NotRequired[list[str]]
+    """Reference entries."""
+    status: NotRequired[str]
+    """Document status (e.g., draft, final)."""
+    subject: NotRequired[str]
+    """Document subject or topic."""
+    subtitle: NotRequired[str]
+    """Document subtitle."""
+    summary: NotRequired[str]
+    """Document Summary"""
+    title: NotRequired[str]
+    """Document title."""
+    version: NotRequired[str]
+    """Version identifier or revision number."""
+    width: NotRequired[int]
+    """Width of the document page/slide/image, if applicable."""
-class ExtractionResult(NamedTuple):
+@dataclass
+class ExtractionResult:
     """The result of a file extraction."""
     content: str
     """The extracted content."""
+    chunks: list[str]
+    """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
     mime_type: str
-    """The mime type of the content."""
+    """The mime type of the extracted content. Is either text/plain or text/markdown."""
     metadata: Metadata
     """The metadata of the content."""
+PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
+ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
+@dataclass(unsafe_hash=True)
+class ExtractionConfig:
+    """Represents configuration settings for an extraction process.
+    This class encapsulates the configuration options for extracting text
+    from images or documents using Optical Character Recognition (OCR). It
+    provides options to customize the OCR behavior, select the backend
+    engine, and configure engine-specific parameters.
+    """
+    force_ocr: bool = False
+    """Whether to force OCR."""
+    chunk_content: bool = False
+    """Whether to chunk the content into smaller chunks."""
+    max_chars: int = DEFAULT_MAX_CHARACTERS
+    """The size of each chunk in characters."""
+    max_overlap: int = DEFAULT_MAX_OVERLAP
+    """The overlap between chunks in characters."""
+    ocr_backend: OcrBackendType | None = "tesseract"
+    """The OCR backend to use."""
+    ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
+    """Configuration to pass to the OCR backend."""
+    post_processing_hooks: list[PostProcessingHook] | None = None
+    """Post processing hooks to call after processing is done and before the final result is returned."""
+    validators: list[ValidationHook] | None = None
+    """Validation hooks to call after processing is done and before post-processing and result return."""
+    def __post_init__(self) -> None:
+        from kreuzberg._ocr._easyocr import EasyOCRConfig
+        from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+        from kreuzberg._ocr._tesseract import TesseractConfig
+        if self.ocr_backend is None and self.ocr_config is not None:
+            raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
+        if self.ocr_config is not None and (
+            (self.ocr_backend == "tesseract" and not isinstance(self.ocr_config, TesseractConfig))
+            or (self.ocr_backend == "easyocr" and not isinstance(self.ocr_config, EasyOCRConfig))
+            or (self.ocr_backend == "paddleocr" and not isinstance(self.ocr_config, PaddleOCRConfig))
+        ):
+            raise ValidationError(
+                "incompatible 'ocr_config' value provided for 'ocr_backend'",
+                context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
+            )
+    def get_config_dict(self) -> dict[str, Any]:
+        """Returns the OCR configuration object based on the backend specified.
+        Returns:
+            A dict of the OCR configuration or an empty dict if no backend is provided.
+        """
+        if self.ocr_backend is not None:
+            if self.ocr_config is not None:
+                return asdict(self.ocr_config)
+            if self.ocr_backend == "tesseract":
+                from kreuzberg._ocr._tesseract import TesseractConfig
+                return asdict(TesseractConfig())
+            if self.ocr_backend == "easyocr":
+                from kreuzberg._ocr._easyocr import EasyOCRConfig
+                return asdict(EasyOCRConfig())
+            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+            return asdict(PaddleOCRConfig())
+        return {}

kreuzberg/_utils/__init__.py ADDED Viewed

File without changes

kreuzberg/{_string.py → _utils/_string.py} RENAMED Viewed

@@ -18,14 +18,12 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     if not byte_data:
         return ""
-    # We try each encoding in order until one works
     encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
     for enc in [e for e in encodings if e]:  # pragma: no cover
         with suppress(UnicodeDecodeError, LookupError):
             return byte_data.decode(enc)
-    # If all encodings fail, fall back to latin-1 which can handle any byte
     return byte_data.decode("latin-1", errors="replace")

kreuzberg/_utils/_sync.py ADDED Viewed

@@ -0,0 +1,121 @@
+from __future__ import annotations
+import sys
+from functools import partial
+from inspect import isawaitable, iscoroutinefunction
+from typing import TYPE_CHECKING, Any, TypeVar, cast
+import anyio
+from anyio import create_task_group
+from anyio.to_thread import run_sync as any_io_run_sync
+if TYPE_CHECKING:  # pragma: no cover
+    from collections.abc import Awaitable, Callable
+if sys.version_info >= (3, 10):
+    from typing import ParamSpec
+else:  # pragma: no cover
+    from typing_extensions import ParamSpec
+T = TypeVar("T")
+P = ParamSpec("P")
+async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
+    """Run a synchronous function in an asynchronous context.
+    Args:
+        sync_fn: The synchronous function to run.
+        *args: The positional arguments to pass to the function.
+        **kwargs: The keyword arguments to pass to the function.
+    Returns:
+        The result of the synchronous function.
+    """
+    handler = partial(sync_fn, **kwargs)
+    return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]
+async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
+    """Run a list of coroutines concurrently.
+    Args:
+        *async_tasks: The list of coroutines to run.
+    Returns:
+        The results of the coroutines.
+    """
+    results: list[Any] = [None] * len(async_tasks)
+    async def run_task(index: int, task: Awaitable[T]) -> None:
+        results[index] = await task
+    async with create_task_group() as tg:
+        for i, t in enumerate(async_tasks):
+            tg.start_soon(run_task, i, t)
+    return results
+async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
+    """Run a list of coroutines concurrently in batches.
+    Args:
+        *async_tasks: The list of coroutines to run.
+        batch_size: The size of each batch.
+    Returns:
+        The results of the coroutines.
+    """
+    results: list[Any] = []
+    for i in range(0, len(async_tasks), batch_size):
+        batch = async_tasks[i : i + batch_size]
+        results.extend(await run_taskgroup(*batch))
+    return results
+async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
+    """Executes a callable function and handles both synchronous and asynchronous
+    results.
+    This function invokes the provided callable `sync_fn` with the given
+    arguments and keyword arguments. If the result of `sync_fn` is awaitable,
+    it awaits the result before returning it. Otherwise, the result is returned
+    directly.
+    Args:
+        fn: The callable to be executed. It can produce either a
+            synchronous or asynchronous result.
+        *args: Positional arguments to pass to `sync_fn`.
+        **kwargs: Keyword arguments to pass to `sync_fn`.
+    Returns:
+        The result of `sync_fn` invocation. If the result is awaitable, the
+        awaited value is returned. Otherwise, the synchronous result is
+        returned.
+    """
+    result = fn(*args, **kwargs)
+    if isawaitable(result):
+        return cast("T", await result)
+    return result
+def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
+    """Runs a synchronous or asynchronous function, resolving the output.
+    Determines if the provided function is synchronous or asynchronous. If synchronous,
+    executes it directly. If asynchronous, it runs the function within the event loop
+    using anyio. The return value is resolved regardless of the function type.
+    Args:
+        fn: The function to be executed, which can
+            either be synchronous or asynchronous.
+        *args: Positional arguments to be passed to the function.
+        **kwargs: Keyword arguments to be passed to the function.
+    Returns:
+        T: The return value of the executed function, resolved if asynchronous.
+    """
+    return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))

kreuzberg/{_tmp.py → _utils/_tmp.py} RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Callable
 from anyio import Path as AsyncPath
-from kreuzberg._sync import run_sync
+from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Coroutine

kreuzberg/exceptions.py CHANGED Viewed

@@ -51,6 +51,31 @@ class ValidationError(KreuzbergError):
 class MissingDependencyError(KreuzbergError):
     """Raised when a dependency is missing."""
+    @classmethod
+    def create_for_package(
+        cls, *, dependency_group: str, functionality: str, package_name: str
+    ) -> MissingDependencyError:
+        """Creates a MissingDependencyError for a specified package and functionality.
+        This class method generates an error message to notify users about a
+        missing package dependency required for specific functionality. The error
+        message includes details about the missing package and the optional
+        dependency group required for installation.
+        Args:
+            dependency_group: The name of the optional dependency group that includes
+                the required package.
+            functionality: The functionality that requires the missing package.
+            package_name: The name of the missing package.
+        Returns:
+            MissingDependencyError: A customized error indicating the missing
+            dependency and how to resolve it.
+        """
+        return MissingDependencyError(
+            f"The package '{package_name}' is required to use {functionality}. You can install using the provided optional dependency group by installing `kreuzberg['{dependency_group}']`."
+        )
 class OCRError(KreuzbergError):
     """Raised when an OCR error occurs."""

kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

kreuzberg 2.1.2py3-none-any.whl → 3.0.1py3-none-any.whl