PyPI - kreuzberg - Versions diffs - 2.1.1__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

kreuzberg 2.1.1py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

kreuzberg/__init__.py +16 -2
kreuzberg/_chunker.py +51 -0
kreuzberg/_constants.py +2 -3
kreuzberg/_mime_types.py +19 -26
kreuzberg/_playa.py +276 -0
kreuzberg/_registry.py +108 -0
kreuzberg/_types.py +133 -36
kreuzberg/exceptions.py +25 -0
kreuzberg/extraction.py +114 -227
kreuzberg-3.0.0.dist-info/METADATA +178 -0
kreuzberg-3.0.0.dist-info/RECORD +15 -0
{kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info}/WHEEL +1 -1
kreuzberg/_html.py +0 -31
kreuzberg/_pandoc.py +0 -366
kreuzberg/_pdf.py +0 -190
kreuzberg/_pptx.py +0 -88
kreuzberg/_string.py +0 -41
kreuzberg/_sync.py +0 -74
kreuzberg/_tesseract.py +0 -231
kreuzberg/_tmp.py +0 -37
kreuzberg/_xlsx.py +0 -88
kreuzberg-2.1.1.dist-info/METADATA +0 -446
kreuzberg-2.1.1.dist-info/RECORD +0 -21
{kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info/licenses}/LICENSE +0 -0
{kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info}/top_level.txt +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -1,5 +1,10 @@
-from ._tesseract import PSMMode
-from ._types import ExtractionResult, Metadata
+from kreuzberg._ocr._easyocr import EasyOCRConfig
+from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+from kreuzberg._ocr._tesseract import TesseractConfig
+from ._ocr._tesseract import PSMMode
+from ._registry import ExtractorRegistry
+from ._types import ExtractionConfig, ExtractionResult, Metadata
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
     batch_extract_bytes,
@@ -7,22 +12,31 @@ from .extraction import (
     batch_extract_file,
     batch_extract_file_sync,
     extract_bytes,
+    extract_bytes_sync,
     extract_file,
+    extract_file_sync,
 )
 __all__ = [
+    "EasyOCRConfig",
+    "ExtractionConfig",
     "ExtractionResult",
+    "ExtractorRegistry",
     "KreuzbergError",
     "Metadata",
     "MissingDependencyError",
     "OCRError",
     "PSMMode",
+    "PaddleOCRConfig",
     "ParsingError",
+    "TesseractConfig",
     "ValidationError",
     "batch_extract_bytes",
     "batch_extract_bytes_sync",
     "batch_extract_file",
     "batch_extract_file_sync",
     "extract_bytes",
+    "extract_bytes_sync",
     "extract_file",
+    "extract_file_sync",
 ]

kreuzberg/_chunker.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from kreuzberg import MissingDependencyError
+from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
+from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
+if TYPE_CHECKING:
+    from semantic_text_splitter import MarkdownSplitter, TextSplitter
+_chunkers: dict[tuple[int, int, str], MarkdownSplitter | TextSplitter] = {}
+def get_chunker(
+    mime_type: str,
+    max_characters: int = DEFAULT_MAX_CHARACTERS,
+    overlap_characters: int = DEFAULT_MAX_OVERLAP,
+) -> MarkdownSplitter | TextSplitter:
+    """Creates and returns a Chunker object configured with the given maximum
+    characters per chunk and overlap between chunks.
+    Args:
+        mime_type: The mime type of the content.
+        max_characters: Maximum number of characters allowed in each chunk.
+        overlap_characters: Number of characters overlapping between two consecutive chunks.
+    Raises:
+        MissingDependencyError: if semantic-text-splitter is not installed.
+    Returns:
+        Chunker: A Chunker object configured with the specified maximum
+            characters and overlap.
+    """
+    key = (max_characters, overlap_characters, mime_type)
+    if key not in _chunkers:
+        try:
+            if mime_type == MARKDOWN_MIME_TYPE:
+                from semantic_text_splitter import MarkdownSplitter
+                _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
+            else:
+                from semantic_text_splitter import TextSplitter
+                _chunkers[key] = TextSplitter(max_characters, overlap_characters)
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
+            ) from e
+    return _chunkers[key]

kreuzberg/_constants.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from __future__ import annotations
-from multiprocessing import cpu_count
 from typing import Final
-DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
-MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
 MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
+DEFAULT_MAX_CHARACTERS: Final[int] = 2000
+DEFAULT_MAX_OVERLAP: Final[int] = 100

kreuzberg/_mime_types.py CHANGED Viewed

@@ -16,7 +16,7 @@ PDF_MIME_TYPE: Final = "application/pdf"
 PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
 POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-# Excel formats
 EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
 EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -24,8 +24,8 @@ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macr
 EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
 EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
-# OpenDocument spreadsheet format
-OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"  # ods
+OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
 PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
 IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -48,26 +48,7 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
     "image/x-portable-pixmap",
     "image/x-tiff",
 }
-IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
-    "image/bmp": "bmp",
-    "image/x-bmp": "bmp",
-    "image/x-ms-bmp": "bmp",
-    "image/gif": "gif",
-    "image/jpeg": "jpg",
-    "image/pjpeg": "jpg",
-    "image/png": "png",
-    "image/tiff": "tiff",
-    "image/x-tiff": "tiff",
-    "image/jp2": "jp2",
-    "image/jpx": "jpx",
-    "image/jpm": "jpm",
-    "image/mj2": "mj2",
-    "image/webp": "webp",
-    "image/x-portable-anymap": "pnm",
-    "image/x-portable-bitmap": "pbm",
-    "image/x-portable-graymap": "pgm",
-    "image/x-portable-pixmap": "ppm",
-}
 PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
     "application/csl+json",
     "application/docbook+xml",
@@ -162,13 +143,17 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
 )
-def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
+def validate_mime_type(
+    *, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
+) -> str:
     """Validate and detect the MIME type for a given file.
     Args:
         file_path: The path to the file.
         mime_type: Optional explicit MIME type. If provided, this will be validated.
             If not provided, the function will attempt to detect the MIME type.
+        check_file_exists: Whether to check if the file exists. Default is True.
+            Set to False in tests where you want to validate a mime type without an actual file.
     Raises:
         ValidationError: If the MIME type is not supported or cannot be determined.
@@ -176,10 +161,18 @@ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = N
     Returns:
         The validated MIME type.
     """
-    path = Path(file_path)
+    if file_path and check_file_exists:
+        path = Path(file_path)
+        if not path.exists():
+            raise ValidationError("The file does not exist", context={"file_path": str(path)})
     if not mime_type:
-        # Try to determine MIME type from file extension first
+        if not file_path:
+            raise ValidationError(
+                "Could not determine mime type.",
+            )
+        path = Path(file_path)
         ext = path.suffix.lower()
         mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]

kreuzberg/_playa.py ADDED Viewed

@@ -0,0 +1,276 @@
+from __future__ import annotations
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, cast
+from playa import asobj, parse
+from playa.utils import decode_text
+from kreuzberg.exceptions import ParsingError
+if TYPE_CHECKING:
+    from playa.document import Document
+    from kreuzberg._types import Metadata
+GRAY_COMPONENTS = 1
+RGB_COMPONENTS = 3
+CMYK_COMPONENTS = 4
+UTF16BE_BOM = b"\xfe\xff"
+UTF16BE_ENCODING = "utf-16be"
+MIN_DATE_LENGTH = 8
+FULL_DATE_LENGTH = 14
+BOM_CHAR = "\ufeff"
+async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
+    """Extract metadata from a PDF document.
+    Args:
+        pdf_content: The bytes of the PDF document.
+    Raises:
+        ParsingError: If the PDF metadata could not be extracted.
+    Returns:
+        A dictionary of metadata extracted from the PDF.
+    """
+    try:
+        document = parse(pdf_content, max_workers=1)
+        metadata: Metadata = {}
+        for raw_info in document.info:
+            pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
+            _extract_basic_metadata(pdf_info, metadata)
+            _extract_author_metadata(pdf_info, metadata)
+            _extract_keyword_metadata(pdf_info, metadata)
+            _extract_category_metadata(pdf_info, metadata)
+            _extract_date_metadata(pdf_info, metadata)
+            _extract_creator_metadata(pdf_info, metadata)
+        if document.pages:
+            _extract_document_dimensions(document, metadata)
+        if document.outline and "description" not in metadata:
+            metadata["description"] = _generate_outline_description(document)
+        if "summary" not in metadata:
+            metadata["summary"] = _generate_document_summary(document)
+        _extract_structure_information(document, metadata)
+        return metadata
+    except Exception as e:
+        raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
+def _extract_basic_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
+    if "title" not in result and (title := pdf_info.get("title")):
+        result["title"] = decode_text(title)
+    if "subject" not in result and (subject := pdf_info.get("subject")):
+        result["subject"] = decode_text(subject)
+    if "publisher" not in result and (publisher := pdf_info.get("Publisher", pdf_info.get("publisher"))):
+        result["publisher"] = decode_text(publisher)
+    if "copyright" not in result and (copyright_info := pdf_info.get("copyright") or pdf_info.get("rights")):
+        result["copyright"] = decode_text(copyright_info)
+    if "comments" not in result and (comments := pdf_info.get("comments")):
+        result["comments"] = decode_text(comments)
+    if "identifier" not in result and (identifier := pdf_info.get("identifier") or pdf_info.get("id")):
+        result["identifier"] = decode_text(identifier)
+    if "license" not in result and (license_info := pdf_info.get("license")):
+        result["license"] = decode_text(license_info)
+    if "modified_by" not in result and (modified_by := pdf_info.get("modifiedby") or pdf_info.get("last_modified_by")):
+        result["modified_by"] = decode_text(modified_by)
+    if "version" not in result and (version := pdf_info.get("version")):
+        result["version"] = decode_text(version)
+def _extract_author_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
+    if author := pdf_info.get("author"):
+        if isinstance(author, (str, bytes)):
+            author_str = decode_text(author)
+            author_str = author_str.replace(" and ", ", ")
+            authors = []
+            for author_segment in author_str.split(";"):
+                authors.extend(
+                    [author_name.strip() for author_name in author_segment.split(",") if author_name.strip()]
+                )
+            result["authors"] = authors
+        elif isinstance(author, list):
+            result["authors"] = [decode_text(a) for a in author]
+def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
+    if keywords := pdf_info.get("keywords"):
+        if isinstance(keywords, (str, bytes)):
+            kw_str = decode_text(keywords)
+            kw_list = [k.strip() for k in kw_str.split(",")]
+            kw_list = [k.strip() for k in " ".join(kw_list).split(";")]
+            result["keywords"] = [k for k in kw_list if k]
+        elif isinstance(keywords, list):
+            result["keywords"] = [decode_text(k) for k in keywords]
+def _extract_category_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
+    if categories := pdf_info.get("categories") or pdf_info.get("category"):
+        if isinstance(categories, (str, bytes)):
+            cat_str = decode_text(categories)
+            cat_list = [c.strip() for c in cat_str.split(",")]
+            result["categories"] = [c for c in cat_list if c]
+        elif isinstance(categories, list):
+            result["categories"] = [decode_text(c) for c in categories]
+def _parse_date_string(date_str: str) -> str:
+    date_str = date_str.removeprefix("D:")
+    if len(date_str) >= MIN_DATE_LENGTH:
+        year = date_str[0:4]
+        month = date_str[4:6]
+        day = date_str[6:8]
+        time_part = ""
+        if len(date_str) >= FULL_DATE_LENGTH:
+            hour = date_str[8:10]
+            minute = date_str[10:12]
+            second = date_str[12:14]
+            time_part = f"T{hour}:{minute}:{second}"
+        return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat()  # noqa: DTZ007
+    return date_str
+def _extract_date_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
+    if created := pdf_info.get("creationdate") or pdf_info.get("createdate"):
+        try:
+            date_str = decode_text(created)
+            result["created_at"] = _parse_date_string(date_str)
+        except (ValueError, IndexError):
+            result["created_at"] = decode_text(created)
+    if modified := pdf_info.get("moddate") or pdf_info.get("modificationdate"):
+        try:
+            date_str = decode_text(modified)
+            result["modified_at"] = _parse_date_string(date_str)
+        except (ValueError, IndexError):
+            result["modified_at"] = decode_text(modified)
+def _extract_creator_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
+    if creator := pdf_info.get("creator"):
+        result["created_by"] = decode_text(creator)
+    if producer := pdf_info.get("producer"):
+        producer_str = decode_text(producer)
+        if "created_by" not in result:
+            result["created_by"] = producer_str
+        elif producer_str not in result["created_by"]:
+            result["created_by"] = f"{result['created_by']} (Producer: {producer_str})"
+def _extract_document_dimensions(document: Document, result: Metadata) -> None:
+    first_page = document.pages[0]
+    if hasattr(first_page, "width") and hasattr(first_page, "height"):
+        result["width"] = int(first_page.width)
+        result["height"] = int(first_page.height)
+def _format_outline(entries: list[Any], level: int = 0) -> list[str]:
+    outline_text: list[str] = []
+    for entry in entries:
+        if hasattr(entry, "title") and entry.title:
+            indent = "  " * level
+            outline_text.append(f"{indent}- {entry.title}")
+        if hasattr(entry, "children") and entry.children:
+            _format_outline(entry.children, level + 1)
+    return outline_text
+def _generate_outline_description(document: Document) -> str:
+    if outline_text := _format_outline(cast("list[Any]", document.outline)):
+        return "Table of Contents:\n" + "\n".join(outline_text)
+    return ""
+def _generate_document_summary(document: Document) -> str:
+    summary_parts = []
+    page_count = len(document.pages)
+    summary_parts.append(f"PDF document with {page_count} page{'s' if page_count != 1 else ''}.")
+    if hasattr(document, "pdf_version"):
+        summary_parts.append(f"PDF version {document.pdf_version}.")
+    if hasattr(document, "is_encrypted") and document.is_encrypted:
+        summary_parts.append("Document is encrypted.")
+        if hasattr(document, "encryption_method") and document.encryption_method:
+            summary_parts.append(f"Encryption: {document.encryption_method}.")
+    permissions = _collect_document_permissions(document)
+    if permissions:
+        summary_parts.append(f"Document is {', '.join(permissions)}.")
+    if hasattr(document, "status") and document.status:
+        status = decode_text(document.status)
+        summary_parts.append(f"Status: {status}.")
+    if hasattr(document, "is_pdf_a") and document.is_pdf_a:
+        if hasattr(document, "pdf_a_level") and document.pdf_a_level:
+            summary_parts.append(f"PDF/A-{document.pdf_a_level} compliant.")
+        else:
+            summary_parts.append("PDF/A compliant.")
+    return " ".join(summary_parts)
+def _collect_document_permissions(document: Document) -> list[str]:
+    permissions = []
+    if document.is_printable:
+        permissions.append("printable")
+    if document.is_modifiable:
+        permissions.append("modifiable")
+    if document.is_extractable:
+        permissions.append("extractable")
+    return permissions
+def _extract_structure_information(document: Document, result: Metadata) -> None:
+    """Extract language and subtitle from document structure."""
+    if document.structure:
+        languages = set()
+        subtitle = None
+        def extract_languages(elements: list[Any]) -> None:
+            nonlocal subtitle
+            for element in elements:
+                if hasattr(element, "language") and element.language:
+                    languages.add(element.language.lower())
+                if (
+                    subtitle is None
+                    and hasattr(element, "role")
+                    and element.role == "H1"
+                    and hasattr(element, "text")
+                    and element.text
+                ):
+                    subtitle = decode_text(element.text)
+                if hasattr(element, "children") and element.children:
+                    extract_languages(element.children)
+        extract_languages(cast("list[Any]", document.structure))
+        if languages:
+            result["languages"] = list(languages)
+        if subtitle and "title" in result and subtitle != result["title"]:
+            result["subtitle"] = subtitle

kreuzberg/_registry.py ADDED Viewed

@@ -0,0 +1,108 @@
+from __future__ import annotations
+from functools import lru_cache
+from typing import TYPE_CHECKING, ClassVar
+from kreuzberg._extractors._html import HTMLExtractor
+from kreuzberg._extractors._image import ImageExtractor
+from kreuzberg._extractors._pandoc import (
+    BibliographyExtractor,
+    EbookExtractor,
+    LaTeXExtractor,
+    MarkdownExtractor,
+    MiscFormatExtractor,
+    OfficeDocumentExtractor,
+    StructuredTextExtractor,
+    TabularDataExtractor,
+    XMLBasedExtractor,
+)
+from kreuzberg._extractors._pdf import PDFExtractor
+from kreuzberg._extractors._presentation import PresentationExtractor
+from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
+if TYPE_CHECKING:
+    from kreuzberg._extractors._base import Extractor
+    from kreuzberg._types import ExtractionConfig
+class ExtractorRegistry:
+    """Manages extractors for different MIME types and their configurations.
+    This class provides functionality to register, unregister, and retrieve
+    extractors based on MIME types. It supports both synchronous and asynchronous
+    operations for managing extractors. A default set of extractors is also
+    maintained alongside user-registered extractors.
+    """
+    _default_extractors: ClassVar[list[type[Extractor]]] = [
+        PDFExtractor,
+        OfficeDocumentExtractor,
+        PresentationExtractor,
+        SpreadSheetExtractor,
+        HTMLExtractor,
+        MarkdownExtractor,
+        ImageExtractor,
+        BibliographyExtractor,
+        EbookExtractor,
+        LaTeXExtractor,
+        MiscFormatExtractor,
+        StructuredTextExtractor,
+        TabularDataExtractor,
+        XMLBasedExtractor,
+    ]
+    _registered_extractors: ClassVar[list[type[Extractor]]] = []
+    @classmethod
+    @lru_cache
+    def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
+        """Gets the extractor for the mimetype.
+        Args:
+            mime_type: The mime type of the content.
+            config: Extraction options object, defaults to the default object.
+        Returns:
+            The extractor
+        """
+        extractors: list[type[Extractor]] = [
+            *cls._registered_extractors,
+            *cls._default_extractors,
+        ]
+        if mime_type:
+            for extractor in extractors:
+                if extractor.supports_mimetype(mime_type):
+                    return extractor(mime_type=mime_type, config=config)
+        return None
+    @classmethod
+    def add_extractor(cls, extractor: type[Extractor]) -> None:
+        """Add an extractor to the registry.
+        Note:
+            Extractors are tried in the order they are added: first added, first tried.
+        Args:
+            extractor: The extractor to add.
+        Returns:
+            None
+        """
+        cls._registered_extractors.append(extractor)
+        cls.get_extractor.cache_clear()
+    @classmethod
+    def remove_extractor(cls, extractor: type[Extractor]) -> None:
+        """Remove an extractor from the registry.
+        Args:
+            extractor: The extractor to remove.
+        Returns:
+            None
+        """
+        try:
+            cls._registered_extractors.remove(extractor)
+            cls.get_extractor.cache_clear()
+        except ValueError:
+            pass

kreuzberg 2.1.1__py3-none-any.whl → 3.0.0__py3-none-any.whl

kreuzberg 2.1.1py3-none-any.whl → 3.0.0py3-none-any.whl