PyPI - kreuzberg - Versions diffs - 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

kreuzberg 1.7.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

kreuzberg/__init__.py +6 -2
kreuzberg/_constants.py +6 -0
kreuzberg/_html.py +32 -0
kreuzberg/_mime_types.py +109 -1
kreuzberg/_pandoc.py +122 -169
kreuzberg/_pdf.py +189 -0
kreuzberg/_pptx.py +88 -0
kreuzberg/_string.py +5 -8
kreuzberg/_sync.py +6 -1
kreuzberg/_tesseract.py +98 -71
kreuzberg/_tmp.py +37 -0
kreuzberg/_types.py +71 -0
kreuzberg/_xlsx.py +92 -0
kreuzberg/extraction.py +269 -64
kreuzberg-2.0.0.dist-info/METADATA +419 -0
kreuzberg-2.0.0.dist-info/RECORD +21 -0
kreuzberg/_extractors.py +0 -280
kreuzberg-1.7.0.dist-info/METADATA +0 -342
kreuzberg-1.7.0.dist-info/RECORD +0 -15
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -1,9 +1,13 @@
-from .exceptions import KreuzbergError, ParsingError, ValidationError
-from .extraction import ExtractionResult, extract_bytes, extract_file
+from ._types import ExtractionResult, Metadata
+from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
+from .extraction import extract_bytes, extract_file
 __all__ = [
     "ExtractionResult",
     "KreuzbergError",
+    "Metadata",
+    "MissingDependencyError",
+    "OCRError",
     "ParsingError",
     "ValidationError",
     "extract_bytes",

kreuzberg/_constants.py ADDED Viewed

@@ -0,0 +1,6 @@
+from __future__ import annotations
+from multiprocessing import cpu_count
+from typing import Final
+DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)

kreuzberg/_html.py ADDED Viewed

@@ -0,0 +1,32 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import html_to_markdown
+from anyio import Path as AsyncPath
+from kreuzberg import ExtractionResult
+from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
+from kreuzberg._string import normalize_spaces, safe_decode
+from kreuzberg._sync import run_sync
+if TYPE_CHECKING:
+    from pathlib import Path
+async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
+    """Extract text from an HTML string.
+    Args:
+        file_path_or_contents: The HTML content.
+    Returns:
+        The extracted text content.
+    """
+    content = (
+        safe_decode(file_path_or_contents)
+        if isinstance(file_path_or_contents, bytes)
+        else await AsyncPath(file_path_or_contents).read_text()
+    )
+    result = await run_sync(html_to_markdown.convert_to_markdown, content)
+    return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})

kreuzberg/_mime_types.py CHANGED Viewed

@@ -1,16 +1,30 @@
 from __future__ import annotations
+from mimetypes import guess_type
+from pathlib import Path
 from typing import TYPE_CHECKING, Final
+from kreuzberg.exceptions import ValidationError
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
+    from os import PathLike
 HTML_MIME_TYPE: Final = "text/html"
 MARKDOWN_MIME_TYPE: Final = "text/markdown"
 PDF_MIME_TYPE: Final = "application/pdf"
 PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
 POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+# Excel formats
 EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
+EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
+EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macroEnabled.12"
+EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
+EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
+# OpenDocument spreadsheet format
+OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"  # ods
 PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
 IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -85,9 +99,103 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
     "text/x-rst",
 }
+SPREADSHEET_MIME_TYPES: Final[set[str]] = {
+    EXCEL_MIME_TYPE,
+    EXCEL_BINARY_MIME_TYPE,
+    EXCEL_MACRO_MIME_TYPE,
+    EXCEL_BINARY_2007_MIME_TYPE,
+    EXCEL_ADDON_MIME_TYPE,
+    EXCEL_TEMPLATE_MIME_TYPE,
+    OPENDOC_SPREADSHEET_MIME_TYPE,
+}
+EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
+    ".txt": PLAIN_TEXT_MIME_TYPE,
+    ".md": MARKDOWN_MIME_TYPE,
+    ".pdf": PDF_MIME_TYPE,
+    ".html": HTML_MIME_TYPE,
+    ".htm": HTML_MIME_TYPE,
+    ".xlsx": EXCEL_MIME_TYPE,
+    ".xls": EXCEL_BINARY_MIME_TYPE,
+    ".xlsm": EXCEL_MACRO_MIME_TYPE,
+    ".xlsb": EXCEL_BINARY_2007_MIME_TYPE,
+    ".xlam": EXCEL_ADDON_MIME_TYPE,
+    ".xla": EXCEL_TEMPLATE_MIME_TYPE,
+    ".ods": OPENDOC_SPREADSHEET_MIME_TYPE,
+    ".pptx": POWER_POINT_MIME_TYPE,
+    ".bmp": "image/bmp",
+    ".gif": "image/gif",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".png": "image/png",
+    ".tiff": "image/tiff",
+    ".tif": "image/tiff",
+    ".webp": "image/webp",
+    ".jp2": "image/jp2",
+    ".jpx": "image/jpx",
+    ".jpm": "image/jpm",
+    ".mj2": "image/mj2",
+    ".pnm": "image/x-portable-anymap",
+    ".pbm": "image/x-portable-bitmap",
+    ".pgm": "image/x-portable-graymap",
+    ".ppm": "image/x-portable-pixmap",
+    ".csv": "text/csv",
+    ".tsv": "text/tab-separated-values",
+    ".rst": "text/x-rst",
+    ".org": "text/x-org",
+    ".epub": "application/epub+zip",
+    ".rtf": "application/rtf",
+    ".odt": "application/vnd.oasis.opendocument.text",
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".bib": "application/x-bibtex",
+    ".ipynb": "application/x-ipynb+json",
+    ".tex": "application/x-latex",
+}
 SUPPORTED_MIME_TYPES: Final[set[str]] = (
     PLAIN_TEXT_MIME_TYPES
     | IMAGE_MIME_TYPES
     | PANDOC_SUPPORTED_MIME_TYPES
-    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
+    | SPREADSHEET_MIME_TYPES
+    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
 )
+def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
+    """Validate and detect the MIME type for a given file.
+    Args:
+        file_path: The path to the file.
+        mime_type: Optional explicit MIME type. If provided, this will be validated.
+            If not provided, the function will attempt to detect the MIME type.
+    Raises:
+        ValidationError: If the MIME type is not supported or cannot be determined.
+    Returns:
+        The validated MIME type.
+    """
+    path = Path(file_path)
+    if not mime_type:
+        # Try to determine MIME type from file extension first
+        ext = path.suffix.lower()
+        mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
+        if not mime_type:  # pragma: no cover
+            raise ValidationError(
+                "Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
+                context={"input_file": str(path), "extension": ext},
+            )
+    if mime_type in SUPPORTED_MIME_TYPES:
+        return mime_type
+    for supported_mime_type in SUPPORTED_MIME_TYPES:
+        if mime_type.startswith(supported_mime_type):
+            return supported_mime_type
+    raise ValidationError(
+        f"Unsupported mime type: {mime_type}",
+        context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
+    )

kreuzberg/_pandoc.py CHANGED Viewed

@@ -1,26 +1,29 @@
 from __future__ import annotations
 import subprocess
-from asyncio import gather
-from dataclasses import dataclass
+import sys
+from functools import partial
 from json import JSONDecodeError, loads
-from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
+from typing import TYPE_CHECKING, Any, Final, Literal, cast
+from anyio import CapacityLimiter, create_task_group, to_process
 from anyio import Path as AsyncPath
+from kreuzberg._constants import DEFAULT_MAX_PROCESSES
+from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
 from kreuzberg._string import normalize_spaces
 from kreuzberg._sync import run_sync
+from kreuzberg._tmp import create_temp_file
+from kreuzberg._types import ExtractionResult, Metadata
 from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
     from os import PathLike
-try:  # pragma: no cover
-    from typing import NotRequired  # type: ignore[attr-defined]
-except ImportError:  # pragma: no cover
-    from typing_extensions import NotRequired
+if sys.version_info < (3, 11):  # pragma: no cover
+    from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
 version_ref: Final[dict[str, bool]] = {"checked": False}
@@ -145,65 +148,6 @@ MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
 }
-class Metadata(TypedDict, total=False):
-    """Document metadata extracted from Pandoc document.
-    All fields are optional but will only be included if they contain non-empty values.
-    Any field that would be empty or None is omitted from the dictionary.
-    """
-    title: NotRequired[str]
-    """Document title."""
-    subtitle: NotRequired[str]
-    """Document subtitle."""
-    abstract: NotRequired[str | list[str]]
-    """Document abstract, summary or description."""
-    authors: NotRequired[list[str]]
-    """List of document authors."""
-    date: NotRequired[str]
-    """Document date as string to preserve original format."""
-    subject: NotRequired[str]
-    """Document subject or topic."""
-    description: NotRequired[str]
-    """Extended description."""
-    keywords: NotRequired[list[str]]
-    """Keywords or tags."""
-    categories: NotRequired[list[str]]
-    """Categories or classifications."""
-    version: NotRequired[str]
-    """Version identifier."""
-    language: NotRequired[str]
-    """Document language code."""
-    references: NotRequired[list[str]]
-    """Reference entries."""
-    citations: NotRequired[list[str]]
-    """Citation identifiers."""
-    copyright: NotRequired[str]
-    """Copyright information."""
-    license: NotRequired[str]
-    """License information."""
-    identifier: NotRequired[str]
-    """Document identifier."""
-    publisher: NotRequired[str]
-    """Publisher name."""
-    contributors: NotRequired[list[str]]
-    """Additional contributors."""
-    creator: NotRequired[str]
-    """Document creator."""
-    institute: NotRequired[str | list[str]]
-    """Institute or organization."""
-@dataclass
-class PandocResult:
-    """Result of a pandoc conversion including content and metadata."""
-    content: str
-    """The processed markdown content."""
-    metadata: Metadata
-    """Document metadata extracted from the source."""
 def _extract_inline_text(node: dict[str, Any]) -> str | None:
     if node_type := node.get(TYPE_FIELD):
         if node_type == INLINE_STR:
@@ -246,13 +190,14 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
         if node_type == META_LIST:
             results = []
             for value in [value for item in content if (value := _extract_meta_value(item))]:
-                if isinstance(value, list):
-                    results.extend(value)  # pragma: no cover
+                if isinstance(value, list):  # pragma: no cover
+                    results.extend(value)
                 else:
                     results.append(value)
             return results
-        if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
+        # This branch is only taken for complex metadata blocks which we don't use
+        if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:  # pragma: no cover
             block_texts = []
             for block in blocks:
                 block_content = block.get(CONTENT_FIELD, [])
@@ -317,134 +262,142 @@ async def _validate_pandoc_version() -> None:
         raise MissingDependencyError("Pandoc is not installed.") from e
-async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
+async def _handle_extract_metadata(
+    input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
+) -> Metadata:
     pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
+    metadata_file, unlink = await create_temp_file(".json")
+    try:
+        command = [
+            "pandoc",
+            str(input_file),
+            f"--from={pandoc_type}",
+            "--to=json",
+            "--standalone",
+            "--quiet",
+            "--output",
+            metadata_file,
+        ]
-    with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
-        try:
-            command = [
-                "pandoc",
-                str(input_file),
-                f"--from={pandoc_type}",
-                "--to=json",
-                "--standalone",
-                "--quiet",
-                "--output",
-                metadata_file.name,
-            ]
-            result = await run_sync(
-                subprocess.run,
-                command,
-                capture_output=True,
-            )
-            if result.returncode != 0:
-                raise ParsingError(
-                    "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
-                )
-            json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
-            return _extract_metadata(json_data)
-        except (RuntimeError, OSError, JSONDecodeError) as e:
-            raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
-        finally:
-            metadata_file.close()
-            await AsyncPath(metadata_file.name).unlink()
+        result = await to_process.run_sync(
+            partial(subprocess.run, capture_output=True),
+            command,
+            cancellable=True,
+            limiter=CapacityLimiter(max_processes),
+        )
+        if result.returncode != 0:
+            raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
+        json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
+        return _extract_metadata(json_data)
+    except (RuntimeError, OSError, JSONDecodeError) as e:
+        raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
+    finally:
+        await unlink()
 async def _handle_extract_file(
-    input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
+    input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
 ) -> str:
     pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
+    output_path, unlink = await create_temp_file(".md")
+    try:
+        command = [
+            "pandoc",
+            str(input_file),
+            f"--from={pandoc_type}",
+            "--to=markdown",
+            "--standalone",
+            "--wrap=preserve",
+            "--quiet",
+        ]
+        command.extend(["--output", str(output_path)])
+        result = await to_process.run_sync(
+            partial(subprocess.run, capture_output=True),
+            command,
+            cancellable=True,
+            limiter=CapacityLimiter(max_processes),
+        )
+        if result.returncode != 0:
+            raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
+        text = await AsyncPath(output_path).read_text("utf-8")
+        return normalize_spaces(text)
+    except (RuntimeError, OSError) as e:
+        raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
+    finally:
+        await unlink()
-    with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
-        try:
-            command = [
-                "pandoc",
-                str(input_file),
-                f"--from={pandoc_type}",
-                "--to=markdown",
-                "--standalone",
-                "--wrap=preserve",
-                "--quiet",
-                "--output",
-                output_file.name,
-            ]
-            if extra_args:
-                command.extend(extra_args)
-            result = await run_sync(
-                subprocess.run,
-                command,
-                capture_output=True,
-            )
-            if result.returncode != 0:
-                raise ParsingError(
-                    "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
-                )
-            text = await AsyncPath(output_file.name).read_text("utf-8")
-            return normalize_spaces(text)
-        except (RuntimeError, OSError) as e:
-            raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
-        finally:
-            output_file.close()
-            await AsyncPath(output_file.name).unlink()
-async def process_file(
-    input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
-) -> PandocResult:
+async def process_file_with_pandoc(
+    input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
+) -> ExtractionResult:
     """Process a single file using Pandoc and convert to markdown.
     Args:
         input_file: The path to the file to process.
         mime_type: The mime type of the file.
-        extra_args: Additional Pandoc command line arguments.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+    Raises:
+        ParsingError: If the file data could not be extracted.
     Returns:
-        PandocResult containing processed content and metadata.
+        ExtractionResult
     """
     await _validate_pandoc_version()
-    metadata, content = await gather(
-        *[
-            _handle_extract_metadata(input_file, mime_type=mime_type),
-            _handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
-        ]
-    )
-    return PandocResult(
-        content=content,  # type: ignore[arg-type]
-        metadata=metadata,  # type: ignore[arg-type]
+    _get_pandoc_type_from_mime_type(mime_type)
+    metadata: Metadata = {}
+    content: str = ""
+    try:
+        async with create_task_group() as tg:
+            async def _get_metadata() -> None:
+                nonlocal metadata
+                metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
+            async def _get_content() -> None:
+                nonlocal content
+                content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
+            tg.start_soon(_get_metadata)
+            tg.start_soon(_get_content)
+    except ExceptionGroup as eg:
+        raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
+    return ExtractionResult(
+        content=normalize_spaces(content),
+        metadata=metadata,
+        mime_type=MARKDOWN_MIME_TYPE,
     )
-async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
+async def process_content_with_pandoc(
+    content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
+) -> ExtractionResult:
     """Process content using Pandoc and convert to markdown.
     Args:
         content: The content to process.
         mime_type: The mime type of the content.
-        extra_args: Additional Pandoc command line arguments.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Returns:
-        PandocResult containing processed content and metadata.
+        ExtractionResult
     """
     extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
+    input_file, unlink = await create_temp_file(f".{extension}")
-    with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
-        try:
-            await AsyncPath(input_file.name).write_bytes(content)
-            return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
+    await AsyncPath(input_file).write_bytes(content)
+    result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
-        finally:
-            input_file.close()
-            await AsyncPath(input_file.name).unlink()
+    await unlink()
+    return result

kreuzberg 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

kreuzberg 1.7.0py3-none-any.whl → 2.0.0py3-none-any.whl