PyPI - kreuzberg - Versions diffs - 1.7.0__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

kreuzberg 1.7.0py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

kreuzberg/__init__.py +17 -2
kreuzberg/_constants.py +6 -0
kreuzberg/_html.py +32 -0
kreuzberg/_mime_types.py +109 -1
kreuzberg/_pandoc.py +122 -169
kreuzberg/_pdf.py +189 -0
kreuzberg/_pptx.py +88 -0
kreuzberg/_string.py +5 -8
kreuzberg/_sync.py +6 -1
kreuzberg/_tesseract.py +97 -200
kreuzberg/_tmp.py +37 -0
kreuzberg/_types.py +71 -0
kreuzberg/_xlsx.py +92 -0
kreuzberg/extraction.py +269 -64
kreuzberg-2.0.1.dist-info/METADATA +451 -0
kreuzberg-2.0.1.dist-info/RECORD +21 -0
kreuzberg/_extractors.py +0 -280
kreuzberg-1.7.0.dist-info/METADATA +0 -342
kreuzberg-1.7.0.dist-info/RECORD +0 -15
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/LICENSE +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/WHEEL +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/top_level.txt +0 -0

kreuzberg/_tesseract.py CHANGED Viewed

@@ -2,152 +2,33 @@ from __future__ import annotations
 import re
 import subprocess
-from asyncio import gather
+import sys
 from enum import Enum
+from functools import partial
 from os import PathLike
-from tempfile import NamedTemporaryFile
-from typing import Any, Literal, TypeVar, Union
+from typing import Final, TypeVar, Union, cast
+from anyio import CapacityLimiter, create_task_group, to_process
 from anyio import Path as AsyncPath
 from PIL.Image import Image
+from kreuzberg import ExtractionResult, ParsingError
+from kreuzberg._constants import DEFAULT_MAX_PROCESSES
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._string import normalize_spaces
 from kreuzberg._sync import run_sync
+from kreuzberg._tmp import create_temp_file
 from kreuzberg.exceptions import MissingDependencyError, OCRError
+if sys.version_info < (3, 11):  # pragma: no cover
+    from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
+MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
 version_ref = {"checked": False}
 T = TypeVar("T", bound=Union[Image, PathLike[str], str])
-SupportedLanguages = Literal[
-    "afr",
-    "amh",
-    "ara",
-    "asm",
-    "aze",
-    "aze_cyrl",
-    "bel",
-    "ben",
-    "bod",
-    "bos",
-    "bre",
-    "bul",
-    "cat",
-    "ceb",
-    "ces",
-    "chi_sim",
-    "chi_tra",
-    "chr",
-    "cos",
-    "cym",
-    "dan",
-    "dan_frak",
-    "deu",
-    "deu_frak",
-    "deu_latf",
-    "dzo",
-    "ell",
-    "eng",
-    "enm",
-    "epo",
-    "equ",
-    "est",
-    "eus",
-    "fao",
-    "fas",
-    "fil",
-    "fin",
-    "fra",
-    "frk",
-    "frm",
-    "fry",
-    "gla",
-    "gle",
-    "glg",
-    "grc",
-    "guj",
-    "hat",
-    "heb",
-    "hin",
-    "hrv",
-    "hun",
-    "hye",
-    "iku",
-    "ind",
-    "isl",
-    "ita",
-    "ita_old",
-    "jav",
-    "jpn",
-    "kan",
-    "kat",
-    "kat_old",
-    "kaz",
-    "khm",
-    "kir",
-    "kmr",
-    "kor",
-    "kor_vert",
-    "kur",
-    "lao",
-    "lat",
-    "lav",
-    "lit",
-    "ltz",
-    "mal",
-    "mar",
-    "mkd",
-    "mlt",
-    "mon",
-    "mri",
-    "msa",
-    "mya",
-    "nep",
-    "nld",
-    "nor",
-    "oci",
-    "ori",
-    "osd",
-    "pan",
-    "pol",
-    "por",
-    "pus",
-    "que",
-    "ron",
-    "rus",
-    "san",
-    "sin",
-    "slk",
-    "slk_frak",
-    "slv",
-    "snd",
-    "spa",
-    "spa_old",
-    "sqi",
-    "srp",
-    "srp_latn",
-    "sun",
-    "swa",
-    "swe",
-    "syr",
-    "tam",
-    "tat",
-    "tel",
-    "tgk",
-    "tgl",
-    "tha",
-    "tir",
-    "ton",
-    "tur",
-    "uig",
-    "ukr",
-    "urd",
-    "uzb",
-    "uzb_cyrl",
-    "vie",
-    "yid",
-    "yor",
-]
 class PSMMode(Enum):
     """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
@@ -189,7 +70,7 @@ async def validate_tesseract_version() -> None:
         command = ["tesseract", "--version"]
         result = await run_sync(subprocess.run, command, capture_output=True)
         version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
-        if not version_match or int(version_match.group(1)) < 5:
+        if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
             raise MissingDependencyError("Tesseract version 5 or above is required.")
         version_ref["checked"] = True
@@ -198,94 +79,96 @@ async def validate_tesseract_version() -> None:
 async def process_file(
-    input_file: str | PathLike[str], *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any
-) -> str:
+    input_file: str | PathLike[str],
+    *,
+    language: str,
+    psm: PSMMode,
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> ExtractionResult:
     """Process a single image file using Tesseract OCR.
     Args:
         input_file: The path to the image file to process.
         language: The language code for OCR.
         psm: Page segmentation mode.
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         OCRError: If OCR fails to extract text from the image.
     Returns:
-        str: Extracted text from the image.
+        ExtractionResult: The extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
-        # this is needed because tesseract adds .txt to the output file
-        try:
-            output_file_name = output_file.name.replace(".txt", "")
-            command = [
-                "tesseract",
-                str(input_file),
-                output_file_name,
-                "-l",
-                language,
-                "--psm",
-                str(psm.value),
-            ]
-            for key, value in kwargs.items():
-                command.extend(["-c", f"{key}={value}"])
-            result = await run_sync(
-                subprocess.run,
-                command,
-                capture_output=True,
-            )
-            if not result.returncode == 0:
-                raise OCRError("OCR failed with a non-0 return code.")
-            output = await AsyncPath(output_file.name).read_text("utf-8")
-            return output.strip()
-        except (RuntimeError, OSError) as e:
-            raise OCRError("Failed to OCR using tesseract") from e
-        finally:
-            output_file.close()
-            await AsyncPath(output_file.name).unlink()
-async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
+    output_path, unlink = await create_temp_file(".txt")
+    try:
+        output_base = str(output_path).replace(".txt", "")
+        command = [
+            "tesseract",
+            str(input_file),
+            output_base,
+            "-l",
+            language,
+            "--psm",
+            str(psm.value),
+        ]
+        result = await to_process.run_sync(
+            partial(subprocess.run, capture_output=True),
+            command,
+            limiter=CapacityLimiter(max_processes),
+            cancellable=True,
+        )
+        if not result.returncode == 0:
+            raise OCRError("OCR failed with a non-0 return code.")
+        output = await AsyncPath(output_path).read_text("utf-8")
+        return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+    except (RuntimeError, OSError) as e:
+        raise OCRError("Failed to OCR using tesseract") from e
+    finally:
+        await unlink()
+async def process_image(
+    image: Image,
+    *,
+    language: str,
+    psm: PSMMode,
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> ExtractionResult:
     """Process a single Pillow Image using Tesseract OCR.
     Args:
         image: The Pillow Image to process.
         language: The language code for OCR.
         psm: Page segmentation mode.
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Returns:
-        str: Extracted text from the image.
+        ExtractionResult: The extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
-        try:
-            await run_sync(image.save, image_file.name, format="PNG")
-            return await process_file(image_file.name, language=language, psm=psm, **kwargs)
-        finally:
-            image_file.close()
-            await AsyncPath(image_file.name).unlink()
+    image_path, unlink = await create_temp_file(".png")
+    await run_sync(image.save, str(image_path), format="PNG")
+    result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
+    await unlink()
+    return result
 async def process_image_with_tesseract(
     image: Image | PathLike[str] | str,
     *,
-    language: SupportedLanguages = "eng",
+    language: str = "eng",
     psm: PSMMode = PSMMode.AUTO,
-    **kwargs: Any,
-) -> str:
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> ExtractionResult:
     """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
     Args:
         image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
         language: The language code for OCR (default: "eng").
         psm: Page segmentation mode (default: PSMMode.AUTO).
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -296,10 +179,10 @@ async def process_image_with_tesseract(
     await validate_tesseract_version()
     if isinstance(image, Image):
-        return await process_image(image, language=language, psm=psm, **kwargs)
+        return await process_image(image, language=language, psm=psm, max_processes=max_processes)
     if isinstance(image, (PathLike, str)):
-        return await process_file(image, language=language, psm=psm, **kwargs)
+        return await process_file(image, language=language, psm=psm, max_processes=max_processes)
     raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
@@ -307,22 +190,36 @@ async def process_image_with_tesseract(
 async def batch_process_images(
     images: list[T],
     *,
-    language: SupportedLanguages = "eng",
+    language: str = "eng",
     psm: PSMMode = PSMMode.AUTO,
-    **kwargs: Any,
-) -> list[str]:
-    """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> list[ExtractionResult]:
+    """Run Tesseract OCR asynchronously on multiple images with controlled concurrency.
     Args:
         images: A list of Pillow Images, paths or strings to process.
         language: The language code for OCR (default: "eng").
         psm: Page segmentation mode (default: PSMMode.AUTO).
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+    Raises:
+        ParsingError: If OCR fails to extract text from any of the images.
     Returns:
-        Extracted text as a string (for single image) or a list of strings (for multiple images).
+        List of ExtractionResult objects, one per input image.
     """
     await validate_tesseract_version()
-    return await gather(
-        *[process_image_with_tesseract(image, language=language, psm=psm, **kwargs) for image in images]
-    )
+    results = cast(list[ExtractionResult], list(range(len(images))))
+    async def _process_image(index: int, image: T) -> None:
+        results[index] = await process_image_with_tesseract(
+            image, language=language, psm=psm, max_processes=max_processes
+        )
+    try:
+        async with create_task_group() as tg:
+            for i, image in enumerate(images):
+                tg.start_soon(_process_image, i, image)
+        return results
+    except ExceptionGroup as eg:
+        raise ParsingError("Failed to process images with Tesseract") from eg

kreuzberg/_tmp.py ADDED Viewed

@@ -0,0 +1,37 @@
+from __future__ import annotations
+from contextlib import suppress
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import TYPE_CHECKING, Callable
+from anyio import Path as AsyncPath
+from kreuzberg._sync import run_sync
+if TYPE_CHECKING:  # pragma: no cover
+    from collections.abc import Coroutine
+async def create_temp_file(
+    extension: str, content: bytes | None = None
+) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
+    """Create a temporary file that is closed.
+    Args:
+        extension: The file extension.
+        content: The content to write to the file.
+    Returns:
+        The temporary file path.
+    """
+    file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
+    if content:
+        await AsyncPath(file.name).write_bytes(content)
+    await run_sync(file.close)
+    async def unlink() -> None:
+        with suppress(OSError, PermissionError):
+            await AsyncPath(file.name).unlink(missing_ok=True)
+    return Path(file.name), unlink

kreuzberg/_types.py ADDED Viewed

@@ -0,0 +1,71 @@
+from __future__ import annotations
+import sys
+from typing import NamedTuple, TypedDict
+if sys.version_info < (3, 11):  # pragma: no cover
+    from typing_extensions import NotRequired
+else:  # pragma: no cover
+    from typing import NotRequired
+class Metadata(TypedDict, total=False):
+    """Document metadata.
+    All fields are optional but will only be included if they contain non-empty values.
+    Any field that would be empty or None is omitted from the dictionary.
+    Different documents and extraction methods will yield different metadata.
+    """
+    title: NotRequired[str]
+    """Document title."""
+    subtitle: NotRequired[str]
+    """Document subtitle."""
+    abstract: NotRequired[str | list[str]]
+    """Document abstract, summary or description."""
+    authors: NotRequired[list[str]]
+    """List of document authors."""
+    date: NotRequired[str]
+    """Document date as string to preserve original format."""
+    subject: NotRequired[str]
+    """Document subject or topic."""
+    description: NotRequired[str]
+    """Extended description."""
+    keywords: NotRequired[list[str]]
+    """Keywords or tags."""
+    categories: NotRequired[list[str]]
+    """Categories or classifications."""
+    version: NotRequired[str]
+    """Version identifier."""
+    language: NotRequired[str]
+    """Document language code."""
+    references: NotRequired[list[str]]
+    """Reference entries."""
+    citations: NotRequired[list[str]]
+    """Citation identifiers."""
+    copyright: NotRequired[str]
+    """Copyright information."""
+    license: NotRequired[str]
+    """License information."""
+    identifier: NotRequired[str]
+    """Document identifier."""
+    publisher: NotRequired[str]
+    """Publisher name."""
+    contributors: NotRequired[list[str]]
+    """Additional contributors."""
+    creator: NotRequired[str]
+    """Document creator."""
+    institute: NotRequired[str | list[str]]
+    """Institute or organization."""
+class ExtractionResult(NamedTuple):
+    """The result of a file extraction."""
+    content: str
+    """The extracted content."""
+    mime_type: str
+    """The mime type of the content."""
+    metadata: Metadata
+    """The metadata of the content."""

kreuzberg/_xlsx.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+import csv
+from io import StringIO
+from typing import TYPE_CHECKING, cast
+from anyio import Path as AsyncPath
+from anyio import create_task_group
+from python_calamine import CalamineWorkbook
+from kreuzberg import ExtractionResult, ParsingError
+from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
+from kreuzberg._pandoc import process_file_with_pandoc
+from kreuzberg._string import normalize_spaces
+from kreuzberg._sync import run_sync
+from kreuzberg._tmp import create_temp_file
+if TYPE_CHECKING:  # pragma: no cover
+    from pathlib import Path
+async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
+    """Extract text from an XLSX file by converting it to CSV and then to markdown.
+    Args:
+        input_file: The path to the XLSX file.
+    Returns:
+        The extracted text content.
+    Raises:
+        ParsingError: If the XLSX file could not be parsed.
+    """
+    try:
+        workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
+        results = cast(list[str], [None] * len(workbook.sheet_names))
+        async def convert_sheet_to_text(sheet_name: str) -> None:
+            nonlocal results
+            values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
+            csv_buffer = StringIO()
+            writer = csv.writer(csv_buffer)
+            for row in values:
+                writer.writerow(row)
+            csv_data = csv_buffer.getvalue()
+            csv_buffer.close()
+            from kreuzberg._tmp import create_temp_file
+            csv_path, unlink = await create_temp_file(".csv")
+            await AsyncPath(csv_path).write_text(csv_data)
+            result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
+            results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
+            await unlink()
+        async with create_task_group() as tg:
+            for sheet_name in workbook.sheet_names:
+                tg.start_soon(convert_sheet_to_text, sheet_name)
+        return ExtractionResult(
+            content="\n\n".join(results),
+            mime_type=MARKDOWN_MIME_TYPE,
+            metadata={},
+        )
+    except Exception as e:
+        raise ParsingError(
+            "Could not extract text from XLSX",
+            context={
+                "error": str(e),
+            },
+        ) from e
+async def extract_xlsx_content(content: bytes) -> ExtractionResult:
+    """Extract text from an XLSX file content.
+    Args:
+        content: The XLSX file content.
+    Returns:
+        The extracted text content.
+    """
+    xlsx_path, unlink = await create_temp_file(".xlsx")
+    await AsyncPath(xlsx_path).write_bytes(content)
+    result = await extract_xlsx_file(xlsx_path)
+    await unlink()
+    return result

kreuzberg 1.7.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

kreuzberg 1.7.0py3-none-any.whl → 2.0.1py3-none-any.whl