PyPI - kreuzberg - Versions diffs - 2.0.1__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

kreuzberg 2.0.1py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

kreuzberg/__init__.py +2 -0
kreuzberg/_constants.py +3 -1
kreuzberg/_html.py +1 -2
kreuzberg/_mime_types.py +3 -2
kreuzberg/_pandoc.py +38 -75
kreuzberg/_pdf.py +20 -19
kreuzberg/_string.py +1 -1
kreuzberg/_sync.py +44 -3
kreuzberg/_tesseract.py +49 -43
kreuzberg/_xlsx.py +32 -36
kreuzberg/exceptions.py +20 -1
kreuzberg/extraction.py +4 -6
{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/METADATA +11 -16
kreuzberg-2.1.1.dist-info/RECORD +21 -0
{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/WHEEL +1 -1
kreuzberg-2.0.1.dist-info/RECORD +0 -21
{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/LICENSE +0 -0
{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/top_level.txt +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from ._tesseract import PSMMode
 from ._types import ExtractionResult, Metadata
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
@@ -15,6 +16,7 @@ __all__ = [
     "Metadata",
     "MissingDependencyError",
     "OCRError",
+    "PSMMode",
     "ParsingError",
     "ValidationError",
     "batch_extract_bytes",

kreuzberg/_constants.py CHANGED Viewed

@@ -3,4 +3,6 @@ from __future__ import annotations
 from multiprocessing import cpu_count
 from typing import Final
-DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
+DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
+MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
+MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2

kreuzberg/_html.py CHANGED Viewed

@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
 from kreuzberg import ExtractionResult
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
 from kreuzberg._string import normalize_spaces, safe_decode
-from kreuzberg._sync import run_sync
 if TYPE_CHECKING:
     from pathlib import Path
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
         if isinstance(file_path_or_contents, bytes)
         else await AsyncPath(file_path_or_contents).read_text()
     )
-    result = await run_sync(html_to_markdown.convert_to_markdown, content)
+    result = html_to_markdown.convert_to_markdown(content)
     return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})

kreuzberg/_mime_types.py CHANGED Viewed

@@ -15,6 +15,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
 PDF_MIME_TYPE: Final = "application/pdf"
 PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
 POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
 # Excel formats
 EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
@@ -73,7 +74,7 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
     "application/epub+zip",
     "application/rtf",
     "application/vnd.oasis.opendocument.text",
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    DOCX_MIME_TYPE,
     "application/x-biblatex",
     "application/x-bibtex",
     "application/x-endnote+xml",
@@ -146,7 +147,7 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
     ".epub": "application/epub+zip",
     ".rtf": "application/rtf",
     ".odt": "application/vnd.oasis.opendocument.text",
-    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".docx": DOCX_MIME_TYPE,
     ".bib": "application/x-bibtex",
     ".ipynb": "application/x-ipynb+json",
     ".tex": "application/x-latex",

kreuzberg/_pandoc.py CHANGED Viewed

@@ -1,21 +1,21 @@
 from __future__ import annotations
-import subprocess
+import re
 import sys
-from functools import partial
 from json import JSONDecodeError, loads
 from typing import TYPE_CHECKING, Any, Final, Literal, cast
-from anyio import CapacityLimiter, create_task_group, to_process
 from anyio import Path as AsyncPath
+from anyio import run_process
-from kreuzberg._constants import DEFAULT_MAX_PROCESSES
+from kreuzberg import ValidationError
+from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
 from kreuzberg._string import normalize_spaces
-from kreuzberg._sync import run_sync
+from kreuzberg._sync import run_taskgroup
 from kreuzberg._tmp import create_temp_file
 from kreuzberg._types import ExtractionResult, Metadata
-from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
+from kreuzberg.exceptions import MissingDependencyError, ParsingError
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
@@ -24,10 +24,8 @@ if TYPE_CHECKING:  # pragma: no cover
 if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
 version_ref: Final[dict[str, bool]] = {"checked": False}
 # Block-level node types in Pandoc AST
 BLOCK_HEADER: Final = "Header"  # Header with level, attributes and inline content
 BLOCK_PARA: Final = "Para"  # Paragraph containing inline content
@@ -229,20 +227,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
 def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
-    if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
-        mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
-    ):
-        raise ValidationError(
-            f"Unsupported mime type: {mime_type}",
-            context={
-                "mime_type": mime_type,
-                "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
-            },
+    if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
+        return pandoc_type
+    if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
+        return next(
+            MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
         )
-    return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
-        MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
-    )
+    raise ValidationError(f"Unsupported mime type: {mime_type}")
 async def _validate_pandoc_version() -> None:
@@ -251,20 +244,19 @@ async def _validate_pandoc_version() -> None:
             return
         command = ["pandoc", "--version"]
-        result = await run_sync(subprocess.run, command, capture_output=True)
-        version = result.stdout.decode().split("\n")[0].split()[1]
-        if not version.startswith("3."):
-            raise MissingDependencyError("Pandoc version 3 or above is required.")
+        result = await run_process(command)
+        version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
+        if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
+            raise MissingDependencyError("Pandoc version 2 or above is required")
         version_ref["checked"] = True
     except FileNotFoundError as e:
-        raise MissingDependencyError("Pandoc is not installed.") from e
+        raise MissingDependencyError("Pandoc is not installed") from e
-async def _handle_extract_metadata(
-    input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
-) -> Metadata:
+async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
     pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
     metadata_file, unlink = await create_temp_file(".json")
     try:
@@ -276,15 +268,10 @@ async def _handle_extract_metadata(
             "--standalone",
             "--quiet",
             "--output",
-            metadata_file,
+            str(metadata_file),
         ]
-        result = await to_process.run_sync(
-            partial(subprocess.run, capture_output=True),
-            command,
-            cancellable=True,
-            limiter=CapacityLimiter(max_processes),
-        )
+        result = await run_process(command)
         if result.returncode != 0:
             raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -297,9 +284,7 @@ async def _handle_extract_metadata(
         await unlink()
-async def _handle_extract_file(
-    input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
-) -> str:
+async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
     pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
     output_path, unlink = await create_temp_file(".md")
     try:
@@ -315,12 +300,7 @@ async def _handle_extract_file(
         command.extend(["--output", str(output_path)])
-        result = await to_process.run_sync(
-            partial(subprocess.run, capture_output=True),
-            command,
-            cancellable=True,
-            limiter=CapacityLimiter(max_processes),
-        )
+        result = await run_process(command)
         if result.returncode != 0:
             raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -334,15 +314,12 @@ async def _handle_extract_file(
         await unlink()
-async def process_file_with_pandoc(
-    input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
-) -> ExtractionResult:
+async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
     """Process a single file using Pandoc and convert to markdown.
     Args:
         input_file: The path to the file to process.
         mime_type: The mime type of the file.
-        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         ParsingError: If the file data could not be extracted.
@@ -354,41 +331,27 @@ async def process_file_with_pandoc(
     _get_pandoc_type_from_mime_type(mime_type)
-    metadata: Metadata = {}
-    content: str = ""
     try:
-        async with create_task_group() as tg:
-            async def _get_metadata() -> None:
-                nonlocal metadata
-                metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
-            async def _get_content() -> None:
-                nonlocal content
-                content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
-            tg.start_soon(_get_metadata)
-            tg.start_soon(_get_content)
+        metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
+        content_task = _handle_extract_file(input_file, mime_type=mime_type)
+        results = await run_taskgroup(metadata_task, content_task)
+        metadata, content = cast(tuple[Metadata, str], results)
+        return ExtractionResult(
+            content=normalize_spaces(content),
+            metadata=metadata,
+            mime_type=MARKDOWN_MIME_TYPE,
+        )
     except ExceptionGroup as eg:
-        raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
-    return ExtractionResult(
-        content=normalize_spaces(content),
-        metadata=metadata,
-        mime_type=MARKDOWN_MIME_TYPE,
-    )
+        raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
-async def process_content_with_pandoc(
-    content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
-) -> ExtractionResult:
+async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
     """Process content using Pandoc and convert to markdown.
     Args:
         content: The content to process.
         mime_type: The mime type of the content.
-        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Returns:
         ExtractionResult
@@ -397,7 +360,7 @@ async def process_content_with_pandoc(
     input_file, unlink = await create_temp_file(f".{extension}")
     await AsyncPath(input_file).write_bytes(content)
-    result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
+    result = await process_file_with_pandoc(input_file, mime_type=mime_type)
     await unlink()
     return result

kreuzberg/_pdf.py CHANGED Viewed

@@ -24,32 +24,36 @@ if TYPE_CHECKING:  # pragma: no cover
 # - Control and non-printable characters
 # - Unicode replacement and invalid characters
 # - Zero-width spaces and other invisible characters
-CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
-    r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
-)
+CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
+SHORT_TEXT_THRESHOLD: Final[int] = 50
+MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
-def _validate_extracted_text(text: str) -> bool:
+def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
     """Check if text extracted from PDF is valid or corrupted.
-    This checks for common indicators of corrupted PDF text extraction:
+    This checks for indicators of corrupted PDF text extraction:
     1. Empty or whitespace-only text
-    2. Control characters and other non-printable characters
-    3. Unicode replacement characters
-    4. Zero-width spaces and other invisible characters
+    2. High concentration of control characters and null bytes
+    3. High concentration of Unicode replacement characters
     Args:
         text: The extracted text to validate
+        corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
+            characters (default: 0.05 or 5%)
     Returns:
         True if the text appears valid, False if it seems corrupted
     """
-    # Check for empty or whitespace-only text
     if not text or not text.strip():
         return False
-    # Check for corruption indicators
-    return not bool(CORRUPTED_PATTERN.search(text))
+    corruption_matches = CORRUPTED_PATTERN.findall(text)
+    if len(text) < SHORT_TEXT_THRESHOLD:
+        return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
+    return (len(corruption_matches) / len(text)) < corruption_threshold
 async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
@@ -67,7 +71,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
     document: pypdfium2.PdfDocument | None = None
     try:
         document = await run_sync(pypdfium2.PdfDocument, str(input_file))
-        return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
+        return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
     except pypdfium2.PdfiumError as e:
         raise ParsingError(
             "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
@@ -148,13 +152,10 @@ async def extract_pdf_file(
     Returns:
         The extracted text.
     """
-    if (
-        not force_ocr
-        and (content := await _extract_pdf_searchable_text(input_file))
-        and _validate_extracted_text(content)
-    ):
-        return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+    if not force_ocr:
+        content = await _extract_pdf_searchable_text(input_file)
+        if _validate_extracted_text(content):
+            return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)

kreuzberg/_string.py CHANGED Viewed

@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
     for enc in [e for e in encodings if e]:  # pragma: no cover
-        with suppress(UnicodeDecodeError):
+        with suppress(UnicodeDecodeError, LookupError):
             return byte_data.decode(enc)
     # If all encodings fail, fall back to latin-1 which can handle any byte

kreuzberg/_sync.py CHANGED Viewed

@@ -2,12 +2,13 @@ from __future__ import annotations
 import sys
 from functools import partial
-from typing import TYPE_CHECKING, TypeVar, cast
+from typing import TYPE_CHECKING, Any, TypeVar, cast
+from anyio import create_task_group
 from anyio.to_thread import run_sync as any_io_run_sync
 if TYPE_CHECKING:  # pragma: no cover
-    from collections.abc import Callable
+    from collections.abc import Awaitable, Callable
 if sys.version_info >= (3, 10):
     from typing import ParamSpec
@@ -30,4 +31,44 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
         The result of the synchronous function.
     """
     handler = partial(sync_fn, **kwargs)
-    return cast(T, await any_io_run_sync(handler, *args))  # pyright: ignore [reportCallIssue]
+    return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]
+async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
+    """Run a list of coroutines concurrently.
+    Args:
+        *async_tasks: The list of coroutines to run.
+    Returns:
+        The results of the coroutines.
+    """
+    results: list[Any] = [None] * len(async_tasks)
+    async def run_task(index: int, task: Awaitable[T]) -> None:
+        results[index] = await task
+    async with create_task_group() as tg:
+        for i, t in enumerate(async_tasks):
+            tg.start_soon(run_task, i, t)
+    return results
+async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
+    """Run a list of coroutines concurrently in batches.
+    Args:
+        *async_tasks: The list of coroutines to run.
+        batch_size: The size of each batch.
+    Returns:
+        The results of the coroutines.
+    """
+    results: list[Any] = []
+    for i in range(0, len(async_tasks), batch_size):
+        batch = async_tasks[i : i + batch_size]
+        results.extend(await run_taskgroup(*batch))
+    return results

kreuzberg/_tesseract.py CHANGED Viewed

@@ -1,30 +1,26 @@
 from __future__ import annotations
 import re
-import subprocess
 import sys
 from enum import Enum
-from functools import partial
 from os import PathLike
-from typing import Final, TypeVar, Union, cast
+from typing import Any, TypeVar, Union
-from anyio import CapacityLimiter, create_task_group, to_process
 from anyio import Path as AsyncPath
+from anyio import run_process
 from PIL.Image import Image
-from kreuzberg import ExtractionResult, ParsingError
-from kreuzberg._constants import DEFAULT_MAX_PROCESSES
+from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._string import normalize_spaces
-from kreuzberg._sync import run_sync
+from kreuzberg._sync import run_sync, run_taskgroup_batched
 from kreuzberg._tmp import create_temp_file
-from kreuzberg.exceptions import MissingDependencyError, OCRError
+from kreuzberg._types import ExtractionResult
+from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
 if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
-MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
 version_ref = {"checked": False}
 T = TypeVar("T", bound=Union[Image, PathLike[str], str])
@@ -68,14 +64,16 @@ async def validate_tesseract_version() -> None:
             return
         command = ["tesseract", "--version"]
-        result = await run_sync(subprocess.run, command, capture_output=True)
-        version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
+        result = await run_process(command)
+        version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
         if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
             raise MissingDependencyError("Tesseract version 5 or above is required.")
         version_ref["checked"] = True
     except FileNotFoundError as e:
-        raise MissingDependencyError("Tesseract is not installed.") from e
+        raise MissingDependencyError(
+            "Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
+        ) from e
 async def process_file(
@@ -83,7 +81,6 @@ async def process_file(
     *,
     language: str,
     psm: PSMMode,
-    max_processes: int = DEFAULT_MAX_PROCESSES,
 ) -> ExtractionResult:
     """Process a single image file using Tesseract OCR.
@@ -91,7 +88,6 @@ async def process_file(
         input_file: The path to the image file to process.
         language: The language code for OCR.
         psm: Page segmentation mode.
-        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         OCRError: If OCR fails to extract text from the image.
@@ -102,6 +98,7 @@ async def process_file(
     output_path, unlink = await create_temp_file(".txt")
     try:
         output_base = str(output_path).replace(".txt", "")
         command = [
             "tesseract",
             str(input_file),
@@ -110,22 +107,44 @@ async def process_file(
             language,
             "--psm",
             str(psm.value),
+            "--oem",
+            "1",
+            "--loglevel",
+            "OFF",
+            "-c",
+            "thresholding_method=1",
+            "-c",
+            "tessedit_enable_dict_correction=1",
+            "-c",
+            "language_model_ngram_on=1",
+            "-c",
+            "textord_space_size_is_variable=1",
+            "-c",
+            "classify_use_pre_adapted_templates=1",
+            "-c",
+            "tessedit_dont_blkrej_good_wds=1",
+            "-c",
+            "tessedit_dont_rowrej_good_wds=1",
+            "-c",
+            "tessedit_use_primary_params_model=1",
         ]
-        result = await to_process.run_sync(
-            partial(subprocess.run, capture_output=True),
-            command,
-            limiter=CapacityLimiter(max_processes),
-            cancellable=True,
-        )
+        env: dict[str, Any] | None = None
+        if sys.platform.startswith("linux"):
+            env = {"OMP_THREAD_LIMIT": "1"}
+        result = await run_process(command, env=env)
         if not result.returncode == 0:
-            raise OCRError("OCR failed with a non-0 return code.")
+            raise OCRError(
+                "OCR failed with a non-0 return code.",
+                context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
+            )
         output = await AsyncPath(output_path).read_text("utf-8")
         return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
     except (RuntimeError, OSError) as e:
-        raise OCRError("Failed to OCR using tesseract") from e
+        raise OCRError(f"Failed to OCR using tesseract: {e}") from e
     finally:
         await unlink()
@@ -135,7 +154,6 @@ async def process_image(
     *,
     language: str,
     psm: PSMMode,
-    max_processes: int = DEFAULT_MAX_PROCESSES,
 ) -> ExtractionResult:
     """Process a single Pillow Image using Tesseract OCR.
@@ -143,14 +161,13 @@ async def process_image(
         image: The Pillow Image to process.
         language: The language code for OCR.
         psm: Page segmentation mode.
-        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Returns:
         ExtractionResult: The extracted text from the image.
     """
     image_path, unlink = await create_temp_file(".png")
     await run_sync(image.save, str(image_path), format="PNG")
-    result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
+    result = await process_file(image_path, language=language, psm=psm)
     await unlink()
     return result
@@ -160,7 +177,6 @@ async def process_image_with_tesseract(
     *,
     language: str = "eng",
     psm: PSMMode = PSMMode.AUTO,
-    max_processes: int = DEFAULT_MAX_PROCESSES,
 ) -> ExtractionResult:
     """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
@@ -168,7 +184,6 @@ async def process_image_with_tesseract(
         image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
         language: The language code for OCR (default: "eng").
         psm: Page segmentation mode (default: PSMMode.AUTO).
-        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -179,10 +194,10 @@ async def process_image_with_tesseract(
     await validate_tesseract_version()
     if isinstance(image, Image):
-        return await process_image(image, language=language, psm=psm, max_processes=max_processes)
+        return await process_image(image, language=language, psm=psm)
     if isinstance(image, (PathLike, str)):
-        return await process_file(image, language=language, psm=psm, max_processes=max_processes)
+        return await process_file(image, language=language, psm=psm)
     raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
@@ -200,7 +215,7 @@ async def batch_process_images(
         images: A list of Pillow Images, paths or strings to process.
         language: The language code for OCR (default: "eng").
         psm: Page segmentation mode (default: PSMMode.AUTO).
-        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        max_processes: Maximum number of concurrent processes (default: CPU count / 2).
     Raises:
         ParsingError: If OCR fails to extract text from any of the images.
@@ -209,17 +224,8 @@ async def batch_process_images(
         List of ExtractionResult objects, one per input image.
     """
     await validate_tesseract_version()
-    results = cast(list[ExtractionResult], list(range(len(images))))
-    async def _process_image(index: int, image: T) -> None:
-        results[index] = await process_image_with_tesseract(
-            image, language=language, psm=psm, max_processes=max_processes
-        )
     try:
-        async with create_task_group() as tg:
-            for i, image in enumerate(images):
-                tg.start_soon(_process_image, i, image)
-        return results
+        tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
+        return await run_taskgroup_batched(*tasks, batch_size=max_processes)
     except ExceptionGroup as eg:
-        raise ParsingError("Failed to process images with Tesseract") from eg
+        raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg

kreuzberg/_xlsx.py CHANGED Viewed

@@ -1,23 +1,46 @@
 from __future__ import annotations
 import csv
+import sys
 from io import StringIO
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
 from anyio import Path as AsyncPath
-from anyio import create_task_group
 from python_calamine import CalamineWorkbook
 from kreuzberg import ExtractionResult, ParsingError
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
 from kreuzberg._pandoc import process_file_with_pandoc
 from kreuzberg._string import normalize_spaces
-from kreuzberg._sync import run_sync
+from kreuzberg._sync import run_sync, run_taskgroup
 from kreuzberg._tmp import create_temp_file
 if TYPE_CHECKING:  # pragma: no cover
     from pathlib import Path
+if sys.version_info < (3, 11):  # pragma: no cover
+    from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
+async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
+    values = workbook.get_sheet_by_name(sheet_name).to_python()
+    csv_buffer = StringIO()
+    writer = csv.writer(csv_buffer)
+    for row in values:
+        writer.writerow(row)
+    csv_data = csv_buffer.getvalue()
+    csv_buffer.close()
+    csv_path, unlink = await create_temp_file(".csv")
+    await AsyncPath(csv_path).write_text(csv_data)
+    result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
+    await unlink()
+    return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
 async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
     """Extract text from an XLSX file by converting it to CSV and then to markdown.
@@ -33,46 +56,19 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
     """
     try:
         workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
-        results = cast(list[str], [None] * len(workbook.sheet_names))
-        async def convert_sheet_to_text(sheet_name: str) -> None:
-            nonlocal results
-            values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
-            csv_buffer = StringIO()
-            writer = csv.writer(csv_buffer)
-            for row in values:
-                writer.writerow(row)
-            csv_data = csv_buffer.getvalue()
-            csv_buffer.close()
-            from kreuzberg._tmp import create_temp_file
-            csv_path, unlink = await create_temp_file(".csv")
-            await AsyncPath(csv_path).write_text(csv_data)
-            result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
-            results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
-            await unlink()
-        async with create_task_group() as tg:
-            for sheet_name in workbook.sheet_names:
-                tg.start_soon(convert_sheet_to_text, sheet_name)
+        tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
+        results: list[str] = await run_taskgroup(*tasks)
         return ExtractionResult(
             content="\n\n".join(results),
             mime_type=MARKDOWN_MIME_TYPE,
             metadata={},
         )
-    except Exception as e:
+    except ExceptionGroup as eg:
         raise ParsingError(
-            "Could not extract text from XLSX",
-            context={
-                "error": str(e),
-            },
-        ) from e
+            "Failed to extract file data",
+            context={"file": str(input_file), "errors": eg.exceptions},
+        ) from eg
 async def extract_xlsx_content(content: bytes) -> ExtractionResult:

kreuzberg/exceptions.py CHANGED Viewed

@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
         self.context = context
         super().__init__(message)
+    def _serialize_context(self, obj: Any) -> Any:
+        """Recursively serialize context objects to ensure JSON compatibility."""
+        if isinstance(obj, bytes):
+            return obj.decode("utf-8", errors="replace")
+        if isinstance(obj, dict):
+            return {k: self._serialize_context(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [self._serialize_context(x) for x in obj]
+        if isinstance(obj, Exception):
+            return {
+                "type": obj.__class__.__name__,
+                "message": str(obj),
+            }
+        return obj
     def __str__(self) -> str:
         """Return a string representation of the exception."""
-        ctx = f"\n\nContext: {dumps(self.context)}" if self.context else ""
+        if self.context:
+            serialized_context = self._serialize_context(self.context)
+            ctx = f"\n\nContext: {dumps(serialized_context)}"
+        else:
+            ctx = ""
         return f"{self.__class__.__name__}: {super().__str__()}{ctx}"

kreuzberg/extraction.py CHANGED Viewed

@@ -87,14 +87,12 @@ async def extract_bytes(
         return await extract_xlsx_content(content)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
-        return await process_image_with_tesseract(
-            open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
-        )
+        return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
     if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
         mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
     ):
-        return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
+        return await process_content_with_pandoc(content=content, mime_type=mime_type)
     if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
         return await extract_pptx_file_content(content)
@@ -150,12 +148,12 @@ async def extract_file(
         return await extract_xlsx_file(Path(input_file))
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
-        return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
+        return await process_image_with_tesseract(input_file, psm=psm, language=language)
     if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
         mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
     ):
-        return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
+        return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
     if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
         return await extract_pptx_file_content(Path(input_file))

{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kreuzberg
-Version: 2.0.1
+Version: 2.1.1
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -31,7 +31,7 @@ Requires-Dist: html-to-markdown>=1.2.0
 Requires-Dist: pypdfium2>=4.30.1
 Requires-Dist: python-calamine>=0.3.1
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.12.2; python_version < "3.11"
+Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
 # Kreuzberg
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
 - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
 - **Local Processing**: No external API calls or cloud dependencies required
 - **Resource Efficient**: Lightweight processing without GPU requirements
-- **Lightweight**: Has few curated dependencies and a minimal footprint
+- **Small Package Size**: Has few curated dependencies and a minimal footprint
 - **Format Support**: Comprehensive support for documents, images, and text formats
 - **Modern Python**: Built with async/await, type hints, and functional first approach
 - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
@@ -61,8 +61,8 @@ pip install kreuzberg
 Kreuzberg requires two system level dependencies:
-- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
+- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
+- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 5.
 You can install these with:
@@ -75,7 +75,6 @@ sudo apt-get install pandoc tesseract-ocr
 #### MacOS
 ```shell
-# MacOS
 brew install tesseract pandoc
 ```
@@ -191,19 +190,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
 #### Processing Configuration
-- `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
-  Notes:
-  - Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
+- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
 ### Quick Start
 ```python
 from pathlib import Path
 from kreuzberg import extract_file
-from kreuzberg.extraction import ExtractionResult
-from kreuzberg._tesseract import PSMMode
+from kreuzberg import ExtractionResult
+from kreuzberg import PSMMode
 # Basic file extraction
@@ -232,7 +227,7 @@ async def extract_document():
 ```python
 from kreuzberg import extract_bytes
-from kreuzberg.extraction import ExtractionResult
+from kreuzberg import ExtractionResult
 async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
@@ -378,8 +373,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
 Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
 ```python
-from kreuzberg import extract_file
-from kreuzberg.exceptions import (
+from kreuzberg import (
+    extract_file,
     ValidationError,
     ParsingError,
     OCRError,

kreuzberg-2.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
+kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
+kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
+kreuzberg/_mime_types.py,sha256=Kuu0yWY4p0Eck8b_vdp9oamqRZc1RJaS_ZKikVD2Z2o,6431
+kreuzberg/_pandoc.py,sha256=YIXaFC11N2tgVHjBd3JD_21GZ6OOVQ0UY3aKrWNfK-I,12531
+kreuzberg/_pdf.py,sha256=AIwxlydZkJOU4878SaeF9cKUmzSN7o3X40Hye7z017U,6479
+kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
+kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
+kreuzberg/_sync.py,sha256=sDVH4GrpYW9SOnmu3BqKPL76xl0hxzHjTAC78aovbQA,2122
+kreuzberg/_tesseract.py,sha256=0BkguZJIKlOFHkrN2mjVgaycWwolmuEv6DwpQY7n7Os,7610
+kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
+kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
+kreuzberg/_xlsx.py,sha256=kSH7PJ33vdLgoh5LmL_bqbc4I0VgZlZUeF4ckKl6NJM,2675
+kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
+kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
+kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg-2.1.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-2.1.1.dist-info/METADATA,sha256=tWRsv1bx9os2dQnU5KrQpUd4fNeQ4x-J2fXWKdcuQAA,14842
+kreuzberg-2.1.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+kreuzberg-2.1.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
+kreuzberg-2.1.1.dist-info/RECORD,,

{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (75.8.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

kreuzberg-2.0.1.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-kreuzberg/__init__.py,sha256=CBRHXPhjdslaSXaUjZO5V0k57uz5_x12cwo0HTtxOcU,647
-kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
-kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
-kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
-kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
-kreuzberg/_pdf.py,sha256=9YErIrRvMMFXKHckXzBDCEMzDAEnC0JVOR38gFhvHKQ,6227
-kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
-kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
-kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
-kreuzberg/_tesseract.py,sha256=SZsv0gFWvzR8iLaMyGr4Oc0lXE7atCR3sNxXR7TQzEE,7686
-kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
-kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
-kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
-kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
-kreuzberg/extraction.py,sha256=kuEKvOGhPBRcFeGX7eKmup9BukX6o55740F_KdZ15qQ,13214
-kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg-2.0.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-2.0.1.dist-info/METADATA,sha256=KmKLubQ89i0_JwpK96kYbhuq1MuucrqHe2bCLNcbyic,15023
-kreuzberg-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-kreuzberg-2.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-2.0.1.dist-info/RECORD,,

{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

kreuzberg 2.0.1__py3-none-any.whl → 2.1.1__py3-none-any.whl

kreuzberg 2.0.1py3-none-any.whl → 2.1.1py3-none-any.whl