PyPI - kreuzberg - Versions diffs - 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

kreuzberg 1.6.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

kreuzberg/__init__.py +6 -2
kreuzberg/_constants.py +6 -0
kreuzberg/_html.py +32 -0
kreuzberg/_mime_types.py +109 -1
kreuzberg/_pandoc.py +154 -167
kreuzberg/_pdf.py +189 -0
kreuzberg/_pptx.py +88 -0
kreuzberg/_string.py +5 -8
kreuzberg/_sync.py +6 -1
kreuzberg/_tesseract.py +101 -64
kreuzberg/_tmp.py +37 -0
kreuzberg/_types.py +71 -0
kreuzberg/_xlsx.py +92 -0
kreuzberg/extraction.py +269 -64
kreuzberg-2.0.0.dist-info/METADATA +419 -0
kreuzberg-2.0.0.dist-info/RECORD +21 -0
kreuzberg/_extractors.py +0 -247
kreuzberg-1.6.0.dist-info/METADATA +0 -317
kreuzberg-1.6.0.dist-info/RECORD +0 -15
{kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
{kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
{kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0

kreuzberg/_pdf.py ADDED Viewed

@@ -0,0 +1,189 @@
+from __future__ import annotations
+from re import Pattern
+from re import compile as compile_regex
+from typing import TYPE_CHECKING, Final, cast
+import pypdfium2
+from anyio import Path as AsyncPath
+from kreuzberg import ExtractionResult
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._string import normalize_spaces
+from kreuzberg._sync import run_sync
+from kreuzberg._tesseract import PSMMode, SupportedLanguage, batch_process_images
+from kreuzberg.exceptions import ParsingError
+if TYPE_CHECKING:  # pragma: no cover
+    from pathlib import Path
+    from PIL.Image import Image
+# Pattern to detect common PDF text extraction corruption:
+# - Control and non-printable characters
+# - Unicode replacement and invalid characters
+# - Zero-width spaces and other invisible characters
+CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
+    r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
+)
+def _validate_extracted_text(text: str) -> bool:
+    """Check if text extracted from PDF is valid or corrupted.
+    This checks for common indicators of corrupted PDF text extraction:
+    1. Empty or whitespace-only text
+    2. Control characters and other non-printable characters
+    3. Unicode replacement characters
+    4. Zero-width spaces and other invisible characters
+    Args:
+        text: The extracted text to validate
+    Returns:
+        True if the text appears valid, False if it seems corrupted
+    """
+    # Check for empty or whitespace-only text
+    if not text or not text.strip():
+        return False
+    # Check for corruption indicators
+    return not bool(CORRUPTED_PATTERN.search(text))
+async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
+    """Convert a PDF file to images.
+    Args:
+        input_file: The path to the PDF file.
+    Raises:
+        ParsingError: If the PDF file could not be converted to images.
+    Returns:
+        A list of Pillow Images.
+    """
+    document: pypdfium2.PdfDocument | None = None
+    try:
+        document = await run_sync(pypdfium2.PdfDocument, str(input_file))
+        return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
+    except pypdfium2.PdfiumError as e:
+        raise ParsingError(
+            "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
+        ) from e
+    finally:
+        if document:
+            await run_sync(document.close)
+async def _extract_pdf_text_with_ocr(
+    input_file: Path,
+    *,
+    language: SupportedLanguage = "eng",
+    max_processes: int,
+    psm: PSMMode = PSMMode.AUTO,
+) -> ExtractionResult:
+    """Extract text from a scanned PDF file using pytesseract.
+    Args:
+        input_file: The path to the PDF file.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        The extracted text.
+    """
+    images = await _convert_pdf_to_images(input_file)
+    ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
+    return ExtractionResult(
+        content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
+    )
+async def _extract_pdf_searchable_text(input_file: Path) -> str:
+    """Extract text from a searchable PDF file using pypdfium2.
+    Args:
+        input_file: The path to the PDF file.
+    Raises:
+        ParsingError: If the text could not be extracted from the PDF file.
+    Returns:
+        The extracted text.
+    """
+    document: pypdfium2.PdfDocument | None = None
+    try:
+        document = await run_sync(pypdfium2.PdfDocument, str(input_file))
+        text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
+        return normalize_spaces(text)
+    except pypdfium2.PdfiumError as e:
+        raise ParsingError(
+            "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
+        ) from e
+    finally:
+        if document:
+            await run_sync(document.close)
+async def extract_pdf_file(
+    input_file: Path,
+    *,
+    force_ocr: bool,
+    language: SupportedLanguage = "eng",
+    max_processes: int,
+    psm: PSMMode = PSMMode.AUTO,
+) -> ExtractionResult:
+    """Extract text from a PDF file.
+    Args:
+        input_file: The path to the PDF file.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        The extracted text.
+    """
+    if (
+        not force_ocr
+        and (content := await _extract_pdf_searchable_text(input_file))
+        and _validate_extracted_text(content)
+    ):
+        return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+    return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
+async def extract_pdf_content(
+    content: bytes,
+    *,
+    force_ocr: bool,
+    language: SupportedLanguage = "eng",
+    max_processes: int,
+    psm: PSMMode = PSMMode.AUTO,
+) -> ExtractionResult:
+    """Extract text from a PDF file content.
+    Args:
+        content: The PDF file content.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        The extracted text.
+    """
+    from kreuzberg._tmp import create_temp_file
+    file_path, unlink = await create_temp_file(".pdf")
+    await AsyncPath(file_path).write_bytes(content)
+    result = await extract_pdf_file(
+        file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
+    )
+    await unlink()
+    return result

kreuzberg/_pptx.py ADDED Viewed

@@ -0,0 +1,88 @@
+from __future__ import annotations
+import re
+from contextlib import suppress
+from html import escape
+from io import BytesIO
+from typing import TYPE_CHECKING
+import pptx
+from anyio import Path as AsyncPath
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+from kreuzberg import ExtractionResult
+from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
+from kreuzberg._string import normalize_spaces
+if TYPE_CHECKING:  # pragma: no cover
+    from pathlib import Path
+async def extract_pptx_file_content(file_path_or_contents: Path | bytes) -> ExtractionResult:
+    """Extract text from a PPTX file.
+    Notes:
+        This function is based on code vendored from `markitdown`, which has an MIT license as well.
+    Args:
+        file_path_or_contents: The path to the PPTX file or its contents as bytes.
+    Returns:
+        The extracted text content
+    """
+    md_content = ""
+    file_contents = (
+        file_path_or_contents
+        if isinstance(file_path_or_contents, bytes)
+        else await AsyncPath(file_path_or_contents).read_bytes()
+    )
+    presentation = pptx.Presentation(BytesIO(file_contents))
+    for index, slide in enumerate(presentation.slides):
+        md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
+        title = slide.shapes.title
+        for shape in slide.shapes:
+            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
+                shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
+            ):
+                alt_text = ""
+                with suppress(AttributeError):
+                    # access non-visual properties
+                    alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")  # noqa: SLF001
+                filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
+            elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                html_table = "<table>"
+                first_row = True
+                for row in shape.table.rows:
+                    html_table += "<tr>"
+                    for cell in row.cells:
+                        tag = "th" if first_row else "td"
+                        html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
+                    html_table += "</tr>"
+                    first_row = False
+                html_table += "</table>"
+                md_content += "\n" + html_table + "\n"
+            elif shape.has_text_frame:
+                md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
+        md_content = md_content.strip()
+        if slide.has_notes_slide:
+            md_content += "\n\n### Notes:\n"
+            notes_frame = slide.notes_slide.notes_text_frame
+            if notes_frame is not None:  # pragma: no branch
+                md_content += notes_frame.text
+            md_content = md_content.strip()
+    return ExtractionResult(content=normalize_spaces(md_content), mime_type=MARKDOWN_MIME_TYPE, metadata={})

kreuzberg/_string.py CHANGED Viewed

@@ -4,8 +4,6 @@ from contextlib import suppress
 from charset_normalizer import detect
-from kreuzberg.exceptions import ParsingError
 def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     """Decode a byte string safely, removing invalid sequences.
@@ -14,22 +12,21 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
         byte_data: The byte string to decode.
         encoding: The encoding to use when decoding the byte string.
-    Raises:
-        ParsingError: If the byte string could not be decoded.
     Returns:
         The decoded string.
     """
     if not byte_data:
         return ""
-    encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8", "latin-1"]
+    # We try each encoding in order until one works
+    encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
-    for enc in [e for e in encodings if e]:
+    for enc in [e for e in encodings if e]:  # pragma: no cover
         with suppress(UnicodeDecodeError):
             return byte_data.decode(enc)
-    raise ParsingError("Could not decode byte string. Please provide an encoding.")
+    # If all encodings fail, fall back to latin-1 which can handle any byte
+    return byte_data.decode("latin-1", errors="replace")
 def normalize_spaces(text: str) -> str:

kreuzberg/_sync.py CHANGED Viewed

@@ -1,14 +1,19 @@
 from __future__ import annotations
+import sys
 from functools import partial
 from typing import TYPE_CHECKING, TypeVar, cast
 from anyio.to_thread import run_sync as any_io_run_sync
-from typing_extensions import ParamSpec
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Callable
+if sys.version_info >= (3, 10):
+    from typing import ParamSpec
+else:  # pragma: no cover
+    from typing_extensions import ParamSpec
 T = TypeVar("T")
 P = ParamSpec("P")

kreuzberg/_tesseract.py CHANGED Viewed

@@ -2,23 +2,34 @@ from __future__ import annotations
 import re
 import subprocess
-from asyncio import gather
+import sys
 from enum import Enum
+from functools import partial
 from os import PathLike
-from tempfile import NamedTemporaryFile
-from typing import Any, Literal, TypeVar, Union
+from typing import Final, Literal, TypeVar, Union, cast
+from anyio import CapacityLimiter, create_task_group, to_process
 from anyio import Path as AsyncPath
 from PIL.Image import Image
+from kreuzberg import ExtractionResult, ParsingError
+from kreuzberg._constants import DEFAULT_MAX_PROCESSES
+from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._string import normalize_spaces
 from kreuzberg._sync import run_sync
+from kreuzberg._tmp import create_temp_file
 from kreuzberg.exceptions import MissingDependencyError, OCRError
+if sys.version_info < (3, 11):  # pragma: no cover
+    from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
+MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
 version_ref = {"checked": False}
 T = TypeVar("T", bound=Union[Image, PathLike[str], str])
-SupportedLanguages = Literal[
+SupportedLanguage = Literal[
     "afr",
     "amh",
     "ara",
@@ -186,9 +197,10 @@ async def validate_tesseract_version() -> None:
         if version_ref["checked"]:
             return
-        result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
-        version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
-        if not version_match or int(version_match.group(1)) < 5:
+        command = ["tesseract", "--version"]
+        result = await run_sync(subprocess.run, command, capture_output=True)
+        version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
+        if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
             raise MissingDependencyError("Tesseract version 5 or above is required.")
         version_ref["checked"] = True
@@ -197,85 +209,96 @@ async def validate_tesseract_version() -> None:
 async def process_file(
-    input_file: str | PathLike[str], *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any
-) -> str:
+    input_file: str | PathLike[str],
+    *,
+    language: SupportedLanguage,
+    psm: PSMMode,
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> ExtractionResult:
     """Process a single image file using Tesseract OCR.
     Args:
         input_file: The path to the image file to process.
         language: The language code for OCR.
         psm: Page segmentation mode.
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         OCRError: If OCR fails to extract text from the image.
     Returns:
-        str: Extracted text from the image.
+        ExtractionResult: The extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".txt") as output_file:
-        # this is needed because tesseract adds .txt to the output file
-        output_file_name = output_file.name.replace(".txt", "")
-        try:
-            command = [
-                "tesseract",
-                str(input_file),
-                output_file_name,
-                "-l",
-                language,
-                "--psm",
-                str(psm.value),
-            ]
-            for key, value in kwargs.items():
-                command.extend(["-c", f"{key}={value}"])
-            result = await run_sync(
-                subprocess.run,
-                command,
-                capture_output=True,
-            )
-            if not result.returncode == 0:
-                raise OCRError("OCR failed with a non-0 return code.")
-            output = await AsyncPath(output_file.name).read_text()
-            return output.strip()
-        except (RuntimeError, OSError) as e:
-            raise OCRError("Failed to OCR using tesseract") from e
-async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
+    output_path, unlink = await create_temp_file(".txt")
+    try:
+        output_base = str(output_path).replace(".txt", "")
+        command = [
+            "tesseract",
+            str(input_file),
+            output_base,
+            "-l",
+            language,
+            "--psm",
+            str(psm.value),
+        ]
+        result = await to_process.run_sync(
+            partial(subprocess.run, capture_output=True),
+            command,
+            limiter=CapacityLimiter(max_processes),
+            cancellable=True,
+        )
+        if not result.returncode == 0:
+            raise OCRError("OCR failed with a non-0 return code.")
+        output = await AsyncPath(output_path).read_text("utf-8")
+        return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+    except (RuntimeError, OSError) as e:
+        raise OCRError("Failed to OCR using tesseract") from e
+    finally:
+        await unlink()
+async def process_image(
+    image: Image,
+    *,
+    language: SupportedLanguage,
+    psm: PSMMode,
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> ExtractionResult:
     """Process a single Pillow Image using Tesseract OCR.
     Args:
         image: The Pillow Image to process.
         language: The language code for OCR.
         psm: Page segmentation mode.
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Returns:
-        str: Extracted text from the image.
+        ExtractionResult: The extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".png") as image_file:
-        await run_sync(image.save, image_file.name, format="PNG")
-        return await process_file(image_file.name, language=language, psm=psm, **kwargs)
+    image_path, unlink = await create_temp_file(".png")
+    await run_sync(image.save, str(image_path), format="PNG")
+    result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
+    await unlink()
+    return result
 async def process_image_with_tesseract(
     image: Image | PathLike[str] | str,
     *,
-    language: SupportedLanguages = "eng",
+    language: SupportedLanguage = "eng",
     psm: PSMMode = PSMMode.AUTO,
-    **kwargs: Any,
-) -> str:
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> ExtractionResult:
     """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
     Args:
         image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
         language: The language code for OCR (default: "eng").
         psm: Page segmentation mode (default: PSMMode.AUTO).
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
     Raises:
         ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -286,10 +309,10 @@ async def process_image_with_tesseract(
     await validate_tesseract_version()
     if isinstance(image, Image):
-        return await process_image(image, language=language, psm=psm, **kwargs)
+        return await process_image(image, language=language, psm=psm, max_processes=max_processes)
     if isinstance(image, (PathLike, str)):
-        return await process_file(image, language=language, psm=psm, **kwargs)
+        return await process_file(image, language=language, psm=psm, max_processes=max_processes)
     raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
@@ -297,22 +320,36 @@ async def process_image_with_tesseract(
 async def batch_process_images(
     images: list[T],
     *,
-    language: SupportedLanguages = "eng",
+    language: SupportedLanguage = "eng",
     psm: PSMMode = PSMMode.AUTO,
-    **kwargs: Any,
-) -> list[str]:
-    """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+) -> list[ExtractionResult]:
+    """Run Tesseract OCR asynchronously on multiple images with controlled concurrency.
     Args:
         images: A list of Pillow Images, paths or strings to process.
         language: The language code for OCR (default: "eng").
         psm: Page segmentation mode (default: PSMMode.AUTO).
-        **kwargs: Additional Tesseract configuration options as key-value pairs.
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+    Raises:
+        ParsingError: If OCR fails to extract text from any of the images.
     Returns:
-        Extracted text as a string (for single image) or a list of strings (for multiple images).
+        List of ExtractionResult objects, one per input image.
     """
     await validate_tesseract_version()
-    return await gather(
-        *[process_image_with_tesseract(image, language=language, psm=psm, **kwargs) for image in images]
-    )
+    results = cast(list[ExtractionResult], list(range(len(images))))
+    async def _process_image(index: int, image: T) -> None:
+        results[index] = await process_image_with_tesseract(
+            image, language=language, psm=psm, max_processes=max_processes
+        )
+    try:
+        async with create_task_group() as tg:
+            for i, image in enumerate(images):
+                tg.start_soon(_process_image, i, image)
+        return results
+    except ExceptionGroup as eg:
+        raise ParsingError("Failed to process images with Tesseract") from eg

kreuzberg/_tmp.py ADDED Viewed

@@ -0,0 +1,37 @@
+from __future__ import annotations
+from contextlib import suppress
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import TYPE_CHECKING, Callable
+from anyio import Path as AsyncPath
+from kreuzberg._sync import run_sync
+if TYPE_CHECKING:  # pragma: no cover
+    from collections.abc import Coroutine
+async def create_temp_file(
+    extension: str, content: bytes | None = None
+) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
+    """Create a temporary file that is closed.
+    Args:
+        extension: The file extension.
+        content: The content to write to the file.
+    Returns:
+        The temporary file path.
+    """
+    file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
+    if content:
+        await AsyncPath(file.name).write_bytes(content)
+    await run_sync(file.close)
+    async def unlink() -> None:
+        with suppress(OSError, PermissionError):
+            await AsyncPath(file.name).unlink(missing_ok=True)
+    return Path(file.name), unlink

kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

kreuzberg 1.6.0py3-none-any.whl → 2.0.0py3-none-any.whl