PyPI - kreuzberg - Versions diffs - 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

kreuzberg 1.3.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

kreuzberg/_extractors.py +46 -81
kreuzberg/_mime_types.py +22 -31
kreuzberg/_pandoc.py +416 -0
kreuzberg/_string.py +9 -12
kreuzberg/_tesseract.py +318 -0
kreuzberg/exceptions.py +9 -1
kreuzberg/extraction.py +16 -16
kreuzberg-1.5.0.dist-info/METADATA +318 -0
kreuzberg-1.5.0.dist-info/RECORD +15 -0
kreuzberg-1.3.0.dist-info/METADATA +0 -306
kreuzberg-1.3.0.dist-info/RECORD +0 -13
{kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/LICENSE +0 -0
{kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/WHEEL +0 -0
{kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/top_level.txt +0 -0

kreuzberg/_extractors.py CHANGED Viewed

@@ -4,53 +4,62 @@ import re
 from contextlib import suppress
 from html import escape
 from io import BytesIO
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
+import html_to_markdown
+import pptx
+import pypdfium2
 from anyio import Path as AsyncPath
-from charset_normalizer import detect
-from html_to_markdown import convert_to_markdown
-from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE
-from pypandoc import convert_file, convert_text
-from pypdfium2 import PdfDocument, PdfiumError
-from pytesseract import TesseractError, image_to_string
-from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
+from kreuzberg._pandoc import process_content, process_file
 from kreuzberg._string import normalize_spaces, safe_decode
 from kreuzberg._sync import run_sync
+from kreuzberg._tesseract import batch_process_images
 from kreuzberg.exceptions import ParsingError
 if TYPE_CHECKING:  # pragma: no cover
     from pathlib import Path
+    from PIL.Image import Image
-def _extract_pdf_with_tesseract(file_path: Path) -> str:
-    """Extract text from a scanned PDF file using pytesseract.
+async def convert_pdf_to_images(file_path: Path) -> list[Image]:
+    """Convert a PDF file to images.
     Args:
         file_path: The path to the PDF file.
     Raises:
-        ParsingError: If the text could not be extracted from the PDF file.
+        ParsingError: If the PDF file could not be converted to images.
     Returns:
-        The extracted text.
+        A list of Pillow Images.
     """
     try:
-        # make it into an image here:
-        pdf = PdfDocument(str(file_path))
-        images = [page.render(scale=2.0).to_pil() for page in pdf]
-        text = "\n".join(image_to_string(img) for img in images)
-        return normalize_spaces(text)
-    except (PdfiumError, TesseractError) as e:
-        # TODO: add test case
+        pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
+        return [page.render(scale=2.0).to_pil() for page in pdf]
+    except pypdfium2.PdfiumError as e:
         raise ParsingError(
-            "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
+            "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
         ) from e
-def _extract_pdf_with_pdfium2(file_path: Path) -> str:
+async def extract_pdf_with_tesseract(file_path: Path) -> str:
+    """Extract text from a scanned PDF file using pytesseract.
+    Args:
+        file_path: The path to the PDF file.
+    Returns:
+        The extracted text.
+    """
+    images = await convert_pdf_to_images(file_path)
+    ocr_results = await batch_process_images(images)
+    return normalize_spaces("\n".join(ocr_results))
+async def extract_pdf_with_pdfium2(file_path: Path) -> str:
     """Extract text from a searchable PDF file using pypdfium2.
     Args:
@@ -63,17 +72,16 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
         The extracted text.
     """
     try:
-        document = PdfDocument(file_path)
+        document = await run_sync(pypdfium2.PdfDocument, file_path)
         text = "\n".join(page.get_textpage().get_text_range() for page in document)
         return normalize_spaces(text)
-    except PdfiumError as e:
-        # TODO: add test case
+    except pypdfium2.PdfiumError as e:
         raise ParsingError(
             "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
         ) from e
-async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
+async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
     """Extract text from a PDF file.
     Args:
@@ -83,84 +91,41 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
     Returns:
         The extracted text.
     """
-    if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
+    if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
         return normalize_spaces(content)
-    return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))
+    return await extract_pdf_with_tesseract(file_path)
-async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
+async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
     """Extract text using pandoc.
     Args:
         file_data: The content of the file.
         mime_type: The mime type of the file.
-        encoding: An optional encoding to use when decoding the string.
-    Raises:
-        ParsingError: If the text could not be extracted from the file using pandoc.
     Returns:
         The extracted text.
     """
-    ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
-    encoding = encoding or detect(file_data)["encoding"] or "utf-8"
-    try:
-        return normalize_spaces(
-            cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
-        )
-    except RuntimeError as e:
-        # TODO: add test case
-        raise ParsingError(
-            f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
-            context={"error": str(e)},
-        ) from e
+    result = await process_content(file_data, mime_type=mime_type)
+    return normalize_spaces(result.content)
-async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
+async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
     """Extract text using pandoc.
     Args:
         file_path: The path to the file.
         mime_type: The mime type of the file.
-    Raises:
-        ParsingError: If the text could not be extracted from the file using pandoc.
     Returns:
         The extracted text.
     """
-    ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
-    try:
-        return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
-    except RuntimeError as e:
-        raise ParsingError(
-            f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
-            context={"file_path": str(file_path), "error": str(e)},
-        ) from e
-async def _extract_image_with_tesseract(file_path: Path | str) -> str:
-    """Extract text from an image file.
-    Args:
-        file_path: The path to the image file.
-    Raises:
-        ParsingError: If the text could not be extracted from the image file.
-    Returns:
-        The extracted content.
-    """
-    try:
-        return normalize_spaces(cast(str, image_to_string(str(file_path))))
-    except TesseractError as e:
-        raise ParsingError(
-            "Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
-        ) from e
+    result = await process_file(file_path, mime_type=mime_type)
+    return normalize_spaces(result.content)
-async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
+async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
     """Extract text from a PPTX file.
     Notes:
@@ -178,7 +143,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
         if isinstance(file_path_or_contents, bytes)
         else await AsyncPath(file_path_or_contents).read_bytes()
     )
-    presentation = Presentation(BytesIO(file_contents))
+    presentation = pptx.Presentation(BytesIO(file_contents))
     for index, slide in enumerate(presentation.slides):
         md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
@@ -230,7 +195,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
     return normalize_spaces(md_content)
-async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
+async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
     """Extract text from an HTML string.
     Args:
@@ -244,4 +209,4 @@ async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
         if isinstance(file_path_or_contents, bytes)
         else await AsyncPath(file_path_or_contents).read_text()
     )
-    return normalize_spaces(await run_sync(convert_to_markdown, content))
+    return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))

kreuzberg/_mime_types.py CHANGED Viewed

@@ -54,44 +54,35 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
     "image/x-portable-pixmap": "ppm",
 }
 PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
-    "application/csv",
-    "application/latex",
+    "application/csl+json",
+    "application/docbook+xml",
+    "application/epub+zip",
     "application/rtf",
     "application/vnd.oasis.opendocument.text",
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    "application/x-csv",
+    "application/x-biblatex",
+    "application/x-bibtex",
+    "application/x-endnote+xml",
+    "application/x-fictionbook+xml",
+    "application/x-ipynb+json",
+    "application/x-jats+xml",
     "application/x-latex",
-    "application/x-rtf",
-    "application/x-vnd.oasis.opendocument.text",
+    "application/x-opml+xml",
+    "application/x-research-info-systems",
+    "application/x-typst",
     "text/csv",
-    "text/latex",
-    "text/rst",
-    "text/rtf",
     "text/tab-separated-values",
-    "text/x-csv",
-    "text/x-latex",
+    "text/troff",
+    "text/x-commonmark",
+    "text/x-dokuwiki",
+    "text/x-gfm",
+    "text/x-markdown",
+    "text/x-markdown-extra",
+    "text/x-mdoc",
+    "text/x-multimarkdown",
+    "text/x-org",
+    "text/x-pod",
     "text/x-rst",
-    "text/x-tsv",
-}
-PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
-    "application/csv": "csv",
-    "application/latex": "latex",
-    "application/rtf": "rtf",
-    "application/vnd.oasis.opendocument.text": "odt",
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
-    "application/x-csv": "csv",
-    "application/x-latex": "latex",
-    "application/x-rtf": "rtf",
-    "application/x-vnd.oasis.opendocument.text": "odt",
-    "text/csv": "csv",
-    "text/latex": "latex",
-    "text/rst": "rst",
-    "text/rtf": "rtf",
-    "text/tab-separated-values": "tsv",
-    "text/x-csv": "csv",
-    "text/x-latex": "latex",
-    "text/x-rst": "rst",
-    "text/x-tsv": "tsv",
 }
 SUPPORTED_MIME_TYPES: Final[set[str]] = (

kreuzberg 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

kreuzberg 1.3.0py3-none-any.whl → 1.5.0py3-none-any.whl