PyPI - kreuzberg - Versions diffs - 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

kreuzberg 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

kreuzberg/_extractors.py +105 -7
kreuzberg/_mime_types.py +9 -4
kreuzberg/_string.py +12 -0
kreuzberg/extraction.py +25 -0
{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/METADATA +45 -13
kreuzberg-1.3.0.dist-info/RECORD +13 -0
kreuzberg-1.1.0.dist-info/RECORD +0 -13
{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/LICENSE +0 -0
{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/WHEEL +0 -0
{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/top_level.txt +0 -0

kreuzberg/_extractors.py CHANGED Viewed

@@ -1,13 +1,22 @@
 from __future__ import annotations
+import re
+from contextlib import suppress
+from html import escape
+from io import BytesIO
 from typing import TYPE_CHECKING, cast
+from anyio import Path as AsyncPath
 from charset_normalizer import detect
+from html_to_markdown import convert_to_markdown
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
 from pypandoc import convert_file, convert_text
 from pypdfium2 import PdfDocument, PdfiumError
 from pytesseract import TesseractError, image_to_string
 from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
+from kreuzberg._string import normalize_spaces, safe_decode
 from kreuzberg._sync import run_sync
 from kreuzberg.exceptions import ParsingError
@@ -33,7 +42,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
         images = [page.render(scale=2.0).to_pil() for page in pdf]
         text = "\n".join(image_to_string(img) for img in images)
-        return text.strip()
+        return normalize_spaces(text)
     except (PdfiumError, TesseractError) as e:
         # TODO: add test case
         raise ParsingError(
@@ -56,7 +65,7 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
     try:
         document = PdfDocument(file_path)
         text = "\n".join(page.get_textpage().get_text_range() for page in document)
-        return text.strip()
+        return normalize_spaces(text)
     except PdfiumError as e:
         # TODO: add test case
         raise ParsingError(
@@ -75,9 +84,9 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
         The extracted text.
     """
     if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
-        return content
+        return normalize_spaces(content)
-    return await run_sync(_extract_pdf_with_tesseract, file_path)
+    return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))
 async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
@@ -97,7 +106,9 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
     ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
     encoding = encoding or detect(file_data)["encoding"] or "utf-8"
     try:
-        return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
+        return normalize_spaces(
+            cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
+        )
     except RuntimeError as e:
         # TODO: add test case
         raise ParsingError(
@@ -121,7 +132,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
     """
     ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
     try:
-        return cast(str, await run_sync(convert_file, file_path, to="md", format=ext))
+        return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
     except RuntimeError as e:
         raise ParsingError(
             f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
@@ -142,8 +153,95 @@ async def _extract_image_with_tesseract(file_path: Path | str) -> str:
         The extracted content.
     """
     try:
-        return cast(str, image_to_string(str(file_path)).strip())
+        return normalize_spaces(cast(str, image_to_string(str(file_path))))
     except TesseractError as e:
         raise ParsingError(
             "Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
         ) from e
+async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
+    """Extract text from a PPTX file.
+    Notes:
+        This function is based on code vendored from `markitdown`, which has an MIT license as well.
+    Args:
+        file_path_or_contents: The path to the PPTX file or its contents as bytes.
+    Returns:
+        The extracted text content
+    """
+    md_content = ""
+    file_contents = (
+        file_path_or_contents
+        if isinstance(file_path_or_contents, bytes)
+        else await AsyncPath(file_path_or_contents).read_bytes()
+    )
+    presentation = Presentation(BytesIO(file_contents))
+    for index, slide in enumerate(presentation.slides):
+        md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
+        title = slide.shapes.title
+        for shape in slide.shapes:
+            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
+                shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
+            ):
+                alt_text = ""
+                with suppress(AttributeError):
+                    # access non-visual properties
+                    alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")  # noqa: SLF001
+                filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
+            elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                html_table = "<table>"
+                first_row = True
+                for row in shape.table.rows:
+                    html_table += "<tr>"
+                    for cell in row.cells:
+                        tag = "th" if first_row else "td"
+                        html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
+                    html_table += "</tr>"
+                    first_row = False
+                html_table += "</table>"
+                md_content += "\n" + html_table + "\n"
+            elif shape.has_text_frame:
+                md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
+        md_content = md_content.strip()
+        if slide.has_notes_slide:
+            md_content += "\n\n### Notes:\n"
+            notes_frame = slide.notes_slide.notes_text_frame
+            if notes_frame is not None:
+                md_content += notes_frame.text
+            md_content = md_content.strip()
+    return normalize_spaces(md_content)
+async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
+    """Extract text from an HTML string.
+    Args:
+        file_path_or_contents: The HTML content.
+    Returns:
+        The extracted text content.
+    """
+    content = (
+        safe_decode(file_path_or_contents)
+        if isinstance(file_path_or_contents, bytes)
+        else await AsyncPath(file_path_or_contents).read_text()
+    )
+    return normalize_spaces(await run_sync(convert_to_markdown, content))

kreuzberg/_mime_types.py CHANGED Viewed

@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING, Final
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
-MARKDOWN_MIME_TYPE: Final[str] = "text/markdown"
-PLAIN_TEXT_MIME_TYPE: Final[str] = "text/plain"
-PDF_MIME_TYPE: Final[str] = "application/pdf"
+HTML_MIME_TYPE: Final = "text/html"
+MARKDOWN_MIME_TYPE: Final = "text/markdown"
+PDF_MIME_TYPE: Final = "application/pdf"
+PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
+POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
@@ -93,5 +95,8 @@ PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
 }
 SUPPORTED_MIME_TYPES: Final[set[str]] = (
-    PLAIN_TEXT_MIME_TYPES | IMAGE_MIME_TYPES | PANDOC_SUPPORTED_MIME_TYPES | {PDF_MIME_TYPE}
+    PLAIN_TEXT_MIME_TYPES
+    | IMAGE_MIME_TYPES
+    | PANDOC_SUPPORTED_MIME_TYPES
+    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
 )

kreuzberg/_string.py CHANGED Viewed

@@ -33,3 +33,15 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     # TODO: add test case
     return byte_data.decode("latin-1", errors="replace")
+def normalize_spaces(text: str) -> str:
+    """Normalize the spaces in a string.
+    Args:
+        text: The text to sanitize.
+    Returns:
+        The sanitized text.
+    """
+    return " ".join(text.strip().split())

kreuzberg/extraction.py CHANGED Viewed

@@ -1,3 +1,12 @@
+"""This module provides functions to extract textual content from files.
+It includes vendored code:
+- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
+    See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
+    Refer to the markitdown repository for it's license (MIT).
+"""
 from __future__ import annotations
 from mimetypes import guess_type
@@ -10,16 +19,20 @@ from anyio import Path as AsyncPath
 from kreuzberg._extractors import (
     _extract_content_with_pandoc,
     _extract_file_with_pandoc,
+    _extract_html_string,
     _extract_image_with_tesseract,
     _extract_pdf_file,
+    _extract_pptx_file,
 )
 from kreuzberg._mime_types import (
+    HTML_MIME_TYPE,
     IMAGE_MIME_TYPE_EXT_MAP,
     IMAGE_MIME_TYPES,
     MARKDOWN_MIME_TYPE,
     PANDOC_SUPPORTED_MIME_TYPES,
     PDF_MIME_TYPE,
     PLAIN_TEXT_MIME_TYPE,
+    POWER_POINT_MIME_TYPE,
     SUPPORTED_MIME_TYPES,
 )
 from kreuzberg._string import safe_decode
@@ -76,6 +89,12 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
             content=await _extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
         )
+    if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
+        return ExtractionResult(content=await _extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
+    if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
+        return ExtractionResult(content=await _extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
     return ExtractionResult(
         content=safe_decode(content),
         mime_type=mime_type,
@@ -125,4 +144,10 @@ async def extract_file(
             content=await _extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
         )
+    if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
+        return ExtractionResult(content=await _extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
+    if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
+        return ExtractionResult(content=await _extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
     return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)

{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.2
 Name: kreuzberg
-Version: 1.1.0
+Version: 1.3.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
-Keywords: async,document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,tesseract,text-extraction,text-processing
+Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
@@ -25,9 +25,11 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: anyio>=4.8.0
 Requires-Dist: charset-normalizer>=3.4.1
+Requires-Dist: html-to-markdown>=1.2.0
 Requires-Dist: pypandoc>=1.15
 Requires-Dist: pypdfium2>=4.30.1
 Requires-Dist: pytesseract>=0.3.13
+Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: typing-extensions>=4.12.2
 # Kreuzberg
@@ -37,7 +39,7 @@ extraction.
 Why?
-I am building, like many do now, a RAG focused service. I have text extraction needs.
+I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
 There are quite a lot of commercial options out there, and several open-source + paid options.
 But I wanted something simple, which does not require expansive round-trips to an external API.
 Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
@@ -65,6 +67,43 @@ Hence, this library.
 - [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
 - [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
+## Dependencies and Philosophy
+This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
+high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
+polished and well maintained.
+### Dependencies
+- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
+- Images are processed using Tesseract OCR
+- Office documents and other formats are processed using Pandoc
+- PPTX files are converted using python-pptx
+- HTML files are converted using html-to-markdown
+- Plain text files are read directly with appropriate encoding detection
+### Roadmap
+V1:
+- [x] - html file text extraction
+- [ ] - better PDF table extraction
+- [ ] - TBD
+V2:
+- [ ] - extra install groups (to make dependencies optional)
+- [ ] - metadata extraction (possible breaking change)
+- [ ] - TBD
+### Feature Requests
+Feel free to open a discussion in GitHub or an issue if you have any feature requests
+### Contribution
+Is welcome! Read guidelines below.
 ## Supported File Types
 Kreuzberg supports a wide range of file formats:
@@ -72,7 +111,8 @@ Kreuzberg supports a wide range of file formats:
 ### Document Formats
 - PDF (`.pdf`) - both searchable and scanned documents
-- Word Documents (`.docx`)
+- Word Documents (`.docx`, `.doc`)
+- Power Point Presentations (`.pptx`)
 - OpenDocument Text (`.odt`)
 - Rich Text Format (`.rtf`)
@@ -92,6 +132,7 @@ Kreuzberg supports a wide range of file formats:
 #### Text and Markup Formats
+- HTML (`.html`, `.htm`)
 - Plain Text (`.txt`)
 - Markdown (`.md`)
 - reStructuredText (`.rst`)
@@ -102,13 +143,6 @@ Kreuzberg supports a wide range of file formats:
 - Comma-Separated Values (`.csv`)
 - Tab-Separated Values (`.tsv`)
-All formats support text extraction, with different processing methods:
-- PDFs are processed using pdfium2 for searchable PDFs and Tesseract OCR for scanned documents
-- Images are processed using Tesseract OCR
-- Office documents and other formats are processed using Pandoc
-- Plain text files are read directly with appropriate encoding detection
 ## Usage
 Kreuzberg exports two async functions:
@@ -116,8 +150,6 @@ Kreuzberg exports two async functions:
 - Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
 - Extract text from a byte-string using `extract_bytes()`
-Note - both of these functions are async and therefore should be used in an async context.
 ### Extract from File
 ```python

kreuzberg-1.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
+kreuzberg/_extractors.py,sha256=eiWPpjnZOZFDwlQL4XsgavJEWqxGtzLVvS8YU28RBAo,8095
+kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
+kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
+kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
+kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
+kreuzberg/extraction.py,sha256=cgX8uoCVXf-Va30g8T8DwrZUqsSPHIzmPfDgnWOqNNU,6148
+kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg-1.3.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-1.3.0.dist-info/METADATA,sha256=3wiaAuaiA865lg5oCjwlAKaZqRQn1w8VqaQXeoEdip4,8579
+kreuzberg-1.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+kreuzberg-1.3.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
+kreuzberg-1.3.0.dist-info/RECORD,,

kreuzberg-1.1.0.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
-kreuzberg/_extractors.py,sha256=r8L9Bm3x7s1u7-T5HKkr1j6M6W3bUuwMAmDtAwX-s9g,4717
-kreuzberg/_mime_types.py,sha256=M5sKT4OkMf7pwtgs_jO2uhl6gC94wUurYzw_wbrIjU0,2739
-kreuzberg/_string.py,sha256=5s6BfTLQdYlDEt2PP4AdmBLV-ajroATOVYQQRcBYFD4,934
-kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
-kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
-kreuzberg/extraction.py,sha256=-a_msLQm7h5pHDhBuvfRP81-FtBwv7FGW-6YVJlXpUg,4926
-kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg-1.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-1.1.0.dist-info/METADATA,sha256=nkDjE2MEqAE_-1MZvlBxnNuM7SKCOD2LvB7Ucb_W7U4,7775
-kreuzberg-1.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-kreuzberg-1.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-1.1.0.dist-info/RECORD,,

{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-1.1.0.dist-info → kreuzberg-1.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

kreuzberg 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

kreuzberg 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl