PyPI - kreuzberg - Versions diffs - 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

kreuzberg 1.5.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

kreuzberg/_extractors.py CHANGED Viewed

@@ -4,6 +4,8 @@ import re
 from contextlib import suppress
 from html import escape
 from io import BytesIO
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING
 import html_to_markdown
@@ -11,6 +13,7 @@ import pptx
 import pypdfium2
 from anyio import Path as AsyncPath
 from pptx.enum.shapes import MSO_SHAPE_TYPE
+from xlsx2csv import Xlsx2csv
 from kreuzberg._pandoc import process_content, process_file
 from kreuzberg._string import normalize_spaces, safe_decode
@@ -19,8 +22,6 @@ from kreuzberg._tesseract import batch_process_images
 from kreuzberg.exceptions import ParsingError
 if TYPE_CHECKING:  # pragma: no cover
-    from pathlib import Path
     from PIL.Image import Image
@@ -195,6 +196,40 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
     return normalize_spaces(md_content)
+async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
+    """Extract text from an XLSX file by converting it to CSV and then to markdown.
+    Args:
+        file_path_or_contents: The path to the XLSX file or its contents as bytes.
+    Returns:
+        The extracted text content.
+    Raises:
+        ParsingError: If the XLSX file could not be parsed.
+    """
+    try:
+        with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
+            if isinstance(file_path_or_contents, bytes):
+                xlsx_file.write(file_path_or_contents)
+                xlsx_file.flush()
+                xlsx_path = xlsx_file.name
+            else:
+                xlsx_path = str(file_path_or_contents)
+            await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
+            result = await process_file(csv_file.name, mime_type="text/csv")
+            return normalize_spaces(result.content)
+    except Exception as e:
+        raise ParsingError(
+            "Could not extract text from XLSX file",
+            context={
+                "error": str(e),
+                "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
+            },
+        ) from e
 async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
     """Extract text from an HTML string.

kreuzberg/_mime_types.py CHANGED Viewed

@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
 PDF_MIME_TYPE: Final = "application/pdf"
 PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
 POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
 IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -89,5 +89,5 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
     PLAIN_TEXT_MIME_TYPES
     | IMAGE_MIME_TYPES
     | PANDOC_SUPPORTED_MIME_TYPES
-    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
+    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
 )

kreuzberg/extraction.py CHANGED Viewed

@@ -22,8 +22,10 @@ from kreuzberg._extractors import (
     extract_html_string,
     extract_pdf_file,
     extract_pptx_file,
+    extract_xlsx_file,
 )
 from kreuzberg._mime_types import (
+    EXCEL_MIME_TYPE,
     HTML_MIME_TYPE,
     IMAGE_MIME_TYPE_EXT_MAP,
     IMAGE_MIME_TYPES,
@@ -75,6 +77,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
                 content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
             )
+    if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
+        return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
         with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
             temp_file.write(content)
@@ -134,6 +139,9 @@ async def extract_file(
     if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
         return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
+    if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
+        return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
         return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)

{kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kreuzberg
-Version: 1.5.0
+Version: 1.6.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -30,6 +30,7 @@ Requires-Dist: html-to-markdown>=1.2.0
 Requires-Dist: pypdfium2>=4.30.1
 Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
+Requires-Dist: xlsx2csv>=0.8.4
 # Kreuzberg
@@ -68,16 +69,12 @@ pip install kreuzberg
 ### 2. Install System Dependencies
-Kreuzberg requires two open-source tools:
+Kreuzberg requires two system level dependencies:
 - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-  - GPL v2.0 licensed (used via CLI only)
-  - Handles office documents and markup formats
 - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
-  - Apache License
-  - Required for scanned documents and images
+Please install these using their respective installation guides.
 ## Architecture
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
   - `pdfium2` for searchable PDFs
   - Tesseract OCR for scanned content
 - **Document Conversion**:
-  - Pandoc for office documents and markup
+  - Pandoc for many document and markup formats
   - `python-pptx` for PowerPoint files
   - `html-to-markdown` for HTML content
+  - `xlsx2csv` for Excel spreadsheets
 - **Text Processing**:
   - Smart encoding detection
   - Markdown and plain text handling
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
 #### Data and Research Formats
+- Excel spreadsheets (`.xlsx`)
 - CSV (`.csv`) and TSV (`.tsv`) files
 - Jupyter Notebooks (`.ipynb`)
 - BibTeX (`.bib`) and BibLaTeX (`.bib`)

{kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
-kreuzberg/_extractors.py,sha256=k6xO_2ItaftPmlqzfXyxTn8rdaWdwrJHGziBbo7gCio,6599
-kreuzberg/_mime_types.py,sha256=0ZYtRrMAaKpCMDkhpTbWAXHCsVob5MFRMGlbni8iYSA,2573
+kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
+kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
 kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
 kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
 kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
 kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
 kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
-kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
+kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
 kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg-1.5.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-1.5.0.dist-info/METADATA,sha256=O462ss7M6Cb8cO6fJXwqsOdzkzaZekqa1oGwb7Vrgx8,9641
-kreuzberg-1.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-kreuzberg-1.5.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-1.5.0.dist-info/RECORD,,
+kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
+kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
+kreuzberg-1.6.0.dist-info/RECORD,,

{kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

kreuzberg 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

kreuzberg 1.5.0py3-none-any.whl → 1.6.0py3-none-any.whl