PyPI - kreuzberg - Versions diffs - 1.5.0__tar.gz → 1.7.0__tar.gz - Mend

kreuzberg 1.5.0tar.gz → 1.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kreuzberg
-Version: 1.5.0
+Version: 1.7.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -29,7 +29,8 @@ Requires-Dist: charset-normalizer>=3.4.1
 Requires-Dist: html-to-markdown>=1.2.0
 Requires-Dist: pypdfium2>=4.30.1
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
+Requires-Dist: typing-extensions>=4.12.2
+Requires-Dist: xlsx2csv>=0.8.4
 # Kreuzberg
@@ -68,16 +69,12 @@ pip install kreuzberg
 ### 2. Install System Dependencies
-Kreuzberg requires two open-source tools:
+Kreuzberg requires two system level dependencies:
 - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-  - GPL v2.0 licensed (used via CLI only)
-  - Handles office documents and markup formats
 - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
-  - Apache License
-  - Required for scanned documents and images
+Please install these using their respective installation guides.
 ## Architecture
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
   - `pdfium2` for searchable PDFs
   - Tesseract OCR for scanned content
 - **Document Conversion**:
-  - Pandoc for office documents and markup
+  - Pandoc for many document and markup formats
   - `python-pptx` for PowerPoint files
   - `html-to-markdown` for HTML content
+  - `xlsx2csv` for Excel spreadsheets
 - **Text Processing**:
   - Smart encoding detection
   - Markdown and plain text handling
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
 #### Data and Research Formats
+- Excel spreadsheets (`.xlsx`)
 - CSV (`.csv`) and TSV (`.tsv`) files
 - Jupyter Notebooks (`.ipynb`)
 - BibTeX (`.bib`) and BibLaTeX (`.bib`)
@@ -232,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
 ### Error Handling
-Kreuzberg provides detailed error handling with two main exception types:
+Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
 ```python
 from kreuzberg import extract_file
-from kreuzberg.exceptions import ValidationError, ParsingError
+from kreuzberg.exceptions import (
+    ValidationError,
+    ParsingError,
+    OCRError,
+    MissingDependencyError
+)
 async def safe_extract(path: str) -> str:
     try:
@@ -244,20 +248,31 @@ async def safe_extract(path: str) -> str:
         return result.content
     except ValidationError as e:
-        # Handles input validation issues:
-        # - Unsupported file types
+        # Input validation issues
+        # - Unsupported or undetectable MIME types
         # - Missing files
-        # - Invalid MIME types
-        print(f"Invalid input: {e.message}")
-        print(f"Details: {e.context}")
+        # - Invalid input parameters
+        print(f"Validation failed: {e}")
+    except OCRError as e:
+        # OCR-specific issues
+        # - Tesseract processing failures
+        # - Image conversion problems
+        print(f"OCR failed: {e}")
+    except MissingDependencyError as e:
+        # System dependency issues
+        # - Missing Tesseract OCR
+        # - Missing Pandoc
+        # - Incompatible versions
+        print(f"Dependency missing: {e}")
     except ParsingError as e:
-        # Handles processing errors:
+        # General processing errors
         # - PDF parsing failures
-        # - OCR errors
         # - Format conversion issues
-        print(f"Processing failed: {e.message}")
-        print(f"Details: {e.context}")
+        # - Encoding problems
+        print(f"Processing failed: {e}")
     return ""
@@ -265,24 +280,33 @@ async def safe_extract(path: str) -> str:
 try:
     result = await extract_file("document.xyz")
 except ValidationError as e:
-    # e.context might contain:
-    # {
+    # Error will include context:
+    # ValidationError: Unsupported mime type
+    # Context: {
     #    "file_path": "document.xyz",
-    #    "error": "Unsupported file type",
-    #    "supported_types": ["pdf", "docx", ...]
+    #    "supported_mimetypes": ["application/pdf", ...]
     # }
+    print(e)
 try:
-    result = await extract_file("scan.pdf")
-except ParsingError as e:
-    # e.context might contain:
-    # {
-    #    "file_path": "scan.pdf",
-    #    "error": "OCR processing failed",
-    #    "details": "Tesseract error: Unable to process image"
+    result = await extract_file("scan.jpg")
+except OCRError as e:
+    # Error will include context:
+    # OCRError: OCR failed with a non-0 return code
+    # Context: {
+    #    "file_path": "scan.jpg",
+    #    "tesseract_version": "5.3.0"
     # }
+    print(e)
 ```
+All exceptions provide:
+- A descriptive error message
+- Relevant context in the `context` attribute
+- String representation with both message and context
+- Proper exception chaining for debugging
 ## Roadmap
 V1:

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/README.md RENAMED Viewed

@@ -35,16 +35,12 @@ pip install kreuzberg
 ### 2. Install System Dependencies
-Kreuzberg requires two open-source tools:
+Kreuzberg requires two system level dependencies:
 - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-  - GPL v2.0 licensed (used via CLI only)
-  - Handles office documents and markup formats
 - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
-  - Apache License
-  - Required for scanned documents and images
+Please install these using their respective installation guides.
 ## Architecture
@@ -54,9 +50,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
   - `pdfium2` for searchable PDFs
   - Tesseract OCR for scanned content
 - **Document Conversion**:
-  - Pandoc for office documents and markup
+  - Pandoc for many document and markup formats
   - `python-pptx` for PowerPoint files
   - `html-to-markdown` for HTML content
+  - `xlsx2csv` for Excel spreadsheets
 - **Text Processing**:
   - Smart encoding detection
   - Markdown and plain text handling
@@ -88,6 +85,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
 #### Data and Research Formats
+- Excel spreadsheets (`.xlsx`)
 - CSV (`.csv`) and TSV (`.tsv`) files
 - Jupyter Notebooks (`.ipynb`)
 - BibTeX (`.bib`) and BibLaTeX (`.bib`)
@@ -199,11 +197,16 @@ async def process_document(path: str) -> tuple[str, str]:
 ### Error Handling
-Kreuzberg provides detailed error handling with two main exception types:
+Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
 ```python
 from kreuzberg import extract_file
-from kreuzberg.exceptions import ValidationError, ParsingError
+from kreuzberg.exceptions import (
+    ValidationError,
+    ParsingError,
+    OCRError,
+    MissingDependencyError
+)
 async def safe_extract(path: str) -> str:
     try:
@@ -211,20 +214,31 @@ async def safe_extract(path: str) -> str:
         return result.content
     except ValidationError as e:
-        # Handles input validation issues:
-        # - Unsupported file types
+        # Input validation issues
+        # - Unsupported or undetectable MIME types
         # - Missing files
-        # - Invalid MIME types
-        print(f"Invalid input: {e.message}")
-        print(f"Details: {e.context}")
+        # - Invalid input parameters
+        print(f"Validation failed: {e}")
+    except OCRError as e:
+        # OCR-specific issues
+        # - Tesseract processing failures
+        # - Image conversion problems
+        print(f"OCR failed: {e}")
+    except MissingDependencyError as e:
+        # System dependency issues
+        # - Missing Tesseract OCR
+        # - Missing Pandoc
+        # - Incompatible versions
+        print(f"Dependency missing: {e}")
     except ParsingError as e:
-        # Handles processing errors:
+        # General processing errors
         # - PDF parsing failures
-        # - OCR errors
         # - Format conversion issues
-        print(f"Processing failed: {e.message}")
-        print(f"Details: {e.context}")
+        # - Encoding problems
+        print(f"Processing failed: {e}")
     return ""
@@ -232,24 +246,33 @@ async def safe_extract(path: str) -> str:
 try:
     result = await extract_file("document.xyz")
 except ValidationError as e:
-    # e.context might contain:
-    # {
+    # Error will include context:
+    # ValidationError: Unsupported mime type
+    # Context: {
     #    "file_path": "document.xyz",
-    #    "error": "Unsupported file type",
-    #    "supported_types": ["pdf", "docx", ...]
+    #    "supported_mimetypes": ["application/pdf", ...]
     # }
+    print(e)
 try:
-    result = await extract_file("scan.pdf")
-except ParsingError as e:
-    # e.context might contain:
-    # {
-    #    "file_path": "scan.pdf",
-    #    "error": "OCR processing failed",
-    #    "details": "Tesseract error: Unable to process image"
+    result = await extract_file("scan.jpg")
+except OCRError as e:
+    # Error will include context:
+    # OCRError: OCR failed with a non-0 return code
+    # Context: {
+    #    "file_path": "scan.jpg",
+    #    "tesseract_version": "5.3.0"
     # }
+    print(e)
 ```
+All exceptions provide:
+- A descriptive error message
+- Relevant context in the `context` attribute
+- String representation with both message and context
+- Proper exception chaining for debugging
 ## Roadmap
 V1:

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_extractors.py RENAMED Viewed

@@ -1,9 +1,12 @@
 from __future__ import annotations
 import re
+from asyncio import gather
 from contextlib import suppress
 from html import escape
 from io import BytesIO
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING
 import html_to_markdown
@@ -11,6 +14,7 @@ import pptx
 import pypdfium2
 from anyio import Path as AsyncPath
 from pptx.enum.shapes import MSO_SHAPE_TYPE
+from xlsx2csv import Xlsx2csv
 from kreuzberg._pandoc import process_content, process_file
 from kreuzberg._string import normalize_spaces, safe_decode
@@ -19,8 +23,6 @@ from kreuzberg._tesseract import batch_process_images
 from kreuzberg.exceptions import ParsingError
 if TYPE_CHECKING:  # pragma: no cover
-    from pathlib import Path
     from PIL.Image import Image
@@ -36,13 +38,18 @@ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
     Returns:
         A list of Pillow Images.
     """
+    pdf = None
+    resolved_path = str(await AsyncPath(file_path).resolve())
     try:
-        pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
+        pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
         return [page.render(scale=2.0).to_pil() for page in pdf]
     except pypdfium2.PdfiumError as e:
         raise ParsingError(
             "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
         ) from e
+    finally:
+        if pdf is not None:
+            pdf.close()
 async def extract_pdf_with_tesseract(file_path: Path) -> str:
@@ -71,30 +78,49 @@ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
     Returns:
         The extracted text.
     """
+    document = None
+    resolved_path = str(await AsyncPath(file_path).resolve())
     try:
-        document = await run_sync(pypdfium2.PdfDocument, file_path)
-        text = "\n".join(page.get_textpage().get_text_range() for page in document)
+        document = await run_sync(pypdfium2.PdfDocument, resolved_path)
+        text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
         return normalize_spaces(text)
     except pypdfium2.PdfiumError as e:
         raise ParsingError(
             "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
         ) from e
+    finally:
+        if document is not None:
+            document.close()
-async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
+async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = False) -> str:
     """Extract text from a PDF file.
     Args:
-        file_path: The path to the PDF file.
+        file_path_or_contents: The path to the PDF file or its contents as bytes.
         force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
     Returns:
         The extracted text.
     """
-    if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
+    if isinstance(file_path_or_contents, bytes):
+        with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
+            try:
+                file_path = Path(pdf_file.name)
+                await AsyncPath(file_path).write_bytes(file_path_or_contents)
+                if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
+                    return normalize_spaces(content)
+                return await extract_pdf_with_tesseract(file_path)
+            finally:
+                pdf_file.close()
+                await AsyncPath(pdf_file.name).unlink()
+    if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
         return normalize_spaces(content)
-    return await extract_pdf_with_tesseract(file_path)
+    return await extract_pdf_with_tesseract(file_path_or_contents)
 async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
@@ -121,7 +147,8 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
     Returns:
         The extracted text.
     """
-    result = await process_file(file_path, mime_type=mime_type)
+    resolved_path = str(await AsyncPath(file_path).resolve())
+    result = await process_file(resolved_path, mime_type=mime_type)
     return normalize_spaces(result.content)
@@ -195,6 +222,47 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
     return normalize_spaces(md_content)
+async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
+    """Extract text from an XLSX file by converting it to CSV and then to markdown.
+    Args:
+        file_path_or_contents: The path to the XLSX file or its contents as bytes.
+    Returns:
+        The extracted text content.
+    Raises:
+        ParsingError: If the XLSX file could not be parsed.
+    """
+    with (
+        NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
+        NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
+    ):
+        try:
+            if isinstance(file_path_or_contents, bytes):
+                xlsx_file.write(file_path_or_contents)
+                xlsx_file.flush()
+                xlsx_path = xlsx_file.name
+            else:
+                xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())
+            await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
+            result = await process_file(csv_file.name, mime_type="text/csv")
+            return normalize_spaces(result.content)
+        except Exception as e:
+            raise ParsingError(
+                "Could not extract text from XLSX file",
+                context={
+                    "error": str(e),
+                    "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
+                },
+            ) from e
+        finally:
+            xlsx_file.close()
+            csv_file.close()
+            await gather(AsyncPath(xlsx_file.name).unlink(), AsyncPath(csv_file.name).unlink())
 async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
     """Extract text from an HTML string.

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_mime_types.py RENAMED Viewed

@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
 PDF_MIME_TYPE: Final = "application/pdf"
 PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
 POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
 IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -89,5 +89,5 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
     PLAIN_TEXT_MIME_TYPES
     | IMAGE_MIME_TYPES
     | PANDOC_SUPPORTED_MIME_TYPES
-    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
+    | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
 )

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_pandoc.py RENAMED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
-import json
 import subprocess
 from asyncio import gather
 from dataclasses import dataclass
+from json import JSONDecodeError, loads
 from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
@@ -13,7 +13,7 @@ from kreuzberg._string import normalize_spaces
 from kreuzberg._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
     from os import PathLike
@@ -80,7 +80,7 @@ NodeType = Literal[
     "MetaBlocks",
 ]
-PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
+MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
     "application/csl+json": "csljson",
     "application/docbook+xml": "docbook",
     "application/epub+zip": "epub",
@@ -112,6 +112,38 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
     "text/x-rst": "rst",
 }
+MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
+    "application/csl+json": "json",
+    "application/docbook+xml": "xml",
+    "application/epub+zip": "epub",
+    "application/rtf": "rtf",
+    "application/vnd.oasis.opendocument.text": "odt",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+    "application/x-biblatex": "bib",
+    "application/x-bibtex": "bib",
+    "application/x-endnote+xml": "xml",
+    "application/x-fictionbook+xml": "fb2",
+    "application/x-ipynb+json": "ipynb",
+    "application/x-jats+xml": "xml",
+    "application/x-latex": "tex",
+    "application/x-opml+xml": "opml",
+    "application/x-research-info-systems": "ris",
+    "application/x-typst": "typst",
+    "text/csv": "csv",
+    "text/tab-separated-values": "tsv",
+    "text/troff": "1",
+    "text/x-commonmark": "md",
+    "text/x-dokuwiki": "wiki",
+    "text/x-gfm": "md",
+    "text/x-markdown": "md",
+    "text/x-markdown-extra": "md",
+    "text/x-mdoc": "md",
+    "text/x-multimarkdown": "md",
+    "text/x-org": "org",
+    "text/x-pod": "pod",
+    "text/x-rst": "rst",
+}
 class Metadata(TypedDict, total=False):
     """Document metadata extracted from Pandoc document.
@@ -232,7 +264,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
 def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
-    """Extract all non-empty metadata values from Pandoc AST metadata."""
     meta: Metadata = {}
     for key, value in raw_meta.items():
@@ -252,34 +283,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
     return meta
-def _get_extension_from_mime_type(mime_type: str) -> str:
-    if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
-        mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
+def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
+    if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
+        mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
     ):
         raise ValidationError(
             f"Unsupported mime type: {mime_type}",
             context={
                 "mime_type": mime_type,
-                "supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
+                "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
             },
         )
-    return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
-        PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
+    return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
+        MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
     )
-async def validate_pandoc_version() -> None:
-    """Validate that Pandoc is installed and is version 3 or above.
-    Raises:
-        MissingDependencyError: If Pandoc is not installed or is below version 3.
-    """
+async def _validate_pandoc_version() -> None:
     try:
         if version_ref["checked"]:
             return
-        result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
+        command = ["pandoc", "--version"]
+        result = await run_sync(subprocess.run, command, capture_output=True)
         version = result.stdout.decode().split("\n")[0].split()[1]
         if not version.startswith("3."):
             raise MissingDependencyError("Pandoc version 3 or above is required.")
@@ -290,27 +317,15 @@ async def validate_pandoc_version() -> None:
         raise MissingDependencyError("Pandoc is not installed.") from e
-async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
-    """Extract metadata from a document using pandoc.
-    Args:
-        input_file: The path to the file to process.
-        mime_type: The mime type of the file.
-    Raises:
-        ParsingError: If Pandoc fails to extract metadata.
-    Returns:
-        Dictionary containing document metadata.
-    """
-    extension = _get_extension_from_mime_type(mime_type)
+async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
+    pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
-    with NamedTemporaryFile(suffix=".json") as metadata_file:
+    with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
         try:
             command = [
                 "pandoc",
                 str(input_file),
-                f"--from={extension}",
+                f"--from={pandoc_type}",
                 "--to=json",
                 "--standalone",
                 "--quiet",
@@ -329,46 +344,60 @@ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -
                     "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
                 )
-            json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
+            json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
             return _extract_metadata(json_data)
-        except (RuntimeError, OSError, json.JSONDecodeError) as e:
+        except (RuntimeError, OSError, JSONDecodeError) as e:
             raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
+        finally:
+            metadata_file.close()
+            await AsyncPath(metadata_file.name).unlink()
-async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
-    extension = _get_extension_from_mime_type(mime_type)
-    with NamedTemporaryFile(suffix=".md") as output_file:
-        command = [
-            "pandoc",
-            str(input_file),
-            f"--from={extension}",
-            "--to=markdown",
-            "--standalone",
-            "--wrap=preserve",
-            "--quiet",
-            "--output",
-            output_file.name,
-        ]
-        if extra_args:
-            command.extend(extra_args)
+async def _handle_extract_file(
+    input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
+) -> str:
+    pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
-        result = await run_sync(
-            subprocess.run,
-            command,
-            capture_output=True,
-        )
+    with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
+        try:
+            command = [
+                "pandoc",
+                str(input_file),
+                f"--from={pandoc_type}",
+                "--to=markdown",
+                "--standalone",
+                "--wrap=preserve",
+                "--quiet",
+                "--output",
+                output_file.name,
+            ]
+            if extra_args:
+                command.extend(extra_args)
-        if result.returncode != 0:
-            raise ParsingError(
-                "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
+            result = await run_sync(
+                subprocess.run,
+                command,
+                capture_output=True,
             )
-        text = await AsyncPath(output_file.name).read_text()
+            if result.returncode != 0:
+                raise ParsingError(
+                    "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
+                )
-        return normalize_spaces(text)
+            text = await AsyncPath(output_file.name).read_text("utf-8")
+            return normalize_spaces(text)
+        except (RuntimeError, OSError) as e:
+            raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
+        finally:
+            output_file.close()
+            await AsyncPath(output_file.name).unlink()
 async def process_file(
@@ -384,12 +413,12 @@ async def process_file(
     Returns:
         PandocResult containing processed content and metadata.
     """
-    await validate_pandoc_version()
+    await _validate_pandoc_version()
     metadata, content = await gather(
         *[
-            extract_metadata(input_file, mime_type=mime_type),
-            _extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
+            _handle_extract_metadata(input_file, mime_type=mime_type),
+            _handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
         ]
     )
     return PandocResult(
@@ -409,8 +438,13 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
     Returns:
         PandocResult containing processed content and metadata.
     """
-    extension = _get_extension_from_mime_type(mime_type)
+    extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
+    with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
+        try:
+            await AsyncPath(input_file.name).write_bytes(content)
+            return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
-    with NamedTemporaryFile(suffix=f".{extension}") as input_file:
-        await AsyncPath(input_file.name).write_bytes(content)
-        return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
+        finally:
+            input_file.close()
+            await AsyncPath(input_file.name).unlink()

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/_tesseract.py RENAMED Viewed

@@ -186,8 +186,9 @@ async def validate_tesseract_version() -> None:
         if version_ref["checked"]:
             return
-        result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
-        version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
+        command = ["tesseract", "--version"]
+        result = await run_sync(subprocess.run, command, capture_output=True)
+        version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
         if not version_match or int(version_match.group(1)) < 5:
             raise MissingDependencyError("Tesseract version 5 or above is required.")
@@ -213,10 +214,10 @@ async def process_file(
     Returns:
         str: Extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".txt") as output_file:
+    with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
         # this is needed because tesseract adds .txt to the output file
-        output_file_name = output_file.name.replace(".txt", "")
         try:
+            output_file_name = output_file.name.replace(".txt", "")
             command = [
                 "tesseract",
                 str(input_file),
@@ -239,11 +240,15 @@ async def process_file(
             if not result.returncode == 0:
                 raise OCRError("OCR failed with a non-0 return code.")
-            output = await AsyncPath(output_file.name).read_text()
+            output = await AsyncPath(output_file.name).read_text("utf-8")
             return output.strip()
         except (RuntimeError, OSError) as e:
             raise OCRError("Failed to OCR using tesseract") from e
+        finally:
+            output_file.close()
+            await AsyncPath(output_file.name).unlink()
 async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
     """Process a single Pillow Image using Tesseract OCR.
@@ -257,9 +262,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
     Returns:
         str: Extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".png") as image_file:
-        await run_sync(image.save, image_file.name, format="PNG")
-        return await process_file(image_file.name, language=language, psm=psm, **kwargs)
+    with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
+        try:
+            await run_sync(image.save, image_file.name, format="PNG")
+            return await process_file(image_file.name, language=language, psm=psm, **kwargs)
+        finally:
+            image_file.close()
+            await AsyncPath(image_file.name).unlink()
 async def process_image_with_tesseract(

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg/extraction.py RENAMED Viewed

@@ -20,10 +20,12 @@ from kreuzberg._extractors import (
     extract_content_with_pandoc,
     extract_file_with_pandoc,
     extract_html_string,
-    extract_pdf_file,
+    extract_pdf,
     extract_pptx_file,
+    extract_xlsx_file,
 )
 from kreuzberg._mime_types import (
+    EXCEL_MIME_TYPE,
     HTML_MIME_TYPE,
     IMAGE_MIME_TYPE_EXT_MAP,
     IMAGE_MIME_TYPES,
@@ -69,18 +71,21 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
         )
     if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
-        with NamedTemporaryFile(suffix=".pdf") as temp_file:
-            temp_file.write(content)
-            return ExtractionResult(
-                content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
-            )
+        return ExtractionResult(content=await extract_pdf(content, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
+    if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
+        return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
-        with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
-            temp_file.write(content)
-            return ExtractionResult(
-                content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
-            )
+        with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
+            try:
+                await AsyncPath(temp_file.name).write_bytes(content)
+                return ExtractionResult(
+                    content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
+                )
+            finally:
+                temp_file.close()
+                await AsyncPath(temp_file.name).unlink()
     if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
         mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
@@ -132,7 +137,10 @@ async def extract_file(
         raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
     if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
-        return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
+        return ExtractionResult(content=await extract_pdf(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
+    if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
+        return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
         return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kreuzberg
-Version: 1.5.0
+Version: 1.7.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -29,7 +29,8 @@ Requires-Dist: charset-normalizer>=3.4.1
 Requires-Dist: html-to-markdown>=1.2.0
 Requires-Dist: pypdfium2>=4.30.1
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
+Requires-Dist: typing-extensions>=4.12.2
+Requires-Dist: xlsx2csv>=0.8.4
 # Kreuzberg
@@ -68,16 +69,12 @@ pip install kreuzberg
 ### 2. Install System Dependencies
-Kreuzberg requires two open-source tools:
+Kreuzberg requires two system level dependencies:
 - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-  - GPL v2.0 licensed (used via CLI only)
-  - Handles office documents and markup formats
 - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
-  - Apache License
-  - Required for scanned documents and images
+Please install these using their respective installation guides.
 ## Architecture
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
   - `pdfium2` for searchable PDFs
   - Tesseract OCR for scanned content
 - **Document Conversion**:
-  - Pandoc for office documents and markup
+  - Pandoc for many document and markup formats
   - `python-pptx` for PowerPoint files
   - `html-to-markdown` for HTML content
+  - `xlsx2csv` for Excel spreadsheets
 - **Text Processing**:
   - Smart encoding detection
   - Markdown and plain text handling
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
 #### Data and Research Formats
+- Excel spreadsheets (`.xlsx`)
 - CSV (`.csv`) and TSV (`.tsv`) files
 - Jupyter Notebooks (`.ipynb`)
 - BibTeX (`.bib`) and BibLaTeX (`.bib`)
@@ -232,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
 ### Error Handling
-Kreuzberg provides detailed error handling with two main exception types:
+Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
 ```python
 from kreuzberg import extract_file
-from kreuzberg.exceptions import ValidationError, ParsingError
+from kreuzberg.exceptions import (
+    ValidationError,
+    ParsingError,
+    OCRError,
+    MissingDependencyError
+)
 async def safe_extract(path: str) -> str:
     try:
@@ -244,20 +248,31 @@ async def safe_extract(path: str) -> str:
         return result.content
     except ValidationError as e:
-        # Handles input validation issues:
-        # - Unsupported file types
+        # Input validation issues
+        # - Unsupported or undetectable MIME types
         # - Missing files
-        # - Invalid MIME types
-        print(f"Invalid input: {e.message}")
-        print(f"Details: {e.context}")
+        # - Invalid input parameters
+        print(f"Validation failed: {e}")
+    except OCRError as e:
+        # OCR-specific issues
+        # - Tesseract processing failures
+        # - Image conversion problems
+        print(f"OCR failed: {e}")
+    except MissingDependencyError as e:
+        # System dependency issues
+        # - Missing Tesseract OCR
+        # - Missing Pandoc
+        # - Incompatible versions
+        print(f"Dependency missing: {e}")
     except ParsingError as e:
-        # Handles processing errors:
+        # General processing errors
         # - PDF parsing failures
-        # - OCR errors
         # - Format conversion issues
-        print(f"Processing failed: {e.message}")
-        print(f"Details: {e.context}")
+        # - Encoding problems
+        print(f"Processing failed: {e}")
     return ""
@@ -265,24 +280,33 @@ async def safe_extract(path: str) -> str:
 try:
     result = await extract_file("document.xyz")
 except ValidationError as e:
-    # e.context might contain:
-    # {
+    # Error will include context:
+    # ValidationError: Unsupported mime type
+    # Context: {
     #    "file_path": "document.xyz",
-    #    "error": "Unsupported file type",
-    #    "supported_types": ["pdf", "docx", ...]
+    #    "supported_mimetypes": ["application/pdf", ...]
     # }
+    print(e)
 try:
-    result = await extract_file("scan.pdf")
-except ParsingError as e:
-    # e.context might contain:
-    # {
-    #    "file_path": "scan.pdf",
-    #    "error": "OCR processing failed",
-    #    "details": "Tesseract error: Unable to process image"
+    result = await extract_file("scan.jpg")
+except OCRError as e:
+    # Error will include context:
+    # OCRError: OCR failed with a non-0 return code
+    # Context: {
+    #    "file_path": "scan.jpg",
+    #    "tesseract_version": "5.3.0"
     # }
+    print(e)
 ```
+All exceptions provide:
+- A descriptive error message
+- Relevant context in the `context` attribute
+- String representation with both message and context
+- Proper exception chaining for debugging
 ## Roadmap
 V1:

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/kreuzberg.egg-info/requires.txt RENAMED Viewed

@@ -3,6 +3,5 @@ charset-normalizer>=3.4.1
 html-to-markdown>=1.2.0
 pypdfium2>=4.30.1
 python-pptx>=1.0.2
-[:python_version < "3.10"]
 typing-extensions>=4.12.2
+xlsx2csv>=0.8.4

{kreuzberg-1.5.0 → kreuzberg-1.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "kreuzberg"
-version = "1.5.0"
+version = "1.7.0"
 description = "A text extraction library supporting PDFs, images, office documents and more"
 readme = "README.md"
 keywords = [
@@ -36,27 +36,28 @@ classifiers = [
 ]
 dependencies = [
-  "anyio>=4.8.0",
-  "charset-normalizer>=3.4.1",
-  "html-to-markdown>=1.2.0",
-  "pypdfium2>=4.30.1",
-  "python-pptx>=1.0.2",
-  "typing-extensions>=4.12.2; python_version<'3.10'",
+    "anyio>=4.8.0",
+    "charset-normalizer>=3.4.1",
+    "html-to-markdown>=1.2.0",
+    "pypdfium2>=4.30.1",
+    "python-pptx>=1.0.2",
+    "typing-extensions>=4.12.2",
+    "xlsx2csv>=0.8.4",
 ]
 urls.homepage = "https://github.com/Goldziher/kreuzberg"
 [dependency-groups]
 dev = [
-  "covdefaults>=2.3.0",
-  "mypy>=1.15.0",
-  "pre-commit>=4.1.0",
-  "pytest>=8.3.4",
-  "pytest-asyncio>=0.25.3",
-  "pytest-cov>=6.0.0",
-  "pytest-mock>=3.14.0",
-  "pytest-timeout>=2.3.1",
-  "python-dotenv>=1.0.1",
-  "ruff>=0.9.5",
+    "covdefaults>=2.3.0",
+    "mypy>=1.15.0",
+    "pre-commit>=4.1.0",
+    "pytest>=8.3.4",
+    "pytest-asyncio>=0.25.3",
+    "pytest-cov>=6.0.0",
+    "pytest-mock>=3.14.0",
+    "pytest-timeout>=2.3.1",
+    "python-dotenv>=1.0.1",
+    "ruff>=0.9.6",
 ]
 [tool.setuptools.packages.find]