PyPI - kreuzberg - Versions diffs - 1.7.0__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

kreuzberg 1.7.0py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

kreuzberg/__init__.py +17 -2
kreuzberg/_constants.py +6 -0
kreuzberg/_html.py +32 -0
kreuzberg/_mime_types.py +109 -1
kreuzberg/_pandoc.py +122 -169
kreuzberg/_pdf.py +189 -0
kreuzberg/_pptx.py +88 -0
kreuzberg/_string.py +5 -8
kreuzberg/_sync.py +6 -1
kreuzberg/_tesseract.py +97 -200
kreuzberg/_tmp.py +37 -0
kreuzberg/_types.py +71 -0
kreuzberg/_xlsx.py +92 -0
kreuzberg/extraction.py +269 -64
kreuzberg-2.0.1.dist-info/METADATA +451 -0
kreuzberg-2.0.1.dist-info/RECORD +21 -0
kreuzberg/_extractors.py +0 -280
kreuzberg-1.7.0.dist-info/METADATA +0 -342
kreuzberg-1.7.0.dist-info/RECORD +0 -15
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/LICENSE +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/WHEEL +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/top_level.txt +0 -0

kreuzberg/_extractors.py DELETED Viewed

@@ -1,280 +0,0 @@
-from __future__ import annotations
-import re
-from asyncio import gather
-from contextlib import suppress
-from html import escape
-from io import BytesIO
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING
-import html_to_markdown
-import pptx
-import pypdfium2
-from anyio import Path as AsyncPath
-from pptx.enum.shapes import MSO_SHAPE_TYPE
-from xlsx2csv import Xlsx2csv
-from kreuzberg._pandoc import process_content, process_file
-from kreuzberg._string import normalize_spaces, safe_decode
-from kreuzberg._sync import run_sync
-from kreuzberg._tesseract import batch_process_images
-from kreuzberg.exceptions import ParsingError
-if TYPE_CHECKING:  # pragma: no cover
-    from PIL.Image import Image
-async def convert_pdf_to_images(file_path: Path) -> list[Image]:
-    """Convert a PDF file to images.
-    Args:
-        file_path: The path to the PDF file.
-    Raises:
-        ParsingError: If the PDF file could not be converted to images.
-    Returns:
-        A list of Pillow Images.
-    """
-    pdf = None
-    resolved_path = str(await AsyncPath(file_path).resolve())
-    try:
-        pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
-        return [page.render(scale=2.0).to_pil() for page in pdf]
-    except pypdfium2.PdfiumError as e:
-        raise ParsingError(
-            "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
-        ) from e
-    finally:
-        if pdf is not None:
-            pdf.close()
-async def extract_pdf_with_tesseract(file_path: Path) -> str:
-    """Extract text from a scanned PDF file using pytesseract.
-    Args:
-        file_path: The path to the PDF file.
-    Returns:
-        The extracted text.
-    """
-    images = await convert_pdf_to_images(file_path)
-    ocr_results = await batch_process_images(images)
-    return normalize_spaces("\n".join(ocr_results))
-async def extract_pdf_with_pdfium2(file_path: Path) -> str:
-    """Extract text from a searchable PDF file using pypdfium2.
-    Args:
-        file_path: The path to the PDF file.
-    Raises:
-        ParsingError: If the text could not be extracted from the PDF file.
-    Returns:
-        The extracted text.
-    """
-    document = None
-    resolved_path = str(await AsyncPath(file_path).resolve())
-    try:
-        document = await run_sync(pypdfium2.PdfDocument, resolved_path)
-        text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
-        return normalize_spaces(text)
-    except pypdfium2.PdfiumError as e:
-        raise ParsingError(
-            "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
-        ) from e
-    finally:
-        if document is not None:
-            document.close()
-async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = False) -> str:
-    """Extract text from a PDF file.
-    Args:
-        file_path_or_contents: The path to the PDF file or its contents as bytes.
-        force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
-    Returns:
-        The extracted text.
-    """
-    if isinstance(file_path_or_contents, bytes):
-        with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
-            try:
-                file_path = Path(pdf_file.name)
-                await AsyncPath(file_path).write_bytes(file_path_or_contents)
-                if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
-                    return normalize_spaces(content)
-                return await extract_pdf_with_tesseract(file_path)
-            finally:
-                pdf_file.close()
-                await AsyncPath(pdf_file.name).unlink()
-    if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
-        return normalize_spaces(content)
-    return await extract_pdf_with_tesseract(file_path_or_contents)
-async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
-    """Extract text using pandoc.
-    Args:
-        file_data: The content of the file.
-        mime_type: The mime type of the file.
-    Returns:
-        The extracted text.
-    """
-    result = await process_content(file_data, mime_type=mime_type)
-    return normalize_spaces(result.content)
-async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
-    """Extract text using pandoc.
-    Args:
-        file_path: The path to the file.
-        mime_type: The mime type of the file.
-    Returns:
-        The extracted text.
-    """
-    resolved_path = str(await AsyncPath(file_path).resolve())
-    result = await process_file(resolved_path, mime_type=mime_type)
-    return normalize_spaces(result.content)
-async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
-    """Extract text from a PPTX file.
-    Notes:
-        This function is based on code vendored from `markitdown`, which has an MIT license as well.
-    Args:
-        file_path_or_contents: The path to the PPTX file or its contents as bytes.
-    Returns:
-        The extracted text content
-    """
-    md_content = ""
-    file_contents = (
-        file_path_or_contents
-        if isinstance(file_path_or_contents, bytes)
-        else await AsyncPath(file_path_or_contents).read_bytes()
-    )
-    presentation = pptx.Presentation(BytesIO(file_contents))
-    for index, slide in enumerate(presentation.slides):
-        md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
-        title = slide.shapes.title
-        for shape in slide.shapes:
-            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
-                shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
-            ):
-                alt_text = ""
-                with suppress(AttributeError):
-                    # access non-visual properties
-                    alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")  # noqa: SLF001
-                filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
-            elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
-                html_table = "<table>"
-                first_row = True
-                for row in shape.table.rows:
-                    html_table += "<tr>"
-                    for cell in row.cells:
-                        tag = "th" if first_row else "td"
-                        html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
-                    html_table += "</tr>"
-                    first_row = False
-                html_table += "</table>"
-                md_content += "\n" + html_table + "\n"
-            elif shape.has_text_frame:
-                md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
-        md_content = md_content.strip()
-        if slide.has_notes_slide:
-            md_content += "\n\n### Notes:\n"
-            notes_frame = slide.notes_slide.notes_text_frame
-            if notes_frame is not None:
-                md_content += notes_frame.text
-            md_content = md_content.strip()
-    return normalize_spaces(md_content)
-async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
-    """Extract text from an XLSX file by converting it to CSV and then to markdown.
-    Args:
-        file_path_or_contents: The path to the XLSX file or its contents as bytes.
-    Returns:
-        The extracted text content.
-    Raises:
-        ParsingError: If the XLSX file could not be parsed.
-    """
-    with (
-        NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
-        NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
-    ):
-        try:
-            if isinstance(file_path_or_contents, bytes):
-                xlsx_file.write(file_path_or_contents)
-                xlsx_file.flush()
-                xlsx_path = xlsx_file.name
-            else:
-                xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())
-            await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
-            result = await process_file(csv_file.name, mime_type="text/csv")
-            return normalize_spaces(result.content)
-        except Exception as e:
-            raise ParsingError(
-                "Could not extract text from XLSX file",
-                context={
-                    "error": str(e),
-                    "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
-                },
-            ) from e
-        finally:
-            xlsx_file.close()
-            csv_file.close()
-            await gather(AsyncPath(xlsx_file.name).unlink(), AsyncPath(csv_file.name).unlink())
-async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
-    """Extract text from an HTML string.
-    Args:
-        file_path_or_contents: The HTML content.
-    Returns:
-        The extracted text content.
-    """
-    content = (
-        safe_decode(file_path_or_contents)
-        if isinstance(file_path_or_contents, bytes)
-        else await AsyncPath(file_path_or_contents).read_text()
-    )
-    return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))

kreuzberg-1.7.0.dist-info/METADATA DELETED Viewed

@@ -1,342 +0,0 @@
-Metadata-Version: 2.2
-Name: kreuzberg
-Version: 1.7.0
-Summary: A text extraction library supporting PDFs, images, office documents and more
-Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
-License: MIT
-Project-URL: homepage, https://github.com/Goldziher/kreuzberg
-Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Classifier: Topic :: Text Processing :: General
-Classifier: Topic :: Utilities
-Classifier: Typing :: Typed
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: anyio>=4.8.0
-Requires-Dist: charset-normalizer>=3.4.1
-Requires-Dist: html-to-markdown>=1.2.0
-Requires-Dist: pypdfium2>=4.30.1
-Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.12.2
-Requires-Dist: xlsx2csv>=0.8.4
-# Kreuzberg
-Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
-## Why Kreuzberg?
-- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
-- **Local Processing**: No external API calls or cloud dependencies required
-- **Resource Efficient**: Lightweight processing without GPU requirements
-- **Format Support**: Comprehensive support for documents, images, and text formats
-- **Modern Python**: Built with async/await, type hints, and current best practices
-Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
-## Features
-- **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
-- **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
-- **Modern Python Design**:
-  - Async-first API using `anyio`
-  - Comprehensive type hints for better IDE support
-  - Detailed error handling with context information
-- **Production Ready**:
-  - Robust error handling
-  - Detailed debugging information
-  - Memory efficient processing
-## Installation
-### 1. Install the Python Package
-```shell
-pip install kreuzberg
-```
-### 2. Install System Dependencies
-Kreuzberg requires two system level dependencies:
-- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
-Please install these using their respective installation guides.
-## Architecture
-Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
-- **PDF Processing**:
-  - `pdfium2` for searchable PDFs
-  - Tesseract OCR for scanned content
-- **Document Conversion**:
-  - Pandoc for many document and markup formats
-  - `python-pptx` for PowerPoint files
-  - `html-to-markdown` for HTML content
-  - `xlsx2csv` for Excel spreadsheets
-- **Text Processing**:
-  - Smart encoding detection
-  - Markdown and plain text handling
-### Supported Formats
-#### Document Formats
-- PDF (`.pdf`, both searchable and scanned documents)
-- Microsoft Word (`.docx`, `.doc`)
-- PowerPoint presentations (`.pptx`)
-- OpenDocument Text (`.odt`)
-- Rich Text Format (`.rtf`)
-- EPUB (`.epub`)
-- DocBook XML (`.dbk`, `.xml`)
-- FictionBook (`.fb2`)
-- LaTeX (`.tex`, `.latex`)
-- Typst (`.typ`)
-#### Markup and Text Formats
-- HTML (`.html`, `.htm`)
-- Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
-- reStructuredText (`.rst`)
-- Org-mode (`.org`)
-- DokuWiki (`.txt`)
-- Pod (`.pod`)
-- Man pages (`.1`, `.2`, etc.)
-#### Data and Research Formats
-- Excel spreadsheets (`.xlsx`)
-- CSV (`.csv`) and TSV (`.tsv`) files
-- Jupyter Notebooks (`.ipynb`)
-- BibTeX (`.bib`) and BibLaTeX (`.bib`)
-- CSL-JSON (`.json`)
-- EndNote XML (`.xml`)
-- RIS (`.ris`)
-- JATS XML (`.xml`)
-#### Image Formats
-- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
-- PNG (`.png`)
-- TIFF (`.tiff`, `.tif`)
-- BMP (`.bmp`)
-- GIF (`.gif`)
-- WebP (`.webp`)
-- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
-- Portable Anymap (`.pnm`)
-- Portable Bitmap (`.pbm`)
-- Portable Graymap (`.pgm`)
-- Portable Pixmap (`.ppm`)
-## Usage
-Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
-- `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
-- `extract_bytes()`: Extract text from bytes (accepts a byte string)
-### Quick Start
-```python
-from pathlib import Path
-from kreuzberg import extract_file, extract_bytes
-# Basic file extraction
-async def extract_document():
-    # Extract from a PDF file
-    pdf_result = await extract_file("document.pdf")
-    print(f"PDF text: {pdf_result.content}")
-    # Extract from an image
-    img_result = await extract_file("scan.png")
-    print(f"Image text: {img_result.content}")
-    # Extract from Word document
-    docx_result = await extract_file(Path("document.docx"))
-    print(f"Word text: {docx_result.content}")
-```
-### Processing Uploaded Files
-```python
-from kreuzberg import extract_bytes
-async def process_upload(file_content: bytes, mime_type: str):
-    """Process uploaded file content with known MIME type."""
-    result = await extract_bytes(file_content, mime_type=mime_type)
-    return result.content
-# Example usage with different file types
-async def handle_uploads():
-    # Process PDF upload
-    pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
-    # Process image upload
-    img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
-    # Process Word document upload
-    docx_result = await extract_bytes(docx_bytes,
-        mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
-```
-### Advanced Features
-#### PDF Processing Options
-```python
-from kreuzberg import extract_file
-async def process_pdf():
-    # Force OCR for PDFs with embedded images or scanned content
-    result = await extract_file("document.pdf", force_ocr=True)
-    # Process a scanned PDF (automatically uses OCR)
-    scanned = await extract_file("scanned.pdf")
-```
-#### ExtractionResult Object
-All extraction functions return an `ExtractionResult` containing:
-- `content`: The extracted text (str)
-- `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
-```python
-from kreuzberg import ExtractionResult
-async def process_document(path: str) -> tuple[str, str]:
-    # Access as a named tuple
-    result: ExtractionResult = await extract_file(path)
-    print(f"Content: {result.content}")
-    print(f"Format: {result.mime_type}")
-    # Or unpack as a tuple
-    content, mime_type = await extract_file(path)
-    return content, mime_type
-```
-### Error Handling
-Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
-```python
-from kreuzberg import extract_file
-from kreuzberg.exceptions import (
-    ValidationError,
-    ParsingError,
-    OCRError,
-    MissingDependencyError
-)
-async def safe_extract(path: str) -> str:
-    try:
-        result = await extract_file(path)
-        return result.content
-    except ValidationError as e:
-        # Input validation issues
-        # - Unsupported or undetectable MIME types
-        # - Missing files
-        # - Invalid input parameters
-        print(f"Validation failed: {e}")
-    except OCRError as e:
-        # OCR-specific issues
-        # - Tesseract processing failures
-        # - Image conversion problems
-        print(f"OCR failed: {e}")
-    except MissingDependencyError as e:
-        # System dependency issues
-        # - Missing Tesseract OCR
-        # - Missing Pandoc
-        # - Incompatible versions
-        print(f"Dependency missing: {e}")
-    except ParsingError as e:
-        # General processing errors
-        # - PDF parsing failures
-        # - Format conversion issues
-        # - Encoding problems
-        print(f"Processing failed: {e}")
-    return ""
-# Example error contexts
-try:
-    result = await extract_file("document.xyz")
-except ValidationError as e:
-    # Error will include context:
-    # ValidationError: Unsupported mime type
-    # Context: {
-    #    "file_path": "document.xyz",
-    #    "supported_mimetypes": ["application/pdf", ...]
-    # }
-    print(e)
-try:
-    result = await extract_file("scan.jpg")
-except OCRError as e:
-    # Error will include context:
-    # OCRError: OCR failed with a non-0 return code
-    # Context: {
-    #    "file_path": "scan.jpg",
-    #    "tesseract_version": "5.3.0"
-    # }
-    print(e)
-```
-All exceptions provide:
-- A descriptive error message
-- Relevant context in the `context` attribute
-- String representation with both message and context
-- Proper exception chaining for debugging
-## Roadmap
-V1:
-- [x] - html file text extraction
-- [ ] - better PDF table extraction
-- [ ] - batch APIs
-- [ ] - sync APIs
-V2:
-- [ ] - metadata extraction (breaking change)
-- [ ] - TBD
-## Contribution
-This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
-submitting PRs to avoid disappointment.
-### Local Development
-1. Clone the repo
-2. Install the system dependencies
-3. Install the full dependencies with `uv sync`
-4. Install the pre-commit hooks with:
-   ```shell
-   pre-commit install && pre-commit install --hook-type commit-msg
-   ```
-5. Make your changes and submit a PR
-## License
-This library uses the MIT license.

kreuzberg-1.7.0.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
-kreuzberg/_extractors.py,sha256=3VP7oBz0VpmkkhlbKDPjRmnZdHBv4K_xqcyMeeDaetM,9283
-kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
-kreuzberg/_pandoc.py,sha256=zhNJ8_92JMs4gG_Fj-IVwdpZwWsyaK-VTrbLke6NGyU,15097
-kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
-kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
-kreuzberg/_tesseract.py,sha256=Yya15OxB4PBi2QqmrGXF70_SHBD7Luii9sBXzMJlCpU,8168
-kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
-kreuzberg/extraction.py,sha256=_vJ9O8t50a3p4co3hY8b3BdBIXV5S7XOUNl_kD9_FvM,6599
-kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg-1.7.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-1.7.0.dist-info/METADATA,sha256=3wKe7X5G1IQfSPNzD0wnS0t81MqoWtQ-cgR-6MBoyec,10355
-kreuzberg-1.7.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-kreuzberg-1.7.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-1.7.0.dist-info/RECORD,,

{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

kreuzberg 1.7.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

kreuzberg 1.7.0py3-none-any.whl → 2.0.1py3-none-any.whl