PyPI - kreuzberg - Versions diffs - 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

kreuzberg 1.7.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

kreuzberg/__init__.py +6 -2
kreuzberg/_constants.py +6 -0
kreuzberg/_html.py +32 -0
kreuzberg/_mime_types.py +109 -1
kreuzberg/_pandoc.py +122 -169
kreuzberg/_pdf.py +189 -0
kreuzberg/_pptx.py +88 -0
kreuzberg/_string.py +5 -8
kreuzberg/_sync.py +6 -1
kreuzberg/_tesseract.py +98 -71
kreuzberg/_tmp.py +37 -0
kreuzberg/_types.py +71 -0
kreuzberg/_xlsx.py +92 -0
kreuzberg/extraction.py +269 -64
kreuzberg-2.0.0.dist-info/METADATA +419 -0
kreuzberg-2.0.0.dist-info/RECORD +21 -0
kreuzberg/_extractors.py +0 -280
kreuzberg-1.7.0.dist-info/METADATA +0 -342
kreuzberg-1.7.0.dist-info/RECORD +0 -15
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
{kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0

kreuzberg/_types.py ADDED Viewed

@@ -0,0 +1,71 @@
+from __future__ import annotations
+import sys
+from typing import NamedTuple, TypedDict
+if sys.version_info < (3, 11):  # pragma: no cover
+    from typing_extensions import NotRequired
+else:  # pragma: no cover
+    from typing import NotRequired
+class Metadata(TypedDict, total=False):
+    """Document metadata.
+    All fields are optional but will only be included if they contain non-empty values.
+    Any field that would be empty or None is omitted from the dictionary.
+    Different documents and extraction methods will yield different metadata.
+    """
+    title: NotRequired[str]
+    """Document title."""
+    subtitle: NotRequired[str]
+    """Document subtitle."""
+    abstract: NotRequired[str | list[str]]
+    """Document abstract, summary or description."""
+    authors: NotRequired[list[str]]
+    """List of document authors."""
+    date: NotRequired[str]
+    """Document date as string to preserve original format."""
+    subject: NotRequired[str]
+    """Document subject or topic."""
+    description: NotRequired[str]
+    """Extended description."""
+    keywords: NotRequired[list[str]]
+    """Keywords or tags."""
+    categories: NotRequired[list[str]]
+    """Categories or classifications."""
+    version: NotRequired[str]
+    """Version identifier."""
+    language: NotRequired[str]
+    """Document language code."""
+    references: NotRequired[list[str]]
+    """Reference entries."""
+    citations: NotRequired[list[str]]
+    """Citation identifiers."""
+    copyright: NotRequired[str]
+    """Copyright information."""
+    license: NotRequired[str]
+    """License information."""
+    identifier: NotRequired[str]
+    """Document identifier."""
+    publisher: NotRequired[str]
+    """Publisher name."""
+    contributors: NotRequired[list[str]]
+    """Additional contributors."""
+    creator: NotRequired[str]
+    """Document creator."""
+    institute: NotRequired[str | list[str]]
+    """Institute or organization."""
+class ExtractionResult(NamedTuple):
+    """The result of a file extraction."""
+    content: str
+    """The extracted content."""
+    mime_type: str
+    """The mime type of the content."""
+    metadata: Metadata
+    """The metadata of the content."""

kreuzberg/_xlsx.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+import csv
+from io import StringIO
+from typing import TYPE_CHECKING, cast
+from anyio import Path as AsyncPath
+from anyio import create_task_group
+from python_calamine import CalamineWorkbook
+from kreuzberg import ExtractionResult, ParsingError
+from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
+from kreuzberg._pandoc import process_file_with_pandoc
+from kreuzberg._string import normalize_spaces
+from kreuzberg._sync import run_sync
+from kreuzberg._tmp import create_temp_file
+if TYPE_CHECKING:  # pragma: no cover
+    from pathlib import Path
+async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
+    """Extract text from an XLSX file by converting it to CSV and then to markdown.
+    Args:
+        input_file: The path to the XLSX file.
+    Returns:
+        The extracted text content.
+    Raises:
+        ParsingError: If the XLSX file could not be parsed.
+    """
+    try:
+        workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
+        results = cast(list[str], [None] * len(workbook.sheet_names))
+        async def convert_sheet_to_text(sheet_name: str) -> None:
+            nonlocal results
+            values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
+            csv_buffer = StringIO()
+            writer = csv.writer(csv_buffer)
+            for row in values:
+                writer.writerow(row)
+            csv_data = csv_buffer.getvalue()
+            csv_buffer.close()
+            from kreuzberg._tmp import create_temp_file
+            csv_path, unlink = await create_temp_file(".csv")
+            await AsyncPath(csv_path).write_text(csv_data)
+            result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
+            results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
+            await unlink()
+        async with create_task_group() as tg:
+            for sheet_name in workbook.sheet_names:
+                tg.start_soon(convert_sheet_to_text, sheet_name)
+        return ExtractionResult(
+            content="\n\n".join(results),
+            mime_type=MARKDOWN_MIME_TYPE,
+            metadata={},
+        )
+    except Exception as e:
+        raise ParsingError(
+            "Could not extract text from XLSX",
+            context={
+                "error": str(e),
+            },
+        ) from e
+async def extract_xlsx_content(content: bytes) -> ExtractionResult:
+    """Extract text from an XLSX file content.
+    Args:
+        content: The XLSX file content.
+    Returns:
+        The extracted text content.
+    """
+    xlsx_path, unlink = await create_temp_file(".xlsx")
+    await AsyncPath(xlsx_path).write_bytes(content)
+    result = await extract_xlsx_file(xlsx_path)
+    await unlink()
+    return result

kreuzberg/extraction.py CHANGED Viewed

@@ -9,54 +9,62 @@ It includes vendored code:
 from __future__ import annotations
-from mimetypes import guess_type
+from functools import partial
+from io import BytesIO
 from pathlib import Path
-from tempfile import NamedTemporaryFile
-from typing import NamedTuple
+from typing import TYPE_CHECKING, cast
+import anyio
 from anyio import Path as AsyncPath
+from PIL.Image import open as open_image
-from kreuzberg._extractors import (
-    extract_content_with_pandoc,
-    extract_file_with_pandoc,
-    extract_html_string,
-    extract_pdf,
-    extract_pptx_file,
-    extract_xlsx_file,
-)
+from kreuzberg import ExtractionResult
+from kreuzberg._constants import DEFAULT_MAX_PROCESSES
+from kreuzberg._html import extract_html_string
 from kreuzberg._mime_types import (
     EXCEL_MIME_TYPE,
     HTML_MIME_TYPE,
-    IMAGE_MIME_TYPE_EXT_MAP,
     IMAGE_MIME_TYPES,
-    MARKDOWN_MIME_TYPE,
     PANDOC_SUPPORTED_MIME_TYPES,
     PDF_MIME_TYPE,
-    PLAIN_TEXT_MIME_TYPE,
     POWER_POINT_MIME_TYPE,
     SUPPORTED_MIME_TYPES,
+    validate_mime_type,
+)
+from kreuzberg._pandoc import process_content_with_pandoc, process_file_with_pandoc
+from kreuzberg._pdf import (
+    extract_pdf_content,
+    extract_pdf_file,
 )
+from kreuzberg._pptx import extract_pptx_file_content
 from kreuzberg._string import safe_decode
-from kreuzberg._tesseract import process_image_with_tesseract
+from kreuzberg._tesseract import PSMMode, SupportedLanguage, process_image_with_tesseract
+from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
 from kreuzberg.exceptions import ValidationError
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from os import PathLike
-class ExtractionResult(NamedTuple):
-    """The result of a file extraction."""
-    content: str
-    """The extracted content."""
-    mime_type: str
-    """The mime type of the content."""
-async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
+async def extract_bytes(
+    content: bytes,
+    mime_type: str,
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> ExtractionResult:
     """Extract the textual content from a given byte string representing a file's contents.
     Args:
         content: The content to extract.
         mime_type: The mime type of the content.
-        force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
     Raises:
         ValidationError: If the mime type is not supported.
@@ -71,50 +79,54 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
         )
     if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
-        return ExtractionResult(content=await extract_pdf(content, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
+        return await extract_pdf_content(
+            content, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
+        )
     if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
-        return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
+        return await extract_xlsx_content(content)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
-        with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
-            try:
-                await AsyncPath(temp_file.name).write_bytes(content)
-                return ExtractionResult(
-                    content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
-                )
-            finally:
-                temp_file.close()
-                await AsyncPath(temp_file.name).unlink()
+        return await process_image_with_tesseract(
+            open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
+        )
     if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
         mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
     ):
-        return ExtractionResult(
-            content=await extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
-        )
+        return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
     if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
-        return ExtractionResult(content=await extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
+        return await extract_pptx_file_content(content)
     if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
-        return ExtractionResult(content=await extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
+        return await extract_html_string(content)
     return ExtractionResult(
         content=safe_decode(content),
         mime_type=mime_type,
+        metadata={},
     )
 async def extract_file(
-    file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
+    file_path: PathLike[str] | str,
+    mime_type: str | None = None,
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
 ) -> ExtractionResult:
     """Extract the textual content from a given file.
     Args:
         file_path: The path to the file.
-        mime_type: The mime type of the file.
-        force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
+        mime_type: The mime type of the content.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
     Raises:
         ValidationError: If the mime type is not supported.
@@ -122,40 +134,233 @@ async def extract_file(
     Returns:
         The extracted content and the mime type of the content.
     """
-    file_path = Path(file_path)
-    mime_type = mime_type or guess_type(file_path.name)[0]
-    if not mime_type:  # pragma: no cover
-        raise ValidationError("Could not determine the mime type of the file.", context={"file_path": str(file_path)})
+    input_file = await AsyncPath(file_path).resolve()
-    if mime_type not in SUPPORTED_MIME_TYPES or not any(mime_type.startswith(value) for value in SUPPORTED_MIME_TYPES):
-        raise ValidationError(
-            f"Unsupported mime type: {mime_type}",
-            context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
-        )
+    mime_type = validate_mime_type(input_file, mime_type)
-    if not await AsyncPath(file_path).exists():
-        raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
+    if not await input_file.exists():
+        raise ValidationError("The file does not exist.", context={"input_file": str(input_file)})
     if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
-        return ExtractionResult(content=await extract_pdf(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
+        return await extract_pdf_file(
+            Path(input_file), force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
+        )
     if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
-        return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
+        return await extract_xlsx_file(Path(input_file))
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
-        return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
+        return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
     if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
         mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
     ):
-        return ExtractionResult(
-            content=await extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
-        )
+        return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
     if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
-        return ExtractionResult(content=await extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
+        return await extract_pptx_file_content(Path(input_file))
     if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
-        return ExtractionResult(content=await extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
+        return await extract_html_string(Path(input_file))
+    return ExtractionResult(content=safe_decode(await input_file.read_bytes()), mime_type=mime_type, metadata={})
+async def batch_extract_file(
+    file_paths: Sequence[PathLike[str] | str],
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> list[ExtractionResult]:
+    """Extract text from multiple files concurrently.
+    Args:
+        file_paths: A sequence of paths to files to extract text from.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        A list of extraction results in the same order as the input paths.
+    """
+    results = cast(list[ExtractionResult], ([None] * len(file_paths)))
+    async def _extract_file(path: PathLike[str] | str, index: int) -> None:
+        result = await extract_file(
+            path,
+            force_ocr=force_ocr,
+            max_processes=max_processes,
+            psm=psm,
+            language=language,
+        )
+        results[index] = result
+    async with anyio.create_task_group() as tg:
+        for i, path in enumerate(file_paths):
+            tg.start_soon(_extract_file, path, i)
+    return results
+async def batch_extract_bytes(
+    contents: Sequence[tuple[bytes, str]],
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> list[ExtractionResult]:
+    """Extract text from multiple byte contents concurrently.
+    Args:
+        contents: A sequence of tuples containing (content, mime_type) pairs.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        A list of extraction results in the same order as the input contents.
+    """
+    results = cast(list[ExtractionResult], [None] * len(contents))
+    async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
+        result = await extract_bytes(
+            content,
+            mime_type,
+            force_ocr=force_ocr,
+            max_processes=max_processes,
+            psm=psm,
+            language=language,
+        )
+        results[index] = result
+    async with anyio.create_task_group() as tg:
+        for i, (content, mime_type) in enumerate(contents):
+            tg.start_soon(_extract_bytes, content, mime_type, i)
+    return results
+### Sync proxies
-    return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
+def extract_bytes_sync(
+    content: bytes,
+    mime_type: str,
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> ExtractionResult:
+    """Synchronous version of extract_bytes.
+    Args:
+        content: The content to extract.
+        mime_type: The mime type of the content.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        The extracted content and the mime type of the content.
+    """
+    handler = partial(
+        extract_bytes, content, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
+    )
+    return anyio.run(handler)
+def extract_file_sync(
+    file_path: Path | str,
+    mime_type: str | None = None,
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> ExtractionResult:
+    """Synchronous version of extract_file.
+    Args:
+        file_path: The path to the file.
+        mime_type: The mime type of the content.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        The extracted content and the mime type of the content.
+    """
+    handler = partial(
+        extract_file, file_path, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
+    )
+    return anyio.run(handler)
+def batch_extract_file_sync(
+    file_paths: Sequence[PathLike[str] | str],
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> list[ExtractionResult]:
+    """Synchronous version of batch_extract_file.
+    Args:
+        file_paths: A sequence of paths to files to extract text from.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        A list of extraction results in the same order as the input paths.
+    """
+    handler = partial(
+        batch_extract_file,
+        file_paths,
+        force_ocr=force_ocr,
+        max_processes=max_processes,
+        language=language,
+        psm=psm,
+    )
+    return anyio.run(handler)
+def batch_extract_bytes_sync(
+    contents: Sequence[tuple[bytes, str]],
+    *,
+    force_ocr: bool = False,
+    language: SupportedLanguage = "eng",
+    max_processes: int = DEFAULT_MAX_PROCESSES,
+    psm: PSMMode = PSMMode.AUTO,
+) -> list[ExtractionResult]:
+    """Synchronous version of batch_extract_bytes.
+    Args:
+        contents: A sequence of tuples containing (content, mime_type) pairs.
+        force_ocr: Whether to force OCR on PDF files that have a text layer.
+        language: The language code for OCR. Defaults to "eng".
+        max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
+        psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
+    Returns:
+        A list of extraction results in the same order as the input contents.
+    """
+    handler = partial(
+        batch_extract_bytes,
+        contents,
+        force_ocr=force_ocr,
+        max_processes=max_processes,
+        language=language,
+        psm=psm,
+    )
+    return anyio.run(handler)

kreuzberg 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

kreuzberg 1.7.0py3-none-any.whl → 2.0.0py3-none-any.whl