PyPI - kreuzberg - Versions diffs - 3.0.1__tar.gz → 3.1.0__tar.gz - Mend

kreuzberg 3.0.1tar.gz → 3.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.0.1
+Version: 3.1.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
-Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
+Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
@@ -27,7 +27,7 @@ License-File: LICENSE
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.1
 Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
-Requires-Dist: html-to-markdown>=1.2.0
+Requires-Dist: html-to-markdown>=1.2.1
 Requires-Dist: playa-pdf>=0.4.1
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.1
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
 Provides-Extra: all
 Requires-Dist: easyocr>=1.7.2; extra == "all"
-Requires-Dist: numpy>=2.0.2; extra == "all"
+Requires-Dist: gmft>=0.4.1; extra == "all"
 Requires-Dist: paddleocr>=2.10.0; extra == "all"
-Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
-Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
+Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
+Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
 Requires-Dist: setuptools>=76.0.0; extra == "all"
 Provides-Extra: chunking
-Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
+Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
+Provides-Extra: gmft
+Requires-Dist: gmft>=0.4.1; extra == "gmft"
 Provides-Extra: paddleocr
-Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
 Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
-Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
+Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
 Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
 Dynamic: license-file
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
 - **Resource Efficient**: Lightweight processing without GPU requirements
 - **Format Support**: Comprehensive support for documents, images, and text formats
 - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
+- **Metadata Extraction**: Get document metadata alongside text content
+- **Table Extraction**: Extract tables from documents using the excellent GMFT library
 - **Modern Python**: Built with async/await, type hints, and a functional-first approach
 - **Permissive OSS**: MIT licensed with permissively licensed dependencies
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
 ### Local Development
 1. Clone the repo
 1. Install the system dependencies
 1. Install the full dependencies with `uv sync`
-1. Install the pre-commit hooks with:
-    ```shell
-    pre-commit install && pre-commit install --hook-type commit-msg
-    ```
+1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
 1. Make your changes and submit a PR
 ## License

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/README.md RENAMED Viewed

@@ -13,6 +13,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
 - **Resource Efficient**: Lightweight processing without GPU requirements
 - **Format Support**: Comprehensive support for documents, images, and text formats
 - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
+- **Metadata Extraction**: Get document metadata alongside text content
+- **Table Extraction**: Extract tables from documents using the excellent GMFT library
 - **Modern Python**: Built with async/await, type hints, and a functional-first approach
 - **Permissive OSS**: MIT licensed with permissively licensed dependencies
@@ -107,17 +109,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
 ### Local Development
 1. Clone the repo
 1. Install the system dependencies
 1. Install the full dependencies with `uv sync`
-1. Install the pre-commit hooks with:
-    ```shell
-    pre-commit install && pre-commit install --hook-type commit-msg
-    ```
+1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
 1. Make your changes and submit a PR
 ## License

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/__init__.py RENAMED Viewed

@@ -1,10 +1,11 @@
+from kreuzberg._gmft import GMFTConfig
 from kreuzberg._ocr._easyocr import EasyOCRConfig
 from kreuzberg._ocr._paddleocr import PaddleOCRConfig
 from kreuzberg._ocr._tesseract import TesseractConfig
 from ._ocr._tesseract import PSMMode
 from ._registry import ExtractorRegistry
-from ._types import ExtractionConfig, ExtractionResult, Metadata
+from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
     batch_extract_bytes,
@@ -22,6 +23,7 @@ __all__ = [
     "ExtractionConfig",
     "ExtractionResult",
     "ExtractorRegistry",
+    "GMFTConfig",
     "KreuzbergError",
     "Metadata",
     "MissingDependencyError",
@@ -29,6 +31,7 @@ __all__ = [
     "PSMMode",
     "PaddleOCRConfig",
     "ParsingError",
+    "TableData",
     "TesseractConfig",
     "ValidationError",
     "batch_extract_bytes",

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_pdf.py RENAMED Viewed

@@ -45,20 +45,28 @@ class PDFExtractor(Extractor):
     async def extract_path_async(self, path: Path) -> ExtractionResult:
         content_bytes = await AsyncPath(path).read_bytes()
-        metadata = await extract_pdf_metadata(content_bytes)
+        result: ExtractionResult | None = None
         if not self.config.force_ocr:
             content = await self._extract_pdf_searchable_text(path)
             if self._validate_extracted_text(content):
-                return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
+                result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
-        if self.config.ocr_backend is not None:
+        if not result and self.config.ocr_backend is not None:
             result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
-            result.metadata = metadata
-            return result
+        if not result:
+            result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
+        result.metadata = await extract_pdf_metadata(content_bytes)
+        if self.config.extract_tables:
+            from kreuzberg._gmft import extract_tables
+            result.tables = await extract_tables(path, self.config.gmft_config)
-        return ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
+        return result
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
         return anyio.run(self.extract_bytes_async, content)

kreuzberg-3.1.0/kreuzberg/_gmft.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+from kreuzberg._types import TableData
+from kreuzberg._utils._sync import run_sync
+from kreuzberg.exceptions import MissingDependencyError
+if TYPE_CHECKING:
+    from os import PathLike
+    from gmft.detectors.base import CroppedTable
+    from pandas import DataFrame
+@dataclass(unsafe_hash=True)
+class GMFTConfig:
+    """Configuration options for GMFT.
+    This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
+    """
+    verbosity: int = 0
+    """
+    Verbosity level for logging.
+    0: errors only
+    1: print warnings
+    2: print warnings and info
+    3: print warnings, info, and debug
+    """
+    formatter_base_threshold: float = 0.3
+    """
+    Base threshold for the confidence demanded of a table feature (row/column).
+    Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
+    """
+    cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
+        default_factory=lambda: {
+            0: 0.3,
+            1: 0.3,
+            2: 0.3,
+            3: 0.3,
+            4: 0.5,
+            5: 0.5,
+            6: 99,
+        },
+        hash=False,
+    )
+    """
+    Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
+    But low confidences may be better than too high confidence (see formatter_base_threshold)
+    """
+    detector_base_threshold: float = 0.9
+    """Minimum confidence score required for a table"""
+    remove_null_rows: bool = True
+    """
+    Flag to remove rows with no text.
+    """
+    enable_multi_header: bool = False
+    """
+    Enable multi-indices in the dataframe.
+    If false, then multiple headers will be merged column-wise.
+    """
+    semantic_spanning_cells: bool = False
+    """
+    [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
+    """
+    semantic_hierarchical_left_fill: str | None = "algorithm"
+    """
+    [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
+    Possible values: 'algorithm', 'deep', None.
+    'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
+    'deep': merges headers according to the spanning cells detected by the Table Transformer.
+    None: headers are not duplicated.
+    """
+    large_table_if_n_rows_removed: int = 8
+    """
+    If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
+    """
+    large_table_threshold: int = 10
+    """
+    With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
+    Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
+    """
+    large_table_row_overlap_threshold: float = 0.2
+    """
+    With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
+    Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
+    """
+    large_table_maximum_rows: int = 1000
+    """
+    Maximum number of rows allowed for a large table.
+    """
+    force_large_table_assumption: bool | None = None
+    """
+    Force the large table assumption to be applied, regardless of the number of rows and overlap.
+    """
+async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | None = None) -> list[TableData]:
+    """Extracts tables from a PDF file.
+    This function takes a file path to a PDF file, and an optional configuration object.
+    It returns a list of strings, where each string is a markdown-formatted table.
+    Args:
+        file_path: The path to the PDF file.
+        config: An optional configuration object.
+    Raises:
+        MissingDependencyError: Raised when the required dependencies are not installed.
+    Returns:
+        A list of table data dictionaries.
+    """
+    try:
+        from gmft.auto import AutoTableDetector, AutoTableFormatter
+        from gmft.detectors.tatr import TATRDetectorConfig
+        from gmft.formatters.tatr import TATRFormatConfig
+        from gmft.pdf_bindings.pdfium import PyPDFium2Document
+        config = config or GMFTConfig()
+        formatter = AutoTableFormatter(
+            config=TATRFormatConfig(
+                verbosity=config.verbosity,
+                formatter_base_threshold=config.formatter_base_threshold,
+                cell_required_confidence=config.cell_required_confidence,
+                remove_null_rows=config.remove_null_rows,
+                enable_multi_header=config.enable_multi_header,
+                semantic_spanning_cells=config.semantic_spanning_cells,
+                semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
+                large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
+                large_table_threshold=config.large_table_threshold,
+                large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
+                large_table_maximum_rows=config.large_table_maximum_rows,
+                force_large_table_assumption=config.force_large_table_assumption,
+            )
+        )
+        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))
+        doc = await run_sync(PyPDFium2Document, str(file_path))
+        cropped_tables: list[CroppedTable] = []
+        dataframes: list[DataFrame] = []
+        try:
+            for page in doc:
+                cropped_tables.extend(await run_sync(detector.extract, page))
+            for cropped_table in cropped_tables:
+                formatted_table = await run_sync(formatter.extract, cropped_table)
+                dataframes.append(await run_sync(formatted_table.df))
+            return [
+                TableData(
+                    cropped_image=cropped_table.image(),
+                    page_number=cropped_table.page.page_number,
+                    text=data_frame.to_markdown(),
+                    df=data_frame,
+                )
+                for data_frame, cropped_table in zip(dataframes, cropped_tables)
+            ]
+        finally:
+            await run_sync(doc.close)
+    except ImportError as e:
+        raise MissingDependencyError.create_for_package(
+            dependency_group="gmft", functionality="table extraction", package_name="gmft"
+        ) from e

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_ocr/_paddleocr.py RENAMED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 import platform
-import sys
 from dataclasses import dataclass
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -233,17 +232,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         Raises:
             MissingDependencyError: If PaddleOCR is not installed.
             OCRError: If initialization fails.
-            ValidationError: If the python version is too high.
         """
         if cls._paddle_ocr is not None:
             return
-        if sys.version_info >= (3, 13):  # pragma: no cover
-            raise ValidationError(
-                "PaddleOCR is only available in python 3.12 and below. Please downgrade your Python or switch to a different OCR backend.",
-                context={"issue": "https://github.com/PaddlePaddle/Paddle/issues/71616"},
-            )
         try:
             from paddleocr import PaddleOCR
         except ImportError as e:

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_types.py RENAMED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import sys
 from collections.abc import Awaitable
-from dataclasses import asdict, dataclass
+from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
@@ -14,6 +14,10 @@ else:  # pragma: no cover
     from typing import NotRequired
 if TYPE_CHECKING:
+    from pandas import DataFrame
+    from PIL.Image import Image
+    from kreuzberg._gmft import GMFTConfig
     from kreuzberg._ocr._easyocr import EasyOCRConfig
     from kreuzberg._ocr._paddleocr import PaddleOCRConfig
     from kreuzberg._ocr._tesseract import TesseractConfig
@@ -21,6 +25,19 @@ if TYPE_CHECKING:
 OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
+class TableData(TypedDict):
+    """Table data, returned from table extraction."""
+    cropped_image: Image
+    """The cropped image of the table."""
+    df: DataFrame
+    """The table data as a pandas DataFrame."""
+    page_number: int
+    """The page number of the table."""
+    text: str
+    """The table text as a markdown string."""
 class Metadata(TypedDict, total=False):
     """Base metadata common to all document types.
@@ -88,12 +105,14 @@ class ExtractionResult:
     content: str
     """The extracted content."""
-    chunks: list[str]
-    """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
     mime_type: str
     """The mime type of the extracted content. Is either text/plain or text/markdown."""
     metadata: Metadata
     """The metadata of the content."""
+    tables: list[TableData] = field(default_factory=list)
+    """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
+    chunks: list[str] = field(default_factory=list)
+    """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
 PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
@@ -114,14 +133,22 @@ class ExtractionConfig:
     """Whether to force OCR."""
     chunk_content: bool = False
     """Whether to chunk the content into smaller chunks."""
+    extract_tables: bool = False
+    """Whether to extract tables from the content. This requires the 'gmft' dependency."""
     max_chars: int = DEFAULT_MAX_CHARACTERS
     """The size of each chunk in characters."""
     max_overlap: int = DEFAULT_MAX_OVERLAP
     """The overlap between chunks in characters."""
     ocr_backend: OcrBackendType | None = "tesseract"
-    """The OCR backend to use."""
+    """The OCR backend to use.
+    Notes:
+        - If set to 'None', OCR will not be performed.
+    """
     ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
     """Configuration to pass to the OCR backend."""
+    gmft_config: GMFTConfig | None = None
+    """GMFT configuration."""
     post_processing_hooks: list[PostProcessingHook] | None = None
     """Post processing hooks to call after processing is done and before the final result is returned."""
     validators: list[ValidationHook] | None = None

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.0.1
+Version: 3.1.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
-Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
+Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
@@ -27,7 +27,7 @@ License-File: LICENSE
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.1
 Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
-Requires-Dist: html-to-markdown>=1.2.0
+Requires-Dist: html-to-markdown>=1.2.1
 Requires-Dist: playa-pdf>=0.4.1
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.1
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
 Provides-Extra: all
 Requires-Dist: easyocr>=1.7.2; extra == "all"
-Requires-Dist: numpy>=2.0.2; extra == "all"
+Requires-Dist: gmft>=0.4.1; extra == "all"
 Requires-Dist: paddleocr>=2.10.0; extra == "all"
-Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
-Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
+Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
+Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
 Requires-Dist: setuptools>=76.0.0; extra == "all"
 Provides-Extra: chunking
-Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
+Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
+Provides-Extra: gmft
+Requires-Dist: gmft>=0.4.1; extra == "gmft"
 Provides-Extra: paddleocr
-Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
 Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
-Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
+Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
 Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
 Dynamic: license-file
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
 - **Resource Efficient**: Lightweight processing without GPU requirements
 - **Format Support**: Comprehensive support for documents, images, and text formats
 - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
+- **Metadata Extraction**: Get document metadata alongside text content
+- **Table Extraction**: Extract tables from documents using the excellent GMFT library
 - **Modern Python**: Built with async/await, type hints, and a functional-first approach
 - **Permissive OSS**: MIT licensed with permissively licensed dependencies
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
 ### Local Development
 1. Clone the repo
 1. Install the system dependencies
 1. Install the full dependencies with `uv sync`
-1. Install the pre-commit hooks with:
-    ```shell
-    pre-commit install && pre-commit install --hook-type commit-msg
-    ```
+1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
 1. Make your changes and submit a PR
 ## License

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/SOURCES.txt RENAMED Viewed

@@ -4,6 +4,7 @@ pyproject.toml
 kreuzberg/__init__.py
 kreuzberg/_chunker.py
 kreuzberg/_constants.py
+kreuzberg/_gmft.py
 kreuzberg/_mime_types.py
 kreuzberg/_playa.py
 kreuzberg/_registry.py

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/requires.txt RENAMED Viewed

@@ -1,6 +1,6 @@
 anyio>=4.9.0
 charset-normalizer>=3.4.1
-html-to-markdown>=1.2.0
+html-to-markdown>=1.2.1
 playa-pdf>=0.4.1
 pypdfium2==4.30.0
 python-calamine>=0.3.1
@@ -14,24 +14,22 @@ typing-extensions>=4.12.2
 [all]
 easyocr>=1.7.2
-numpy>=2.0.2
+gmft>=0.4.1
 paddleocr>=2.10.0
-semantic-text-splitter>=0.24.1
+paddlepaddle>=3.0.0
+semantic-text-splitter>=0.25.1
 setuptools>=76.0.0
-[all:python_version < "3.13"]
-paddlepaddle>=2.6.2
 [chunking]
-semantic-text-splitter>=0.24.1
+semantic-text-splitter>=0.25.1
 [easyocr]
 easyocr>=1.7.2
+[gmft]
+gmft>=0.4.1
 [paddleocr]
-numpy>=2.0.2
 paddleocr>=2.10.0
+paddlepaddle>=3.0.0
 setuptools>=76.0.0
-[paddleocr:python_version < "3.13"]
-paddlepaddle>=2.6.2

{kreuzberg-3.0.1 → kreuzberg-3.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "kreuzberg"
-version = "3.0.1"
+version = "3.1.0"
 description = "A text extraction library supporting PDFs, images, office documents and more"
 readme = "README.md"
 keywords = [
@@ -10,6 +10,7 @@ keywords = [
   "pandoc",
   "pdf-extraction",
   "rag",
+  "table-extraction",
   "tesseract",
   "text-extraction",
   "text-processing",
@@ -39,7 +40,7 @@ dependencies = [
   "anyio>=4.9.0",
   "charset-normalizer>=3.4.1",
   "exceptiongroup>=1.2.2; python_version<'3.11'",
-  "html-to-markdown>=1.2.0",
+  "html-to-markdown>=1.2.1",
   "playa-pdf>=0.4.1",
   "pypdfium2==4.30.0",                                # pinned due to bug in 4.30.1, until v5 is stable
   "python-calamine>=0.3.1",
@@ -50,24 +51,27 @@ dependencies = [
 optional-dependencies.all = [
   # easyocr
   "easyocr>=1.7.2",
+  # gmft
+  "gmft>=0.4.1",
   # paddle
-  "numpy>=2.0.2",
   "paddleocr>=2.10.0",
-  "paddlepaddle>=2.6.2; python_version<'3.13'",
+  "paddlepaddle>=3.0.0",
   # chunking
-  "semantic-text-splitter>=0.24.1",
+  "semantic-text-splitter>=0.25.1",
   "setuptools>=76.0.0",
 ]
 optional-dependencies.chunking = [
-  "semantic-text-splitter>=0.24.1",
+  "semantic-text-splitter>=0.25.1",
 ]
 optional-dependencies.easyocr = [
   "easyocr>=1.7.2",
 ]
+optional-dependencies.gmft = [
+  "gmft>=0.4.1",
+]
 optional-dependencies.paddleocr = [
-  "numpy>=2.0.2",
   "paddleocr>=2.10.0",
-  "paddlepaddle>=2.6.2; python_version<'3.13'",
+  "paddlepaddle>=3.0.0",
   "setuptools>=76.0.0",
 ]
 urls.homepage = "https://github.com/Goldziher/kreuzberg"
@@ -83,6 +87,7 @@ dev = [
   "pytest-timeout>=2.3.1",
   "ruff>=0.11.2",
   "trio>=0.29.0",
+  "uv-bump",
 ]
 doc = [
   "mkdocs>=1.6.1",
@@ -121,6 +126,7 @@ lint.per-file-ignores."tests/**/*.*" = [
   "ARG001",
   "D",
   "N815",
+  "PD",
   "PGH003",
   "PLR0915",
   "PLR2004",
@@ -167,3 +173,6 @@ disable_error_code = 'import-untyped'
 implicit_reexport = false
 show_error_codes = true
 strict = true
+[tool.uv.sources]
+uv-bump = { git = "https://github.com/Goldziher/uv-bump" }