PyPI - kreuzberg - Versions diffs - 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl - Mend

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +6 -12
kreuzberg/_ocr/_paddleocr.py +15 -13
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_types.py +4 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +10 -27
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
kreuzberg-3.4.0.dist-info/METADATA +290 -0
kreuzberg-3.4.0.dist-info/RECORD +50 -0
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.2.0.dist-info/METADATA +0 -166
kreuzberg-3.2.0.dist-info/RECORD +0 -34
kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
+import contextlib
 from multiprocessing import cpu_count
+from pathlib import Path
 from re import Pattern
 from re import compile as compile_regex
 from typing import TYPE_CHECKING, ClassVar, cast
@@ -14,14 +16,13 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
 from kreuzberg._playa import extract_pdf_metadata
 from kreuzberg._types import ExtractionResult, OcrBackendType
+from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ParsingError
 if TYPE_CHECKING:  # pragma: no cover
-    from pathlib import Path
     from PIL.Image import Image
@@ -69,10 +70,52 @@ class PDFExtractor(Extractor):
         return result
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        return anyio.run(self.extract_bytes_async, content)
+        """Pure sync implementation of PDF extraction from bytes."""
+        import os
+        import tempfile
+        fd, temp_path = tempfile.mkstemp(suffix=".pdf")
+        try:
+            with os.fdopen(fd, "wb") as f:
+                f.write(content)
+            result = self.extract_path_sync(Path(temp_path))
+            from kreuzberg._playa import extract_pdf_metadata_sync
+            metadata = extract_pdf_metadata_sync(content)
+            result.metadata = metadata
+            return result
+        finally:
+            with contextlib.suppress(OSError):
+                Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        return anyio.run(self.extract_path_async, path)
+        """Pure sync implementation of PDF extraction from path."""
+        text = self._extract_pdf_searchable_text_sync(path)
+        if self.config.force_ocr or not self._validate_extracted_text(text):
+            text = self._extract_pdf_with_ocr_sync(path)
+        tables = []
+        if self.config.extract_tables:
+            try:
+                from kreuzberg._gmft import extract_tables_sync
+                tables = extract_tables_sync(path)
+            except ImportError:
+                pass
+        text = normalize_spaces(text)
+        return ExtractionResult(
+            content=text,
+            mime_type=PLAIN_TEXT_MIME_TYPE,
+            metadata={},
+            tables=tables,
+            chunks=[],
+        )
     def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
         """Check if text extracted from PDF is valid or corrupted.
@@ -112,17 +155,45 @@ class PDFExtractor(Extractor):
         Returns:
             A list of Pillow Images.
         """
+        from kreuzberg._utils._errors import create_error_context, should_retry
         document: pypdfium2.PdfDocument | None = None
-        try:
-            document = await run_sync(pypdfium2.PdfDocument, str(input_file))
-            return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
-        except pypdfium2.PdfiumError as e:
-            raise ParsingError(
-                "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
-            ) from e
-        finally:
-            if document:
-                await run_sync(document.close)
+        last_error = None
+        for attempt in range(3):  # Try up to 3 times  # ~keep
+            try:
+                with pypdfium_file_lock(input_file):
+                    document = await run_sync(pypdfium2.PdfDocument, str(input_file))
+                    return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
+            except pypdfium2.PdfiumError as e:  # noqa: PERF203
+                last_error = e
+                if not should_retry(e, attempt + 1):
+                    raise ParsingError(
+                        "Could not convert PDF to images",
+                        context=create_error_context(
+                            operation="convert_pdf_to_images",
+                            file_path=input_file,
+                            error=e,
+                            attempt=attempt + 1,
+                        ),
+                    ) from e
+                # Wait before retry with exponential backoff  # ~keep
+                await anyio.sleep(0.5 * (attempt + 1))
+            finally:
+                if document:
+                    with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
+                        await run_sync(document.close)
+        # All retries failed  # ~keep
+        raise ParsingError(
+            "Could not convert PDF to images after retries",
+            context=create_error_context(
+                operation="convert_pdf_to_images",
+                file_path=input_file,
+                error=last_error,
+                attempts=3,
+            ),
+        ) from last_error
     async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
         """Extract text from a scanned PDF file using OCR.
@@ -157,15 +228,124 @@ class PDFExtractor(Extractor):
         Returns:
             The extracted text.
         """
+        from kreuzberg._utils._errors import create_error_context
         document: pypdfium2.PdfDocument | None = None
         try:
-            document = await run_sync(pypdfium2.PdfDocument, str(input_file))
-            text = "\n".join(page.get_textpage().get_text_bounded() for page in cast("pypdfium2.PdfDocument", document))
-            return normalize_spaces(text)
+            with pypdfium_file_lock(input_file):
+                document = await run_sync(pypdfium2.PdfDocument, str(input_file))
+                text_parts = []
+                page_errors = []
+                for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
+                    try:
+                        text_page = page.get_textpage()
+                        text_parts.append(text_page.get_text_bounded())
+                    except Exception as e:  # noqa: PERF203, BLE001
+                        page_errors.append({"page": i + 1, "error": str(e)})
+                        text_parts.append(f"[Error extracting page {i + 1}]")
+                text = "\n".join(text_parts)
+                if page_errors and text_parts:
+                    return normalize_spaces(text)
+                if not text_parts:
+                    raise ParsingError(
+                        "Could not extract any text from PDF",
+                        context=create_error_context(
+                            operation="extract_pdf_searchable_text",
+                            file_path=input_file,
+                            page_errors=page_errors,
+                        ),
+                    )
+                return normalize_spaces(text)
         except pypdfium2.PdfiumError as e:
             raise ParsingError(
-                "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
+                "Could not extract text from PDF file",
+                context=create_error_context(
+                    operation="extract_pdf_searchable_text",
+                    file_path=input_file,
+                    error=e,
+                ),
             ) from e
         finally:
             if document:
-                await run_sync(document.close)
+                with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
+                    await run_sync(document.close)
+    def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
+        """Extract searchable text from PDF using pypdfium2 (sync version)."""
+        pdf = None
+        try:
+            with pypdfium_file_lock(path):
+                pdf = pypdfium2.PdfDocument(str(path))
+                text_parts = []
+                for page in pdf:
+                    text_page = page.get_textpage()
+                    text = text_page.get_text_range()
+                    text_parts.append(text)
+                    text_page.close()
+                    page.close()
+                return "".join(text_parts)
+        except Exception as e:
+            raise ParsingError(f"Failed to extract PDF text: {e}") from e
+        finally:
+            if pdf:
+                with pypdfium_file_lock(path), contextlib.suppress(Exception):
+                    pdf.close()
+    def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
+        """Extract text from PDF using OCR (sync version)."""
+        pdf = None
+        try:
+            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            images = []
+            with pypdfium_file_lock(path):
+                pdf = pypdfium2.PdfDocument(str(path))
+                for page in pdf:
+                    bitmap = page.render(scale=200 / 72)
+                    pil_image = bitmap.to_pil()
+                    images.append(pil_image)
+                    bitmap.close()
+                    page.close()
+            import os
+            import tempfile
+            image_paths = []
+            temp_files = []
+            try:
+                for i, img in enumerate(images):
+                    fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
+                    temp_files.append((fd, temp_path))
+                    img.save(temp_path, format="PNG")
+                    os.close(fd)
+                    image_paths.append(temp_path)
+                if self.config.ocr_backend == "tesseract":
+                    from kreuzberg._ocr._tesseract import TesseractConfig
+                    if isinstance(self.config.ocr_config, TesseractConfig):
+                        config = self.config.ocr_config
+                    else:
+                        config = TesseractConfig()
+                    results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
+                    text_parts = [r.content for r in results]
+                    return "\n\n".join(text_parts)
+                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+            finally:
+                for _, temp_path in temp_files:
+                    with contextlib.suppress(OSError):
+                        Path(temp_path).unlink()
+        except Exception as e:
+            raise ParsingError(f"Failed to OCR PDF: {e}") from e
+        finally:
+            if pdf:
+                with pypdfium_file_lock(path), contextlib.suppress(Exception):
+                    pdf.close()

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -202,7 +202,7 @@ class PresentationExtractor(Extractor):
             ("keywords", "keywords"),
             ("modified_by", "last_modified_by"),
             ("modified_at", "modified"),
-            ("version", "revision"),  # if version and revision are given, version overwrites
+            ("version", "revision"),
             ("subject", "subject"),
             ("title", "title"),
             ("version", "version"),

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from __future__ import annotations
+import contextlib
 import csv
 import sys
 from datetime import date, datetime, time, timedelta
 from io import StringIO
-from typing import TYPE_CHECKING, Any, Union
+from pathlib import Path
+from typing import Any, Union
-import anyio
 from anyio import Path as AsyncPath
 from python_calamine import CalamineWorkbook
@@ -18,9 +19,6 @@ from kreuzberg._utils._sync import run_sync, run_taskgroup
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import ParsingError
-if TYPE_CHECKING:  # pragma: no cover
-    from pathlib import Path
 if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
@@ -64,10 +62,37 @@ class SpreadSheetExtractor(Extractor):
             ) from e
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        return anyio.run(self.extract_bytes_async, content)
+        """Pure sync implementation of extract_bytes."""
+        import os
+        import tempfile
+        fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
+        try:
+            with os.fdopen(fd, "wb") as f:
+                f.write(content)
+            return self.extract_path_sync(Path(temp_path))
+        finally:
+            with contextlib.suppress(OSError):
+                Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        return anyio.run(self.extract_path_async, path)
+        """Pure sync implementation of extract_path."""
+        try:
+            workbook = CalamineWorkbook.from_path(str(path))
+            results = []
+            for sheet_name in workbook.sheet_names:
+                sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
+                results.append(sheet_text)
+            return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
+        except Exception as e:
+            raise ParsingError(
+                "Failed to extract file data",
+                context={"file": str(path), "error": str(e)},
+            ) from e
     @staticmethod
     def _convert_cell_to_str(value: Any) -> str:
@@ -123,3 +148,36 @@ class SpreadSheetExtractor(Extractor):
         await unlink()
         return f"## {sheet_name}\n\n{normalize_spaces(result)}"
+    def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
+        """Synchronous version of _convert_sheet_to_text."""
+        values = workbook.get_sheet_by_name(sheet_name).to_python()
+        csv_buffer = StringIO()
+        writer = csv.writer(csv_buffer)
+        for row in values:
+            writer.writerow([self._convert_cell_to_str(cell) for cell in row])
+        csv_data = csv_buffer.getvalue()
+        csv_buffer.close()
+        csv_reader = csv.reader(StringIO(csv_data))
+        rows = list(csv_reader)
+        result = ""
+        if rows:
+            header = rows[0]
+            markdown_lines: list[str] = [
+                "| " + " | ".join(header) + " |",
+                "| " + " | ".join(["---" for _ in header]) + " |",
+            ]
+            for row in rows[1:]:  # type: ignore[assignment]
+                while len(row) < len(header):
+                    row.append("")
+                markdown_lines.append("| " + " | ".join(row) + " |")  # type: ignore[arg-type]
+            result = "\n".join(markdown_lines)
+        return f"## {sheet_name}\n\n{normalize_spaces(result)}"

kreuzberg/_gmft.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from __future__ import annotations
+import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Any, Literal
 from kreuzberg._types import TableData
 from kreuzberg._utils._sync import run_sync
@@ -69,7 +70,7 @@ class GMFTConfig:
     """
     [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
     """
-    semantic_hierarchical_left_fill: str | None = "algorithm"
+    semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
     """
     [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
@@ -103,9 +104,31 @@ class GMFTConfig:
     """
     Force the large table assumption to be applied, regardless of the number of rows and overlap.
     """
+    total_overlap_reject_threshold: float = 0.9
+    """
+    Reject if total overlap is > 90% of table area.
+    """
+    total_overlap_warn_threshold: float = 0.1
+    """
+    Warn if total overlap is > 10% of table area.
+    """
+    nms_warn_threshold: int = 5
+    """
+    Warn if non maxima suppression removes > 5 rows.
+    """
+    iob_reject_threshold: float = 0.05
+    """
+    Reject if iob between textbox and cell is < 5%.
+    """
+    iob_warn_threshold: float = 0.5
+    """
+    Warn if iob between textbox and cell is < 50%.
+    """
-async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | None = None) -> list[TableData]:
+async def extract_tables(  # noqa: PLR0915
+    file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
+) -> list[TableData]:
     """Extracts tables from a PDF file.
     This function takes a file path to a PDF file, and an optional configuration object.
@@ -114,6 +137,8 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
     Args:
         file_path: The path to the PDF file.
         config: An optional configuration object.
+        use_isolated_process: Whether to use an isolated process for extraction.
+            If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
     Raises:
         MissingDependencyError: Raised when the required dependencies are not installed.
@@ -121,14 +146,189 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
     Returns:
         A list of table data dictionaries.
     """
+    from pathlib import Path
+    from kreuzberg._utils._cache import get_table_cache
+    # Determine if we should use isolated process  # ~keep
+    if use_isolated_process is None:
+        use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
+    path = Path(file_path)
+    try:
+        stat = path.stat()
+        file_info = {
+            "path": str(path.resolve()),
+            "size": stat.st_size,
+            "mtime": stat.st_mtime,
+        }
+    except OSError:
+        file_info = {
+            "path": str(path),
+            "size": 0,
+            "mtime": 0,
+        }
+    config = config or GMFTConfig()
+    cache_kwargs = {
+        "file_info": str(sorted(file_info.items())),
+        "extractor": "gmft",
+        "config": str(sorted(config.__dict__.items())),
+    }
+    table_cache = get_table_cache()
+    cached_result = await table_cache.aget(**cache_kwargs)
+    if cached_result is not None:
+        return cached_result  # type: ignore[no-any-return]
+    if table_cache.is_processing(**cache_kwargs):
+        import anyio
+        event = table_cache.mark_processing(**cache_kwargs)
+        await anyio.to_thread.run_sync(event.wait)
+        # Try cache again after waiting for other process to complete  # ~keep
+        cached_result = await table_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result  # type: ignore[no-any-return]
+    table_cache.mark_processing(**cache_kwargs)
+    try:
+        if use_isolated_process:
+            from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
+            result = await extract_tables_isolated_async(file_path, config)
+            await table_cache.aset(result, **cache_kwargs)
+            return result
+        try:
+            from gmft.auto import AutoTableDetector, AutoTableFormatter  # type: ignore[attr-defined]
+            from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]
+            from gmft.formatters.tatr import TATRFormatConfig
+            from gmft.pdf_bindings.pdfium import PyPDFium2Document
+            formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]
+                config=TATRFormatConfig(
+                    verbosity=config.verbosity,
+                    formatter_base_threshold=config.formatter_base_threshold,
+                    cell_required_confidence=config.cell_required_confidence,
+                    remove_null_rows=config.remove_null_rows,
+                    enable_multi_header=config.enable_multi_header,
+                    semantic_spanning_cells=config.semantic_spanning_cells,
+                    semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
+                    large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
+                    large_table_threshold=config.large_table_threshold,
+                    large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
+                    large_table_maximum_rows=config.large_table_maximum_rows,
+                    force_large_table_assumption=config.force_large_table_assumption,
+                )
+            )
+            detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]
+                config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
+            )
+            doc = await run_sync(PyPDFium2Document, str(file_path))
+            cropped_tables: list[CroppedTable] = []
+            dataframes: list[DataFrame] = []
+            try:
+                for page in doc:
+                    cropped_tables.extend(await run_sync(detector.extract, page))
+                for cropped_table in cropped_tables:
+                    formatted_table = await run_sync(formatter.extract, cropped_table)
+                    dataframes.append(await run_sync(formatted_table.df))
+                result = [
+                    TableData(
+                        cropped_image=cropped_table.image(),
+                        page_number=cropped_table.page.page_number,
+                        text=data_frame.to_markdown(),
+                        df=data_frame,
+                    )
+                    for data_frame, cropped_table in zip(dataframes, cropped_tables)
+                ]
+                await table_cache.aset(result, **cache_kwargs)
+                return result
+            finally:
+                await run_sync(doc.close)
+        except ImportError as e:
+            raise MissingDependencyError.create_for_package(
+                dependency_group="gmft", functionality="table extraction", package_name="gmft"
+            ) from e
+    finally:
+        table_cache.mark_complete(**cache_kwargs)
+def extract_tables_sync(
+    file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
+) -> list[TableData]:
+    """Synchronous wrapper for extract_tables.
+    Args:
+        file_path: The path to the PDF file.
+        config: An optional configuration object.
+        use_isolated_process: Whether to use an isolated process for extraction.
+            If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
+    Returns:
+        A list of table data dictionaries.
+    """
+    from pathlib import Path
+    from kreuzberg._utils._cache import get_table_cache
+    # Determine if we should use isolated process  # ~keep
+    if use_isolated_process is None:
+        use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
+    path = Path(file_path)
     try:
-        from gmft.auto import AutoTableDetector, AutoTableFormatter
-        from gmft.detectors.tatr import TATRDetectorConfig
+        stat = path.stat()
+        file_info = {
+            "path": str(path.resolve()),
+            "size": stat.st_size,
+            "mtime": stat.st_mtime,
+        }
+    except OSError:
+        file_info = {
+            "path": str(path),
+            "size": 0,
+            "mtime": 0,
+        }
+    config = config or GMFTConfig()
+    cache_kwargs = {
+        "file_info": str(sorted(file_info.items())),
+        "extractor": "gmft",
+        "config": str(sorted(config.__dict__.items())),
+    }
+    table_cache = get_table_cache()
+    cached_result = table_cache.get(**cache_kwargs)
+    if cached_result is not None:
+        return cached_result  # type: ignore[no-any-return]
+    if use_isolated_process:
+        from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
+        result = extract_tables_isolated(file_path, config)
+        table_cache.set(result, **cache_kwargs)
+        return result
+    try:
+        from gmft.auto import AutoTableDetector, AutoTableFormatter  # type: ignore[attr-defined]
+        from gmft.detectors.tatr import TATRDetectorConfig  # type: ignore[attr-defined]
         from gmft.formatters.tatr import TATRFormatConfig
         from gmft.pdf_bindings.pdfium import PyPDFium2Document
-        config = config or GMFTConfig()
-        formatter = AutoTableFormatter(
+        formatter: Any = AutoTableFormatter(  # type: ignore[no-untyped-call]
             config=TATRFormatConfig(
                 verbosity=config.verbosity,
                 formatter_base_threshold=config.formatter_base_threshold,
@@ -144,19 +344,21 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
                 force_large_table_assumption=config.force_large_table_assumption,
             )
         )
-        detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))
-        doc = await run_sync(PyPDFium2Document, str(file_path))
-        cropped_tables: list[CroppedTable] = []
-        dataframes: list[DataFrame] = []
+        detector: Any = AutoTableDetector(  # type: ignore[no-untyped-call]
+            config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
+        )
+        doc = PyPDFium2Document(str(file_path))
+        cropped_tables: list[Any] = []
+        dataframes: list[Any] = []
         try:
             for page in doc:
-                cropped_tables.extend(await run_sync(detector.extract, page))
+                cropped_tables.extend(detector.extract(page))
             for cropped_table in cropped_tables:
-                formatted_table = await run_sync(formatter.extract, cropped_table)
-                dataframes.append(await run_sync(formatted_table.df))
+                formatted_table = formatter.extract(cropped_table)
+                dataframes.append(formatted_table.df())
-            return [
+            result = [
                 TableData(
                     cropped_image=cropped_table.image(),
                     page_number=cropped_table.page.page_number,
@@ -165,8 +367,12 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
                 )
                 for data_frame, cropped_table in zip(dataframes, cropped_tables)
             ]
+            table_cache.set(result, **cache_kwargs)
+            return result
         finally:
-            await run_sync(doc.close)
+            doc.close()  # type: ignore[no-untyped-call]
     except ImportError as e:
         raise MissingDependencyError.create_for_package(

kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl