PyPI - kreuzberg - Versions diffs - 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +66 -10
kreuzberg/_ocr/_paddleocr.py +86 -7
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +356 -0
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
kreuzberg-3.3.0.dist-info/RECORD +48 -0
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.1.7.dist-info/RECORD +0 -33
kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -18,6 +18,8 @@ from .extraction import (
     extract_file_sync,
 )
+__version__ = "3.2.0"
 __all__ = [
     "EasyOCRConfig",
     "ExtractionConfig",
@@ -34,6 +36,7 @@ __all__ = [
     "TableData",
     "TesseractConfig",
     "ValidationError",
+    "__version__",
     "batch_extract_bytes",
     "batch_extract_bytes_sync",
     "batch_extract_file",

kreuzberg/__main__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Entry point for running kreuzberg as a module with python -m kreuzberg."""
+from __future__ import annotations
+from kreuzberg.cli import cli
+if __name__ == "__main__":
+    cli()

kreuzberg/_cli_config.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""Configuration parsing for the CLI."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+if sys.version_info >= (3, 11):
+    import tomllib
+else:
+    import tomli as tomllib  # type: ignore[import-not-found]
+from kreuzberg._gmft import GMFTConfig
+from kreuzberg._ocr._easyocr import EasyOCRConfig
+from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+from kreuzberg._ocr._tesseract import TesseractConfig
+from kreuzberg._types import ExtractionConfig, OcrBackendType
+from kreuzberg.exceptions import ValidationError
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+def load_config_from_file(config_path: Path) -> dict[str, Any]:
+    """Load configuration from a TOML file.
+    Args:
+        config_path: Path to the configuration file.
+    Returns:
+        Dictionary containing the loaded configuration.
+    Raises:
+        ValidationError: If the file cannot be read or parsed.
+    """
+    try:
+        with config_path.open("rb") as f:
+            data = tomllib.load(f)
+    except FileNotFoundError as e:
+        raise ValidationError(f"Configuration file not found: {config_path}") from e
+    except tomllib.TOMLDecodeError as e:
+        raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
+    return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
+    """Merge two configuration dictionaries recursively.
+    Args:
+        base: Base configuration dictionary.
+        override: Configuration dictionary to override base values.
+    Returns:
+        Merged configuration dictionary.
+    """
+    result = base.copy()
+    for key, value in override.items():
+        if isinstance(value, dict) and key in result and isinstance(result[key], dict):
+            result[key] = merge_configs(result[key], value)
+        else:
+            result[key] = value
+    return result
+def parse_ocr_backend_config(
+    config_dict: dict[str, Any], backend: OcrBackendType
+) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
+    """Parse OCR backend-specific configuration.
+    Args:
+        config_dict: Configuration dictionary.
+        backend: The OCR backend type.
+    Returns:
+        Backend-specific configuration object or None.
+    """
+    if backend not in config_dict:
+        return None
+    backend_config = config_dict[backend]
+    if not isinstance(backend_config, dict):
+        return None
+    if backend == "tesseract":
+        return TesseractConfig(**backend_config)
+    if backend == "easyocr":
+        return EasyOCRConfig(**backend_config)
+    if backend == "paddleocr":
+        return PaddleOCRConfig(**backend_config)
+    return None
+def build_extraction_config(  # noqa: C901, PLR0912
+    file_config: dict[str, Any],
+    cli_args: MutableMapping[str, Any],
+) -> ExtractionConfig:
+    """Build ExtractionConfig from file config and CLI arguments.
+    Args:
+        file_config: Configuration loaded from file.
+        cli_args: CLI arguments.
+    Returns:
+        ExtractionConfig instance.
+    """
+    config_dict: dict[str, Any] = {}
+    if file_config:
+        for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
+            if field in file_config:
+                config_dict[field] = file_config[field]
+    for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
+        cli_key = field
+        if cli_key in cli_args and cli_args[cli_key] is not None:
+            config_dict[field] = cli_args[cli_key]
+    ocr_backend = config_dict.get("ocr_backend")
+    if ocr_backend and ocr_backend != "none":
+        ocr_config = None
+        if cli_args.get(f"{ocr_backend}_config"):
+            backend_args = cli_args[f"{ocr_backend}_config"]
+            if ocr_backend == "tesseract":
+                ocr_config = TesseractConfig(**backend_args)
+            elif ocr_backend == "easyocr":
+                ocr_config = EasyOCRConfig(**backend_args)  # type: ignore[assignment]
+            elif ocr_backend == "paddleocr":
+                ocr_config = PaddleOCRConfig(**backend_args)  # type: ignore[assignment]
+        if not ocr_config and file_config:
+            ocr_config = parse_ocr_backend_config(file_config, ocr_backend)  # type: ignore[assignment]
+        if ocr_config:
+            config_dict["ocr_config"] = ocr_config
+    if config_dict.get("extract_tables"):
+        gmft_config = None
+        if cli_args.get("gmft_config"):
+            gmft_config = GMFTConfig(**cli_args["gmft_config"])
+        elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
+            gmft_config = GMFTConfig(**file_config["gmft"])
+        if gmft_config:
+            config_dict["gmft_config"] = gmft_config
+    if config_dict.get("ocr_backend") == "none":
+        config_dict["ocr_backend"] = None
+    return ExtractionConfig(**config_dict)
+def find_default_config() -> Path | None:
+    """Find the default configuration file (pyproject.toml).
+    Returns:
+        Path to the configuration file or None if not found.
+    """
+    current = Path.cwd()
+    while current != current.parent:
+        config_path = current / "pyproject.toml"
+        if config_path.exists():
+            try:
+                with config_path.open("rb") as f:
+                    data = tomllib.load(f)
+                if "tool" in data and "kreuzberg" in data["tool"]:
+                    return config_path
+            except Exception:  # noqa: BLE001
+                pass
+        current = current.parent
+    return None

kreuzberg/_extractors/_image.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, ClassVar
-import anyio
 from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
@@ -13,10 +12,12 @@ from kreuzberg.exceptions import ValidationError
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
-    from pathlib import Path
     from kreuzberg._types import ExtractionResult
+import contextlib
+from pathlib import Path
 class ImageExtractor(Extractor):
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -58,10 +59,44 @@ class ImageExtractor(Extractor):
         return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        return anyio.run(self.extract_bytes_async, content)
+        """Pure sync implementation of extract_bytes."""
+        import os
+        import tempfile
+        extension = self._get_extension_from_mime_type(self.mime_type)
+        fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
+        try:
+            with os.fdopen(fd, "wb") as f:
+                f.write(content)
+            return self.extract_path_sync(Path(temp_path))
+        finally:
+            with contextlib.suppress(OSError):
+                Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        return anyio.run(self.extract_path_async, path)
+        """Pure sync implementation of extract_path."""
+        if self.config.ocr_backend is None:
+            raise ValidationError("ocr_backend is None, cannot perform OCR")
+        from kreuzberg._ocr._tesseract import TesseractConfig
+        from kreuzberg._types import ExtractionResult
+        if self.config.ocr_backend == "tesseract":
+            from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
+            if isinstance(self.config.ocr_config, TesseractConfig):
+                config = self.config.ocr_config
+            else:
+                config = TesseractConfig()
+            results = process_batch_images_sync_pure([str(path)], config)
+            if results:
+                return results[0]
+            return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
+        raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
     def _get_extension_from_mime_type(self, mime_type: str) -> str:
         if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from __future__ import annotations
+import contextlib
 import re
 import sys
 from json import JSONDecodeError, loads
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
-import anyio
 from anyio import Path as AsyncPath
 from anyio import run_process
@@ -21,7 +22,7 @@ from kreuzberg.exceptions import MissingDependencyError, ParsingError, Validatio
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
     from os import PathLike
-    from pathlib import Path
 if sys.version_info < (3, 11):  # pragma: no cover
     from exceptiongroup import ExceptionGroup  # type: ignore[import-not-found]
@@ -194,7 +195,7 @@ class PandocExtractor(Extractor):
             raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Synchronous version of extract_bytes_async.
+        """Pure sync implementation of extract_bytes.
         Args:
             content: The content bytes to process.
@@ -202,18 +203,46 @@ class PandocExtractor(Extractor):
         Returns:
             ExtractionResult with the extracted text and metadata.
         """
-        return anyio.run(self.extract_bytes_async, content)
+        import os
+        import tempfile
+        from pathlib import Path
+        extension = self._get_pandoc_type_from_mime_type(self.mime_type)
+        fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
+        try:
+            with os.fdopen(fd, "wb") as f:
+                f.write(content)
+            return self.extract_path_sync(Path(temp_path))
+        finally:
+            with contextlib.suppress(OSError):
+                Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Synchronous version of extract_path_async.
+        """Pure sync implementation of extract_path.
         Args:
             path: The path to the file to process.
         Returns:
             ExtractionResult with the extracted text and metadata.
+        Raises:
+            ParsingError: When file processing fails.
         """
-        return anyio.run(self.extract_path_async, path)
+        self._validate_pandoc_version_sync()
+        self._get_pandoc_type_from_mime_type(self.mime_type)
+        try:
+            metadata = self._extract_metadata_sync(path)
+            content = self._extract_file_sync(path)
+            return ExtractionResult(
+                content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE, chunks=[]
+            )
+        except Exception as e:
+            raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
     async def _validate_pandoc_version(self) -> None:
         """Validate that the installed Pandoc version meets the minimum requirement.
@@ -229,36 +258,26 @@ class PandocExtractor(Extractor):
             result = await run_process(command)
             stdout = result.stdout.decode()
-            # Try more inclusive patterns to detect the pandoc version
-            # Try common formats first
             version_match = re.search(
                 r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
             )
-            # Try version in parentheses format
             if not version_match:
                 version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
-            # Try hyphenated format
             if not version_match:
                 version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
-            # If still no match, check for version at the beginning of the output or any line
             if not version_match:
-                # Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
                 version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
-            # Try finding version-like patterns elsewhere in the text
             if not version_match:
-                # Search for version-like patterns at the beginning of lines or after spaces
                 version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
-            # As a last resort, check any sequence of digits that might be a version
             if not version_match:
                 out_lines = stdout.splitlines()
                 for line in out_lines:
                     for token in line.split():
-                        # Match standalone version patterns like 2.11 or 2.11.4
                         version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
                         if version_pattern:
                             version_match = version_pattern
@@ -266,12 +285,10 @@ class PandocExtractor(Extractor):
                     if version_match:
                         break
-            # If we found a version, check that the major version is at least the minimum required
             if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
                 self._checked_version = True
                 return
-            # If we get here, we either didn't find a version or it's too low
             raise MissingDependencyError(
                 "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
             )
@@ -560,6 +577,129 @@ class PandocExtractor(Extractor):
         return None
+    def _validate_pandoc_version_sync(self) -> None:
+        """Synchronous version of _validate_pandoc_version."""
+        import subprocess
+        try:
+            if self._checked_version:
+                return
+            result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False)  # noqa: S607
+            if result.returncode != 0:
+                raise MissingDependencyError(
+                    "Pandoc version 2 or above is a required system dependency. "
+                    "Please install it on your system and make sure its available in $PATH."
+                )
+            stdout = result.stdout
+            version_match = re.search(
+                r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
+            )
+            if not version_match:
+                version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
+            if not version_match:
+                version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
+            if not version_match:
+                version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
+            if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
+                self._checked_version = True
+                return
+            raise MissingDependencyError(
+                "Pandoc version 2 or above is a required system dependency. "
+                "Please install it on your system and make sure its available in $PATH."
+            )
+        except (subprocess.SubprocessError, FileNotFoundError) as e:
+            raise MissingDependencyError(
+                "Pandoc version 2 or above is a required system dependency. "
+                "Please install it on your system and make sure its available in $PATH."
+            ) from e
+    def _extract_metadata_sync(self, path: Path) -> Metadata:
+        """Synchronous version of _handle_extract_metadata."""
+        import os
+        import subprocess
+        import tempfile
+        pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
+        fd, metadata_file = tempfile.mkstemp(suffix=".json")
+        os.close(fd)
+        try:
+            command = [
+                "pandoc",
+                str(path),
+                f"--from={pandoc_type}",
+                "--to=json",
+                "--standalone",
+                "--quiet",
+                "--output",
+                str(metadata_file),
+            ]
+            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            if result.returncode != 0:
+                raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
+            with Path(metadata_file).open(encoding="utf-8") as f:
+                json_data = loads(f.read())
+            return self._extract_metadata(json_data)
+        except (OSError, JSONDecodeError) as e:
+            raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
+        finally:
+            with contextlib.suppress(OSError):
+                Path(metadata_file).unlink()
+    def _extract_file_sync(self, path: Path) -> str:
+        """Synchronous version of _handle_extract_file."""
+        import os
+        import subprocess
+        import tempfile
+        pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
+        fd, output_path = tempfile.mkstemp(suffix=".md")
+        os.close(fd)
+        try:
+            command = [
+                "pandoc",
+                str(path),
+                f"--from={pandoc_type}",
+                "--to=markdown",
+                "--standalone",
+                "--wrap=preserve",
+                "--quiet",
+                "--output",
+                str(output_path),
+            ]
+            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            if result.returncode != 0:
+                raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
+            with Path(output_path).open(encoding="utf-8") as f:
+                text = f.read()
+            return normalize_spaces(text)
+        except OSError as e:
+            raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
+        finally:
+            with contextlib.suppress(OSError):
+                Path(output_path).unlink()
 class MarkdownExtractor(PandocExtractor):
     """Extractor for Markdown-based document formats."""

kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl