PyPI - kreuzberg - Versions diffs - 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +66 -10
kreuzberg/_ocr/_paddleocr.py +86 -7
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +356 -0
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
kreuzberg-3.3.0.dist-info/RECORD +48 -0
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.1.7.dist-info/RECORD +0 -33
kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
{kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import warnings
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -8,6 +9,7 @@ from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -55,7 +57,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "hr",
     "hu",
     "id",
-    "inh",  # codespell:ignore
+    "inh",
     "is",
     "it",
     "ja",
@@ -95,7 +97,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "sw",
     "ta",
     "tab",
-    "te",  # codespell:ignore
+    "te",
     "th",
     "tjk",
     "tl",
@@ -144,7 +146,13 @@ class EasyOCRConfig:
     text_threshold: float = 0.7
     """Text confidence threshold."""
     use_gpu: bool = False
-    """Whether to use GPU for inference."""
+    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
+    device: DeviceType = "auto"
+    """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
+    gpu_memory_limit: float | None = None
+    """Maximum GPU memory to use in GB. None for no limit."""
+    fallback_to_cpu: bool = True
+    """Whether to fallback to CPU if requested device is unavailable."""
     width_ths: float = 0.5
     """Maximum horizontal distance for merging boxes."""
     x_ths: float = 1.0
@@ -253,11 +261,12 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
                 content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
             )
+        # Group text boxes by lines based on Y coordinate  # ~keep
         sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
         line_groups: list[list[Any]] = []
         current_line: list[Any] = []
         prev_y_center: float | None = None
-        line_height_threshold = 20
+        line_height_threshold = 20  # Minimum distance to consider as new line  # ~keep
         for item in sorted_results:
             box, text, confidence = item
@@ -280,7 +289,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         confidence_count = 0
         for line in line_groups:
-            line_sorted = sorted(line, key=lambda x: x[0][0][0])
+            line_sorted = sorted(line, key=lambda x: x[0][0][0])  # Sort boxes by X coordinate within line  # ~keep
             for item in line_sorted:
                 _, text, confidence = item
@@ -336,8 +345,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             ) from e
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
-        has_gpu = cls._is_gpu_available()
-        kwargs.setdefault("gpu", has_gpu)
+        device_info = cls._resolve_device_config(**kwargs)
+        use_gpu = device_info.device_type in ("cuda", "mps")
         kwargs.setdefault("detector", True)
         kwargs.setdefault("recognizer", True)
         kwargs.setdefault("download_enabled", True)
@@ -347,12 +358,59 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             cls._reader = await run_sync(
                 easyocr.Reader,
                 languages,
-                gpu=kwargs.get("use_gpu"),
+                gpu=use_gpu,
                 verbose=False,
             )
         except Exception as e:
             raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
+    @classmethod
+    def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
+        """Resolve device configuration with backward compatibility.
+        Args:
+            **kwargs: Configuration parameters including device settings.
+        Returns:
+            DeviceInfo object for the selected device.
+        Raises:
+            ValidationError: If requested device is not available and fallback is disabled.
+        """
+        use_gpu = kwargs.get("use_gpu", False)
+        device = kwargs.get("device", "auto")
+        memory_limit = kwargs.get("gpu_memory_limit")
+        fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
+        if use_gpu and device == "auto":
+            warnings.warn(
+                "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
+                "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+            device = "auto" if use_gpu else "cpu"
+        elif use_gpu and device != "auto":
+            warnings.warn(
+                "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
+                "Using 'device' parameter value.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+        try:
+            return validate_device_request(
+                device,
+                "EasyOCR",
+                memory_limit=memory_limit,
+                fallback_to_cpu=fallback_to_cpu,
+            )
+        except ValidationError:
+            if not use_gpu and device == "cpu":
+                return DeviceInfo(device_type="cpu", name="CPU")
+            raise
     @staticmethod
     def _validate_language_code(language_codes: str | list[str]) -> list[str]:
         """Validate and normalize provided language codes.
@@ -367,10 +425,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             A list with the normalized language codes.
         """
         if isinstance(language_codes, str):
-            # Handle comma-separated language codes
             languages = [lang.strip().lower() for lang in language_codes.split(",")]
         else:
-            # Handle list of language codes
             languages = [lang.lower() for lang in language_codes]
         unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import platform
+import warnings
 from dataclasses import dataclass
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
@@ -10,6 +11,7 @@ from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._types import ExtractionResult, Metadata
+from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -91,7 +93,13 @@ class PaddleOCRConfig:
     use_angle_cls: bool = True
     """Whether to use text orientation classification model."""
     use_gpu: bool = False
-    """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
+    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
+    device: DeviceType = "auto"
+    """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
+    gpu_memory_limit: float | None = None
+    """Maximum GPU memory to use in GB. None for no limit."""
+    fallback_to_cpu: bool = True
+    """Whether to fallback to CPU if requested device is unavailable."""
     use_space_char: bool = True
     """Whether to recognize spaces."""
     use_zero_copy_run: bool = False
@@ -117,6 +125,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         import numpy as np
         await self._init_paddle_ocr(**kwargs)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
         image_np = np.array(image)
         try:
             result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
@@ -145,7 +157,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
     @staticmethod
-    def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
+    def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
         """Process PaddleOCR result into an ExtractionResult with metadata.
         Args:
@@ -163,6 +175,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             if not page_result:
                 continue
+            # Group text boxes by lines based on Y coordinate  # ~keep
             sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
             line_groups: list[list[Any]] = []
             current_line: list[Any] = []
@@ -171,7 +184,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             for box in sorted_boxes:
                 box_points, (_, _) = box
                 current_y = sum(point[1] for point in box_points) / 4
-                min_box_distance = 20
+                min_box_distance = 20  # Minimum distance to consider as new line  # ~keep
                 if prev_y is None or abs(current_y - prev_y) > min_box_distance:
                     if current_line:
@@ -186,7 +199,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 line_groups.append(current_line)
             for line in line_groups:
-                line_sorted = sorted(line, key=lambda x: x[0][0][0])
+                line_sorted = sorted(line, key=lambda x: x[0][0][0])  # Sort boxes by X coordinate within line  # ~keep
                 for box in line_sorted:
                     _, (text, confidence) = box
@@ -197,7 +210,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 text_content += "\n"
-        width, height = image.size
+        if hasattr(image, "width") and hasattr(image, "height"):
+            width = image.width
+            height = image.height
+        else:
+            width, height = image.size
         metadata = Metadata(
             width=width,
             height=height,
@@ -248,19 +265,81 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             ) from e
         language = cls._validate_language_code(kwargs.pop("language", "en"))
+        device_info = cls._resolve_device_config(**kwargs)
+        use_gpu = device_info.device_type == "cuda"
         has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
         kwargs.setdefault("use_angle_cls", True)
-        kwargs.setdefault("use_gpu", has_gpu_package)
-        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
+        kwargs["use_gpu"] = use_gpu and has_gpu_package
+        kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
         kwargs.setdefault("det_db_thresh", 0.3)
         kwargs.setdefault("det_db_box_thresh", 0.5)
         kwargs.setdefault("det_db_unclip_ratio", 1.6)
+        if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
+            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
         try:
             cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
         except Exception as e:
             raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
+    @classmethod
+    def _resolve_device_config(cls, **kwargs: Unpack[PaddleOCRConfig]) -> DeviceInfo:
+        """Resolve device configuration with backward compatibility.
+        Args:
+            **kwargs: Configuration parameters including device settings.
+        Returns:
+            DeviceInfo object for the selected device.
+        Raises:
+            ValidationError: If requested device is not available and fallback is disabled.
+        """
+        use_gpu = kwargs.get("use_gpu", False)
+        device = kwargs.get("device", "auto")
+        memory_limit = kwargs.get("gpu_memory_limit")
+        fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
+        if use_gpu and device == "auto":
+            warnings.warn(
+                "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
+                "Use 'device=\"cuda\"' or 'device=\"auto\"' instead.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+            device = "auto" if use_gpu else "cpu"
+        elif use_gpu and device != "auto":
+            warnings.warn(
+                "Both 'use_gpu' and 'device' parameters specified. The 'use_gpu' parameter is deprecated. "
+                "Using 'device' parameter value.",
+                DeprecationWarning,
+                stacklevel=4,
+            )
+        if device == "mps":
+            warnings.warn(
+                "PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
+                UserWarning,
+                stacklevel=4,
+            )
+            device = "cpu"
+        try:
+            return validate_device_request(
+                device,
+                "PaddleOCR",
+                memory_limit=memory_limit,
+                fallback_to_cpu=fallback_to_cpu,
+            )
+        except ValidationError:
+            if not use_gpu and device == "cpu":
+                return DeviceInfo(device_type="cpu", name="CPU")
+            raise
     @staticmethod
     def _validate_language_code(lang_code: str) -> str:
         """Convert a language code to PaddleOCR format.

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import hashlib
 import re
 import sys
 from dataclasses import dataclass
@@ -144,7 +145,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "tel",
     "tgk",
     "tgl",
-    "tha",  # codespell:ignore
+    "tha",
     "tir",
     "ton",
     "tur",
@@ -153,7 +154,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "urd",
     "uzb",
     "uzb_cyrl",
-    "vie",  # codespell:ignore
+    "vie",
     "yid",
     "yor",
 }
@@ -227,62 +228,151 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         image: Image,
         **kwargs: Unpack[TesseractConfig],
     ) -> ExtractionResult:
-        await self._validate_tesseract_version()
-        image_path, unlink = await create_temp_file(".png")
-        await run_sync(image.save, str(image_path), format="PNG")
+        import io
+        from kreuzberg._utils._cache import get_ocr_cache
+        image_buffer = io.BytesIO()
+        await run_sync(image.save, image_buffer, format="PNG")
+        image_content = image_buffer.getvalue()
+        cache_kwargs = {
+            "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
+            "ocr_backend": "tesseract",
+            "ocr_config": str(sorted(kwargs.items())),
+        }
+        ocr_cache = get_ocr_cache()
+        cached_result = await ocr_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+        if ocr_cache.is_processing(**cache_kwargs):
+            import anyio
+            event = ocr_cache.mark_processing(**cache_kwargs)
+            await anyio.to_thread.run_sync(event.wait)
+            # Try cache again after waiting for other process to complete  # ~keep
+            cached_result = await ocr_cache.aget(**cache_kwargs)
+            if cached_result is not None:
+                return cached_result
+        ocr_cache.mark_processing(**cache_kwargs)
         try:
-            return await self.process_file(image_path, **kwargs)
+            await self._validate_tesseract_version()
+            image_path, unlink = await create_temp_file(".png")
+            await run_sync(image.save, str(image_path), format="PNG")
+            try:
+                result = await self.process_file(image_path, **kwargs)
+                await ocr_cache.aset(result, **cache_kwargs)
+                return result
+            finally:
+                await unlink()
         finally:
-            await unlink()
+            ocr_cache.mark_complete(**cache_kwargs)
     async def process_file(
         self,
         path: Path,
         **kwargs: Unpack[TesseractConfig],
     ) -> ExtractionResult:
-        await self._validate_tesseract_version()
-        output_path, unlink = await create_temp_file(".txt")
-        language = self._validate_language_code(kwargs.pop("language", "eng"))
-        psm = kwargs.pop("psm", PSMMode.AUTO)
+        from kreuzberg._utils._cache import get_ocr_cache
         try:
-            output_base = str(output_path).replace(".txt", "")
-            command = [
-                "tesseract",
-                str(path),
-                output_base,
-                "-l",
-                language,
-                "--psm",
-                str(psm.value),
-                "--oem",
-                "1",
-                "--loglevel",
-                "OFF",
-            ]
-            for kwarg, value in kwargs.items():
-                command.extend(["-c", f"{kwarg}={1 if value else 0}"])
-            env: dict[str, Any] | None = None
-            if sys.platform.startswith("linux"):
-                # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
-                env = {"OMP_THREAD_LIMIT": "1"}
-            result = await run_process(command, env=env)
-            if not result.returncode == 0:
-                raise OCRError(
-                    "OCR failed with a non-0 return code.",
-                    context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
+            stat = path.stat()
+            file_info = {
+                "path": str(path.resolve()),
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+            }
+        except OSError:
+            file_info = {
+                "path": str(path),
+                "size": 0,
+                "mtime": 0,
+            }
+        cache_kwargs = {
+            "file_info": str(sorted(file_info.items())),
+            "ocr_backend": "tesseract",
+            "ocr_config": str(sorted(kwargs.items())),
+        }
+        ocr_cache = get_ocr_cache()
+        cached_result = await ocr_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+        if ocr_cache.is_processing(**cache_kwargs):
+            import anyio
+            event = ocr_cache.mark_processing(**cache_kwargs)
+            await anyio.to_thread.run_sync(event.wait)
+            # Try cache again after waiting for other process to complete  # ~keep
+            cached_result = await ocr_cache.aget(**cache_kwargs)
+            if cached_result is not None:
+                return cached_result
+        ocr_cache.mark_processing(**cache_kwargs)
+        try:
+            await self._validate_tesseract_version()
+            output_path, unlink = await create_temp_file(".txt")
+            language = self._validate_language_code(kwargs.pop("language", "eng"))
+            psm = kwargs.pop("psm", PSMMode.AUTO)
+            try:
+                output_base = str(output_path).replace(".txt", "")
+                command = [
+                    "tesseract",
+                    str(path),
+                    output_base,
+                    "-l",
+                    language,
+                    "--psm",
+                    str(psm.value),
+                    "--oem",
+                    "1",
+                    "--loglevel",
+                    "OFF",
+                ]
+                for kwarg, value in kwargs.items():
+                    command.extend(["-c", f"{kwarg}={1 if value else 0}"])
+                env: dict[str, Any] | None = None
+                if sys.platform.startswith("linux"):
+                    env = {"OMP_THREAD_LIMIT": "1"}
+                result = await run_process(command, env=env)
+                if not result.returncode == 0:
+                    raise OCRError(
+                        "OCR failed with a non-0 return code.",
+                        context={
+                            "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
+                        },
+                    )
+                output = await AsyncPath(output_path).read_text("utf-8")
+                extraction_result = ExtractionResult(
+                    content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
                 )
-            output = await AsyncPath(output_path).read_text("utf-8")
-            return ExtractionResult(
-                content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-            )
-        except (RuntimeError, OSError) as e:
-            raise OCRError(f"Failed to OCR using tesseract: {e}") from e
+                final_cache_kwargs = cache_kwargs.copy()
+                final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
+                await ocr_cache.aset(extraction_result, **final_cache_kwargs)
+                return extraction_result
+            except (RuntimeError, OSError) as e:
+                raise OCRError(f"Failed to OCR using tesseract: {e}") from e
+            finally:
+                await unlink()
         finally:
-            await unlink()
+            ocr_cache.mark_complete(**cache_kwargs)
     @classmethod
     async def _validate_tesseract_version(cls) -> None:

kreuzberg/_playa.py CHANGED Viewed

@@ -274,3 +274,46 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
         if subtitle and "title" in result and subtitle != result["title"]:
             result["subtitle"] = subtitle
+def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
+    """Synchronous version of extract_pdf_metadata.
+    Extract metadata from a PDF document without using async/await.
+    Args:
+        pdf_content: The bytes of the PDF document.
+    Raises:
+        ParsingError: If the PDF metadata could not be extracted.
+    Returns:
+        A dictionary of metadata extracted from the PDF.
+    """
+    try:
+        document = parse(pdf_content, max_workers=1)
+        metadata: Metadata = {}
+        for raw_info in document.info:
+            pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
+            _extract_basic_metadata(pdf_info, metadata)
+            _extract_author_metadata(pdf_info, metadata)
+            _extract_keyword_metadata(pdf_info, metadata)
+            _extract_category_metadata(pdf_info, metadata)
+            _extract_date_metadata(pdf_info, metadata)
+            _extract_creator_metadata(pdf_info, metadata)
+        if document.pages:
+            _extract_document_dimensions(document, metadata)
+        if document.outline and "description" not in metadata:
+            metadata["description"] = _generate_outline_description(document)
+        if "summary" not in metadata:
+            metadata["summary"] = _generate_document_summary(document)
+        _extract_structure_information(document, metadata)
+        return metadata
+    except Exception as e:
+        raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e

kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

kreuzberg 3.1.7py3-none-any.whl → 3.3.0py3-none-any.whl