PyPI - kreuzberg - Versions diffs - 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl - Mend

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +6 -12
kreuzberg/_ocr/_paddleocr.py +15 -13
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_types.py +4 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +10 -27
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
kreuzberg-3.4.0.dist-info/METADATA +290 -0
kreuzberg-3.4.0.dist-info/RECORD +50 -0
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.2.0.dist-info/METADATA +0 -166
kreuzberg-3.2.0.dist-info/RECORD +0 -34
kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
{kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -57,7 +57,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "hr",
     "hu",
     "id",
-    "inh",  # codespell:ignore
+    "inh",
     "is",
     "it",
     "ja",
@@ -97,7 +97,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "sw",
     "ta",
     "tab",
-    "te",  # codespell:ignore
+    "te",
     "th",
     "tjk",
     "tl",
@@ -261,11 +261,12 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
                 content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
             )
+        # Group text boxes by lines based on Y coordinate  # ~keep
         sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
         line_groups: list[list[Any]] = []
         current_line: list[Any] = []
         prev_y_center: float | None = None
-        line_height_threshold = 20
+        line_height_threshold = 20  # Minimum distance to consider as new line  # ~keep
         for item in sorted_results:
             box, text, confidence = item
@@ -288,7 +289,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         confidence_count = 0
         for line in line_groups:
-            line_sorted = sorted(line, key=lambda x: x[0][0][0])
+            line_sorted = sorted(line, key=lambda x: x[0][0][0])  # Sort boxes by X coordinate within line  # ~keep
             for item in line_sorted:
                 _, text, confidence = item
@@ -345,7 +346,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
-        # Handle device selection with backward compatibility
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -377,13 +377,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         Raises:
             ValidationError: If requested device is not available and fallback is disabled.
         """
-        # Handle deprecated use_gpu parameter
         use_gpu = kwargs.get("use_gpu", False)
         device = kwargs.get("device", "auto")
         memory_limit = kwargs.get("gpu_memory_limit")
         fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
-        # Check for deprecated parameter usage
         if use_gpu and device == "auto":
             warnings.warn(
                 "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
@@ -391,7 +389,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
                 DeprecationWarning,
                 stacklevel=4,
             )
-            # Convert deprecated use_gpu=True to device="auto"
             device = "auto" if use_gpu else "cpu"
         elif use_gpu and device != "auto":
             warnings.warn(
@@ -401,7 +399,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
                 stacklevel=4,
             )
-        # Validate and get device info
         try:
             return validate_device_request(
                 device,
@@ -410,7 +407,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
                 fallback_to_cpu=fallback_to_cpu,
             )
         except ValidationError:
-            # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
             if not use_gpu and device == "cpu":
                 return DeviceInfo(device_type="cpu", name="CPU")
             raise
@@ -429,10 +425,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
             A list with the normalized language codes.
         """
         if isinstance(language_codes, str):
-            # Handle comma-separated language codes
             languages = [lang.strip().lower() for lang in language_codes.split(",")]
         else:
-            # Handle list of language codes
             languages = [lang.lower() for lang in language_codes]
         unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -125,6 +125,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         import numpy as np
         await self._init_paddle_ocr(**kwargs)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
         image_np = np.array(image)
         try:
             result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
@@ -153,7 +157,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
     @staticmethod
-    def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
+    def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
         """Process PaddleOCR result into an ExtractionResult with metadata.
         Args:
@@ -171,6 +175,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             if not page_result:
                 continue
+            # Group text boxes by lines based on Y coordinate  # ~keep
             sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
             line_groups: list[list[Any]] = []
             current_line: list[Any] = []
@@ -179,7 +184,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             for box in sorted_boxes:
                 box_points, (_, _) = box
                 current_y = sum(point[1] for point in box_points) / 4
-                min_box_distance = 20
+                min_box_distance = 20  # Minimum distance to consider as new line  # ~keep
                 if prev_y is None or abs(current_y - prev_y) > min_box_distance:
                     if current_line:
@@ -194,7 +199,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 line_groups.append(current_line)
             for line in line_groups:
-                line_sorted = sorted(line, key=lambda x: x[0][0][0])
+                line_sorted = sorted(line, key=lambda x: x[0][0][0])  # Sort boxes by X coordinate within line  # ~keep
                 for box in line_sorted:
                     _, (text, confidence) = box
@@ -205,7 +210,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 text_content += "\n"
-        width, height = image.size
+        if hasattr(image, "width") and hasattr(image, "height"):
+            width = image.width
+            height = image.height
+        else:
+            width, height = image.size
         metadata = Metadata(
             width=width,
             height=height,
@@ -257,7 +266,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         language = cls._validate_language_code(kwargs.pop("language", "en"))
-        # Handle device selection with backward compatibility
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type == "cuda"
@@ -269,9 +277,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         kwargs.setdefault("det_db_box_thresh", 0.5)
         kwargs.setdefault("det_db_unclip_ratio", 1.6)
-        # Set GPU memory limit if specified
         if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
-            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)  # Convert GB to MB
+            kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
         try:
             cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
@@ -291,13 +298,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         Raises:
             ValidationError: If requested device is not available and fallback is disabled.
         """
-        # Handle deprecated use_gpu parameter
         use_gpu = kwargs.get("use_gpu", False)
         device = kwargs.get("device", "auto")
         memory_limit = kwargs.get("gpu_memory_limit")
         fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
-        # Check for deprecated parameter usage
         if use_gpu and device == "auto":
             warnings.warn(
                 "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
@@ -305,7 +310,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 DeprecationWarning,
                 stacklevel=4,
             )
-            # Convert deprecated use_gpu=True to device="auto"
             device = "auto" if use_gpu else "cpu"
         elif use_gpu and device != "auto":
             warnings.warn(
@@ -315,7 +320,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 stacklevel=4,
             )
-        # PaddlePaddle doesn't support MPS, so warn if requested
         if device == "mps":
             warnings.warn(
                 "PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
@@ -324,7 +328,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
             )
             device = "cpu"
-        # Validate and get device info
         try:
             return validate_device_request(
                 device,
@@ -333,7 +336,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
                 fallback_to_cpu=fallback_to_cpu,
             )
         except ValidationError:
-            # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
             if not use_gpu and device == "cpu":
                 return DeviceInfo(device_type="cpu", name="CPU")
             raise

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import hashlib
 import re
 import sys
 from dataclasses import dataclass
@@ -144,7 +145,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "tel",
     "tgk",
     "tgl",
-    "tha",  # codespell:ignore
+    "tha",
     "tir",
     "ton",
     "tur",
@@ -153,7 +154,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
     "urd",
     "uzb",
     "uzb_cyrl",
-    "vie",  # codespell:ignore
+    "vie",
     "yid",
     "yor",
 }
@@ -227,62 +228,151 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         image: Image,
         **kwargs: Unpack[TesseractConfig],
     ) -> ExtractionResult:
-        await self._validate_tesseract_version()
-        image_path, unlink = await create_temp_file(".png")
-        await run_sync(image.save, str(image_path), format="PNG")
+        import io
+        from kreuzberg._utils._cache import get_ocr_cache
+        image_buffer = io.BytesIO()
+        await run_sync(image.save, image_buffer, format="PNG")
+        image_content = image_buffer.getvalue()
+        cache_kwargs = {
+            "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
+            "ocr_backend": "tesseract",
+            "ocr_config": str(sorted(kwargs.items())),
+        }
+        ocr_cache = get_ocr_cache()
+        cached_result = await ocr_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+        if ocr_cache.is_processing(**cache_kwargs):
+            import anyio
+            event = ocr_cache.mark_processing(**cache_kwargs)
+            await anyio.to_thread.run_sync(event.wait)
+            # Try cache again after waiting for other process to complete  # ~keep
+            cached_result = await ocr_cache.aget(**cache_kwargs)
+            if cached_result is not None:
+                return cached_result
+        ocr_cache.mark_processing(**cache_kwargs)
         try:
-            return await self.process_file(image_path, **kwargs)
+            await self._validate_tesseract_version()
+            image_path, unlink = await create_temp_file(".png")
+            await run_sync(image.save, str(image_path), format="PNG")
+            try:
+                result = await self.process_file(image_path, **kwargs)
+                await ocr_cache.aset(result, **cache_kwargs)
+                return result
+            finally:
+                await unlink()
         finally:
-            await unlink()
+            ocr_cache.mark_complete(**cache_kwargs)
     async def process_file(
         self,
         path: Path,
         **kwargs: Unpack[TesseractConfig],
     ) -> ExtractionResult:
-        await self._validate_tesseract_version()
-        output_path, unlink = await create_temp_file(".txt")
-        language = self._validate_language_code(kwargs.pop("language", "eng"))
-        psm = kwargs.pop("psm", PSMMode.AUTO)
+        from kreuzberg._utils._cache import get_ocr_cache
         try:
-            output_base = str(output_path).replace(".txt", "")
-            command = [
-                "tesseract",
-                str(path),
-                output_base,
-                "-l",
-                language,
-                "--psm",
-                str(psm.value),
-                "--oem",
-                "1",
-                "--loglevel",
-                "OFF",
-            ]
-            for kwarg, value in kwargs.items():
-                command.extend(["-c", f"{kwarg}={1 if value else 0}"])
-            env: dict[str, Any] | None = None
-            if sys.platform.startswith("linux"):
-                # we have to prevent multithreading this way otherwise we will get deadlocks
-                env = {"OMP_THREAD_LIMIT": "1"}
-            result = await run_process(command, env=env)
-            if not result.returncode == 0:
-                raise OCRError(
-                    "OCR failed with a non-0 return code.",
-                    context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
+            stat = path.stat()
+            file_info = {
+                "path": str(path.resolve()),
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+            }
+        except OSError:
+            file_info = {
+                "path": str(path),
+                "size": 0,
+                "mtime": 0,
+            }
+        cache_kwargs = {
+            "file_info": str(sorted(file_info.items())),
+            "ocr_backend": "tesseract",
+            "ocr_config": str(sorted(kwargs.items())),
+        }
+        ocr_cache = get_ocr_cache()
+        cached_result = await ocr_cache.aget(**cache_kwargs)
+        if cached_result is not None:
+            return cached_result
+        if ocr_cache.is_processing(**cache_kwargs):
+            import anyio
+            event = ocr_cache.mark_processing(**cache_kwargs)
+            await anyio.to_thread.run_sync(event.wait)
+            # Try cache again after waiting for other process to complete  # ~keep
+            cached_result = await ocr_cache.aget(**cache_kwargs)
+            if cached_result is not None:
+                return cached_result
+        ocr_cache.mark_processing(**cache_kwargs)
+        try:
+            await self._validate_tesseract_version()
+            output_path, unlink = await create_temp_file(".txt")
+            language = self._validate_language_code(kwargs.pop("language", "eng"))
+            psm = kwargs.pop("psm", PSMMode.AUTO)
+            try:
+                output_base = str(output_path).replace(".txt", "")
+                command = [
+                    "tesseract",
+                    str(path),
+                    output_base,
+                    "-l",
+                    language,
+                    "--psm",
+                    str(psm.value),
+                    "--oem",
+                    "1",
+                    "--loglevel",
+                    "OFF",
+                ]
+                for kwarg, value in kwargs.items():
+                    command.extend(["-c", f"{kwarg}={1 if value else 0}"])
+                env: dict[str, Any] | None = None
+                if sys.platform.startswith("linux"):
+                    env = {"OMP_THREAD_LIMIT": "1"}
+                result = await run_process(command, env=env)
+                if not result.returncode == 0:
+                    raise OCRError(
+                        "OCR failed with a non-0 return code.",
+                        context={
+                            "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
+                        },
+                    )
+                output = await AsyncPath(output_path).read_text("utf-8")
+                extraction_result = ExtractionResult(
+                    content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
                 )
-            output = await AsyncPath(output_path).read_text("utf-8")
-            return ExtractionResult(
-                content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-            )
-        except (RuntimeError, OSError) as e:
-            raise OCRError(f"Failed to OCR using tesseract: {e}") from e
+                final_cache_kwargs = cache_kwargs.copy()
+                final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
+                await ocr_cache.aset(extraction_result, **final_cache_kwargs)
+                return extraction_result
+            except (RuntimeError, OSError) as e:
+                raise OCRError(f"Failed to OCR using tesseract: {e}") from e
+            finally:
+                await unlink()
         finally:
-            await unlink()
+            ocr_cache.mark_complete(**cache_kwargs)
     @classmethod
     async def _validate_tesseract_version(cls) -> None:

kreuzberg/_playa.py CHANGED Viewed

@@ -274,3 +274,46 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
         if subtitle and "title" in result and subtitle != result["title"]:
             result["subtitle"] = subtitle
+def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
+    """Synchronous version of extract_pdf_metadata.
+    Extract metadata from a PDF document without using async/await.
+    Args:
+        pdf_content: The bytes of the PDF document.
+    Raises:
+        ParsingError: If the PDF metadata could not be extracted.
+    Returns:
+        A dictionary of metadata extracted from the PDF.
+    """
+    try:
+        document = parse(pdf_content, max_workers=1)
+        metadata: Metadata = {}
+        for raw_info in document.info:
+            pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
+            _extract_basic_metadata(pdf_info, metadata)
+            _extract_author_metadata(pdf_info, metadata)
+            _extract_keyword_metadata(pdf_info, metadata)
+            _extract_category_metadata(pdf_info, metadata)
+            _extract_date_metadata(pdf_info, metadata)
+            _extract_creator_metadata(pdf_info, metadata)
+        if document.pages:
+            _extract_document_dimensions(document, metadata)
+        if document.outline and "description" not in metadata:
+            metadata["description"] = _generate_outline_description(document)
+        if "summary" not in metadata:
+            metadata["summary"] = _generate_document_summary(document)
+        _extract_structure_information(document, metadata)
+        return metadata
+    except Exception as e:
+        raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e

kreuzberg/_types.py CHANGED Viewed

@@ -114,6 +114,10 @@ class ExtractionResult:
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    def to_dict(self) -> dict[str, Any]:
+        """Converts the ExtractionResult to a dictionary."""
+        return asdict(self)
 PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
 ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]

kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

kreuzberg 3.2.0py3-none-any.whl → 3.4.0py3-none-any.whl