PyPI - kreuzberg - Versions diffs - 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl - Mend

kreuzberg 3.15.0py3-none-any.whl → 3.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

kreuzberg/__init__.py +6 -0
kreuzberg/_api/main.py +0 -53
kreuzberg/_config.py +17 -8
kreuzberg/_document_classification.py +1 -1
kreuzberg/_extractors/_base.py +0 -46
kreuzberg/_extractors/_email.py +16 -10
kreuzberg/_extractors/_html.py +39 -12
kreuzberg/_extractors/_pandoc.py +2 -2
kreuzberg/_extractors/_pdf.py +6 -7
kreuzberg/_extractors/_presentation.py +4 -0
kreuzberg/_extractors/_spread_sheet.py +0 -1
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +7 -2
kreuzberg/_mcp/server.py +1 -22
kreuzberg/_mime_types.py +1 -1
kreuzberg/_ocr/_easyocr.py +47 -20
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +27 -26
kreuzberg/_token_reduction/__init__.py +11 -0
kreuzberg/_token_reduction/_reducer.py +439 -0
kreuzberg/_token_reduction/_stopwords.py +116 -0
kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
kreuzberg/_types.py +146 -43
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +1 -1
kreuzberg/_utils/_ref.py +14 -6
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +15 -16
kreuzberg/exceptions.py +0 -1
kreuzberg/extraction.py +27 -11
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
kreuzberg-3.17.0.dist-info/RECORD +128 -0
kreuzberg-3.15.0.dist-info/RECORD +0 -60
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_structured.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import json
 import sys
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -17,11 +16,13 @@ try:
 except ImportError:  # pragma: no cover
     yaml = None
 from anyio import Path as AsyncPath
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
-from kreuzberg._types import ExtractionResult, normalize_metadata
+from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
+from kreuzberg._utils._serialization import deserialize
 from kreuzberg._utils._string import normalize_spaces, safe_decode
 from kreuzberg._utils._sync import run_sync
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
         "text/toml",
     }
+    @property
+    def _json_config(self) -> JSONExtractionConfig | None:
+        return self.config.json_config
+    def _get_text_field_keywords(self) -> frozenset[str]:
+        json_config = self._json_config
+        if json_config and json_config.custom_text_field_patterns:
+            return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
+        return _TEXT_FIELD_KEYWORDS
+    def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
+        json_config = self._json_config
+        if not json_config or not json_config.extract_schema:
+            return {}
+        if depth >= json_config.max_depth:
+            return {"max_depth_reached": True}
+        schema_info: dict[str, Any] = {"type": type(data).__name__}
+        if isinstance(data, dict):
+            schema_info["properties"] = {}
+            for key, value in data.items():
+                key_path = f"{path}.{key}" if path else key
+                schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
+        elif isinstance(data, list) and data:
+            if len(data) <= json_config.array_item_limit:
+                schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
+                schema_info["length"] = len(data)
+            else:
+                schema_info["items"] = {"type": "truncated"}
+                schema_info["length"] = len(data)
+                schema_info["truncated"] = True
+        return schema_info
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
         return await run_sync(self.extract_bytes_sync, content)
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
         return await self.extract_bytes_async(content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        text_content = safe_decode(content)
+        text_content: None | str = None
         try:
             if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
-                data = json.loads(text_content)
+                data = deserialize(content, dict, json=True)
             elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
+                text_content = safe_decode(content)
                 if tomllib is None:
                     return ExtractionResult(
                         content=normalize_spaces(text_content),
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
                     )
                 data = tomllib.loads(text_content)
             else:
+                text_content = safe_decode(content)
                 if yaml is None:
                     return ExtractionResult(
                         content=normalize_spaces(text_content),
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
                     )
                 data = yaml.safe_load(text_content)
-            text_parts: list[str] = []
             metadata: dict[str, Any] = {}
+            if (
+                self.mime_type in {JSON_MIME_TYPE, "text/json"}
+                and self._json_config
+                and self._json_config.extract_schema
+            ):
+                schema_info = self._extract_json_schema(data)
+                if schema_info:
+                    metadata["json_schema"] = schema_info
             if isinstance(data, dict):
                 text_parts = self._extract_from_dict(data, metadata)
             elif isinstance(data, list):
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
             else:
                 text_parts = [str(data)]
-            combined_text = "\n".join(text_parts) if text_parts else text_content
+            combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
             return ExtractionResult(
                 content=normalize_spaces(combined_text),
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
         except (ValueError, TypeError) as e:
             return ExtractionResult(
-                content=normalize_spaces(text_content),
+                content=normalize_spaces(text_content or safe_decode(content)),
                 mime_type=PLAIN_TEXT_MIME_TYPE,
                 metadata={"parse_error": str(e)},
                 chunks=[],
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
             full_key = f"{prefix}.{key}" if prefix else key
             if isinstance(value, str) and value.strip():
-                text_parts.append(f"{full_key}: {value}")
+                if self._json_config and self._json_config.include_type_info:
+                    text_parts.append(f"{full_key} (string): {value}")
+                else:
+                    text_parts.append(f"{full_key}: {value}")
                 key_lower = key.lower()
-                if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
+                text_field_keywords = self._get_text_field_keywords()
+                if any(keyword in key_lower for keyword in text_field_keywords):
                     metadata[full_key] = value
             elif isinstance(value, (int, float, bool)):
-                text_parts.append(f"{full_key}: {value}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(value).__name__
+                    text_parts.append(f"{full_key} ({type_name}): {value}")
+                else:
+                    text_parts.append(f"{full_key}: {value}")
             elif isinstance(value, dict):
-                text_parts.extend(self._extract_from_dict(value, metadata, full_key))
+                if self._json_config and not self._json_config.flatten_nested_objects:
+                    text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
+                else:
+                    text_parts.extend(self._extract_from_dict(value, metadata, full_key))
             elif isinstance(value, list):
                 text_parts.extend(self._extract_from_list(value, metadata, full_key))
             elif value is not None:
-                text_parts.append(f"{full_key}: {value!s}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(value).__name__
+                    text_parts.append(f"{full_key} ({type_name}): {value!s}")
+                else:
+                    text_parts.append(f"{full_key}: {value!s}")
         return text_parts
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
             item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
             if isinstance(item, str) and item.strip():
-                text_parts.append(f"{item_key}: {item}")
+                if self._json_config and self._json_config.include_type_info:
+                    text_parts.append(f"{item_key} (string): {item}")
+                else:
+                    text_parts.append(f"{item_key}: {item}")
             elif isinstance(item, dict):
                 text_parts.extend(self._extract_from_dict(item, metadata, item_key))
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
                 text_parts.extend(self._extract_from_list(item, metadata, item_key))
             elif item is not None:
-                text_parts.append(f"{item_key}: {item!s}")
+                if self._json_config and self._json_config.include_type_info:
+                    type_name = type(item).__name__
+                    text_parts.append(f"{item_key} ({type_name}): {item!s}")
+                else:
+                    text_parts.append(f"{item_key}: {item!s}")
         return text_parts

kreuzberg/_gmft.py CHANGED Viewed

@@ -99,7 +99,7 @@ async def extract_tables(
             "size": stat.st_size,
             "mtime": stat.st_mtime,
         }
-    except OSError:
+    except OSError:  # pragma: no cover
         file_info = {
             "path": str(path),
             "size": 0,
@@ -215,7 +215,7 @@ def extract_tables_sync(
             "size": stat.st_size,
             "mtime": stat.st_mtime,
         }
-    except OSError:
+    except OSError:  # pragma: no cover
         file_info = {
             "path": str(path),
             "size": 0,
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
         from gmft.formatters.tatr import TATRFormatConfig  # noqa: PLC0415
         from gmft.pdf_bindings.pdfium import PyPDFium2Document  # noqa: PLC0415
+        if "cell_required_confidence" in config_dict:
+            cell_config = config_dict["cell_required_confidence"]
+            if isinstance(cell_config, dict) and cell_config:
+                config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
         config = GMFTConfig(**config_dict)
         formatter = AutoTableFormatter(  # type: ignore[no-untyped-call]

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -22,7 +22,6 @@ from kreuzberg.extraction import (
 mcp = FastMCP("Kreuzberg Text Extraction")
-# Security and performance limits
 MAX_BATCH_SIZE = 100
@@ -40,13 +39,12 @@ def _validate_file_path(file_path: str) -> Path:
     """
     try:
         path = Path(file_path).resolve()
-    except (OSError, ValueError) as e:
+    except (OSError, ValueError) as e:  # pragma: no cover
         raise ValidationError(
             f"Invalid file path: {file_path}",
             context={"file_path": file_path, "error": str(e)},
         ) from e
-    # Check for path traversal attempts
     if ".." in file_path and not file_path.startswith("/"):
         raise ValidationError(
             "Path traversal detected in file path",
@@ -73,7 +71,6 @@ def _validate_file_path_with_context(file_path: str, index: int, total: int) ->
     try:
         return _validate_file_path(file_path)
     except ValidationError as e:
-        # Add context about which file in the batch failed
         e.context = e.context or {}
         e.context["batch_index"] = index
         e.context["total_files"] = total
@@ -99,7 +96,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
             context={"context": context_info},
         )
-    # Check for whitespace-only content
     if not content_base64.strip():
         raise ValidationError(
             "Base64 content cannot be whitespace only",
@@ -126,7 +122,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
 def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
     base_config = discover_config()
-    # Extract Tesseract-specific parameters from kwargs first
     tesseract_lang = kwargs.pop("tesseract_lang", None)
     tesseract_psm = kwargs.pop("tesseract_psm", None)
     tesseract_output_format = kwargs.pop("tesseract_output_format", None)
@@ -151,7 +146,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
         }
         config_dict = config_dict | kwargs
-    # Handle Tesseract OCR configuration
     ocr_backend = config_dict.get("ocr_backend")
     if ocr_backend == "tesseract" and (
         tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
@@ -174,10 +168,8 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
             tesseract_config_dict["enable_table_detection"] = True
         if tesseract_config_dict:
-            # Merge with existing tesseract config if present
             existing_ocr_config = config_dict.get("ocr_config")
             if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
-                # Convert existing config to dict, merge, and recreate
                 existing_dict = existing_ocr_config.to_dict()
                 merged_dict = existing_dict | tesseract_config_dict
                 config_dict["ocr_config"] = TesseractConfig(**merged_dict)
@@ -206,7 +198,6 @@ def extract_document(  # noqa: PLR0913
     tesseract_output_format: str | None = None,
     enable_table_detection: bool | None = None,
 ) -> dict[str, Any]:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         force_ocr=force_ocr,
@@ -289,7 +280,6 @@ def batch_extract_document(  # noqa: PLR0913
     tesseract_output_format: str | None = None,
     enable_table_detection: bool | None = None,
 ) -> list[dict[str, Any]]:
-    # Validate batch size
     if len(file_paths) > MAX_BATCH_SIZE:
         raise ValidationError(
             f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -302,7 +292,6 @@ def batch_extract_document(  # noqa: PLR0913
             context={"file_paths": file_paths},
         )
-    # Validate all file paths for security
     validated_paths = []
     for i, file_path in enumerate(file_paths):
         validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
@@ -346,7 +335,6 @@ def batch_extract_bytes(  # noqa: PLR0913
     tesseract_output_format: str | None = None,
     enable_table_detection: bool | None = None,
 ) -> list[dict[str, Any]]:
-    # Validate input
     if not content_items:
         raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
@@ -355,7 +343,6 @@ def batch_extract_bytes(  # noqa: PLR0913
             "content_items must be a list", context={"content_items_type": type(content_items).__name__}
         )
-    # Validate batch size
     if len(content_items) > MAX_BATCH_SIZE:
         raise ValidationError(
             f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -379,17 +366,14 @@ def batch_extract_bytes(  # noqa: PLR0913
         enable_table_detection=enable_table_detection,
     )
-    # Convert list of dicts to list of tuples (bytes, mime_type)
     contents = []
     for i, item in enumerate(content_items):
-        # Validate item structure
         if not isinstance(item, dict):
             raise ValidationError(
                 f"Item at index {i} must be a dictionary",
                 context={"item_index": i, "item_type": type(item).__name__, "item": item},
             )
-        # Check for required keys
         if "content_base64" not in item:
             raise ValidationError(
                 f"Item at index {i} is missing required key 'content_base64'",
@@ -405,11 +389,9 @@ def batch_extract_bytes(  # noqa: PLR0913
         content_base64 = item["content_base64"]
         mime_type = item["mime_type"]
-        # Validate base64 content
         try:
             content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
         except ValidationError as e:
-            # Add batch-specific context
             e.context = e.context or {}
             e.context["item_index"] = i
             e.context["total_items"] = len(content_items)
@@ -426,7 +408,6 @@ def extract_simple(
     file_path: str,
     mime_type: str | None = None,
 ) -> str:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides()
     result = extract_file_sync(str(validated_path), mime_type, config)
@@ -467,7 +448,6 @@ def get_supported_formats() -> str:
 @mcp.prompt()
 def extract_and_summarize(file_path: str) -> list[TextContent]:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
@@ -481,7 +461,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
 @mcp.prompt()
 def extract_structured(file_path: str) -> list[TextContent]:
-    # Validate file path for security
     validated_path = _validate_file_path(file_path)
     config = _create_config_with_overrides(
         extract_entities=True,

kreuzberg/_mime_types.py CHANGED Viewed

@@ -229,7 +229,7 @@ def validate_mime_type(
                 "mtime": stat.st_mtime if stat else 0,
                 "check_file_exists": check_file_exists,
             }
-        except OSError:
+        except OSError:  # pragma: no cover
             file_info = {
                 "path": str(path),
                 "size": 0,

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -33,22 +33,37 @@ except ImportError:  # pragma: no cover
 if TYPE_CHECKING:
     import easyocr
-    import numpy as np
     import torch
+else:
+    easyocr: Any = None
+    torch: Any = None
+HAS_EASYOCR: bool = False
+def _import_easyocr() -> tuple[Any, Any]:
+    global HAS_EASYOCR, easyocr, torch
+    if easyocr is not None:
+        return easyocr, torch
+    if not HAS_EASYOCR and easyocr is None:
+        return None, None
-HAS_EASYOCR: bool
-if not TYPE_CHECKING:
     try:
-        import easyocr
-        import numpy as np
-        import torch
+        import easyocr as _easyocr  # noqa: PLC0415
+        try:
+            import torch as _torch  # noqa: PLC0415
+        except ImportError:  # pragma: no cover
+            _torch = None  # type: ignore[assignment]
+        easyocr = _easyocr
+        torch = _torch
         HAS_EASYOCR = True
-    except ImportError:
-        HAS_EASYOCR = False
-        easyocr: Any = None
-        np: Any = None
-        torch: Any = None
+        return easyocr, torch
+    except ImportError:  # pragma: no cover
+        return None, None
 EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -142,6 +157,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     _reader: ClassVar[Any] = None
     async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        try:
+            import numpy as np  # noqa: PLC0415
+        except ImportError as e:  # pragma: no cover
+            raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
         cache_kwargs = None
@@ -292,7 +312,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _is_gpu_available(cls) -> bool:
-        if not HAS_EASYOCR or torch is None:
+        if torch is None:
             return False
         return bool(torch.cuda.is_available())
@@ -301,13 +321,14 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        if not HAS_EASYOCR or easyocr is None:
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        easyocr_module, _ = _import_easyocr()
+        if easyocr_module is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             )
-        languages = cls._validate_language_code(kwargs.pop("language", "en"))
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -318,7 +339,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         try:
             cls._reader = await run_sync(
-                easyocr.Reader,
+                easyocr_module.Reader,
                 languages,
                 gpu=use_gpu,
                 verbose=False,
@@ -382,6 +403,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         return languages
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
+        try:
+            import numpy as np  # noqa: PLC0415
+        except ImportError as e:  # pragma: no cover
+            raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
         cache_kwargs = None
@@ -453,13 +479,14 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        if not HAS_EASYOCR or easyocr is None:
+        languages = cls._validate_language_code(kwargs.pop("language", "en"))
+        easyocr_module, _ = _import_easyocr()
+        if easyocr_module is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
             )
-        languages = cls._validate_language_code(kwargs.pop("language", "en"))
         device_info = cls._resolve_device_config(**kwargs)
         use_gpu = device_info.device_type in ("cuda", "mps")
@@ -469,7 +496,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         kwargs.setdefault("recog_network", "standard")
         try:
-            cls._reader = easyocr.Reader(
+            cls._reader = easyocr_module.Reader(
                 languages,
                 gpu=use_gpu,
                 verbose=False,

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -60,7 +60,7 @@ def _import_paddleocr() -> tuple[Any, Any]:
         PaddleOCR = _PaddleOCR
         HAS_PADDLEOCR = True
         return np, PaddleOCR
-    except ImportError:
+    except ImportError:  # pragma: no cover
         return None, None

kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

kreuzberg 3.15.0py3-none-any.whl → 3.17.0py3-none-any.whl