PyPI - kreuzberg - Versions diffs - 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl - Mend

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +0 -124
kreuzberg/_document_classification.py +20 -39
kreuzberg/_entity_extraction.py +0 -29
kreuzberg/_extractors/_base.py +4 -66
kreuzberg/_extractors/_email.py +0 -4
kreuzberg/_extractors/_image.py +0 -2
kreuzberg/_extractors/_pandoc.py +0 -58
kreuzberg/_extractors/_pdf.py +0 -3
kreuzberg/_extractors/_presentation.py +0 -82
kreuzberg/_extractors/_spread_sheet.py +0 -2
kreuzberg/_gmft.py +0 -61
kreuzberg/_language_detection.py +0 -14
kreuzberg/_mime_types.py +0 -17
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +110 -85
kreuzberg/_ocr/_paddleocr.py +146 -138
kreuzberg/_ocr/_table_extractor.py +0 -76
kreuzberg/_ocr/_tesseract.py +0 -206
kreuzberg/_playa.py +0 -27
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +16 -119
kreuzberg/_utils/_cache.py +0 -52
kreuzberg/_utils/_device.py +0 -56
kreuzberg/_utils/_document_cache.py +0 -73
kreuzberg/_utils/_errors.py +0 -47
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -14
kreuzberg/_utils/_process_pool.py +0 -47
kreuzberg/_utils/_quality.py +0 -17
kreuzberg/_utils/_ref.py +0 -16
kreuzberg/_utils/_serialization.py +0 -25
kreuzberg/_utils/_string.py +0 -20
kreuzberg/_utils/_sync.py +0 -76
kreuzberg/_utils/_table.py +0 -45
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +2 -2
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
kreuzberg-3.13.2.dist-info/RECORD +57 -0
kreuzberg-3.13.0.dist-info/RECORD +0 -56
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_types.py CHANGED Viewed

@@ -35,18 +35,7 @@ OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
 class ConfigDict:
-    """Abstract base class for configuration objects that can be converted to dictionaries."""
     def to_dict(self, include_none: bool = False) -> dict[str, Any]:
-        """Convert configuration to dictionary.
-        Args:
-            include_none: If True, include fields with None values.
-                         If False (default), exclude None values.
-        Returns:
-            Dictionary representation of the configuration.
-        """
         result = msgspec.to_builtins(
             self,
             builtin_types=(type(None),),
@@ -60,8 +49,6 @@ class ConfigDict:
 class PSMMode(Enum):
-    """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
     OSD_ONLY = 0
     """Orientation and script detection only."""
     AUTO_OSD = 1
@@ -88,8 +75,6 @@ class PSMMode(Enum):
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class TesseractConfig(ConfigDict):
-    """Configuration options for Tesseract OCR engine."""
     classify_use_pre_adapted_templates: bool = True
     """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
     language: str = "eng"
@@ -132,8 +117,6 @@ class TesseractConfig(ConfigDict):
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class EasyOCRConfig(ConfigDict):
-    """Configuration options for EasyOCR."""
     add_margin: float = 0.1
     """Extend bounding boxes in all directions."""
     adjust_contrast: float = 0.5
@@ -185,21 +168,16 @@ class EasyOCRConfig(ConfigDict):
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class PaddleOCRConfig(ConfigDict):
-    """Configuration options for PaddleOCR.
-    This dataclass provides type hints and documentation for all PaddleOCR parameters.
-    """
     cls_image_shape: str = "3,48,192"
     """Image shape for classification algorithm in format 'channels,height,width'."""
     det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
     """Detection algorithm."""
     det_db_box_thresh: float = 0.5
-    """Score threshold for detected boxes. Boxes below this value are discarded."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_box_thresh' instead. Score threshold for detected boxes."""
     det_db_thresh: float = 0.3
-    """Binarization threshold for DB output map."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_thresh' instead. Binarization threshold for DB output map."""
     det_db_unclip_ratio: float = 2.0
-    """Expansion ratio for detected text boxes."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_unclip_ratio' instead. Expansion ratio for detected text boxes."""
     det_east_cover_thresh: float = 0.1
     """Score threshold for EAST output boxes."""
     det_east_nms_thresh: float = 0.2
@@ -215,7 +193,7 @@ class PaddleOCRConfig(ConfigDict):
     enable_mkldnn: bool = False
     """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
     gpu_mem: int = 8000
-    """GPU memory size (in MB) to use for initialization."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. GPU memory size (in MB) to use for initialization."""
     language: str = "en"
     """Language to use for OCR."""
     max_text_length: int = 25
@@ -245,13 +223,13 @@ class PaddleOCRConfig(ConfigDict):
     table: bool = True
     """Whether to enable table recognition."""
     use_angle_cls: bool = True
-    """Whether to use text orientation classification model."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Use 'use_textline_orientation' instead. Whether to use text orientation classification model."""
     use_gpu: bool = False
-    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. Use hardware acceleration flags instead."""
     device: DeviceType = "auto"
     """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
     gpu_memory_limit: float | None = None
-    """Maximum GPU memory to use in GB. None for no limit."""
+    """DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. Maximum GPU memory to use in GB."""
     fallback_to_cpu: bool = True
     """Whether to fallback to CPU if requested device is unavailable."""
     use_space_char: bool = True
@@ -259,14 +237,18 @@ class PaddleOCRConfig(ConfigDict):
     use_zero_copy_run: bool = False
     """Whether to enable zero_copy_run for inference optimization."""
+    text_det_thresh: float = 0.3
+    """Binarization threshold for text detection output map (replaces det_db_thresh)."""
+    text_det_box_thresh: float = 0.5
+    """Score threshold for detected text boxes (replaces det_db_box_thresh)."""
+    text_det_unclip_ratio: float = 2.0
+    """Expansion ratio for detected text boxes (replaces det_db_unclip_ratio)."""
+    use_textline_orientation: bool = True
+    """Whether to use text line orientation classification model (replaces use_angle_cls)."""
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class GMFTConfig(ConfigDict):
-    """Configuration options for GMFT table extraction.
-    This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
-    """
     verbosity: int = 0
     """
     Verbosity level for logging.
@@ -369,8 +351,6 @@ class GMFTConfig(ConfigDict):
 @dataclass(frozen=True, slots=True)
 class LanguageDetectionConfig(ConfigDict):
-    """Configuration for language detection."""
     low_memory: bool = True
     """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
     Defaults to True for better memory efficiency."""
@@ -387,8 +367,6 @@ class LanguageDetectionConfig(ConfigDict):
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class SpacyEntityExtractionConfig(ConfigDict):
-    """Configuration for spaCy-based entity extraction."""
     model_cache_dir: str | Path | None = None
     """Directory to cache spaCy models. If None, uses spaCy's default."""
     language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
@@ -450,7 +428,6 @@ class SpacyEntityExtractionConfig(ConfigDict):
         }
     def get_model_for_language(self, language_code: str) -> str | None:
-        """Get the appropriate spaCy model for a language code."""
         if not self.language_models:
             return None
@@ -466,13 +443,10 @@ class SpacyEntityExtractionConfig(ConfigDict):
         return None
     def get_fallback_model(self) -> str | None:
-        """Get fallback multilingual model if enabled."""
         return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
 class BoundingBox(TypedDict):
-    """Bounding box coordinates for text elements."""
     left: int
     """X coordinate of the left edge."""
     top: int
@@ -484,8 +458,6 @@ class BoundingBox(TypedDict):
 class TSVWord(TypedDict):
-    """Represents a word from Tesseract TSV output."""
     level: int
     """Hierarchy level (1=page, 2=block, 3=para, 4=line, 5=word)."""
     page_num: int
@@ -513,8 +485,6 @@ class TSVWord(TypedDict):
 class TableCell(TypedDict):
-    """Represents a cell in a reconstructed table."""
     row: int
     """Row index (0-based)."""
     col: int
@@ -528,8 +498,6 @@ class TableCell(TypedDict):
 class TableData(TypedDict):
-    """Table data, returned from table extraction."""
     cropped_image: Image
     """The cropped image of the table."""
     df: DataFrame | None
@@ -541,12 +509,6 @@ class TableData(TypedDict):
 class Metadata(TypedDict, total=False):
-    """Base metadata common to all document types.
-    All fields will only be included if they contain non-empty values.
-    Any field that would be empty or None is omitted from the dictionary.
-    """
     authors: NotRequired[list[str]]
     """List of document authors."""
     categories: NotRequired[list[str]]
@@ -674,10 +636,6 @@ _VALID_METADATA_KEYS = {
 def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
-    """Normalize any dict to proper Metadata TypedDict.
-    Filters out invalid keys and ensures type safety.
-    """
     if not data:
         return {}
@@ -691,8 +649,6 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
 @dataclass(frozen=True, slots=True)
 class Entity:
-    """Represents an extracted entity with type, text, and position."""
     type: str
     """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
     text: str
@@ -705,8 +661,6 @@ class Entity:
 @dataclass(slots=True)
 class ExtractionResult:
-    """The result of a file extraction."""
     content: str
     """The extracted content."""
     mime_type: str
@@ -731,15 +685,6 @@ class ExtractionResult:
     """Internal layout data from OCR, not for public use."""
     def to_dict(self, include_none: bool = False) -> dict[str, Any]:
-        """Converts the ExtractionResult to a dictionary.
-        Args:
-            include_none: If True, include fields with None values.
-                         If False (default), exclude None values.
-        Returns:
-            Dictionary representation of the ExtractionResult.
-        """
         result = msgspec.to_builtins(
             self,
             builtin_types=(type(None),),
@@ -752,33 +697,18 @@ class ExtractionResult:
         return {k: v for k, v in result.items() if v is not None}
     def export_tables_to_csv(self) -> list[str]:
-        """Export all tables to CSV format.
-        Returns:
-            List of CSV strings, one per table
-        """
         if not self.tables:  # pragma: no cover
             return []
         return [export_table_to_csv(table) for table in self.tables]
     def export_tables_to_tsv(self) -> list[str]:
-        """Export all tables to TSV format.
-        Returns:
-            List of TSV strings, one per table
-        """
         if not self.tables:  # pragma: no cover
             return []
         return [export_table_to_tsv(table) for table in self.tables]
     def get_table_summaries(self) -> list[dict[str, Any]]:
-        """Get structural information for all tables.
-        Returns:
-            List of table structure dictionaries
-        """
         if not self.tables:  # pragma: no cover
             return []
@@ -791,14 +721,6 @@ ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
 @dataclass(unsafe_hash=True, slots=True)
 class ExtractionConfig(ConfigDict):
-    """Represents configuration settings for an extraction process.
-    This class encapsulates the configuration options for extracting text
-    from images or documents using Optical Character Recognition (OCR). It
-    provides options to customize the OCR behavior, select the backend
-    engine, and configure engine-specific parameters.
-    """
     force_ocr: bool = False
     """Whether to force OCR."""
     chunk_content: bool = False
@@ -876,11 +798,6 @@ class ExtractionConfig(ConfigDict):
             )
     def get_config_dict(self) -> dict[str, Any]:
-        """Returns the OCR configuration object based on the backend specified.
-        Returns:
-            A dict of the OCR configuration or an empty dict if no backend is provided.
-        """
         if self.ocr_backend is None:
             return {"use_cache": self.use_cache}
@@ -904,15 +821,6 @@ class ExtractionConfig(ConfigDict):
                 return config_dict
     def to_dict(self, include_none: bool = False) -> dict[str, Any]:
-        """Convert configuration to dictionary recursively.
-        Args:
-            include_none: If True, include fields with None values.
-                         If False (default), exclude None values.
-        Returns:
-            Dictionary representation of the configuration with nested configs converted.
-        """
         result = msgspec.to_builtins(
             self,
             builtin_types=(type(None),),
@@ -931,13 +839,6 @@ class ExtractionConfig(ConfigDict):
 @dataclass(frozen=True)
 class HTMLToMarkdownConfig:
-    """Configuration for HTML to Markdown conversion.
-    This configuration class provides fine-grained control over how HTML content
-    is converted to Markdown format. Most fields have sensible defaults that work
-    well for typical document extraction scenarios.
-    """
     stream_processing: bool = False
     """Enable streaming mode for processing large HTML documents."""
     chunk_size: int = 1024
@@ -1004,8 +905,4 @@ class HTMLToMarkdownConfig:
     """Remove form elements from HTML."""
     def to_dict(self) -> dict[str, Any]:
-        """Convert config to dictionary for passing to convert_to_markdown.
-        Excludes None values and handles special cases.
-        """
         return {key: value for key, value in self.__dict__.items() if value is not None}

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -20,12 +20,6 @@ T = TypeVar("T")
 class KreuzbergCache(Generic[T]):
-    """File-based cache for Kreuzberg operations.
-    Provides both sync and async interfaces for caching extraction results,
-    OCR results, table data, and other expensive operations to disk.
-    """
     def __init__(
         self,
         cache_type: str,
@@ -33,14 +27,6 @@ class KreuzbergCache(Generic[T]):
         max_cache_size_mb: float = 500.0,
         max_age_days: int = 30,
     ) -> None:
-        """Initialize cache.
-        Args:
-            cache_type: Type of cache (e.g., 'ocr', 'tables', 'documents', 'mime')
-            cache_dir: Cache directory (defaults to .kreuzberg/{cache_type} in cwd)
-            max_cache_size_mb: Maximum cache size in MB (default: 500MB)
-            max_age_days: Maximum age of cached results in days (default: 30 days)
-        """
         if cache_dir is None:
             cache_dir = Path.cwd() / ".kreuzberg" / cache_type
@@ -159,14 +145,6 @@ class KreuzbergCache(Generic[T]):
             pass
     def get(self, **kwargs: Any) -> T | None:
-        """Get cached result (sync).
-        Args:
-            **kwargs: Key-value pairs to generate cache key from
-        Returns:
-            Cached result if available, None otherwise
-        """
         cache_key = self._get_cache_key(**kwargs)
         cache_path = self._get_cache_path(cache_key)
@@ -183,12 +161,6 @@ class KreuzbergCache(Generic[T]):
             return None
     def set(self, result: T, **kwargs: Any) -> None:
-        """Cache result (sync).
-        Args:
-            result: Result to cache
-            **kwargs: Key-value pairs to generate cache key from
-        """
         cache_key = self._get_cache_key(**kwargs)
         cache_path = self._get_cache_path(cache_key)
@@ -203,14 +175,6 @@ class KreuzbergCache(Generic[T]):
             pass
     async def aget(self, **kwargs: Any) -> T | None:
-        """Get cached result (async).
-        Args:
-            **kwargs: Key-value pairs to generate cache key from
-        Returns:
-            Cached result if available, None otherwise
-        """
         cache_key = self._get_cache_key(**kwargs)
         cache_path = AsyncPath(self._get_cache_path(cache_key))
@@ -227,12 +191,6 @@ class KreuzbergCache(Generic[T]):
             return None
     async def aset(self, result: T, **kwargs: Any) -> None:
-        """Cache result (async).
-        Args:
-            result: Result to cache
-            **kwargs: Key-value pairs to generate cache key from
-        """
         cache_key = self._get_cache_key(**kwargs)
         cache_path = AsyncPath(self._get_cache_path(cache_key))
@@ -247,13 +205,11 @@ class KreuzbergCache(Generic[T]):
             pass
     def is_processing(self, **kwargs: Any) -> bool:
-        """Check if operation is currently being processed."""
         cache_key = self._get_cache_key(**kwargs)
         with self._lock:
             return cache_key in self._processing
     def mark_processing(self, **kwargs: Any) -> threading.Event:
-        """Mark operation as being processed and return event to wait on."""
         cache_key = self._get_cache_key(**kwargs)
         with self._lock:
@@ -262,7 +218,6 @@ class KreuzbergCache(Generic[T]):
             return self._processing[cache_key]
     def mark_complete(self, **kwargs: Any) -> None:
-        """Mark operation processing as complete."""
         cache_key = self._get_cache_key(**kwargs)
         with self._lock:
@@ -271,7 +226,6 @@ class KreuzbergCache(Generic[T]):
                 event.set()
     def clear(self) -> None:
-        """Clear all cached results."""
         try:
             for cache_file in self.cache_dir.glob("*.msgpack"):
                 cache_file.unlink(missing_ok=True)
@@ -282,7 +236,6 @@ class KreuzbergCache(Generic[T]):
             pass
     def get_stats(self) -> dict[str, Any]:
-        """Get cache statistics."""
         try:
             cache_files = list(self.cache_dir.glob("*.msgpack"))
             total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists())
@@ -328,7 +281,6 @@ _ocr_cache_ref = Ref("ocr_cache", _create_ocr_cache)
 def get_ocr_cache() -> KreuzbergCache[ExtractionResult]:
-    """Get the OCR cache instance."""
     return _ocr_cache_ref.get()
@@ -350,7 +302,6 @@ _document_cache_ref = Ref("document_cache", _create_document_cache)
 def get_document_cache() -> KreuzbergCache[ExtractionResult]:
-    """Get the document cache instance."""
     return _document_cache_ref.get()
@@ -372,7 +323,6 @@ _table_cache_ref = Ref("table_cache", _create_table_cache)
 def get_table_cache() -> KreuzbergCache[Any]:
-    """Get the table cache instance."""
     return _table_cache_ref.get()
@@ -394,12 +344,10 @@ _mime_cache_ref = Ref("mime_cache", _create_mime_cache)
 def get_mime_cache() -> KreuzbergCache[str]:
-    """Get the MIME type cache instance."""
     return _mime_cache_ref.get()
 def clear_all_caches() -> None:
-    """Clear all caches."""
     if _ocr_cache_ref.is_initialized():
         get_ocr_cache().clear()
     if _document_cache_ref.is_initialized():

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -14,8 +14,6 @@ DeviceType = Literal["cpu", "cuda", "mps", "auto"]
 @dataclass(frozen=True, slots=True)
 class DeviceInfo:
-    """Information about a compute device."""
     device_type: Literal["cpu", "cuda", "mps"]
     """The type of device."""
     device_id: int | None = None
@@ -29,11 +27,6 @@ class DeviceInfo:
 def detect_available_devices() -> list[DeviceInfo]:
-    """Detect all available compute devices.
-    Returns:
-        List of available devices, with the most preferred device first.
-    """
     cpu_device = DeviceInfo(device_type="cpu", name="CPU")
     cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
@@ -46,11 +39,6 @@ def detect_available_devices() -> list[DeviceInfo]:
 def get_optimal_device() -> DeviceInfo:
-    """Get the optimal device for OCR processing.
-    Returns:
-        The best available device, preferring GPU over CPU.
-    """
     devices = detect_available_devices()
     return devices[0] if devices else DeviceInfo(device_type="cpu", name="CPU")
@@ -62,20 +50,6 @@ def validate_device_request(
     memory_limit: float | None = None,
     fallback_to_cpu: bool = True,
 ) -> DeviceInfo:
-    """Validate and resolve a device request.
-    Args:
-        requested: The requested device type.
-        backend: Name of the OCR backend requesting the device.
-        memory_limit: Optional memory limit in GB.
-        fallback_to_cpu: Whether to fallback to CPU if requested device unavailable.
-    Returns:
-        A validated DeviceInfo object.
-    Raises:
-        ValidationError: If the requested device is not available and fallback is disabled.
-    """
     available_devices = detect_available_devices()
     if requested == "auto":
@@ -115,14 +89,6 @@ def validate_device_request(
 def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | None]:
-    """Get memory information for a device.
-    Args:
-        device: The device to query.
-    Returns:
-        Tuple of (total_memory_gb, available_memory_gb). None values if unknown.
-    """
     if device.device_type == "cpu":
         return None, None
@@ -261,28 +227,11 @@ def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
 def is_backend_gpu_compatible(backend: str) -> bool:
-    """Check if an OCR backend supports GPU acceleration.
-    Args:
-        backend: Name of the OCR backend.
-    Returns:
-        True if the backend supports GPU acceleration.
-    """
     # EasyOCR and PaddleOCR support GPU, Tesseract does not  # ~keep
     return backend.lower() in ("easyocr", "paddleocr")
 def get_recommended_batch_size(device: DeviceInfo, input_size_mb: float = 10.0) -> int:
-    """Get recommended batch size for OCR processing.
-    Args:
-        device: The device to optimize for.
-        input_size_mb: Estimated input size per item in MB.
-    Returns:
-        Recommended batch size.
-    """
     if device.device_type == "cpu":
         # Conservative batch size for CPU  # ~keep
         return 1
@@ -304,11 +253,6 @@ def get_recommended_batch_size(device: DeviceInfo, input_size_mb: float = 10.0)
 def cleanup_device_memory(device: DeviceInfo) -> None:
-    """Clean up device memory.
-    Args:
-        device: The device to clean up.
-    """
     if device.device_type == "cuda":
         try:
             import torch  # noqa: PLC0415

kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl