PyPI - kreuzberg - Versions diffs - 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

kreuzberg/__init__.py +9 -2
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_entity_extraction.py +238 -0
kreuzberg/_extractors/_base.py +39 -1
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +27 -22
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +97 -34
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +181 -6
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +318 -11
kreuzberg/_language_detection.py +95 -0
kreuzberg/_mcp/__init__.py +5 -0
kreuzberg/_mcp/server.py +227 -0
kreuzberg/_mime_types.py +27 -1
kreuzberg/_ocr/__init__.py +10 -1
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +89 -0
kreuzberg/_ocr/_tesseract.py +569 -5
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +181 -4
kreuzberg/_utils/_cache.py +52 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_errors.py +3 -7
kreuzberg/_utils/_process_pool.py +182 -9
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +6 -7
kreuzberg/_utils/_table.py +261 -0
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/cli.py +1 -2
kreuzberg/extraction.py +43 -34
kreuzberg-3.8.1.dist-info/METADATA +301 -0
kreuzberg-3.8.1.dist-info/RECORD +53 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
kreuzberg/_multiprocessing/__init__.py +0 -6
kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
kreuzberg/_multiprocessing/process_manager.py +0 -188
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
kreuzberg-3.3.0.dist-info/METADATA +0 -235
kreuzberg-3.3.0.dist-info/RECORD +0 -48
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_types.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
 import sys
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Callable
 from dataclasses import asdict, dataclass, field
-from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
+from typing import TYPE_CHECKING, Any, Literal, TypedDict
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
 from kreuzberg.exceptions import ValidationError
@@ -17,7 +17,9 @@ if TYPE_CHECKING:
     from pandas import DataFrame
     from PIL.Image import Image
+    from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
     from kreuzberg._gmft import GMFTConfig
+    from kreuzberg._language_detection import LanguageDetectionConfig
     from kreuzberg._ocr._easyocr import EasyOCRConfig
     from kreuzberg._ocr._paddleocr import PaddleOCRConfig
     from kreuzberg._ocr._tesseract import TesseractConfig
@@ -98,6 +100,110 @@ class Metadata(TypedDict, total=False):
     width: NotRequired[int]
     """Width of the document page/slide/image, if applicable."""
+    # Email-specific fields
+    email_from: NotRequired[str]
+    """Email sender (from field)."""
+    email_to: NotRequired[str]
+    """Email recipient (to field)."""
+    email_cc: NotRequired[str]
+    """Email carbon copy recipients."""
+    email_bcc: NotRequired[str]
+    """Email blind carbon copy recipients."""
+    date: NotRequired[str]
+    """Email date or document date."""
+    attachments: NotRequired[list[str]]
+    """List of attachment names."""
+    # Additional metadata fields for various extractors
+    content: NotRequired[str]
+    """Content metadata field."""
+    parse_error: NotRequired[str]
+    """Parse error information."""
+    warning: NotRequired[str]
+    """Warning messages."""
+    # Table extraction metadata
+    table_count: NotRequired[int]
+    """Number of tables extracted from the document."""
+    tables_summary: NotRequired[str]
+    """Summary of table extraction results."""
+    quality_score: NotRequired[float]
+    """Quality score for extracted content (0.0-1.0)."""
+# Cache valid metadata keys at module level for performance
+_VALID_METADATA_KEYS = {
+    "authors",
+    "categories",
+    "citations",
+    "comments",
+    "content",
+    "copyright",
+    "created_at",
+    "created_by",
+    "description",
+    "fonts",
+    "height",
+    "identifier",
+    "keywords",
+    "languages",
+    "license",
+    "modified_at",
+    "modified_by",
+    "organization",
+    "parse_error",
+    "publisher",
+    "references",
+    "status",
+    "subject",
+    "subtitle",
+    "summary",
+    "title",
+    "version",
+    "warning",
+    "width",
+    "email_from",
+    "email_to",
+    "email_cc",
+    "email_bcc",
+    "date",
+    "attachments",
+    "table_count",
+    "tables_summary",
+    "quality_score",
+}
+def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
+    """Normalize any dict to proper Metadata TypedDict.
+    Filters out invalid keys and ensures type safety.
+    """
+    if not data:
+        return {}
+    # Filter and return only valid metadata
+    normalized: Metadata = {}
+    for key, value in data.items():
+        if key in _VALID_METADATA_KEYS and value is not None:
+            normalized[key] = value  # type: ignore[literal-required]
+    return normalized
+@dataclass(frozen=True)
+class Entity:
+    """Represents an extracted entity with type, text, and position."""
+    type: str
+    """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
+    text: str
+    """Extracted text"""
+    start: int
+    """Start character offset in the content"""
+    end: int
+    """End character offset in the content"""
 @dataclass
 class ExtractionResult:
@@ -113,10 +219,59 @@ class ExtractionResult:
     """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
     chunks: list[str] = field(default_factory=list)
     """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
+    entities: list[Entity] | None = None
+    """Extracted entities, if entity extraction is enabled."""
+    keywords: list[tuple[str, float]] | None = None
+    """Extracted keywords and their scores, if keyword extraction is enabled."""
+    detected_languages: list[str] | None = None
+    """Languages detected in the extracted content, if language detection is enabled."""
+    def to_dict(self) -> dict[str, Any]:
+        """Converts the ExtractionResult to a dictionary."""
+        return asdict(self)
+    def export_tables_to_csv(self) -> list[str]:
+        """Export all tables to CSV format.
+        Returns:
+            List of CSV strings, one per table
+        """
+        if not self.tables:
+            return []
+        from kreuzberg._utils._table import export_table_to_csv
+        return [export_table_to_csv(table) for table in self.tables]
+    def export_tables_to_tsv(self) -> list[str]:
+        """Export all tables to TSV format.
+        Returns:
+            List of TSV strings, one per table
+        """
+        if not self.tables:
+            return []
+        from kreuzberg._utils._table import export_table_to_tsv
+        return [export_table_to_tsv(table) for table in self.tables]
+    def get_table_summaries(self) -> list[dict[str, Any]]:
+        """Get structural information for all tables.
+        Returns:
+            List of table structure dictionaries
+        """
+        if not self.tables:
+            return []
+        from kreuzberg._utils._table import extract_table_structure_info
+        return [extract_table_structure_info(table) for table in self.tables]
-PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
-ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
+PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
+ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
 @dataclass(unsafe_hash=True)
@@ -153,8 +308,30 @@ class ExtractionConfig:
     """Post processing hooks to call after processing is done and before the final result is returned."""
     validators: list[ValidationHook] | None = None
     """Validation hooks to call after processing is done and before post-processing and result return."""
+    extract_entities: bool = False
+    """Whether to extract named entities from the content."""
+    extract_keywords: bool = False
+    """Whether to extract keywords from the content."""
+    keyword_count: int = 10
+    """Number of keywords to extract if extract_keywords is True."""
+    custom_entity_patterns: frozenset[tuple[str, str]] | None = None
+    """Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
+    auto_detect_language: bool = False
+    """Whether to automatically detect language and configure OCR accordingly."""
+    language_detection_config: LanguageDetectionConfig | None = None
+    """Configuration for language detection. If None, uses default settings."""
+    spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
+    """Configuration for spaCy entity extraction. If None, uses default settings."""
+    enable_quality_processing: bool = True
+    """Whether to apply quality post-processing to improve extraction results."""
     def __post_init__(self) -> None:
+        if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
+            object.__setattr__(self, "custom_entity_patterns", frozenset(self.custom_entity_patterns.items()))
+        if self.post_processing_hooks is not None and isinstance(self.post_processing_hooks, list):
+            object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
+        if self.validators is not None and isinstance(self.validators, list):
+            object.__setattr__(self, "validators", tuple(self.validators))
         from kreuzberg._ocr._easyocr import EasyOCRConfig
         from kreuzberg._ocr._paddleocr import PaddleOCRConfig
         from kreuzberg._ocr._tesseract import TesseractConfig

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -64,8 +64,23 @@ class KreuzbergCache(Generic[T]):
         Returns:
             Unique cache key string
         """
-        # Sort for consistent hashing  # ~keep
-        cache_str = str(sorted(kwargs.items()))
+        # Use more efficient string building for cache key
+        if not kwargs:
+            return "empty"
+        # Build key string efficiently
+        parts = []
+        for key in sorted(kwargs):
+            value = kwargs[key]
+            # Convert common types efficiently
+            if isinstance(value, (str, int, float, bool)):
+                parts.append(f"{key}={value}")
+            elif isinstance(value, bytes):
+                parts.append(f"{key}=bytes:{len(value)}")
+            else:
+                parts.append(f"{key}={type(value).__name__}:{value!s}")
+        cache_str = "&".join(parts)
         return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
     def _get_cache_path(self, cache_key: str) -> Path:
@@ -87,15 +102,48 @@ class KreuzbergCache(Generic[T]):
     def _serialize_result(self, result: T) -> dict[str, Any]:
         """Serialize result for caching with metadata."""
+        # Handle TableData objects that contain DataFrames
+        if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
+            serialized_data = []
+            for item in result:
+                if isinstance(item, dict) and "df" in item:
+                    # Create a copy and serialize the DataFrame as CSV
+                    item_copy = item.copy()
+                    if hasattr(item["df"], "to_csv"):
+                        item_copy["df_csv"] = item["df"].to_csv(index=False)
+                    else:
+                        # Fallback for non-DataFrame objects
+                        item_copy["df_csv"] = str(item["df"])
+                    del item_copy["df"]
+                    serialized_data.append(item_copy)
+                else:
+                    serialized_data.append(item)
+            return {"type": "TableDataList", "data": serialized_data, "cached_at": time.time()}
         return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
     def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
         """Deserialize cached result."""
         data = cached_data["data"]
-        if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
-            from kreuzberg._types import ExtractionResult
+        if cached_data.get("type") == "TableDataList" and isinstance(data, list):
+            deserialized_data = []
+            for item in data:
+                if isinstance(item, dict) and "df_csv" in item:
+                    # Restore the DataFrame from CSV
+                    item_copy = item.copy()
+                    from io import StringIO
+                    import pandas as pd
+                    item_copy["df"] = pd.read_csv(StringIO(item["df_csv"]))
+                    del item_copy["df_csv"]
+                    deserialized_data.append(item_copy)
+                else:
+                    deserialized_data.append(item)
+            return deserialized_data  # type: ignore[return-value]
+        if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
             return ExtractionResult(**data)  # type: ignore[return-value]
         return data  # type: ignore[no-any-return]

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -153,7 +153,7 @@ def _is_cuda_available() -> bool:
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]
-        return torch.cuda.is_available()
+        return bool(torch.cuda.is_available())
     except ImportError:
         return False
@@ -163,7 +163,7 @@ def _is_mps_available() -> bool:
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]
-        return torch.backends.mps.is_available()
+        return bool(torch.backends.mps.is_available())
     except ImportError:
         return False

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -5,12 +5,12 @@ from __future__ import annotations
 import platform
 import traceback
 from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Any
+from pathlib import Path
+from typing import Any
 import psutil
-if TYPE_CHECKING:
-    from pathlib import Path
+from kreuzberg.exceptions import ValidationError
 def create_error_context(
@@ -37,8 +37,6 @@ def create_error_context(
     }
     if file_path:
-        from pathlib import Path
         path = Path(file_path) if isinstance(file_path, str) else file_path
         context["file"] = {
             "path": str(path),
@@ -158,8 +156,6 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
     if attempt >= max_attempts:
         return False
-    from kreuzberg.exceptions import ValidationError
     if isinstance(error, ValidationError):
         return False

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -2,13 +2,20 @@
 from __future__ import annotations
+import io
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
+import anyio
+import psutil
+import pypdfium2
+from typing_extensions import Self
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    import types
+    from collections.abc import Callable, Generator
 T = TypeVar("T")
@@ -54,15 +61,13 @@ def shutdown_process_pool() -> None:
 def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
     """Worker function for extracting PDF text in a separate process."""
-    import pypdfium2
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)
         text_parts = []
         for page in pdf:
             text_page = page.get_textpage()
-            text = text_page.get_text_range()
+            text = text_page.get_text_bounded()
             text_parts.append(text)
             text_page.close()
             page.close()
@@ -76,10 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
 def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
     """Worker function for converting PDF to images in a separate process."""
-    import io
-    import pypdfium2
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)
@@ -98,3 +99,175 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
     finally:
         if pdf:
             pdf.close()
+class ProcessPoolManager:
+    """Resource-aware process pool manager for CPU-intensive tasks."""
+    def __init__(
+        self,
+        max_processes: int | None = None,
+        memory_limit_gb: float | None = None,
+    ) -> None:
+        """Initialize the process pool manager.
+        Args:
+            max_processes: Maximum number of processes. Defaults to CPU count.
+            memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
+        """
+        self.max_processes = max_processes or mp.cpu_count()
+        if memory_limit_gb is None:
+            available_memory = psutil.virtual_memory().available
+            self.memory_limit_bytes = int(available_memory * 0.75)  # Use 75% of available  # ~keep
+        else:
+            self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
+        self._executor: ProcessPoolExecutor | None = None
+        self._active_tasks = 0
+    def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
+        """Calculate optimal number of workers based on memory constraints.
+        Args:
+            task_memory_mb: Estimated memory usage per task in MB.
+        Returns:
+            Optimal number of workers.
+        """
+        task_memory_bytes = task_memory_mb * 1024**2
+        memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
+        return min(self.max_processes, memory_based_limit)
+    def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
+        """Ensure process pool executor is initialized."""
+        if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
+            if self._executor is not None:
+                self._executor.shutdown(wait=False)
+            workers = max_workers or self.max_processes
+            self._executor = ProcessPoolExecutor(max_workers=workers)
+        return self._executor
+    async def submit_task(
+        self,
+        func: Callable[..., T],
+        *args: Any,
+        task_memory_mb: float = 100,
+    ) -> T:
+        """Submit a task to the process pool.
+        Args:
+            func: Function to execute.
+            *args: Positional arguments for the function.
+            task_memory_mb: Estimated memory usage in MB.
+        Returns:
+            Result of the function execution.
+        """
+        workers = self.get_optimal_workers(task_memory_mb)
+        self._ensure_executor(workers)
+        self._active_tasks += 1
+        try:
+            return await anyio.to_thread.run_sync(func, *args)
+        finally:
+            self._active_tasks -= 1
+    async def submit_batch(
+        self,
+        func: Callable[..., T],
+        arg_batches: list[tuple[Any, ...]],
+        task_memory_mb: float = 100,
+        max_concurrent: int | None = None,
+    ) -> list[T]:
+        """Submit a batch of tasks to the process pool.
+        Args:
+            func: Function to execute.
+            arg_batches: List of argument tuples for each task.
+            task_memory_mb: Estimated memory usage per task in MB.
+            max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
+        Returns:
+            List of results in the same order as input.
+        """
+        if not arg_batches:
+            return []
+        workers = self.get_optimal_workers(task_memory_mb)
+        max_concurrent = max_concurrent or workers
+        self._ensure_executor(workers)
+        semaphore = anyio.CapacityLimiter(max_concurrent)
+        async def submit_single(args: tuple[Any, ...]) -> T:
+            async with semaphore:
+                self._active_tasks += 1
+                try:
+                    return await anyio.to_thread.run_sync(func, *args)
+                finally:
+                    self._active_tasks -= 1
+        async with anyio.create_task_group() as tg:
+            results: list[T] = [None] * len(arg_batches)  # type: ignore[list-item]
+            async def run_task(idx: int, args: tuple[Any, ...]) -> None:
+                results[idx] = await submit_single(args)
+            for idx, args in enumerate(arg_batches):
+                tg.start_soon(run_task, idx, args)
+        return results
+    def get_system_info(self) -> dict[str, Any]:
+        """Get current system resource information."""
+        memory = psutil.virtual_memory()
+        cpu_percent = psutil.cpu_percent(interval=1)
+        return {
+            "cpu_count": mp.cpu_count(),
+            "cpu_percent": cpu_percent,
+            "memory_total": memory.total,
+            "memory_available": memory.available,
+            "memory_percent": memory.percent,
+            "active_tasks": self._active_tasks,
+            "max_processes": self.max_processes,
+            "memory_limit": self.memory_limit_bytes,
+        }
+    def shutdown(self, wait: bool = True) -> None:
+        """Shutdown the process pool."""
+        if self._executor is not None:
+            self._executor.shutdown(wait=wait)
+            self._executor = None
+    def __enter__(self) -> Self:
+        """Context manager entry."""
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: types.TracebackType | None,
+    ) -> None:
+        """Context manager exit."""
+        self.shutdown()
+    async def __aenter__(self) -> Self:
+        """Async context manager entry."""
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: types.TracebackType | None,
+    ) -> None:
+        """Async context manager exit."""
+        self.shutdown()

kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl