PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""General-purpose file-based caching layer for Kreuzberg."""
 from __future__ import annotations
 import hashlib
@@ -14,6 +12,7 @@ from typing import Any, Generic, TypeVar
 from anyio import Path as AsyncPath
 from kreuzberg._types import ExtractionResult
+from kreuzberg._utils._ref import Ref
 from kreuzberg._utils._serialization import deserialize, serialize
 from kreuzberg._utils._sync import run_sync
@@ -57,22 +56,12 @@ class KreuzbergCache(Generic[T]):
         self._lock = threading.Lock()
     def _get_cache_key(self, **kwargs: Any) -> str:
-        """Generate cache key from kwargs.
-        Args:
-            **kwargs: Key-value pairs to generate cache key from
-        Returns:
-            Unique cache key string
-        """
         if not kwargs:
             return "empty"
-        # Build cache key using list + join (faster than StringIO)
         parts = []
         for key in sorted(kwargs):
             value = kwargs[key]
-            # Convert common types efficiently
             if isinstance(value, (str, int, float, bool)):
                 parts.append(f"{key}={value}")
             elif isinstance(value, bytes):
@@ -81,15 +70,12 @@ class KreuzbergCache(Generic[T]):
                 parts.append(f"{key}={type(value).__name__}:{value!s}")
         cache_str = "&".join(parts)
-        # SHA256 is secure and fast enough for cache keys
         return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
     def _get_cache_path(self, cache_key: str) -> Path:
-        """Get cache file path for key."""
         return self.cache_dir / f"{cache_key}.msgpack"
     def _is_cache_valid(self, cache_path: Path) -> bool:
-        """Check if cached result is still valid."""
         try:
             if not cache_path.exists():
                 return False
@@ -102,18 +88,14 @@ class KreuzbergCache(Generic[T]):
             return False
     def _serialize_result(self, result: T) -> dict[str, Any]:
-        """Serialize result for caching with metadata."""
-        # Handle TableData objects that contain DataFrames
         if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
             serialized_data = []
             for item in result:
                 if isinstance(item, dict) and "df" in item:
-                    # Build new dict without unnecessary copy
                     serialized_item = {k: v for k, v in item.items() if k != "df"}
                     if hasattr(item["df"], "to_csv"):
                         serialized_item["df_csv"] = item["df"].to_csv(index=False)
                     else:
-                        # Fallback for non-DataFrame objects
                         serialized_item["df_csv"] = str(item["df"])
                     serialized_data.append(serialized_item)
                 else:
@@ -123,7 +105,6 @@ class KreuzbergCache(Generic[T]):
         return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
     def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
-        """Deserialize cached result."""
         data = cached_data["data"]
         if cached_data.get("type") == "TableDataList" and isinstance(data, list):
@@ -132,7 +113,6 @@ class KreuzbergCache(Generic[T]):
             deserialized_data = []
             for item in data:
                 if isinstance(item, dict) and "df_csv" in item:
-                    # Build new dict without unnecessary copy
                     deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
                     deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
                     deserialized_data.append(deserialized_item)
@@ -146,7 +126,6 @@ class KreuzbergCache(Generic[T]):
         return data  # type: ignore[no-any-return]
     def _cleanup_cache(self) -> None:
-        """Clean up old and oversized cache entries."""
         try:
             cache_files = list(self.cache_dir.glob("*.msgpack"))
@@ -331,87 +310,106 @@ class KreuzbergCache(Generic[T]):
             }
-_ocr_cache: KreuzbergCache[ExtractionResult] | None = None
-_document_cache: KreuzbergCache[ExtractionResult] | None = None
-_table_cache: KreuzbergCache[Any] | None = None
-_mime_cache: KreuzbergCache[str] | None = None
+def _create_ocr_cache() -> KreuzbergCache[ExtractionResult]:
+    cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+    cache_dir: Path | None = None
+    if cache_dir_str:
+        cache_dir = Path(cache_dir_str) / "ocr"
+    return KreuzbergCache[ExtractionResult](
+        cache_type="ocr",
+        cache_dir=cache_dir,
+        max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
+        max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
+    )
+_ocr_cache_ref = Ref("ocr_cache", _create_ocr_cache)
 def get_ocr_cache() -> KreuzbergCache[ExtractionResult]:
-    """Get the global OCR cache instance."""
-    global _ocr_cache
-    if _ocr_cache is None:
-        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
-        cache_dir: Path | None = None
-        if cache_dir_str:
-            cache_dir = Path(cache_dir_str) / "ocr"
-        _ocr_cache = KreuzbergCache[ExtractionResult](
-            cache_type="ocr",
-            cache_dir=cache_dir,
-            max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
-            max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
-        )
-    return _ocr_cache
+    """Get the OCR cache instance."""
+    return _ocr_cache_ref.get()
+def _create_document_cache() -> KreuzbergCache[ExtractionResult]:
+    cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+    cache_dir: Path | None = None
+    if cache_dir_str:
+        cache_dir = Path(cache_dir_str) / "documents"
+    return KreuzbergCache[ExtractionResult](
+        cache_type="documents",
+        cache_dir=cache_dir,
+        max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
+        max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
+    )
+_document_cache_ref = Ref("document_cache", _create_document_cache)
 def get_document_cache() -> KreuzbergCache[ExtractionResult]:
-    """Get the global document cache instance."""
-    global _document_cache
-    if _document_cache is None:
-        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
-        cache_dir: Path | None = None
-        if cache_dir_str:
-            cache_dir = Path(cache_dir_str) / "documents"
-        _document_cache = KreuzbergCache[ExtractionResult](
-            cache_type="documents",
-            cache_dir=cache_dir,
-            max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
-            max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
-        )
-    return _document_cache
+    """Get the document cache instance."""
+    return _document_cache_ref.get()
+def _create_table_cache() -> KreuzbergCache[Any]:
+    cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+    cache_dir: Path | None = None
+    if cache_dir_str:
+        cache_dir = Path(cache_dir_str) / "tables"
+    return KreuzbergCache[Any](
+        cache_type="tables",
+        cache_dir=cache_dir,
+        max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
+        max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
+    )
+_table_cache_ref = Ref("table_cache", _create_table_cache)
 def get_table_cache() -> KreuzbergCache[Any]:
-    """Get the global table cache instance."""
-    global _table_cache
-    if _table_cache is None:
-        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
-        cache_dir: Path | None = None
-        if cache_dir_str:
-            cache_dir = Path(cache_dir_str) / "tables"
-        _table_cache = KreuzbergCache[Any](
-            cache_type="tables",
-            cache_dir=cache_dir,
-            max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
-            max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
-        )
-    return _table_cache
+    """Get the table cache instance."""
+    return _table_cache_ref.get()
+def _create_mime_cache() -> KreuzbergCache[str]:
+    cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
+    cache_dir: Path | None = None
+    if cache_dir_str:
+        cache_dir = Path(cache_dir_str) / "mime"
+    return KreuzbergCache[str](
+        cache_type="mime",
+        cache_dir=cache_dir,
+        max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
+        max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
+    )
+_mime_cache_ref = Ref("mime_cache", _create_mime_cache)
 def get_mime_cache() -> KreuzbergCache[str]:
-    """Get the global MIME type cache instance."""
-    global _mime_cache
-    if _mime_cache is None:
-        cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
-        cache_dir: Path | None = None
-        if cache_dir_str:
-            cache_dir = Path(cache_dir_str) / "mime"
-        _mime_cache = KreuzbergCache[str](
-            cache_type="mime",
-            cache_dir=cache_dir,
-            max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
-            max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
-        )
-    return _mime_cache
+    """Get the MIME type cache instance."""
+    return _mime_cache_ref.get()
 def clear_all_caches() -> None:
     """Clear all caches."""
-    get_ocr_cache().clear()
-    get_document_cache().clear()
-    get_table_cache().clear()
-    get_mime_cache().clear()
+    if _ocr_cache_ref.is_initialized():
+        get_ocr_cache().clear()
+    if _document_cache_ref.is_initialized():
+        get_document_cache().clear()
+    if _table_cache_ref.is_initialized():
+        get_table_cache().clear()
+    if _mime_cache_ref.is_initialized():
+        get_mime_cache().clear()
+    _ocr_cache_ref.clear()
+    _document_cache_ref.clear()
+    _table_cache_ref.clear()
+    _mime_cache_ref.clear()

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -1,4 +1,3 @@
-"""Device detection and management utilities for GPU acceleration."""
 # ruff: noqa: BLE001  # ~keep
 from __future__ import annotations
@@ -35,7 +34,6 @@ def detect_available_devices() -> list[DeviceInfo]:
     Returns:
         List of available devices, with the most preferred device first.
     """
-    # Build device lists efficiently using generators
     cpu_device = DeviceInfo(device_type="cpu", name="CPU")
     cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
@@ -43,7 +41,6 @@ def detect_available_devices() -> list[DeviceInfo]:
     mps_device = _get_mps_device() if _is_mps_available() else None
     mps_devices = [mps_device] if mps_device else []
-    # Return GPU devices first, then CPU using itertools.chain
     gpu_devices = list(chain(cuda_devices, mps_devices))
     return [*gpu_devices, cpu_device]
@@ -139,7 +136,6 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
 def _is_cuda_available() -> bool:
-    """Check if CUDA is available."""
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]  # noqa: PLC0415
@@ -149,7 +145,6 @@ def _is_cuda_available() -> bool:
 def _is_mps_available() -> bool:
-    """Check if MPS (Apple Silicon) is available."""
     try:
         import torch  # type: ignore[import-not-found,unused-ignore]  # noqa: PLC0415
@@ -159,7 +154,6 @@ def _is_mps_available() -> bool:
 def _get_cuda_devices() -> list[DeviceInfo]:
-    """Get information about available CUDA devices."""
     devices: list[DeviceInfo] = []
     try:
@@ -197,7 +191,6 @@ def _get_cuda_devices() -> list[DeviceInfo]:
 def _get_mps_device() -> DeviceInfo | None:
-    """Get information about the MPS device."""
     try:
         import torch  # noqa: PLC0415
@@ -214,7 +207,6 @@ def _get_mps_device() -> DeviceInfo | None:
 def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
-    """Get CUDA memory information for a specific device."""
     try:
         import torch  # noqa: PLC0415
@@ -237,20 +229,10 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
 def _get_mps_memory_info() -> tuple[float | None, float | None]:
-    """Get MPS memory information."""
     return None, None
 def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
-    """Validate that a device has enough memory for the requested limit.
-    Args:
-        device: The device to validate.
-        memory_limit: Required memory in GB.
-    Raises:
-        ValidationError: If the device doesn't have enough memory.
-    """
     if device.device_type == "cpu":
         # CPU memory validation is complex and OS-dependent, skip for now  # ~keep
         return

kreuzberg/_utils/_document_cache.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Document-level caching to prevent pypdfium2 issues with duplicate processing."""
 from __future__ import annotations
 import hashlib

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Enhanced error handling utilities."""
 from __future__ import annotations
 import platform
@@ -12,7 +10,6 @@ import psutil
 from kreuzberg.exceptions import ValidationError
-# Define error keywords as frozensets for O(1) membership testing
 _SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
 _TRANSIENT_ERROR_PATTERNS = frozenset(
     {

kreuzberg/_utils/_pdf_lock.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""PDF processing lock utilities for thread-safe pypdfium2 operations."""
 from __future__ import annotations
 import hashlib

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Process pool utilities for CPU-intensive operations."""
 from __future__ import annotations
 import io
@@ -13,6 +11,8 @@ import psutil
 import pypdfium2
 from typing_extensions import Self
+from kreuzberg._utils._ref import Ref
 if TYPE_CHECKING:
     import types
     from collections.abc import Callable, Generator
@@ -20,27 +20,29 @@ if TYPE_CHECKING:
 T = TypeVar("T")
-_PROCESS_POOL: ProcessPoolExecutor | None = None
 _POOL_SIZE = max(1, mp.cpu_count() - 1)
-def _init_process_pool() -> ProcessPoolExecutor:
-    """Initialize the global process pool."""
-    global _PROCESS_POOL
-    if _PROCESS_POOL is None:
-        _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
-    return _PROCESS_POOL
+def _create_process_pool() -> ProcessPoolExecutor:
+    return ProcessPoolExecutor(max_workers=_POOL_SIZE)
+_process_pool_ref = Ref("process_pool", _create_process_pool)
+def _get_process_pool() -> ProcessPoolExecutor:
+    return _process_pool_ref.get()
 @contextmanager
 def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
-    """Get the global process pool."""
-    pool = _init_process_pool()
+    """Get the process pool."""
+    pool = _get_process_pool()
     try:
         yield pool
     except Exception:  # noqa: BLE001
         shutdown_process_pool()
-        pool = _init_process_pool()
+        pool = _get_process_pool()
         yield pool
@@ -52,15 +54,14 @@ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) ->
 def shutdown_process_pool() -> None:
-    """Shutdown the global process pool."""
-    global _PROCESS_POOL
-    if _PROCESS_POOL is not None:
-        _PROCESS_POOL.shutdown(wait=True)
-        _PROCESS_POOL = None
+    """Shutdown the process pool."""
+    if _process_pool_ref.is_initialized():
+        pool = _process_pool_ref.get()
+        pool.shutdown(wait=True)
+        _process_pool_ref.clear()
 def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
-    """Worker function for extracting PDF text in a separate process."""
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)
@@ -80,7 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
 def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
-    """Worker function for converting PDF to images in a separate process."""
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)

kreuzberg/_utils/_quality.py CHANGED Viewed

@@ -1,14 +1,10 @@
-"""Quality post-processing utilities for extracted text."""
 from __future__ import annotations
 import re
 from functools import reduce
 from typing import Any
-# Pre-compiled patterns for performance
 _OCR_ARTIFACTS = {
-    # Common OCR misreads
     "scattered_chars": re.compile(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b"),
     "repeated_punctuation": re.compile(r"[.]{3,}|[-]{3,}|[_]{3,}"),
     "isolated_punctuation": re.compile(r"\s[.,;:!?]\s"),
@@ -17,7 +13,6 @@ _OCR_ARTIFACTS = {
     "broken_sentences": re.compile(r"[a-z]\s{3,}[A-Z][a-z]"),
 }
-# Combined pattern for faster OCR penalty calculation
 _COMBINED_OCR_PATTERN = re.compile(
     r"(?P<scattered>\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b)|"
     r"(?P<repeated>[.]{3,}|[-]{3,}|[_]{3,})|"
@@ -27,14 +22,12 @@ _COMBINED_OCR_PATTERN = re.compile(
     r"(?P<broken>[a-z]\s{3,}[A-Z][a-z])"
 )
-# Pre-compiled patterns for text normalization
 _WHITESPACE_NORMALIZE = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
 _NEWLINE_NORMALIZE = re.compile(r"\n\s*\n\s*\n+")
 _SENTENCE_DETECT = re.compile(r"[.!?]\s+[A-Z]")
 _PUNCTUATION_DETECT = re.compile(r"[.!?]")
 _SCRIPT_PATTERNS = {
-    # JavaScript and CSS content
     "js_functions": re.compile(r"function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}", re.IGNORECASE),
     "css_rules": re.compile(r"\.[a-zA-Z][\w-]*\s*\{[^}]*\}", re.IGNORECASE),
     "script_tags": re.compile(r"<script[^>]*>.*?</script>", re.DOTALL | re.IGNORECASE),
@@ -63,27 +56,21 @@ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -
     if not text or not text.strip():
         return 0.0
-    # Initialize score
     score = 1.0
     total_chars = len(text)
-    # Penalize OCR artifacts
     ocr_penalty = _calculate_ocr_penalty(text, total_chars)
     score -= ocr_penalty * 0.3
-    # Penalize script/style content
     script_penalty = _calculate_script_penalty(text, total_chars)
     score -= script_penalty * 0.2
-    # Penalize navigation content
     nav_penalty = _calculate_navigation_penalty(text, total_chars)
     score -= nav_penalty * 0.1
-    # Bonus for structure (sentences, paragraphs)
     structure_bonus = _calculate_structure_bonus(text)
     score += structure_bonus * 0.2
-    # Bonus for metadata richness
     if metadata:
         metadata_bonus = _calculate_metadata_bonus(metadata)
         score += metadata_bonus * 0.1
@@ -103,16 +90,12 @@ def clean_extracted_text(text: str) -> str:
     if not text:
         return text
-    # Remove script and style content using functools.reduce for single pass
     text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
-    # Clean OCR artifacts
     text = _clean_ocr_artifacts(text)
-    # Clean navigation elements
     text = _clean_navigation_elements(text)
-    # Normalize whitespace using pre-compiled patterns
     text = _WHITESPACE_NORMALIZE.sub(" ", text)
     text = _NEWLINE_NORMALIZE.sub("\n\n", text)
@@ -120,72 +103,57 @@ def clean_extracted_text(text: str) -> str:
 def _calculate_ocr_penalty(text: str, total_chars: int) -> float:
-    """Calculate penalty for OCR artifacts."""
     if total_chars == 0:
         return 0.0
-    # Use combined pattern for single-pass processing
     artifact_chars = sum(len(match.group()) for match in _COMBINED_OCR_PATTERN.finditer(text))
     return min(1.0, artifact_chars / total_chars)
 def _calculate_script_penalty(text: str, total_chars: int) -> float:
-    """Calculate penalty for script/style content."""
     if total_chars == 0:
         return 0.0
-    # Use sum with generator expression for single-pass calculation
     script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, script_chars / total_chars)
 def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
-    """Calculate penalty for navigation content."""
     if total_chars == 0:
         return 0.0
-    # Use sum with generator expression for single-pass calculation
     nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, nav_chars / total_chars)
 def _calculate_structure_bonus(text: str) -> float:
-    """Calculate bonus for proper text structure."""
     if not text:
         return 0.0
-    # Count sentences (rough heuristic)
     sentence_count = len(_SENTENCE_DETECT.findall(text))
-    # Count paragraphs
     paragraph_count = len(text.split("\n\n"))
-    # Calculate structure score
     words = len(text.split())
     if words == 0:
         return 0.0
-    # Good structure: reasonable sentence and paragraph distribution
     avg_words_per_sentence = words / max(1, sentence_count)
     avg_words_per_paragraph = words / max(1, paragraph_count)
     structure_score = 0.0
-    # Bonus for reasonable sentence length (10-30 words)
     if 10 <= avg_words_per_sentence <= 30:
         structure_score += 0.3
-    # Bonus for reasonable paragraph length (50-300 words)
     if 50 <= avg_words_per_paragraph <= 300:
         structure_score += 0.3
-    # Bonus for having multiple paragraphs
     if paragraph_count > 1:
         structure_score += 0.2
-    # Bonus for having punctuation
     if _PUNCTUATION_DETECT.search(text):
         structure_score += 0.2
@@ -193,7 +161,6 @@ def _calculate_structure_bonus(text: str) -> float:
 def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
-    """Calculate bonus for rich metadata."""
     if not metadata:
         return 0.0
@@ -204,30 +171,20 @@ def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
 def _clean_ocr_artifacts(text: str) -> str:
-    """Remove common OCR artifacts from text."""
-    # Fix scattered characters (likely OCR errors)
     text = _OCR_ARTIFACTS["scattered_chars"].sub(lambda m: m.group().replace(" ", ""), text)
-    # Clean repeated punctuation
     text = _OCR_ARTIFACTS["repeated_punctuation"].sub("...", text)
-    # Fix isolated punctuation
     text = _OCR_ARTIFACTS["isolated_punctuation"].sub(" ", text)
-    # Remove malformed words with numbers mixed in
     text = _OCR_ARTIFACTS["malformed_words"].sub(" ", text)
-    # Normalize excessive whitespace
     return _OCR_ARTIFACTS["excessive_whitespace"].sub(" ", text)
 def _clean_navigation_elements(text: str) -> str:
-    """Remove navigation elements from text."""
-    # Remove navigation words
     text = _NAVIGATION_PATTERNS["nav_words"].sub(" ", text)
-    # Remove breadcrumbs
     text = _NAVIGATION_PATTERNS["breadcrumbs"].sub(" ", text)
-    # Remove pagination
     return _NAVIGATION_PATTERNS["pagination"].sub(" ", text)

kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl