PyPI - kreuzberg - Versions diffs - 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

kreuzberg/__init__.py +9 -2
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_entity_extraction.py +238 -0
kreuzberg/_extractors/_base.py +39 -1
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +27 -22
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +97 -34
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +181 -6
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +318 -11
kreuzberg/_language_detection.py +95 -0
kreuzberg/_mcp/__init__.py +5 -0
kreuzberg/_mcp/server.py +227 -0
kreuzberg/_mime_types.py +27 -1
kreuzberg/_ocr/__init__.py +10 -1
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +89 -0
kreuzberg/_ocr/_tesseract.py +569 -5
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +181 -4
kreuzberg/_utils/_cache.py +52 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_errors.py +3 -7
kreuzberg/_utils/_process_pool.py +182 -9
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +6 -7
kreuzberg/_utils/_table.py +261 -0
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/cli.py +1 -2
kreuzberg/extraction.py +43 -34
kreuzberg-3.8.1.dist-info/METADATA +301 -0
kreuzberg-3.8.1.dist-info/RECORD +53 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
kreuzberg/_multiprocessing/__init__.py +0 -6
kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
kreuzberg/_multiprocessing/process_manager.py +0 -188
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
kreuzberg-3.3.0.dist-info/METADATA +0 -235
kreuzberg-3.3.0.dist-info/RECORD +0 -48
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_quality.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""Quality post-processing utilities for extracted text."""
+from __future__ import annotations
+import re
+from typing import Any
+# Pre-compiled patterns for performance
+_OCR_ARTIFACTS = {
+    # Common OCR misreads
+    "scattered_chars": re.compile(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b"),
+    "repeated_punctuation": re.compile(r"[.]{3,}|[-]{3,}|[_]{3,}"),
+    "isolated_punctuation": re.compile(r"\s[.,;:!?]\s"),
+    "malformed_words": re.compile(r"\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b"),
+    "excessive_whitespace": re.compile(r"\s{3,}"),
+    "broken_sentences": re.compile(r"[a-z]\s{3,}[A-Z][a-z]"),
+}
+# Combined pattern for faster OCR penalty calculation
+_COMBINED_OCR_PATTERN = re.compile(
+    r"(?P<scattered>\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b)|"
+    r"(?P<repeated>[.]{3,}|[-]{3,}|[_]{3,})|"
+    r"(?P<isolated>\s[.,;:!?]\s)|"
+    r"(?P<malformed>\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b)|"
+    r"(?P<whitespace>\s{3,})|"
+    r"(?P<broken>[a-z]\s{3,}[A-Z][a-z])"
+)
+# Pre-compiled patterns for text normalization
+_WHITESPACE_NORMALIZE = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
+_NEWLINE_NORMALIZE = re.compile(r"\n\s*\n\s*\n+")
+_SENTENCE_DETECT = re.compile(r"[.!?]\s+[A-Z]")
+_PUNCTUATION_DETECT = re.compile(r"[.!?]")
+_SCRIPT_PATTERNS = {
+    # JavaScript and CSS content
+    "js_functions": re.compile(r"function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}", re.IGNORECASE),
+    "css_rules": re.compile(r"\.[a-zA-Z][\w-]*\s*\{[^}]*\}", re.IGNORECASE),
+    "script_tags": re.compile(r"<script[^>]*>.*?</script>", re.DOTALL | re.IGNORECASE),
+    "style_tags": re.compile(r"<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE),
+}
+_NAVIGATION_PATTERNS = {
+    "nav_words": re.compile(r"\b(?:Skip to main content|Back to top|Main navigation|Site navigation)\b", re.IGNORECASE),
+    "breadcrumbs": re.compile(r"(?:Home\s*[>»]\s*|[>»]\s*){2,}"),
+    "pagination": re.compile(
+        r"\b(?:Page \d+ of \d+|First page|Last page|Previous page|Next page|^\d+ of \d+$)\b", re.IGNORECASE
+    ),
+}
+def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -> float:
+    """Calculate overall quality score for extracted text.
+    Args:
+        text: The extracted text content
+        metadata: Optional metadata for additional scoring
+    Returns:
+        Quality score between 0.0 and 1.0
+    """
+    if not text or not text.strip():
+        return 0.0
+    # Initialize score
+    score = 1.0
+    total_chars = len(text)
+    # Penalize OCR artifacts
+    ocr_penalty = _calculate_ocr_penalty(text, total_chars)
+    score -= ocr_penalty * 0.3
+    # Penalize script/style content
+    script_penalty = _calculate_script_penalty(text, total_chars)
+    score -= script_penalty * 0.2
+    # Penalize navigation content
+    nav_penalty = _calculate_navigation_penalty(text, total_chars)
+    score -= nav_penalty * 0.1
+    # Bonus for structure (sentences, paragraphs)
+    structure_bonus = _calculate_structure_bonus(text)
+    score += structure_bonus * 0.2
+    # Bonus for metadata richness
+    if metadata:
+        metadata_bonus = _calculate_metadata_bonus(metadata)
+        score += metadata_bonus * 0.1
+    return max(0.0, min(1.0, score))
+def clean_extracted_text(text: str) -> str:
+    """Clean extracted text by removing artifacts and improving quality.
+    Args:
+        text: The raw extracted text
+    Returns:
+        Cleaned text with artifacts removed
+    """
+    if not text:
+        return text
+    # Remove script and style content
+    for pattern in _SCRIPT_PATTERNS.values():
+        text = pattern.sub(" ", text)
+    # Clean OCR artifacts
+    text = _clean_ocr_artifacts(text)
+    # Clean navigation elements
+    text = _clean_navigation_elements(text)
+    # Normalize whitespace using pre-compiled patterns
+    text = _WHITESPACE_NORMALIZE.sub(" ", text)
+    text = _NEWLINE_NORMALIZE.sub("\n\n", text)
+    return text.strip()
+def _calculate_ocr_penalty(text: str, total_chars: int) -> float:
+    """Calculate penalty for OCR artifacts."""
+    if total_chars == 0:
+        return 0.0
+    # Use combined pattern for single-pass processing
+    artifact_chars = sum(len(match.group()) for match in _COMBINED_OCR_PATTERN.finditer(text))
+    return min(1.0, artifact_chars / total_chars)
+def _calculate_script_penalty(text: str, total_chars: int) -> float:
+    """Calculate penalty for script/style content."""
+    if total_chars == 0:
+        return 0.0
+    script_chars = 0
+    for pattern in _SCRIPT_PATTERNS.values():
+        matches = pattern.findall(text)
+        script_chars += sum(len(match) for match in matches)
+    return min(1.0, script_chars / total_chars)
+def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
+    """Calculate penalty for navigation content."""
+    if total_chars == 0:
+        return 0.0
+    nav_chars = 0
+    for pattern in _NAVIGATION_PATTERNS.values():
+        matches = pattern.findall(text)
+        nav_chars += sum(len(match) for match in matches)
+    return min(1.0, nav_chars / total_chars)
+def _calculate_structure_bonus(text: str) -> float:
+    """Calculate bonus for proper text structure."""
+    if not text:
+        return 0.0
+    # Count sentences (rough heuristic)
+    sentence_count = len(_SENTENCE_DETECT.findall(text))
+    # Count paragraphs
+    paragraph_count = len(text.split("\n\n"))
+    # Calculate structure score
+    words = len(text.split())
+    if words == 0:
+        return 0.0
+    # Good structure: reasonable sentence and paragraph distribution
+    avg_words_per_sentence = words / max(1, sentence_count)
+    avg_words_per_paragraph = words / max(1, paragraph_count)
+    structure_score = 0.0
+    # Bonus for reasonable sentence length (10-30 words)
+    if 10 <= avg_words_per_sentence <= 30:
+        structure_score += 0.3
+    # Bonus for reasonable paragraph length (50-300 words)
+    if 50 <= avg_words_per_paragraph <= 300:
+        structure_score += 0.3
+    # Bonus for having multiple paragraphs
+    if paragraph_count > 1:
+        structure_score += 0.2
+    # Bonus for having punctuation
+    if _PUNCTUATION_DETECT.search(text):
+        structure_score += 0.2
+    return min(1.0, structure_score)
+def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
+    """Calculate bonus for rich metadata."""
+    if not metadata:
+        return 0.0
+    important_fields = {"title", "author", "subject", "description", "keywords"}
+    present_fields = sum(1 for field in important_fields if metadata.get(field))
+    return present_fields / len(important_fields)
+def _clean_ocr_artifacts(text: str) -> str:
+    """Remove common OCR artifacts from text."""
+    # Fix scattered characters (likely OCR errors)
+    text = _OCR_ARTIFACTS["scattered_chars"].sub(lambda m: m.group().replace(" ", ""), text)
+    # Clean repeated punctuation
+    text = _OCR_ARTIFACTS["repeated_punctuation"].sub("...", text)
+    # Fix isolated punctuation
+    text = _OCR_ARTIFACTS["isolated_punctuation"].sub(" ", text)
+    # Remove malformed words with numbers mixed in
+    text = _OCR_ARTIFACTS["malformed_words"].sub(" ", text)
+    # Normalize excessive whitespace
+    return _OCR_ARTIFACTS["excessive_whitespace"].sub(" ", text)
+def _clean_navigation_elements(text: str) -> str:
+    """Remove navigation elements from text."""
+    # Remove navigation words
+    text = _NAVIGATION_PATTERNS["nav_words"].sub(" ", text)
+    # Remove breadcrumbs
+    text = _NAVIGATION_PATTERNS["breadcrumbs"].sub(" ", text)
+    # Remove pagination
+    return _NAVIGATION_PATTERNS["pagination"].sub(" ", text)

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -29,8 +29,10 @@ def encode_hook(obj: Any) -> Any:
         "to_list",
         "tolist",
     ):
-        if hasattr(obj, key) and callable(getattr(obj, key)):
-            return getattr(obj, key)()
+        if hasattr(obj, key):
+            method = getattr(obj, key)  # Cache the attribute lookup
+            if callable(method):
+                return method()
     if is_dataclass(obj) and not isinstance(obj, type):
         return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}

kreuzberg/_utils/_string.py CHANGED Viewed

@@ -1,39 +1,182 @@
 from __future__ import annotations
+import hashlib
+import re
 from contextlib import suppress
+from functools import lru_cache
-from charset_normalizer import detect
+import chardetng_py
+# Compile regex patterns once at module level for performance
+_WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
+_NEWLINES_PATTERN = re.compile(r"\n+")
+_MOJIBAKE_PATTERNS = {
+    # Hebrew as Cyrillic patterns
+    "hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
+    # Control characters that shouldn't appear in text
+    "control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
+    # Unicode replacement characters
+    "replacement_chars": re.compile(r"\uFFFD+"),
+    # Isolated combining marks (likely encoding issues)
+    "isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
+}
+# Simple cache for encoding detection (in-memory, session-scoped)
+_encoding_cache: dict[str, str] = {}
+@lru_cache(maxsize=128)
+def _get_encoding_cache_key(data_hash: str, size: int) -> str:
+    """Generate cache key for encoding detection."""
+    return f"{data_hash}:{size}"
 def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
-    """Decode a byte string safely, removing invalid sequences.
+    """Decode a byte string safely with mojibake detection and correction.
     Args:
         byte_data: The byte string to decode.
         encoding: The encoding to use when decoding the byte string.
     Returns:
-        The decoded string.
+        The decoded string with mojibake detection and correction.
     """
     if not byte_data:
         return ""
-    encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
+    # Try provided encoding first (fastest path)
+    if encoding:
+        with suppress(UnicodeDecodeError, LookupError):
+            decoded = byte_data.decode(encoding)
+            return _fix_mojibake(decoded)
-    for enc in [e for e in encodings if e]:
+    # Check cache for similar content (performance optimization)
+    data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16]  # Hash first 1KB
+    cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
+    if cache_key in _encoding_cache:
+        cached_encoding = _encoding_cache[cache_key]
+        with suppress(UnicodeDecodeError, LookupError):
+            decoded = byte_data.decode(cached_encoding)
+            return _fix_mojibake(decoded)
+    # Use chardetng for better performance than charset-normalizer
+    detected_encoding = chardetng_py.detect(byte_data)
+    if detected_encoding:
         with suppress(UnicodeDecodeError, LookupError):
-            return byte_data.decode(enc)
+            decoded = byte_data.decode(detected_encoding)
+            # Cache successful encoding detection
+            if len(_encoding_cache) < 1000:  # Prevent unlimited growth
+                _encoding_cache[cache_key] = detected_encoding
+            return _fix_mojibake(decoded)
+    # Try multiple encodings with confidence scoring
+    encodings_to_try = [
+        "utf-8",
+        "windows-1255",  # Hebrew
+        "iso-8859-8",  # Hebrew
+        "windows-1256",  # Arabic
+        "iso-8859-6",  # Arabic
+        "windows-1252",  # Western European
+        "cp1251",  # Cyrillic
+    ]
+    best_result = None
+    best_confidence = 0.0
+    for enc in encodings_to_try:
+        with suppress(UnicodeDecodeError, LookupError):
+            decoded = byte_data.decode(enc)
+            confidence = _calculate_text_confidence(decoded)
+            if confidence > best_confidence:
+                best_confidence = confidence
+                best_result = decoded
+    if best_result and best_confidence > 0.5:
+        return _fix_mojibake(best_result)
+    # Final fallback
     return byte_data.decode("latin-1", errors="replace")
+def _calculate_text_confidence(text: str) -> float:
+    """Calculate confidence score for decoded text quality."""
+    if not text:
+        return 0.0
+    # Check for common encoding problems
+    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
+    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
+    total_chars = len(text)
+    if total_chars == 0:
+        return 0.0
+    # Penalize replacement and control characters
+    penalty = (replacement_count + control_count * 2) / total_chars
+    # Bonus for readable character ranges
+    readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
+    readability_score = readable_chars / total_chars
+    # Check for suspicious Cyrillic that might be misencoded Hebrew
+    cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
+    if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
+        penalty += 0.3  # Heavy penalty for likely mojibake
+    return max(0.0, min(1.0, readability_score - penalty))
+def _fix_mojibake(text: str) -> str:
+    """Attempt to fix common mojibake patterns."""
+    if not text:
+        return text
+    # Remove control characters
+    text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
+    # Remove replacement characters
+    text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
+    # Remove isolated combining marks
+    text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
+    # Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
+    if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
+        # This is a heuristic fix - in practice, you'd need actual character mapping
+        # For now, we flag it for manual review by keeping the text but adding a marker
+        pass
+    return text
 def normalize_spaces(text: str) -> str:
-    """Normalize the spaces in a string.
+    """Normalize spaces while preserving line breaks and paragraph structure.
     Args:
-        text: The text to sanitize.
+        text: The text to normalize.
     Returns:
-        The sanitized text.
+        The normalized text with proper spacing.
     """
-    return " ".join(text.strip().split())
+    if not text or not text.strip():
+        return ""
+    # Split by double newlines to preserve paragraph breaks
+    paragraphs = text.split("\n\n")
+    normalized_paragraphs = []
+    for paragraph in paragraphs:
+        # Use pre-compiled patterns for better performance
+        # Replace multiple whitespace (except newlines) with single space
+        cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
+        # Clean up multiple newlines within paragraph (keep single newlines)
+        cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
+        # Strip and filter empty lines efficiently
+        lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
+        if lines:
+            normalized_paragraphs.append("\n".join(lines))
+    return "\n\n".join(normalized_paragraphs)

kreuzberg/_utils/_sync.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import sys
 from functools import partial
 from inspect import isawaitable, iscoroutinefunction
 from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -12,10 +11,7 @@ from anyio.to_thread import run_sync as any_io_run_sync
 if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Awaitable, Callable
-if sys.version_info >= (3, 10):
-    from typing import ParamSpec
-else:  # pragma: no cover
-    from typing_extensions import ParamSpec
+from typing import ParamSpec
 T = TypeVar("T")
 P = ParamSpec("P")
@@ -32,8 +28,11 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
     Returns:
         The result of the synchronous function.
     """
-    handler = partial(sync_fn, **kwargs)
-    return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]
+    # Optimize: only create partial if we have kwargs
+    if kwargs:
+        handler = partial(sync_fn, **kwargs)
+        return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]
+    return cast("T", await any_io_run_sync(sync_fn, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]
 async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:

kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl