PyPI - kreuzberg - Versions diffs - 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl - Mend

kreuzberg 3.17.3py3-none-any.whl → 3.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

kreuzberg/_api/main.py +45 -3
kreuzberg/_entity_extraction.py +108 -18
kreuzberg/_error_handling.py +182 -0
kreuzberg/_extractors/_base.py +2 -2
kreuzberg/_extractors/_html.py +2 -2
kreuzberg/_extractors/_pdf.py +33 -54
kreuzberg/_extractors/_structured.py +1 -1
kreuzberg/_language_detection.py +2 -0
kreuzberg/_ocr/_tesseract.py +28 -6
kreuzberg/_types.py +18 -0
kreuzberg/cli.py +36 -22
kreuzberg/extraction.py +251 -107
{kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/METADATA +7 -4
{kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/RECORD +17 -16
{kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -1113,6 +1113,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             **run_config["remaining_kwargs"],
             "language": run_config["language"],
             "psm": run_config["psm"],
+            "tesseract_format": run_config["tesseract_format"],
+            "ext": run_config["ext"],
+            "output_format": run_config["output_format"],
+            "enable_table_detection": run_config["enable_table_detection"],
         }
         optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
@@ -1222,13 +1226,21 @@ def _process_image_with_tesseract(
     config_dict: dict[str, Any],
 ) -> dict[str, Any]:
     try:
-        with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
-            output_base = tmp_file.name.replace(".txt", "")
+        tesseract_format = config_dict.get("tesseract_format", "text")
+        ext = config_dict.get("ext", ".txt")
+        output_format = config_dict.get("output_format", "text")
+        config_dict.get("enable_table_detection", False)
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
+            output_base = tmp_file.name.replace(ext, "")
         try:
             language = config_dict.get("language", "eng")
             psm = config_dict.get("psm", 3)
+            # Convert PSM enum to integer value if needed
+            psm_value = psm.value if hasattr(psm, "value") else psm
             command = [
                 "tesseract",
                 image_path,
@@ -1236,13 +1248,16 @@ def _process_image_with_tesseract(
                 "-l",
                 language,
                 "--psm",
-                str(psm),
+                str(psm_value),
                 "--oem",
                 "1",
                 "--loglevel",
                 "OFF",
             ]
+            if tesseract_format != "text":
+                command.append(tesseract_format)
             boolean_options = [
                 "classify_use_pre_adapted_templates",
                 "language_model_ngram_on",
@@ -1275,10 +1290,17 @@ def _process_image_with_tesseract(
             if result.returncode != 0:
                 raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
-            output_file = output_base + ".txt"
+            output_file = output_base + ext
             with Path(output_file).open(encoding="utf-8") as f:
                 text = f.read()
+            # Process based on output format
+            if output_format == "markdown" and tesseract_format == "hocr":
+                # Import here to avoid circular dependency ~keep
+                from html_to_markdown import convert_to_markdown  # noqa: PLC0415
+                text = convert_to_markdown(text, heading_style="atx")
             text = normalize_spaces(text)
             return {
@@ -1289,8 +1311,8 @@ def _process_image_with_tesseract(
             }
         finally:
-            for ext in [".txt"]:
-                temp_file = output_base + ext
+            for possible_ext in [ext, ".txt", ".hocr", ".tsv"]:
+                temp_file = output_base + possible_ext
                 temp_path = Path(temp_file)
                 if temp_path.exists():
                     temp_path.unlink()

kreuzberg/_types.py CHANGED Viewed

@@ -32,6 +32,7 @@ if TYPE_CHECKING:
 OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
 OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
+ErrorContextType = Literal["batch_processing", "optional_feature", "single_extraction", "unknown"]
 class ConfigDict:
@@ -503,6 +504,17 @@ class SpacyEntityExtractionConfig(ConfigDict):
         return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
+class ProcessingErrorDict(TypedDict):
+    feature: str
+    """Name of the feature that failed (e.g., 'chunking', 'entity_extraction', 'keyword_extraction')."""
+    error_type: str
+    """Type of the exception that occurred (e.g., 'RuntimeError', 'ValidationError')."""
+    error_message: str
+    """Human-readable error message."""
+    traceback: str
+    """Full Python traceback for debugging."""
 class BoundingBox(TypedDict):
     left: int
     """X coordinate of the left edge."""
@@ -701,6 +713,10 @@ class Metadata(TypedDict, total=False):
     """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
     token_reduction: NotRequired[dict[str, float]]
     """Token reduction statistics including reduction ratios and counts."""
+    processing_errors: NotRequired[list[ProcessingErrorDict]]
+    """List of processing errors that occurred during extraction."""
+    extraction_error: NotRequired[dict[str, Any]]
+    """Error information for critical extraction failures."""
 _VALID_METADATA_KEYS = {
@@ -756,6 +772,8 @@ _VALID_METADATA_KEYS = {
     "message",
     "attributes",
     "token_reduction",
+    "processing_errors",
+    "extraction_error",
 }

kreuzberg/cli.py CHANGED Viewed

@@ -168,31 +168,45 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
             input_text = sys.stdin.read()
             input_bytes = input_text.encode("utf-8")
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            console=console,
-            transient=True,
-        ) as progress:
-            progress.add_task("Extracting text...", total=None)
-            try:
-                import magic  # type: ignore[import-not-found] # noqa: PLC0415
-                mime_type = magic.from_buffer(input_bytes, mime=True)
-            except ImportError:  # pragma: no cover
-                content_str = input_bytes.decode("utf-8", errors="ignore").lower()
-                mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
+        # Detect MIME type from content
+        content_str = input_bytes.decode("utf-8", errors="ignore").lower()
+        if "<html" in content_str or "<!doctype html" in content_str or "<body" in content_str:
+            mime_type = "text/html"
+        elif (content_str.strip().startswith("{") and content_str.strip().endswith("}")) or (
+            content_str.strip().startswith("[") and content_str.strip().endswith("]")
+        ):
+            mime_type = "application/json"
+        elif content_str.strip().startswith("---") or ":" in content_str[:100]:
+            mime_type = "application/x-yaml"
+        else:
+            mime_type = "text/plain"
+        # Use progress display if possible, fallback to simple extraction on Windows issues
+        try:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                console=console,
+                transient=True,
+            ) as progress:
+                progress.add_task("Extracting text...", total=None)
+                return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
+        except (OSError, RuntimeError):  # pragma: no cover
+            # Fallback for Windows console issues
             return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
     else:
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            console=console,
-            transient=True,
-        ) as progress:
-            progress.add_task(f"Extracting text from {file.name}...", total=None)
+        # Use progress display if possible, fallback to simple extraction on Windows issues
+        try:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                console=console,
+                transient=True,
+            ) as progress:
+                progress.add_task(f"Extracting text from {file.name}...", total=None)
+                return extract_file_sync(str(file), config=extraction_config)
+        except (OSError, RuntimeError):  # pragma: no cover
+            # Fallback for Windows console issues
             return extract_file_sync(str(file), config=extraction_config)

kreuzberg/extraction.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import multiprocessing as mp
+import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import TYPE_CHECKING, Final, cast
@@ -10,6 +11,7 @@ import anyio
 from kreuzberg._chunker import get_chunker
 from kreuzberg._document_classification import auto_detect_document_type
 from kreuzberg._entity_extraction import extract_entities, extract_keywords
+from kreuzberg._error_handling import safe_feature_execution, should_exception_bubble_up
 from kreuzberg._language_detection import detect_languages
 from kreuzberg._mime_types import (
     validate_mime_type,
@@ -21,7 +23,7 @@ from kreuzberg._utils._document_cache import get_document_cache
 from kreuzberg._utils._errors import create_error_context
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
-from kreuzberg.exceptions import ValidationError
+from kreuzberg.exceptions import KreuzbergError, ValidationError
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -50,69 +52,107 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
 def _validate_and_post_process_helper(
     result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
 ) -> ExtractionResult:
+    if result.metadata is None:
+        result.metadata = {}
     if config.chunk_content:
-        result.chunks = _handle_chunk_content(
-            mime_type=result.mime_type,
-            config=config,
-            content=result.content,
+        result.chunks = safe_feature_execution(
+            feature_name="chunking",
+            execution_func=lambda: _handle_chunk_content(
+                mime_type=result.mime_type,
+                config=config,
+                content=result.content,
+            ),
+            default_value=[],
+            result=result,
         )
     if config.extract_entities:
-        try:
-            result.entities = extract_entities(
+        result.entities = safe_feature_execution(
+            feature_name="entity_extraction",
+            execution_func=lambda: extract_entities(
                 result.content,
                 custom_patterns=config.custom_entity_patterns,
-            )
-        except RuntimeError:
-            result.entities = None
+            ),
+            default_value=None,
+            result=result,
+        )
     if config.extract_keywords:
-        try:
-            result.keywords = extract_keywords(
+        result.keywords = safe_feature_execution(
+            feature_name="keyword_extraction",
+            execution_func=lambda: extract_keywords(
                 result.content,
                 keyword_count=config.keyword_count,
-            )
-        except RuntimeError:
-            result.keywords = None
+            ),
+            default_value=None,
+            result=result,
+        )
     if config.auto_detect_language:
-        lang_config = config.language_detection_config
-        if lang_config is None:
-            from kreuzberg._types import LanguageDetectionConfig  # noqa: PLC0415
-            lang_config = LanguageDetectionConfig(model=config.language_detection_model)
+        def _detect_language() -> list[str]:
+            lang_config = config.language_detection_config
+            if lang_config is None:
+                from kreuzberg._types import LanguageDetectionConfig  # noqa: PLC0415
+                lang_config = LanguageDetectionConfig(model=config.language_detection_model)
-        result.detected_languages = detect_languages(
-            result.content,
-            config=lang_config,
+            return detect_languages(result.content, config=lang_config) or []
+        result.detected_languages = safe_feature_execution(
+            feature_name="language_detection",
+            execution_func=_detect_language,
+            default_value=[],
+            result=result,
         )
     if config.auto_detect_document_type:
-        result = auto_detect_document_type(result, config, file_path=file_path)
+        result = safe_feature_execution(
+            feature_name="document_type_detection",
+            execution_func=lambda: auto_detect_document_type(result, config, file_path=file_path),
+            default_value=result,
+            result=result,
+        )
     if config.token_reduction is not None and config.token_reduction.mode != "off":
-        original_content = result.content
-        language_hint = None
-        if result.detected_languages and len(result.detected_languages) > 0:
-            language_hint = result.detected_languages[0]
+        def _apply_token_reduction() -> str:
+            original_content = result.content
-        reduced_content = reduce_tokens(
-            original_content,
-            config=config.token_reduction,
-            language=language_hint,
+            language_hint = None
+            if result.detected_languages and len(result.detected_languages) > 0:
+                language_hint = result.detected_languages[0]
+            reduced_content = (
+                reduce_tokens(
+                    original_content,
+                    config=config.token_reduction,
+                    language=language_hint,
+                )
+                if config.token_reduction
+                else original_content
+            )
+            reduction_stats = get_reduction_stats(original_content, reduced_content)
+            if result.metadata is not None:
+                result.metadata["token_reduction"] = {
+                    "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
+                    "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
+                    "original_characters": reduction_stats["original_characters"],
+                    "reduced_characters": reduction_stats["reduced_characters"],
+                    "original_tokens": reduction_stats["original_tokens"],
+                    "reduced_tokens": reduction_stats["reduced_tokens"],
+                }
+            return reduced_content
+        result.content = safe_feature_execution(
+            feature_name="token_reduction",
+            execution_func=_apply_token_reduction,
+            default_value=result.content,
+            result=result,
         )
-        reduction_stats = get_reduction_stats(original_content, reduced_content)
-        result.content = reduced_content
-        result.metadata["token_reduction"] = {
-            "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
-            "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
-            "original_characters": reduction_stats["original_characters"],
-            "reduced_characters": reduction_stats["reduced_characters"],
-            "original_tokens": reduction_stats["original_tokens"],
-            "reduced_tokens": reduction_stats["reduced_tokens"],
-        }
     return result
@@ -125,8 +165,22 @@ async def _validate_and_post_process_async(
     result = _validate_and_post_process_helper(result, config, file_path)
-    for post_processor in config.post_processing_hooks or []:
-        result = await run_maybe_sync(post_processor, result)
+    for i, post_processor in enumerate(config.post_processing_hooks or []):
+        try:
+            result = await run_maybe_sync(post_processor, result)
+        except (KreuzbergError, ValueError, RuntimeError, TypeError) as e:  # noqa: PERF203
+            if result.metadata is None:
+                result.metadata = {}
+            error_list = result.metadata.setdefault("processing_errors", [])
+            if isinstance(error_list, list):
+                error_list.append(
+                    {
+                        "feature": f"post_processing_hook_{i}",
+                        "error_type": type(e).__name__,
+                        "error_message": str(e),
+                        "traceback": traceback.format_exc(),
+                    }
+                )
     return result
@@ -260,22 +314,18 @@ async def batch_extract_file(
                     config,
                 )
                 results[index] = result
-            except Exception as e:  # noqa: BLE001
-                error_result = ExtractionResult(
-                    content=f"Error: {type(e).__name__}: {e!s}",
-                    mime_type="text/plain",
-                    metadata={
-                        "error": f"{type(e).__name__}: {e!s}",
-                        "error_context": create_error_context(
-                            operation="batch_extract_file",
-                            file_path=str(path),
-                            error=e,
-                            index=index,
-                        ),
-                    },
-                    chunks=[],
+            except Exception as e:
+                if should_exception_bubble_up(e, "batch_processing"):
+                    raise
+                basic_result = _attempt_basic_extraction(
+                    None,
+                    None,
+                    e,
+                    index,
+                    file_path=str(path),
                 )
-                results[index] = error_result
+                results[index] = basic_result
     async with anyio.create_task_group() as tg:
         for i, path in enumerate(file_paths):
@@ -309,23 +359,12 @@ async def batch_extract_bytes(
             try:
                 result = await extract_bytes(content, mime_type, config)
                 results[index] = result
-            except Exception as e:  # noqa: BLE001
-                error_result = ExtractionResult(
-                    content=f"Error: {type(e).__name__}: {e!s}",
-                    mime_type="text/plain",
-                    metadata={
-                        "error": f"{type(e).__name__}: {e!s}",
-                        "error_context": create_error_context(
-                            operation="batch_extract_bytes",
-                            error=e,
-                            index=index,
-                            mime_type=mime_type,
-                            content_size=len(content),
-                        ),
-                    },
-                    chunks=[],
-                )
-                results[index] = error_result
+            except Exception as e:
+                if should_exception_bubble_up(e, "batch_processing"):
+                    raise
+                basic_result = _attempt_basic_extraction(content, mime_type, e, index)
+                results[index] = basic_result
     async with anyio.create_task_group() as tg:
         for i, (content, mime_type) in enumerate(contents):
@@ -334,6 +373,125 @@ async def batch_extract_bytes(
     return results
+def _attempt_basic_extraction(
+    content: bytes | None, mime_type: str | None, original_error: Exception, index: int, *, file_path: str | None = None
+) -> ExtractionResult:
+    """Attempt basic extraction when full extraction fails, preserving as much as possible.
+    This function tries to extract at least basic text content even when advanced
+    features like OCR, entity extraction, etc. fail.
+    Args:
+        content: The raw content bytes (None for file extractions)
+        mime_type: The MIME type of the content (None if unknown)
+        original_error: The exception that caused the main extraction to fail
+        index: Index of this content in the batch
+        file_path: Optional file path for file-based extractions
+    Returns:
+        A basic ExtractionResult with whatever could be extracted
+    """
+    if (
+        isinstance(original_error, (ValueError, TypeError, ValidationError))
+        or "mock" in str(type(original_error)).lower()
+    ):
+        return ExtractionResult(
+            content=f"Error: {type(original_error).__name__}: {original_error!s}",
+            mime_type="text/plain",
+            metadata={
+                "error": f"{type(original_error).__name__}: {original_error!s}",
+                "error_context": create_error_context(
+                    operation="batch_extract_file" if file_path else "batch_extract_bytes",
+                    error=original_error,
+                    index=index,
+                    mime_type=mime_type,
+                    content_size=len(content) if content else 0,
+                    file_path=file_path,
+                ),
+            },
+            chunks=[],
+            entities=[],
+            keywords=[],
+            detected_languages=[],
+            tables=[],
+            images=[],
+            image_ocr_results=[],
+        )
+    try:
+        if content is None:
+            return ExtractionResult(
+                content=f"Error: {type(original_error).__name__}: {original_error!s}",
+                mime_type="text/plain",
+                metadata={
+                    "error": f"{type(original_error).__name__}: {original_error!s}",
+                    "error_context": create_error_context(
+                        operation="batch_extract_file",
+                        error=original_error,
+                        index=index,
+                        file_path=file_path,
+                    ),
+                },
+                chunks=[],
+                entities=[],
+                keywords=[],
+                detected_languages=[],
+                tables=[],
+                images=[],
+                image_ocr_results=[],
+            )
+        mime_type = validate_mime_type(mime_type=mime_type)
+        if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=ExtractionConfig()):
+            basic_result = extractor.extract_bytes_sync(content)
+            if basic_result.metadata is None:
+                basic_result.metadata = {}
+            basic_result.metadata["extraction_error"] = {
+                "error_type": type(original_error).__name__,
+                "error_message": str(original_error),
+                "traceback": traceback.format_exc(),
+                "context": create_error_context(
+                    operation="batch_extract_file" if file_path else "batch_extract_bytes",
+                    error=original_error,
+                    index=index,
+                    mime_type=mime_type,
+                    content_size=len(content),
+                    file_path=file_path,
+                ),
+                "recovery_mode": "basic_extraction",
+            }
+            return basic_result
+    except (KreuzbergError, ValueError, RuntimeError, TypeError):
+        pass
+    return ExtractionResult(
+        content=f"Error: {type(original_error).__name__}: {original_error!s}",
+        mime_type="text/plain",
+        metadata={
+            "error": f"{type(original_error).__name__}: {original_error!s}",
+            "error_context": create_error_context(
+                operation="batch_extract_file" if file_path else "batch_extract_bytes",
+                error=original_error,
+                index=index,
+                mime_type=mime_type,
+                content_size=len(content) if content else 0,
+                file_path=file_path,
+            ),
+        },
+        chunks=[],
+        entities=[],
+        keywords=[],
+        detected_languages=[],
+        tables=[],
+        images=[],
+        image_ocr_results=[],
+    )
 def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
     """Synchronous version of extract_bytes.
@@ -444,21 +602,18 @@ def batch_extract_file_sync(
                 index,
                 extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
             )
-        except Exception as e:  # noqa: BLE001
-            error_result = ExtractionResult(
-                content=f"Error: {type(e).__name__}: {e!s}",
-                mime_type="text/plain",
-                metadata={
-                    "error": f"{type(e).__name__}: {e!s}",
-                    "error_context": create_error_context(
-                        operation="batch_extract_file_sync",
-                        file_path=str(file_path),
-                        error=e,
-                    ),
-                },
-                chunks=[],
+        except Exception as e:
+            if should_exception_bubble_up(e, "batch_processing"):
+                raise
+            basic_result = _attempt_basic_extraction(
+                None,
+                None,
+                e,
+                index,
+                file_path=str(file_path),
             )
-            return (index, error_result)
+            return (index, basic_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
@@ -494,23 +649,12 @@ def batch_extract_bytes_sync(
         """Extract single content with index for ordering."""
         try:
             return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
-        except Exception as e:  # noqa: BLE001
-            error_result = ExtractionResult(
-                content=f"Error: {type(e).__name__}: {e!s}",
-                mime_type="text/plain",
-                metadata={
-                    "error": f"{type(e).__name__}: {e!s}",
-                    "error_context": create_error_context(
-                        operation="batch_extract_bytes_sync",
-                        error=e,
-                        index=index,
-                        mime_type=mime_type,
-                        content_size=len(content),
-                    ),
-                },
-                chunks=[],
-            )
-            return (index, error_result)
+        except Exception as e:
+            if should_exception_bubble_up(e, "batch_processing"):
+                raise
+            basic_result = _attempt_basic_extraction(content, mime_type, e, index)
+            return (index, basic_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_index = {

kreuzberg 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl

kreuzberg 3.17.3py3-none-any.whl → 3.19.0py3-none-any.whl