PyPI - kreuzberg - Versions diffs - 3.18.0__py3-none-any.whl → 3.19.0__py3-none-any.whl - Mend

kreuzberg 3.18.0py3-none-any.whl → 3.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

kreuzberg/_api/main.py +4 -2
kreuzberg/_entity_extraction.py +4 -8
kreuzberg/_error_handling.py +182 -0
kreuzberg/_extractors/_base.py +2 -2
kreuzberg/_extractors/_html.py +2 -2
kreuzberg/_extractors/_pdf.py +33 -54
kreuzberg/_extractors/_structured.py +1 -1
kreuzberg/_language_detection.py +2 -0
kreuzberg/_ocr/_tesseract.py +28 -6
kreuzberg/_types.py +18 -0
kreuzberg/cli.py +36 -22
kreuzberg/extraction.py +251 -107
{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/METADATA +4 -1
{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/RECORD +17 -16
{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_api/main.py CHANGED Viewed

@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
     Environment Variables:
         KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
     """
-    default_size = 1024 * 1024 * 1024  # 1GB
+    default_size = 1024 * 1024 * 1024
     try:
         size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
-        # Return default if negative
         return size if size >= 0 else default_size
     except ValueError:
         return default_size
@@ -311,6 +310,9 @@ async def handle_files_upload(  # noqa: PLR0913
     """
     static_config = discover_config_cached()
+    if not data:
+        raise ValidationError("No files provided for extraction", context={"file_count": 0})
     min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
     max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)

kreuzberg/_entity_extraction.py CHANGED Viewed

@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
     try:
         nlp = spacy.load(model_name)
     except OSError:
-        # Try to download the model automatically
         async def install_model() -> tuple[bool, str | None]:
             """Install model and return success status and error message."""
-            # First try spaCy's built-in download
             try:
                 success = await install_spacy_model_with_spacy(model_name)
                 if success:
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
             else:
                 spacy_error = "spaCy download failed"
-            # If spaCy download failed and uv is available, try uv as fallback
             if is_uv_available():
                 try:
                     result = await install_spacy_model_with_uv(model_name)
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
             return False, spacy_error
-        # Run the async installation in a sync context
         try:
             success, error_details = anyio.run(install_model)
-        except (OSError, RuntimeError) as e:
-            success, error_details = False, str(e)
+        except SystemExit as e:
+            success, error_details = False, f"spaCy CLI exit code: {e.code}"
         if not success:
-            # Generate appropriate error message based on available tools
             if is_uv_available():
                 model_url = get_spacy_model_url(model_name)
                 manual_install_cmd = f"uv pip install {model_url}"
@@ -234,7 +230,7 @@ def extract_keywords(
         kw_model = KeyBERT()
         keywords = kw_model.extract_keywords(text, top_n=keyword_count)
         return [(kw, float(score)) for kw, score in keywords]
-    except (RuntimeError, OSError, ValueError):
+    except ValueError:
         return []
     except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(

kreuzberg/_error_handling.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""Type-safe error handling utilities for extraction pipeline."""
+from __future__ import annotations
+import traceback
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Callable
+from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
+from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
+def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
+    """Determine if an exception should bubble up or be handled gracefully.
+    Args:
+        exception: The exception to classify
+        context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
+    Returns:
+        True if the exception should bubble up, False if it should be handled gracefully
+    """
+    if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
+        return True
+    if isinstance(exception, MissingDependencyError):
+        return True
+    if isinstance(exception, ValidationError):
+        if context == "batch_processing":
+            return False
+        return context != "optional_feature"
+    if isinstance(exception, KreuzbergError) and context == "optional_feature":
+        return False
+    if context == "batch_processing":
+        return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
+    return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
+class FeatureProcessingError:
+    """Type-safe processing error for extraction features."""
+    def __init__(self, feature: str, error: Exception) -> None:
+        self._feature = feature
+        self._error = error
+        self._traceback = traceback.format_exc()
+    @property
+    def feature(self) -> str:
+        return self._feature
+    @property
+    def error_type(self) -> str:
+        return type(self._error).__name__
+    @property
+    def error_message(self) -> str:
+        return str(self._error)
+    @property
+    def traceback(self) -> str:
+        return self._traceback
+    def to_dict(self) -> ProcessingErrorDict:
+        return {
+            "feature": self.feature,
+            "error_type": self.error_type,
+            "error_message": self.error_message,
+            "traceback": self.traceback,
+        }
+def safe_feature_execution(
+    feature_name: str,
+    execution_func: Callable[[], Any],
+    default_value: Any,
+    result: ExtractionResult,
+    context: ErrorContextType = "optional_feature",
+) -> Any:
+    """Safely execute a feature extraction function with proper error handling.
+    Args:
+        feature_name: Name of the feature being executed
+        execution_func: Function to execute that may raise exceptions
+        default_value: Default value to return if execution fails
+        result: ExtractionResult to update with error information
+        context: The context for exception handling decisions
+    Returns:
+        Either the successful result or the default value
+    """
+    try:
+        return execution_func()
+    except Exception as e:
+        if should_exception_bubble_up(e, context):
+            raise
+        _add_processing_error(result, FeatureProcessingError(feature_name, e))
+        return default_value
+def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
+    """Add a processing error to the result metadata in a type-safe way."""
+    if result.metadata is None:
+        result.metadata = {}
+    if "processing_errors" not in result.metadata:
+        result.metadata["processing_errors"] = []
+    errors_list = result.metadata["processing_errors"]
+    if isinstance(errors_list, list):
+        errors_list.append(error.to_dict())
+    else:
+        result.metadata["processing_errors"] = [error.to_dict()]
+def preserve_result_with_errors(
+    result: ExtractionResult,
+    errors: list[FeatureProcessingError],
+) -> ExtractionResult:
+    """Preserve a successful extraction result while adding error information.
+    This is used when core extraction succeeds but optional features fail.
+    Args:
+        result: The successful extraction result
+        errors: List of errors that occurred during optional processing
+    Returns:
+        The result with error information added to metadata
+    """
+    for error in errors:
+        _add_processing_error(result, error)
+    return result
+def create_error_result(
+    content: str,
+    mime_type: str,
+    errors: list[FeatureProcessingError],
+    **metadata_kwargs: Any,
+) -> ExtractionResult:
+    """Create an error result with proper type safety.
+    Args:
+        content: Error content to include
+        mime_type: MIME type of the result
+        errors: List of errors that occurred
+        **metadata_kwargs: Additional metadata to include
+    Returns:
+        An ExtractionResult with error information
+    """
+    metadata: Metadata = {
+        "error": f"Multiple processing errors occurred: {len(errors)} errors",
+        "error_context": {
+            "error_count": len(errors),
+            "errors": [error.to_dict() for error in errors],
+            **metadata_kwargs,
+        },
+        "processing_errors": [error.to_dict() for error in errors],
+    }
+    return ExtractionResult(
+        content=content,
+        chunks=[],
+        mime_type=mime_type,
+        metadata=metadata,
+        entities=[],
+        keywords=[],
+        detected_languages=[],
+        tables=[],
+        images=[],
+        image_ocr_results=[],
+    )

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -230,13 +230,13 @@ class Extractor(ABC):
                 confidence_score=None,
                 processing_time=duration,
             )
-        except (OSError, ValueError) as e:  # pragma: no cover
+        except ValueError as e:  # pragma: no cover
             return ImageOCRResult(
                 image=target,
                 ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
                 skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
             )
-        except (RuntimeError, TypeError) as e:  # pragma: no cover
+        except TypeError as e:  # pragma: no cover
             return ImageOCRResult(
                 image=target,
                 ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -75,7 +75,7 @@ class HTMLExtractor(Extractor):
         soup = BeautifulSoup(html_content, "xml")
         for img in soup.find_all("img"):
-            src_val = img.get("src")  # type: ignore[union-attr]
+            src_val = img.get("src")
             if isinstance(src_val, str) and src_val.startswith("data:image/"):
                 try:
                     header, data = src_val.split(",", 1)
@@ -105,7 +105,7 @@ class HTMLExtractor(Extractor):
                     except (OSError, ValueError) as e:  # pragma: no cover
                         logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
-                    alt_val = img.get("alt")  # type: ignore[union-attr]
+                    alt_val = img.get("alt")
                     desc = alt_val if isinstance(alt_val, str) else None
                     images.append(
                         ExtractedImage(

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -6,7 +6,6 @@ import logging
 import os
 import tempfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import asdict
 from itertools import count
 from multiprocessing import cpu_count
 from pathlib import Path
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
 from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
 from kreuzberg._types import (
-    EasyOCRConfig,
     ExtractedImage,
     ExtractionResult,
     ImageOCRResult,
     Metadata,
     OcrBackendType,
-    PaddleOCRConfig,
-    TesseractConfig,
 )
 from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
     def extract_path_sync(self, path: Path) -> ExtractionResult:
         content_bytes = path.read_bytes()
+        result: ExtractionResult | None = None
         document: Document | None = None
         if self.config.extract_images or self.config.extract_tables:
             document = self._parse_with_password_attempts(content_bytes)
-        try:
-            text = self._extract_pdf_searchable_text_sync(path)
-        except ParsingError:
-            text = ""
+        if not self.config.force_ocr:
+            try:
+                content = self._extract_pdf_searchable_text_sync(path)
+                if self._validate_extracted_text(content):
+                    result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+            except ParsingError:
+                pass
-        if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
-            text = self._extract_pdf_with_ocr_sync(path)
+        if not result and self.config.ocr_backend is not None:
+            result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
+        if not result:
+            result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
+        metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
+        result.metadata = metadata
-        tables = []
         if self.config.extract_tables:
             # GMFT is optional dependency ~keep
             try:
                 from kreuzberg._gmft import extract_tables_sync  # noqa: PLC0415
                 tables = extract_tables_sync(path)
+                result.tables = tables
             except ImportError:  # pragma: no cover
-                tables = []
-        if not self.config.force_ocr and self._validate_extracted_text(text):
-            text = self._extract_with_playa_sync(path, fallback_text=text)
-        text = normalize_spaces(text)
-        result = ExtractionResult(
-            content=text,
-            mime_type=PLAIN_TEXT_MIME_TYPE,
-            metadata={},
-            tables=list(tables),
-        )
+                result.tables = []
-        if tables:
-            table_summary = generate_table_summary(tables)
-            result.metadata = result.metadata | {
-                "table_count": table_summary["table_count"],
-                "tables_summary": f"Document contains {table_summary['table_count']} tables "
-                f"across {table_summary['pages_with_tables']} pages with "
-                f"{table_summary['total_rows']} total rows",
-            }
+            if result.tables:
+                table_summary = generate_table_summary(result.tables)
+                result.metadata = result.metadata | {
+                    "table_count": table_summary["table_count"],
+                    "tables_summary": f"Document contains {table_summary['table_count']} tables "
+                    f"across {table_summary['pages_with_tables']} pages with "
+                    f"{table_summary['total_rows']} total rows",
+                }
         if self.config.extract_images and document:
             images = self._extract_images_from_playa_sync(document)
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
         except Exception as e:
             raise ParsingError(f"Failed to extract PDF text: {e}") from e
-    def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
+    def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
         temp_files: list[Path] = []
         try:
             with pdf_document_sync(path) as pdf:
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
                         with pdf_resources_sync(bitmap, page):
                             pil_image.close()
-            return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
+            content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
+            return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
         except Exception as e:
             raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
                 with contextlib.suppress(OSError):
                     p.unlink()
-    def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
-        backend = get_ocr_backend(self.config.ocr_backend)
+    def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
+        backend = get_ocr_backend(ocr_backend)
         paths = [Path(p) for p in image_paths]
-        match self.config.ocr_backend:
-            case "tesseract":
-                config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
-                )
-                results = backend.process_batch_sync(paths, **asdict(config))
-            case "paddleocr":
-                paddle_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
-                )
-                results = backend.process_batch_sync(paths, **asdict(paddle_config))
-            case "easyocr":
-                easy_config = (
-                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
-                )
-                results = backend.process_batch_sync(paths, **asdict(easy_config))
-            case _:
-                raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
+        results = backend.process_batch_sync(paths, **self.config.get_config_dict())
         return "\n\n".join(result.content for result in results)

kreuzberg/_extractors/_structured.py CHANGED Viewed

@@ -14,7 +14,7 @@ else:  # pragma: no cover
 try:
     import yaml
 except ImportError:  # pragma: no cover
-    yaml = None
+    yaml = None  # type: ignore[assignment]
 from anyio import Path as AsyncPath

kreuzberg/_language_detection.py CHANGED Viewed

@@ -31,5 +31,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
             langs = [result["lang"].lower() for result in results if result.get("lang")]
             return langs if langs else None
         return None
+    except (RuntimeError, OSError, MemoryError):
+        raise
     except Exception:  # noqa: BLE001
         return None

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -1113,6 +1113,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             **run_config["remaining_kwargs"],
             "language": run_config["language"],
             "psm": run_config["psm"],
+            "tesseract_format": run_config["tesseract_format"],
+            "ext": run_config["ext"],
+            "output_format": run_config["output_format"],
+            "enable_table_detection": run_config["enable_table_detection"],
         }
         optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
@@ -1222,13 +1226,21 @@ def _process_image_with_tesseract(
     config_dict: dict[str, Any],
 ) -> dict[str, Any]:
     try:
-        with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
-            output_base = tmp_file.name.replace(".txt", "")
+        tesseract_format = config_dict.get("tesseract_format", "text")
+        ext = config_dict.get("ext", ".txt")
+        output_format = config_dict.get("output_format", "text")
+        config_dict.get("enable_table_detection", False)
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
+            output_base = tmp_file.name.replace(ext, "")
         try:
             language = config_dict.get("language", "eng")
             psm = config_dict.get("psm", 3)
+            # Convert PSM enum to integer value if needed
+            psm_value = psm.value if hasattr(psm, "value") else psm
             command = [
                 "tesseract",
                 image_path,
@@ -1236,13 +1248,16 @@ def _process_image_with_tesseract(
                 "-l",
                 language,
                 "--psm",
-                str(psm),
+                str(psm_value),
                 "--oem",
                 "1",
                 "--loglevel",
                 "OFF",
             ]
+            if tesseract_format != "text":
+                command.append(tesseract_format)
             boolean_options = [
                 "classify_use_pre_adapted_templates",
                 "language_model_ngram_on",
@@ -1275,10 +1290,17 @@ def _process_image_with_tesseract(
             if result.returncode != 0:
                 raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
-            output_file = output_base + ".txt"
+            output_file = output_base + ext
             with Path(output_file).open(encoding="utf-8") as f:
                 text = f.read()
+            # Process based on output format
+            if output_format == "markdown" and tesseract_format == "hocr":
+                # Import here to avoid circular dependency ~keep
+                from html_to_markdown import convert_to_markdown  # noqa: PLC0415
+                text = convert_to_markdown(text, heading_style="atx")
             text = normalize_spaces(text)
             return {
@@ -1289,8 +1311,8 @@ def _process_image_with_tesseract(
             }
         finally:
-            for ext in [".txt"]:
-                temp_file = output_base + ext
+            for possible_ext in [ext, ".txt", ".hocr", ".tsv"]:
+                temp_file = output_base + possible_ext
                 temp_path = Path(temp_file)
                 if temp_path.exists():
                     temp_path.unlink()

kreuzberg/_types.py CHANGED Viewed

@@ -32,6 +32,7 @@ if TYPE_CHECKING:
 OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
 OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
+ErrorContextType = Literal["batch_processing", "optional_feature", "single_extraction", "unknown"]
 class ConfigDict:
@@ -503,6 +504,17 @@ class SpacyEntityExtractionConfig(ConfigDict):
         return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
+class ProcessingErrorDict(TypedDict):
+    feature: str
+    """Name of the feature that failed (e.g., 'chunking', 'entity_extraction', 'keyword_extraction')."""
+    error_type: str
+    """Type of the exception that occurred (e.g., 'RuntimeError', 'ValidationError')."""
+    error_message: str
+    """Human-readable error message."""
+    traceback: str
+    """Full Python traceback for debugging."""
 class BoundingBox(TypedDict):
     left: int
     """X coordinate of the left edge."""
@@ -701,6 +713,10 @@ class Metadata(TypedDict, total=False):
     """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
     token_reduction: NotRequired[dict[str, float]]
     """Token reduction statistics including reduction ratios and counts."""
+    processing_errors: NotRequired[list[ProcessingErrorDict]]
+    """List of processing errors that occurred during extraction."""
+    extraction_error: NotRequired[dict[str, Any]]
+    """Error information for critical extraction failures."""
 _VALID_METADATA_KEYS = {
@@ -756,6 +772,8 @@ _VALID_METADATA_KEYS = {
     "message",
     "attributes",
     "token_reduction",
+    "processing_errors",
+    "extraction_error",
 }

kreuzberg/cli.py CHANGED Viewed

@@ -168,31 +168,45 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
             input_text = sys.stdin.read()
             input_bytes = input_text.encode("utf-8")
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            console=console,
-            transient=True,
-        ) as progress:
-            progress.add_task("Extracting text...", total=None)
-            try:
-                import magic  # type: ignore[import-not-found] # noqa: PLC0415
-                mime_type = magic.from_buffer(input_bytes, mime=True)
-            except ImportError:  # pragma: no cover
-                content_str = input_bytes.decode("utf-8", errors="ignore").lower()
-                mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
+        # Detect MIME type from content
+        content_str = input_bytes.decode("utf-8", errors="ignore").lower()
+        if "<html" in content_str or "<!doctype html" in content_str or "<body" in content_str:
+            mime_type = "text/html"
+        elif (content_str.strip().startswith("{") and content_str.strip().endswith("}")) or (
+            content_str.strip().startswith("[") and content_str.strip().endswith("]")
+        ):
+            mime_type = "application/json"
+        elif content_str.strip().startswith("---") or ":" in content_str[:100]:
+            mime_type = "application/x-yaml"
+        else:
+            mime_type = "text/plain"
+        # Use progress display if possible, fallback to simple extraction on Windows issues
+        try:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                console=console,
+                transient=True,
+            ) as progress:
+                progress.add_task("Extracting text...", total=None)
+                return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
+        except (OSError, RuntimeError):  # pragma: no cover
+            # Fallback for Windows console issues
             return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
     else:
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            console=console,
-            transient=True,
-        ) as progress:
-            progress.add_task(f"Extracting text from {file.name}...", total=None)
+        # Use progress display if possible, fallback to simple extraction on Windows issues
+        try:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                console=console,
+                transient=True,
+            ) as progress:
+                progress.add_task(f"Extracting text from {file.name}...", total=None)
+                return extract_file_sync(str(file), config=extraction_config)
+        except (OSError, RuntimeError):  # pragma: no cover
+            # Fallback for Windows console issues
             return extract_file_sync(str(file), config=extraction_config)

kreuzberg/extraction.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import multiprocessing as mp
+import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import TYPE_CHECKING, Final, cast
@@ -10,6 +11,7 @@ import anyio
 from kreuzberg._chunker import get_chunker
 from kreuzberg._document_classification import auto_detect_document_type
 from kreuzberg._entity_extraction import extract_entities, extract_keywords
+from kreuzberg._error_handling import safe_feature_execution, should_exception_bubble_up
 from kreuzberg._language_detection import detect_languages
 from kreuzberg._mime_types import (
     validate_mime_type,
@@ -21,7 +23,7 @@ from kreuzberg._utils._document_cache import get_document_cache
 from kreuzberg._utils._errors import create_error_context
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
-from kreuzberg.exceptions import ValidationError
+from kreuzberg.exceptions import KreuzbergError, ValidationError
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -50,69 +52,107 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
 def _validate_and_post_process_helper(
     result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
 ) -> ExtractionResult:
+    if result.metadata is None:
+        result.metadata = {}
     if config.chunk_content:
-        result.chunks = _handle_chunk_content(
-            mime_type=result.mime_type,
-            config=config,
-            content=result.content,
+        result.chunks = safe_feature_execution(
+            feature_name="chunking",
+            execution_func=lambda: _handle_chunk_content(
+                mime_type=result.mime_type,
+                config=config,
+                content=result.content,
+            ),
+            default_value=[],
+            result=result,
         )
     if config.extract_entities:
-        try:
-            result.entities = extract_entities(
+        result.entities = safe_feature_execution(
+            feature_name="entity_extraction",
+            execution_func=lambda: extract_entities(
                 result.content,
                 custom_patterns=config.custom_entity_patterns,
-            )
-        except RuntimeError:
-            result.entities = None
+            ),
+            default_value=None,
+            result=result,
+        )
     if config.extract_keywords:
-        try:
-            result.keywords = extract_keywords(
+        result.keywords = safe_feature_execution(
+            feature_name="keyword_extraction",
+            execution_func=lambda: extract_keywords(
                 result.content,
                 keyword_count=config.keyword_count,
-            )
-        except RuntimeError:
-            result.keywords = None
+            ),
+            default_value=None,
+            result=result,
+        )
     if config.auto_detect_language:
-        lang_config = config.language_detection_config
-        if lang_config is None:
-            from kreuzberg._types import LanguageDetectionConfig  # noqa: PLC0415
-            lang_config = LanguageDetectionConfig(model=config.language_detection_model)
+        def _detect_language() -> list[str]:
+            lang_config = config.language_detection_config
+            if lang_config is None:
+                from kreuzberg._types import LanguageDetectionConfig  # noqa: PLC0415
+                lang_config = LanguageDetectionConfig(model=config.language_detection_model)
-        result.detected_languages = detect_languages(
-            result.content,
-            config=lang_config,
+            return detect_languages(result.content, config=lang_config) or []
+        result.detected_languages = safe_feature_execution(
+            feature_name="language_detection",
+            execution_func=_detect_language,
+            default_value=[],
+            result=result,
         )
     if config.auto_detect_document_type:
-        result = auto_detect_document_type(result, config, file_path=file_path)
+        result = safe_feature_execution(
+            feature_name="document_type_detection",
+            execution_func=lambda: auto_detect_document_type(result, config, file_path=file_path),
+            default_value=result,
+            result=result,
+        )
     if config.token_reduction is not None and config.token_reduction.mode != "off":
-        original_content = result.content
-        language_hint = None
-        if result.detected_languages and len(result.detected_languages) > 0:
-            language_hint = result.detected_languages[0]
+        def _apply_token_reduction() -> str:
+            original_content = result.content
-        reduced_content = reduce_tokens(
-            original_content,
-            config=config.token_reduction,
-            language=language_hint,
+            language_hint = None
+            if result.detected_languages and len(result.detected_languages) > 0:
+                language_hint = result.detected_languages[0]
+            reduced_content = (
+                reduce_tokens(
+                    original_content,
+                    config=config.token_reduction,
+                    language=language_hint,
+                )
+                if config.token_reduction
+                else original_content
+            )
+            reduction_stats = get_reduction_stats(original_content, reduced_content)
+            if result.metadata is not None:
+                result.metadata["token_reduction"] = {
+                    "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
+                    "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
+                    "original_characters": reduction_stats["original_characters"],
+                    "reduced_characters": reduction_stats["reduced_characters"],
+                    "original_tokens": reduction_stats["original_tokens"],
+                    "reduced_tokens": reduction_stats["reduced_tokens"],
+                }
+            return reduced_content
+        result.content = safe_feature_execution(
+            feature_name="token_reduction",
+            execution_func=_apply_token_reduction,
+            default_value=result.content,
+            result=result,
         )
-        reduction_stats = get_reduction_stats(original_content, reduced_content)
-        result.content = reduced_content
-        result.metadata["token_reduction"] = {
-            "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
-            "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
-            "original_characters": reduction_stats["original_characters"],
-            "reduced_characters": reduction_stats["reduced_characters"],
-            "original_tokens": reduction_stats["original_tokens"],
-            "reduced_tokens": reduction_stats["reduced_tokens"],
-        }
     return result
@@ -125,8 +165,22 @@ async def _validate_and_post_process_async(
     result = _validate_and_post_process_helper(result, config, file_path)
-    for post_processor in config.post_processing_hooks or []:
-        result = await run_maybe_sync(post_processor, result)
+    for i, post_processor in enumerate(config.post_processing_hooks or []):
+        try:
+            result = await run_maybe_sync(post_processor, result)
+        except (KreuzbergError, ValueError, RuntimeError, TypeError) as e:  # noqa: PERF203
+            if result.metadata is None:
+                result.metadata = {}
+            error_list = result.metadata.setdefault("processing_errors", [])
+            if isinstance(error_list, list):
+                error_list.append(
+                    {
+                        "feature": f"post_processing_hook_{i}",
+                        "error_type": type(e).__name__,
+                        "error_message": str(e),
+                        "traceback": traceback.format_exc(),
+                    }
+                )
     return result
@@ -260,22 +314,18 @@ async def batch_extract_file(
                     config,
                 )
                 results[index] = result
-            except Exception as e:  # noqa: BLE001
-                error_result = ExtractionResult(
-                    content=f"Error: {type(e).__name__}: {e!s}",
-                    mime_type="text/plain",
-                    metadata={
-                        "error": f"{type(e).__name__}: {e!s}",
-                        "error_context": create_error_context(
-                            operation="batch_extract_file",
-                            file_path=str(path),
-                            error=e,
-                            index=index,
-                        ),
-                    },
-                    chunks=[],
+            except Exception as e:
+                if should_exception_bubble_up(e, "batch_processing"):
+                    raise
+                basic_result = _attempt_basic_extraction(
+                    None,
+                    None,
+                    e,
+                    index,
+                    file_path=str(path),
                 )
-                results[index] = error_result
+                results[index] = basic_result
     async with anyio.create_task_group() as tg:
         for i, path in enumerate(file_paths):
@@ -309,23 +359,12 @@ async def batch_extract_bytes(
             try:
                 result = await extract_bytes(content, mime_type, config)
                 results[index] = result
-            except Exception as e:  # noqa: BLE001
-                error_result = ExtractionResult(
-                    content=f"Error: {type(e).__name__}: {e!s}",
-                    mime_type="text/plain",
-                    metadata={
-                        "error": f"{type(e).__name__}: {e!s}",
-                        "error_context": create_error_context(
-                            operation="batch_extract_bytes",
-                            error=e,
-                            index=index,
-                            mime_type=mime_type,
-                            content_size=len(content),
-                        ),
-                    },
-                    chunks=[],
-                )
-                results[index] = error_result
+            except Exception as e:
+                if should_exception_bubble_up(e, "batch_processing"):
+                    raise
+                basic_result = _attempt_basic_extraction(content, mime_type, e, index)
+                results[index] = basic_result
     async with anyio.create_task_group() as tg:
         for i, (content, mime_type) in enumerate(contents):
@@ -334,6 +373,125 @@ async def batch_extract_bytes(
     return results
+def _attempt_basic_extraction(
+    content: bytes | None, mime_type: str | None, original_error: Exception, index: int, *, file_path: str | None = None
+) -> ExtractionResult:
+    """Attempt basic extraction when full extraction fails, preserving as much as possible.
+    This function tries to extract at least basic text content even when advanced
+    features like OCR, entity extraction, etc. fail.
+    Args:
+        content: The raw content bytes (None for file extractions)
+        mime_type: The MIME type of the content (None if unknown)
+        original_error: The exception that caused the main extraction to fail
+        index: Index of this content in the batch
+        file_path: Optional file path for file-based extractions
+    Returns:
+        A basic ExtractionResult with whatever could be extracted
+    """
+    if (
+        isinstance(original_error, (ValueError, TypeError, ValidationError))
+        or "mock" in str(type(original_error)).lower()
+    ):
+        return ExtractionResult(
+            content=f"Error: {type(original_error).__name__}: {original_error!s}",
+            mime_type="text/plain",
+            metadata={
+                "error": f"{type(original_error).__name__}: {original_error!s}",
+                "error_context": create_error_context(
+                    operation="batch_extract_file" if file_path else "batch_extract_bytes",
+                    error=original_error,
+                    index=index,
+                    mime_type=mime_type,
+                    content_size=len(content) if content else 0,
+                    file_path=file_path,
+                ),
+            },
+            chunks=[],
+            entities=[],
+            keywords=[],
+            detected_languages=[],
+            tables=[],
+            images=[],
+            image_ocr_results=[],
+        )
+    try:
+        if content is None:
+            return ExtractionResult(
+                content=f"Error: {type(original_error).__name__}: {original_error!s}",
+                mime_type="text/plain",
+                metadata={
+                    "error": f"{type(original_error).__name__}: {original_error!s}",
+                    "error_context": create_error_context(
+                        operation="batch_extract_file",
+                        error=original_error,
+                        index=index,
+                        file_path=file_path,
+                    ),
+                },
+                chunks=[],
+                entities=[],
+                keywords=[],
+                detected_languages=[],
+                tables=[],
+                images=[],
+                image_ocr_results=[],
+            )
+        mime_type = validate_mime_type(mime_type=mime_type)
+        if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=ExtractionConfig()):
+            basic_result = extractor.extract_bytes_sync(content)
+            if basic_result.metadata is None:
+                basic_result.metadata = {}
+            basic_result.metadata["extraction_error"] = {
+                "error_type": type(original_error).__name__,
+                "error_message": str(original_error),
+                "traceback": traceback.format_exc(),
+                "context": create_error_context(
+                    operation="batch_extract_file" if file_path else "batch_extract_bytes",
+                    error=original_error,
+                    index=index,
+                    mime_type=mime_type,
+                    content_size=len(content),
+                    file_path=file_path,
+                ),
+                "recovery_mode": "basic_extraction",
+            }
+            return basic_result
+    except (KreuzbergError, ValueError, RuntimeError, TypeError):
+        pass
+    return ExtractionResult(
+        content=f"Error: {type(original_error).__name__}: {original_error!s}",
+        mime_type="text/plain",
+        metadata={
+            "error": f"{type(original_error).__name__}: {original_error!s}",
+            "error_context": create_error_context(
+                operation="batch_extract_file" if file_path else "batch_extract_bytes",
+                error=original_error,
+                index=index,
+                mime_type=mime_type,
+                content_size=len(content) if content else 0,
+                file_path=file_path,
+            ),
+        },
+        chunks=[],
+        entities=[],
+        keywords=[],
+        detected_languages=[],
+        tables=[],
+        images=[],
+        image_ocr_results=[],
+    )
 def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
     """Synchronous version of extract_bytes.
@@ -444,21 +602,18 @@ def batch_extract_file_sync(
                 index,
                 extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
             )
-        except Exception as e:  # noqa: BLE001
-            error_result = ExtractionResult(
-                content=f"Error: {type(e).__name__}: {e!s}",
-                mime_type="text/plain",
-                metadata={
-                    "error": f"{type(e).__name__}: {e!s}",
-                    "error_context": create_error_context(
-                        operation="batch_extract_file_sync",
-                        file_path=str(file_path),
-                        error=e,
-                    ),
-                },
-                chunks=[],
+        except Exception as e:
+            if should_exception_bubble_up(e, "batch_processing"):
+                raise
+            basic_result = _attempt_basic_extraction(
+                None,
+                None,
+                e,
+                index,
+                file_path=str(file_path),
             )
-            return (index, error_result)
+            return (index, basic_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
@@ -494,23 +649,12 @@ def batch_extract_bytes_sync(
         """Extract single content with index for ordering."""
         try:
             return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
-        except Exception as e:  # noqa: BLE001
-            error_result = ExtractionResult(
-                content=f"Error: {type(e).__name__}: {e!s}",
-                mime_type="text/plain",
-                metadata={
-                    "error": f"{type(e).__name__}: {e!s}",
-                    "error_context": create_error_context(
-                        operation="batch_extract_bytes_sync",
-                        error=e,
-                        index=index,
-                        mime_type=mime_type,
-                        content_size=len(content),
-                    ),
-                },
-                chunks=[],
-            )
-            return (index, error_result)
+        except Exception as e:
+            if should_exception_bubble_up(e, "batch_processing"):
+                raise
+            basic_result = _attempt_basic_extraction(content, mime_type, e, index)
+            return (index, basic_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_index = {

{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.18.0
+Version: 3.19.0
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -42,6 +42,7 @@ Requires-Dist: psutil>=7.1.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.5.3
 Requires-Dist: python-pptx>=1.0.2
+Requires-Dist: transformers>=4.30.0
 Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
 Provides-Extra: additional-extensions
 Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
@@ -63,6 +64,7 @@ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
 Requires-Dist: spacy>=3.8.7; extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
+Requires-Dist: transformers>=4.25.0; extra == 'all'
 Provides-Extra: api
 Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
 Provides-Extra: chunking
@@ -82,6 +84,7 @@ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
 Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.2; extra == 'gmft'
+Requires-Dist: transformers>=4.25.0; extra == 'gmft'
 Provides-Extra: langdetect
 Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
 Provides-Extra: paddleocr

{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/RECORD RENAMED Viewed

@@ -4,30 +4,31 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
 kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
 kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
 kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
-kreuzberg/_entity_extraction.py,sha256=zbwgvS_2M4JibmVVnclkmie0nmZQtyHtT_ucdbQc6nU,7837
+kreuzberg/_entity_extraction.py,sha256=Ks-1gZIYDqgg2uJerd0FH_lYhjIwS0f0bMVhR9M59jA,7518
+kreuzberg/_error_handling.py,sha256=Isr9yrY4JRKOmUVaUOky_LZ7tGVZAm8jxRD3qGbkc1g,5604
 kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
-kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
+kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD2Ks,1143
 kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
 kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
 kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
-kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
-kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
+kreuzberg/_types.py,sha256=6oBsmUUihVr4hJJrYeuWoUVzCP_-eciCrBVvGQHQTDI,49920
+kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
 kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
-kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
+kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
 kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
-kreuzberg/_api/main.py,sha256=5LiqgyeHJy0GLLa-ehB0bq8ftEUYfM1Pt6f0j_a0dso,15190
+kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
 kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg/_extractors/_base.py,sha256=4MRBXdLsgdtdrTuupWb2IT9YpRSnNPpWWviS2mfeOXg,9961
+kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
 kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
-kreuzberg/_extractors/_html.py,sha256=TXXgwQZuEvnrny5HdBpn8oikGktyxgY9jvgZmnFtnqY,6371
+kreuzberg/_extractors/_html.py,sha256=vNAgBrfok-16SOkhhsy10unqVwAczlTL_2KEn2X6S98,6315
 kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
 kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
-kreuzberg/_extractors/_pdf.py,sha256=GFy7xHUH09i48E5Xixy6nReF_uBu9646UTjywKoH-Rs,23304
+kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
 kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
 kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
-kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
+kreuzberg/_extractors/_structured.py,sha256=thpXhsBnvaHzGQX4sy6eVHowFv0yaYxLGHwxx4DouCI,8947
 kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
 kreuzberg/_mcp/server.py,sha256=71MhjiFDwgFROdGejf0djgO1eG370qudWmZsN59CUeA,16743
 kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
@@ -35,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
 kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
 kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
 kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
-kreuzberg/_ocr/_tesseract.py,sha256=1SEfrX_JvU6KIeWt31GsRWnNmjaAh3xgQaRMPvoZLJA,51349
+kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
 kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
 kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
 kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
@@ -121,8 +122,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
 kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
 kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
 kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
-kreuzberg-3.18.0.dist-info/METADATA,sha256=Z54em4GwMd18BmlIWmq1AHtCdFStstMV5RAXaB4x3_0,12351
-kreuzberg-3.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kreuzberg-3.18.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
-kreuzberg-3.18.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.18.0.dist-info/RECORD,,
+kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
+kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
+kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.19.0.dist-info/RECORD,,

{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.18.0__py3-none-any.whl → 3.19.0__py3-none-any.whl

kreuzberg 3.18.0py3-none-any.whl → 3.19.0py3-none-any.whl