PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_table.py CHANGED Viewed

@@ -1,8 +1,6 @@
-"""Table processing and export utilities."""
 from __future__ import annotations
-import csv
+import io
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
@@ -10,67 +8,38 @@ if TYPE_CHECKING:
 def export_table_to_csv(table: TableData, separator: str = ",") -> str:
-    r"""Export a TableData object to CSV/TSV format.
-    Args:
-        table: TableData object containing DataFrame
-        separator: Field separator ("," for CSV, "\t" for TSV)
-    Returns:
-        String representation in CSV/TSV format
-    """
     if "df" not in table or table["df"] is None:
         return ""
-    # Use pandas to_csv() direct string return instead of StringIO
-    csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
-    return str(csv_output).strip()
+    buffer = io.StringIO()
+    df = table["df"]
+    df.write_csv(buffer, separator=separator, include_header=True)
+    return buffer.getvalue().strip()
 def export_table_to_tsv(table: TableData) -> str:
-    """Export a TableData object to TSV format.
-    Args:
-        table: TableData object containing DataFrame
-    Returns:
-        String representation in TSV format
-    """
     return export_table_to_csv(table, separator="\t")
 def enhance_table_markdown(table: TableData) -> str:
-    """Generate enhanced markdown table with better formatting.
-    Args:
-        table: TableData object
-    Returns:
-        Enhanced markdown table string
-    """
     if "df" not in table or table["df"] is None:
         return table.get("text", "")
     df = table["df"]
-    if df.empty:
+    if df.is_empty():
         return table.get("text", "")
-    # Create enhanced markdown with proper alignment
     lines = []
-    # Header row
     headers = [str(col).strip() for col in df.columns]
     lines.append("| " + " | ".join(headers) + " |")
-    # Separator row with alignment hints
     lines.append(_generate_separator_row(df))
-    # Analyze float columns to determine formatting strategy
     float_col_formatting = _analyze_float_columns(df)
-    # Data rows with proper formatting
-    for _, row in df.iterrows():
+    for row in df.iter_rows(named=True):
         formatted_row = _format_table_row(row, df, float_col_formatting)
         lines.append("| " + " | ".join(formatted_row) + " |")
@@ -78,79 +47,77 @@ def enhance_table_markdown(table: TableData) -> str:
 def _generate_separator_row(df: Any) -> str:
-    """Generate separator row with proper alignment hints."""
     separators = []
     for col in df.columns:
-        # Check if column contains mostly numbers for right alignment
-        if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
-            separators.append("---:")  # Right align numbers
+        dtype_str = str(df[col].dtype)
+        if dtype_str in ["Int64", "Float64", "Int32", "Float32"] or _is_numeric_column(df[col]):
+            separators.append("---:")
         else:
-            separators.append("---")  # Left align text
+            separators.append("---")
     return "| " + " | ".join(separators) + " |"
 def _analyze_float_columns(df: Any) -> dict[str, str]:
-    """Analyze float columns to determine formatting strategy."""
     float_col_formatting = {}
     for col in df.columns:
-        if str(df[col].dtype) == "float64":
-            non_null_values = df[col].dropna()
+        dtype_str = str(df[col].dtype)
+        if dtype_str in ["Float64", "Float32"]:
+            non_null_values = df[col].drop_nulls()
             if len(non_null_values) > 0:
-                # If all non-null values are whole numbers, format as integers
-                all_integers = all(val.is_integer() for val in non_null_values)
-                float_col_formatting[col] = "int" if all_integers else "float"
+                try:
+                    values_list = non_null_values.to_list()
+                    all_integers = all(float(val).is_integer() for val in values_list if val is not None)
+                    float_col_formatting[col] = "int" if all_integers else "float"
+                except (ValueError, AttributeError):
+                    float_col_formatting[col] = "float"
             else:
                 float_col_formatting[col] = "int"
     return float_col_formatting
 def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
-    """Format a single table row with proper value formatting."""
     formatted_row = []
     for col_name, value in row.items():
-        if value is None or (isinstance(value, float) and str(value) == "nan"):
+        if value is None:
             formatted_row.append("")
-        elif str(df[col_name].dtype) in ["int64", "int32"]:
-            # For integer columns, format as integers
-            formatted_row.append(str(int(value)))
-        elif isinstance(value, float):
-            # For float columns, use the determined formatting strategy
-            if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
+        else:
+            dtype_str = str(df[col_name].dtype)
+            if dtype_str in ["Int64", "Int32"]:
                 formatted_row.append(str(int(value)))
+            elif isinstance(value, float):
+                if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
+                    formatted_row.append(str(int(value)))
+                else:
+                    formatted_row.append(f"{value:.2f}")
             else:
-                formatted_row.append(f"{value:.2f}")
-        else:
-            # Clean up text values
-            clean_value = str(value).strip().replace("|", "\\|")  # Escape pipes
-            formatted_row.append(clean_value)
+                clean_value = str(value).strip().replace("|", "\\|")
+                formatted_row.append(clean_value)
     return formatted_row
 def _is_numeric_column(series: Any) -> bool:
-    """Check if a pandas Series contains mostly numeric values."""
     if len(series) == 0:
         return False
     try:
-        # Check if already numeric dtype first (fastest path)
-        if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
+        dtype_str = str(series.dtype)
+        if dtype_str in {"Int64", "Float64", "Int32", "Float32"}:
             return True
-        # Sample-based approach for large series (>1000 rows)
         sample_size = min(100, len(series))
-        if len(series) > 1000:
-            sample_series = series.dropna().sample(n=sample_size, random_state=42)
-        else:
-            sample_series = series.dropna()
+        series_no_nulls = series.drop_nulls()
+        if len(series_no_nulls) == 0:
+            return False
+        sample_series = series_no_nulls.slice(0, sample_size) if len(series_no_nulls) > 1000 else series_no_nulls
         if len(sample_series) == 0:
             return False
-        # Optimized numeric conversion - avoid exception overhead
         numeric_count = 0
-        for val in sample_series:
+        for val in sample_series.to_list():
             val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
-            # Quick check: if it contains only digits, decimal point, minus, plus, or e
             if val_str and all(c in "0123456789.-+eE" for c in val_str):
                 try:
                     float(val_str)
@@ -158,7 +125,6 @@ def _is_numeric_column(series: Any) -> bool:
                 except (ValueError, TypeError):
                     pass
-        # Consider numeric if >70% of sampled values are numeric
         return (numeric_count / len(sample_series)) > 0.7
     except (ValueError, TypeError, ZeroDivisionError):
@@ -166,14 +132,6 @@ def _is_numeric_column(series: Any) -> bool:
 def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
-    """Generate summary statistics for extracted tables.
-    Args:
-        tables: List of TableData objects
-    Returns:
-        Dictionary with table statistics
-    """
     if not tables:
         return {
             "table_count": 0,
@@ -190,8 +148,8 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
     for table in tables:
         if "df" in table and table["df"] is not None:
             df = table["df"]
-            total_rows += len(df)
-            total_columns += len(df.columns)
+            total_rows += df.height
+            total_columns += df.width
         if "page_number" in table:
             page_num = table["page_number"]
@@ -213,14 +171,6 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
 def extract_table_structure_info(table: TableData) -> dict[str, Any]:
-    """Extract structural information from a table.
-    Args:
-        table: TableData object
-    Returns:
-        Dictionary with structural information
-    """
     info = {
         "has_headers": False,
         "row_count": 0,
@@ -236,25 +186,23 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
     df = table["df"]
-    if df.empty:
+    if df.is_empty():
         return info
-    info["row_count"] = len(df)
-    info["column_count"] = len(df.columns)
-    info["has_headers"] = len(df.columns) > 0
+    info["row_count"] = df.height
+    info["column_count"] = df.width
+    info["has_headers"] = df.width > 0
-    # Analyze column types
     for col in df.columns:
         if _is_numeric_column(df[col]):
             info["numeric_columns"] += 1
         else:
             info["text_columns"] += 1
-    # Calculate data density
-    total_cells = len(df) * len(df.columns)
+    total_cells = df.height * df.width
     if total_cells > 0:
-        empty_cells = df.isnull().sum().sum()
-        info["empty_cells"] = int(empty_cells)
+        empty_cells = df.null_count().sum().item()
+        info["empty_cells"] = empty_cells
         info["data_density"] = (total_cells - empty_cells) / total_cells
     return info

kreuzberg/_utils/_tmp.py CHANGED Viewed

@@ -16,15 +16,6 @@ if TYPE_CHECKING:  # pragma: no cover
 async def create_temp_file(
     extension: str, content: bytes | None = None
 ) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
-    """Create a temporary file that is closed.
-    Args:
-        extension: The file extension.
-        content: The content to write to the file.
-    Returns:
-        The temporary file path.
-    """
     file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
     if content:
         await AsyncPath(file.name).write_bytes(content)

kreuzberg/cli.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Command-line interface for kreuzberg."""
 from __future__ import annotations
 import json
@@ -84,11 +82,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
     return "\n".join(output_parts)
-def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
+def _load_config(config_path: Path | None, verbose: bool) -> dict[str, Any]:
     """Load configuration from file or find default."""
     file_config = {}
-    if config:
-        file_config = load_config_from_file(config)
+    if config_path:
+        file_config = load_config_from_file(config_path)
     else:
         default_config = find_config_file()
         if default_config:
@@ -101,39 +99,38 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
     return file_config
-def _build_cli_args(
-    force_ocr: bool,
-    chunk_content: bool,
-    extract_tables: bool,
-    max_chars: int,
-    max_overlap: int,
-    ocr_backend: str | None,
-    tesseract_lang: str | None,
-    tesseract_psm: int | None,
-    easyocr_languages: str | None,
-    paddleocr_languages: str | None,
-) -> dict[str, Any]:
+def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
     """Build CLI arguments dictionary."""
     cli_args: dict[str, Any] = {
-        "force_ocr": force_ocr if force_ocr else None,
-        "chunk_content": chunk_content if chunk_content else None,
-        "extract_tables": extract_tables if extract_tables else None,
-        "max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
-        "max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
-        "ocr_backend": ocr_backend,
+        "force_ocr": params["force_ocr"] if params["force_ocr"] else None,
+        "chunk_content": params["chunk_content"] if params["chunk_content"] else None,
+        "extract_tables": params["extract_tables"] if params["extract_tables"] else None,
+        "max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
+        "max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
+        "ocr_backend": params["ocr_backend"],
     }
-    if ocr_backend == "tesseract" and (tesseract_lang or tesseract_psm is not None):
+    ocr_backend = params["ocr_backend"]
+    if ocr_backend == "tesseract" and (
+        params["tesseract_lang"]
+        or params["tesseract_psm"] is not None
+        or params["tesseract_output_format"]
+        or params["enable_table_detection"]
+    ):
         tesseract_config = {}
-        if tesseract_lang:
-            tesseract_config["language"] = tesseract_lang
-        if tesseract_psm is not None:
-            tesseract_config["psm"] = tesseract_psm  # type: ignore[assignment]
+        if params["tesseract_lang"]:
+            tesseract_config["language"] = params["tesseract_lang"]
+        if params["tesseract_psm"] is not None:
+            tesseract_config["psm"] = params["tesseract_psm"]
+        if params["tesseract_output_format"]:
+            tesseract_config["output_format"] = params["tesseract_output_format"]
+        if params["enable_table_detection"]:
+            tesseract_config["enable_table_detection"] = True
         cli_args["tesseract_config"] = tesseract_config
-    elif ocr_backend == "easyocr" and easyocr_languages:
-        cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
-    elif ocr_backend == "paddleocr" and paddleocr_languages:
-        cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
+    elif ocr_backend == "easyocr" and params["easyocr_languages"]:
+        cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
+    elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
+        cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
     return cli_args
@@ -158,7 +155,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
             progress.add_task("Extracting text...", total=None)
             try:
-                import magic  # type: ignore[import-not-found]  # noqa: PLC0415
+                import magic  # type: ignore[import-not-found] # noqa: PLC0415
                 mime_type = magic.from_buffer(input_bytes, mime=True)
             except ImportError:  # pragma: no cover
@@ -188,7 +185,10 @@ def _write_output(
         if verbose:
             console.print(f"[green]✓[/green] Output written to: {output}")
     else:
-        click.echo(formatted_output)
+        try:
+            click.echo(formatted_output)
+        except UnicodeEncodeError:
+            sys.stdout.buffer.write(formatted_output.encode("utf-8"))
 def handle_error(error: Exception, verbose: bool) -> None:  # pragma: no cover
@@ -248,71 +248,51 @@ def cli(ctx: click.Context) -> None:
 @click.option(
     "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
 )
-@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
+@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
 @click.option("--show-metadata", is_flag=True, help="Include metadata in output")
 @click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
 @click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
 @click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
 @click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
+@click.option(
+    "--tesseract-output-format",
+    type=click.Choice(["text", "markdown", "tsv", "hocr"]),
+    help="Tesseract OCR output format (default: markdown)",
+)
+@click.option(
+    "--enable-table-detection", is_flag=True, help="Enable table extraction from scanned documents (with TSV format)"
+)
 @click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
 @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
 @click.pass_context
-def extract(  # noqa: PLR0913
-    _: click.Context,
-    file: Path | None,
-    output: Path | None,
-    force_ocr: bool,
-    chunk_content: bool,
-    extract_tables: bool,
-    max_chars: int,
-    max_overlap: int,
-    ocr_backend: str | None,
-    config: Path | None,
-    show_metadata: bool,
-    output_format: str,
-    verbose: bool,
-    tesseract_lang: str | None,
-    tesseract_psm: int | None,
-    easyocr_languages: str | None,
-    paddleocr_languages: str | None,
-) -> None:
+def extract(ctx: click.Context) -> None:
     """Extract text from a document.
     FILE can be a path to a document or '-' to read from stdin.
     If FILE is omitted, reads from stdin.
     """
+    params = ctx.params
     try:
-        file_config = _load_config(config, verbose)
-        cli_args = _build_cli_args(
-            force_ocr,
-            chunk_content,
-            extract_tables,
-            max_chars,
-            max_overlap,
-            ocr_backend,
-            tesseract_lang,
-            tesseract_psm,
-            easyocr_languages,
-            paddleocr_languages,
-        )
+        file_config = _load_config(params["config_file"], params["verbose"])
+        cli_args = _build_cli_args(params)
         extraction_config = build_extraction_config(file_config, cli_args)
-        result = _perform_extraction(file, extraction_config, verbose)
+        result = _perform_extraction(params["file"], extraction_config, params["verbose"])
-        _write_output(result, output, show_metadata, output_format, verbose)
+        _write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
     except Exception as e:  # noqa: BLE001
-        handle_error(e, verbose)
+        handle_error(e, params["verbose"])
 @cli.command()
-@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
-def config(config: Path | None) -> None:
+@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
+def config(config_file: Path | None) -> None:
     """Show current configuration."""
     try:
-        config_path = config or find_config_file()
+        config_path = config_file or find_config_file()
         if config_path:
             file_config = load_config_from_file(config_path)

kreuzberg/extraction.py CHANGED Viewed

@@ -151,20 +151,22 @@ async def extract_file(
     """
     cache = get_document_cache()
     path = Path(file_path)
-    cached_result = cache.get(path, config)
-    if cached_result is not None:
-        return cached_result
-    if cache.is_processing(path, config):
-        event = cache.mark_processing(path, config)
-        await anyio.to_thread.run_sync(event.wait)  # pragma: no cover
-        # Try cache again after waiting for other process to complete  # ~keep
-        cached_result = cache.get(path, config)  # pragma: no cover
-        if cached_result is not None:  # pragma: no cover
+    if config.use_cache:
+        cached_result = cache.get(path, config)
+        if cached_result is not None:
             return cached_result
-    cache.mark_processing(path, config)
+        if cache.is_processing(path, config):
+            event = cache.mark_processing(path, config)
+            await anyio.to_thread.run_sync(event.wait)  # pragma: no cover
+            # Try cache again after waiting for other process to complete  # ~keep
+            cached_result = cache.get(path, config)  # pragma: no cover
+            if cached_result is not None:  # pragma: no cover
+                return cached_result
+        cache.mark_processing(path, config)
     try:
         if not path.exists():
@@ -183,11 +185,13 @@ async def extract_file(
         result = await _validate_and_post_process_async(result=result, config=config, file_path=path)
-        cache.set(path, config, result)
+        if config.use_cache:
+            cache.set(path, config, result)
         return result
     finally:
-        cache.mark_complete(path, config)
+        if config.use_cache:
+            cache.mark_complete(path, config)
 async def batch_extract_file(
@@ -224,7 +228,7 @@ async def batch_extract_file(
                     content=f"Error: {type(e).__name__}: {e!s}",
                     mime_type="text/plain",
                     metadata={  # type: ignore[typeddict-unknown-key]
-                        "error": True,
+                        "error": f"{type(e).__name__}: {e!s}",
                         "error_context": create_error_context(
                             operation="batch_extract_file",
                             file_path=path,
@@ -273,7 +277,7 @@ async def batch_extract_bytes(
                     content=f"Error: {type(e).__name__}: {e!s}",
                     mime_type="text/plain",
                     metadata={  # type: ignore[typeddict-unknown-key]
-                        "error": True,
+                        "error": f"{type(e).__name__}: {e!s}",
                         "error_context": create_error_context(
                             operation="batch_extract_bytes",
                             error=e,
@@ -336,20 +340,22 @@ def extract_file_sync(
     """
     cache = get_document_cache()
     path = Path(file_path)
-    cached_result = cache.get(path, config)
-    if cached_result is not None:
-        return cached_result
-    if cache.is_processing(path, config):
-        event = cache.mark_processing(path, config)
-        event.wait()  # pragma: no cover
-        # Try cache again after waiting for other process to complete  # ~keep
-        cached_result = cache.get(path, config)  # pragma: no cover
-        if cached_result is not None:  # pragma: no cover
+    if config.use_cache:
+        cached_result = cache.get(path, config)
+        if cached_result is not None:
             return cached_result
-    cache.mark_processing(path, config)
+        if cache.is_processing(path, config):
+            event = cache.mark_processing(path, config)
+            event.wait()  # pragma: no cover
+            # Try cache again after waiting for other process to complete  # ~keep
+            cached_result = cache.get(path, config)  # pragma: no cover
+            if cached_result is not None:  # pragma: no cover
+                return cached_result
+        cache.mark_processing(path, config)
     try:
         if not path.exists():
@@ -360,7 +366,7 @@ def extract_file_sync(
             result = extractor.extract_path_sync(Path(file_path))
         else:
             result = ExtractionResult(
-                content=Path(file_path).read_text(),
+                content=Path(file_path).read_text(encoding="utf-8"),
                 chunks=[],
                 mime_type=mime_type,
                 metadata={},
@@ -368,11 +374,13 @@ def extract_file_sync(
         result = _validate_and_post_process_sync(result=result, config=config, file_path=path)
-        cache.set(path, config, result)
+        if config.use_cache:
+            cache.set(path, config, result)
         return result
     finally:
-        cache.mark_complete(path, config)
+        if config.use_cache:
+            cache.mark_complete(path, config)
 def batch_extract_file_sync(
@@ -404,7 +412,7 @@ def batch_extract_file_sync(
                 content=f"Error: {type(e).__name__}: {e!s}",
                 mime_type="text/plain",
                 metadata={  # type: ignore[typeddict-unknown-key]
-                    "error": True,
+                    "error": f"{type(e).__name__}: {e!s}",
                     "error_context": create_error_context(
                         operation="batch_extract_file_sync",
                         file_path=file_path,
@@ -455,7 +463,7 @@ def batch_extract_bytes_sync(
                 content=f"Error: {type(e).__name__}: {e!s}",
                 mime_type="text/plain",
                 metadata={  # type: ignore[typeddict-unknown-key]
-                    "error": True,
+                    "error": f"{type(e).__name__}: {e!s}",
                     "error_context": create_error_context(
                         operation="batch_extract_bytes_sync",
                         error=e,
@@ -469,7 +477,6 @@ def batch_extract_bytes_sync(
             return (index, error_result)
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Avoid creating intermediate list, use enumerate directly
         future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
         results: list[ExtractionResult] = [None] * len(contents)  # type: ignore[list-item]

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl