PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_ref.py ADDED Viewed

@@ -0,0 +1,48 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, cast
+if TYPE_CHECKING:
+    from collections.abc import Callable
+T = TypeVar("T")
+class Ref(Generic[T]):
+    """A reference container that manages singleton instances without global variables.
+    This provides a clean alternative to global variables by using a registry pattern
+    with type safety.
+    """
+    _instances: ClassVar[dict[str, Any]] = {}
+    def __init__(self, name: str, factory: Callable[[], T]) -> None:
+        """Initialize a reference container.
+        Args:
+            name: Unique name for this reference
+            factory: Factory function to create the instance when needed
+        """
+        self.name = name
+        self.factory = factory
+    def get(self) -> T:
+        """Get the singleton instance, creating it if it doesn't exist."""
+        if self.name not in self._instances:
+            self._instances[self.name] = self.factory()
+        return cast("T", self._instances[self.name])
+    def clear(self) -> None:
+        """Clear the singleton instance."""
+        if self.name in self._instances:
+            del self._instances[self.name]
+    def is_initialized(self) -> bool:
+        """Check if the singleton instance exists."""
+        return self.name in self._instances
+    @classmethod
+    def clear_all(cls) -> None:
+        """Clear all singleton instances."""
+        cls._instances.clear()

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Fast serialization utilities using msgspec."""
 from __future__ import annotations
 from dataclasses import is_dataclass
@@ -12,7 +10,6 @@ from msgspec.msgpack import decode, encode
 T = TypeVar("T")
-# Define dict method names in priority order
 _DICT_METHOD_NAMES = (
     "to_dict",
     "as_dict",
@@ -32,14 +29,12 @@ def encode_hook(obj: Any) -> Any:
     if isinstance(obj, Exception):
         return {"message": str(obj), "type": type(obj).__name__}
-    # Check for dict-like methods more efficiently using any() with generator
     for attr_name in _DICT_METHOD_NAMES:
         method = getattr(obj, attr_name, None)
         if method is not None and callable(method):
             return method()
     if is_dataclass(obj) and not isinstance(obj, type):
-        # Use msgspec.to_builtins for more efficient conversion
         return msgspec.to_builtins(obj)
     if hasattr(obj, "save") and hasattr(obj, "format"):

kreuzberg/_utils/_string.py CHANGED Viewed

@@ -7,28 +7,21 @@ from functools import lru_cache
 import chardetng_py
-# Compile regex patterns once at module level for performance
 _WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
 _NEWLINES_PATTERN = re.compile(r"\n+")
 _MOJIBAKE_PATTERNS = {
-    # Hebrew as Cyrillic patterns
     "hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
-    # Control characters that shouldn't appear in text
     "control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
-    # Unicode replacement characters
     "replacement_chars": re.compile(r"\uFFFD+"),
-    # Isolated combining marks (likely encoding issues)
     "isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
 }
-# Simple cache for encoding detection (in-memory, session-scoped)
 _encoding_cache: dict[str, str] = {}
 @lru_cache(maxsize=128)
 def _get_encoding_cache_key(data_hash: str, size: int) -> str:
     """Generate cache key for encoding detection."""
-    # Use string interpolation which is faster than format strings for simple cases
     return f"{data_hash}:{size}"
@@ -45,14 +38,12 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     if not byte_data:
         return ""
-    # Try provided encoding first (fastest path)
     if encoding:
         with suppress(UnicodeDecodeError, LookupError):
             decoded = byte_data.decode(encoding)
             return _fix_mojibake(decoded)
-    # Check cache for similar content (performance optimization)
-    data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16]  # Hash first 1KB
+    data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16]
     cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
     if cache_key in _encoding_cache:
@@ -61,25 +52,22 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
             decoded = byte_data.decode(cached_encoding)
             return _fix_mojibake(decoded)
-    # Use chardetng for better performance than charset-normalizer
     detected_encoding = chardetng_py.detect(byte_data)
     if detected_encoding:
         with suppress(UnicodeDecodeError, LookupError):
             decoded = byte_data.decode(detected_encoding)
-            # Cache successful encoding detection
-            if len(_encoding_cache) < 1000:  # Prevent unlimited growth
+            if len(_encoding_cache) < 1000:  # Prevent unlimited growth ~keep
                 _encoding_cache[cache_key] = detected_encoding
             return _fix_mojibake(decoded)
-    # Try multiple encodings with confidence scoring
     encodings_to_try = [
         "utf-8",
-        "windows-1255",  # Hebrew
-        "iso-8859-8",  # Hebrew
-        "windows-1256",  # Arabic
-        "iso-8859-6",  # Arabic
-        "windows-1252",  # Western European
-        "cp1251",  # Cyrillic
+        "windows-1255",  # Hebrew ~keep
+        "iso-8859-8",  # Hebrew ~keep
+        "windows-1256",  # Arabic ~keep
+        "iso-8859-6",  # Arabic ~keep
+        "windows-1252",  # Western European ~keep
+        "cp1251",  # Cyrillic ~keep
     ]
     best_result = None
@@ -96,7 +84,6 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     if best_result and best_confidence > 0.5:
         return _fix_mojibake(best_result)
-    # Final fallback
     return byte_data.decode("latin-1", errors="replace")
@@ -109,25 +96,19 @@ def _calculate_text_confidence(text: str) -> float:
     if total_chars == 0:
         return 0.0
-    # Check for common encoding problems - compile patterns once
     replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
     control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
-    # Penalize replacement and control characters
     penalty = (replacement_count + control_count * 2) / total_chars
-    # Bonus for readable character ranges - more efficient counting
-    # Use generator expression with early termination
     readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
     readability_score = readable_chars / total_chars
-    # Check for suspicious Cyrillic that might be misencoded Hebrew
     cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
     if cyrillic_matches:
-        # Calculate total length more efficiently
         cyrillic_length = sum(len(match) for match in cyrillic_matches)
         if cyrillic_length > total_chars * 0.1:
-            penalty += 0.3  # Heavy penalty for likely mojibake
+            penalty += 0.3
     return max(0.0, min(1.0, readability_score - penalty))
@@ -137,19 +118,13 @@ def _fix_mojibake(text: str) -> str:
     if not text:
         return text
-    # Remove control characters
     text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
-    # Remove replacement characters
     text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
-    # Remove isolated combining marks
     text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
-    # Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
     if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
-        # This is a heuristic fix - in practice, you'd need actual character mapping
-        # For now, we flag it for manual review by keeping the text but adding a marker
         pass
     return text
@@ -167,19 +142,14 @@ def normalize_spaces(text: str) -> str:
     if not text or not text.strip():
         return ""
-    # Split by double newlines to preserve paragraph breaks
     paragraphs = text.split("\n\n")
     result_paragraphs = []
     for paragraph in paragraphs:
-        # Use pre-compiled patterns for better performance
-        # Replace multiple whitespace (except newlines) with single space
         cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
-        # Clean up multiple newlines within paragraph (keep single newlines)
         cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
-        # Process lines efficiently - manual loop avoids double strip() calls
         lines = []
         for line in cleaned.split("\n"):
             stripped_line = line.strip()

kreuzberg/_utils/_sync.py CHANGED Viewed

@@ -28,7 +28,6 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
     Returns:
         The result of the synchronous function.
     """
-    # Optimize: only create partial if we have kwargs
     if kwargs:
         handler = partial(sync_fn, **kwargs)
         return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]

kreuzberg/_utils/_table.py CHANGED Viewed

@@ -1,8 +1,6 @@
-"""Table processing and export utilities."""
 from __future__ import annotations
-import csv
+import io
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
@@ -22,9 +20,10 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
     if "df" not in table or table["df"] is None:
         return ""
-    # Use pandas to_csv() direct string return instead of StringIO
-    csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
-    return str(csv_output).strip()
+    buffer = io.StringIO()
+    df = table["df"]
+    df.write_csv(buffer, separator=separator, include_header=True)
+    return buffer.getvalue().strip()
 def export_table_to_tsv(table: TableData) -> str:
@@ -53,24 +52,19 @@ def enhance_table_markdown(table: TableData) -> str:
     df = table["df"]
-    if df.empty:
+    if df.is_empty():
         return table.get("text", "")
-    # Create enhanced markdown with proper alignment
     lines = []
-    # Header row
     headers = [str(col).strip() for col in df.columns]
     lines.append("| " + " | ".join(headers) + " |")
-    # Separator row with alignment hints
     lines.append(_generate_separator_row(df))
-    # Analyze float columns to determine formatting strategy
     float_col_formatting = _analyze_float_columns(df)
-    # Data rows with proper formatting
-    for _, row in df.iterrows():
+    for row in df.iter_rows(named=True):
         formatted_row = _format_table_row(row, df, float_col_formatting)
         lines.append("| " + " | ".join(formatted_row) + " |")
@@ -81,11 +75,11 @@ def _generate_separator_row(df: Any) -> str:
     """Generate separator row with proper alignment hints."""
     separators = []
     for col in df.columns:
-        # Check if column contains mostly numbers for right alignment
-        if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
-            separators.append("---:")  # Right align numbers
+        dtype_str = str(df[col].dtype)
+        if dtype_str in ["Int64", "Float64", "Int32", "Float32"] or _is_numeric_column(df[col]):
+            separators.append("---:")
         else:
-            separators.append("---")  # Left align text
+            separators.append("---")
     return "| " + " | ".join(separators) + " |"
@@ -93,12 +87,16 @@ def _analyze_float_columns(df: Any) -> dict[str, str]:
     """Analyze float columns to determine formatting strategy."""
     float_col_formatting = {}
     for col in df.columns:
-        if str(df[col].dtype) == "float64":
-            non_null_values = df[col].dropna()
+        dtype_str = str(df[col].dtype)
+        if dtype_str in ["Float64", "Float32"]:
+            non_null_values = df[col].drop_nulls()
             if len(non_null_values) > 0:
-                # If all non-null values are whole numbers, format as integers
-                all_integers = all(val.is_integer() for val in non_null_values)
-                float_col_formatting[col] = "int" if all_integers else "float"
+                try:
+                    values_list = non_null_values.to_list()
+                    all_integers = all(float(val).is_integer() for val in values_list if val is not None)
+                    float_col_formatting[col] = "int" if all_integers else "float"
+                except (ValueError, AttributeError):
+                    float_col_formatting[col] = "float"
             else:
                 float_col_formatting[col] = "int"
     return float_col_formatting
@@ -108,49 +106,47 @@ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -
     """Format a single table row with proper value formatting."""
     formatted_row = []
     for col_name, value in row.items():
-        if value is None or (isinstance(value, float) and str(value) == "nan"):
+        if value is None:
             formatted_row.append("")
-        elif str(df[col_name].dtype) in ["int64", "int32"]:
-            # For integer columns, format as integers
-            formatted_row.append(str(int(value)))
-        elif isinstance(value, float):
-            # For float columns, use the determined formatting strategy
-            if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
+        else:
+            dtype_str = str(df[col_name].dtype)
+            if dtype_str in ["Int64", "Int32"]:
                 formatted_row.append(str(int(value)))
+            elif isinstance(value, float):
+                if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
+                    formatted_row.append(str(int(value)))
+                else:
+                    formatted_row.append(f"{value:.2f}")
             else:
-                formatted_row.append(f"{value:.2f}")
-        else:
-            # Clean up text values
-            clean_value = str(value).strip().replace("|", "\\|")  # Escape pipes
-            formatted_row.append(clean_value)
+                clean_value = str(value).strip().replace("|", "\\|")
+                formatted_row.append(clean_value)
     return formatted_row
 def _is_numeric_column(series: Any) -> bool:
-    """Check if a pandas Series contains mostly numeric values."""
+    """Check if a polars Series contains mostly numeric values."""
     if len(series) == 0:
         return False
     try:
-        # Check if already numeric dtype first (fastest path)
-        if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
+        dtype_str = str(series.dtype)
+        if dtype_str in {"Int64", "Float64", "Int32", "Float32"}:
             return True
-        # Sample-based approach for large series (>1000 rows)
         sample_size = min(100, len(series))
-        if len(series) > 1000:
-            sample_series = series.dropna().sample(n=sample_size, random_state=42)
-        else:
-            sample_series = series.dropna()
+        series_no_nulls = series.drop_nulls()
+        if len(series_no_nulls) == 0:
+            return False
+        sample_series = series_no_nulls.slice(0, sample_size) if len(series_no_nulls) > 1000 else series_no_nulls
         if len(sample_series) == 0:
             return False
-        # Optimized numeric conversion - avoid exception overhead
         numeric_count = 0
-        for val in sample_series:
+        for val in sample_series.to_list():
             val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
-            # Quick check: if it contains only digits, decimal point, minus, plus, or e
             if val_str and all(c in "0123456789.-+eE" for c in val_str):
                 try:
                     float(val_str)
@@ -158,7 +154,6 @@ def _is_numeric_column(series: Any) -> bool:
                 except (ValueError, TypeError):
                     pass
-        # Consider numeric if >70% of sampled values are numeric
         return (numeric_count / len(sample_series)) > 0.7
     except (ValueError, TypeError, ZeroDivisionError):
@@ -190,8 +185,8 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
     for table in tables:
         if "df" in table and table["df"] is not None:
             df = table["df"]
-            total_rows += len(df)
-            total_columns += len(df.columns)
+            total_rows += df.height
+            total_columns += df.width
         if "page_number" in table:
             page_num = table["page_number"]
@@ -236,25 +231,23 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
     df = table["df"]
-    if df.empty:
+    if df.is_empty():
         return info
-    info["row_count"] = len(df)
-    info["column_count"] = len(df.columns)
-    info["has_headers"] = len(df.columns) > 0
+    info["row_count"] = df.height
+    info["column_count"] = df.width
+    info["has_headers"] = df.width > 0
-    # Analyze column types
     for col in df.columns:
         if _is_numeric_column(df[col]):
             info["numeric_columns"] += 1
         else:
             info["text_columns"] += 1
-    # Calculate data density
-    total_cells = len(df) * len(df.columns)
+    total_cells = df.height * df.width
     if total_cells > 0:
-        empty_cells = df.isnull().sum().sum()
-        info["empty_cells"] = int(empty_cells)
+        empty_cells = df.null_count().sum().item()
+        info["empty_cells"] = empty_cells
         info["data_density"] = (total_cells - empty_cells) / total_cells
     return info

kreuzberg/cli.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""Command-line interface for kreuzberg."""
 from __future__ import annotations
 import json
@@ -84,11 +82,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
     return "\n".join(output_parts)
-def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
+def _load_config(config_path: Path | None, verbose: bool) -> dict[str, Any]:
     """Load configuration from file or find default."""
     file_config = {}
-    if config:
-        file_config = load_config_from_file(config)
+    if config_path:
+        file_config = load_config_from_file(config_path)
     else:
         default_config = find_config_file()
         if default_config:
@@ -101,39 +99,38 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
     return file_config
-def _build_cli_args(
-    force_ocr: bool,
-    chunk_content: bool,
-    extract_tables: bool,
-    max_chars: int,
-    max_overlap: int,
-    ocr_backend: str | None,
-    tesseract_lang: str | None,
-    tesseract_psm: int | None,
-    easyocr_languages: str | None,
-    paddleocr_languages: str | None,
-) -> dict[str, Any]:
+def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
     """Build CLI arguments dictionary."""
     cli_args: dict[str, Any] = {
-        "force_ocr": force_ocr if force_ocr else None,
-        "chunk_content": chunk_content if chunk_content else None,
-        "extract_tables": extract_tables if extract_tables else None,
-        "max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
-        "max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
-        "ocr_backend": ocr_backend,
+        "force_ocr": params["force_ocr"] if params["force_ocr"] else None,
+        "chunk_content": params["chunk_content"] if params["chunk_content"] else None,
+        "extract_tables": params["extract_tables"] if params["extract_tables"] else None,
+        "max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
+        "max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
+        "ocr_backend": params["ocr_backend"],
     }
-    if ocr_backend == "tesseract" and (tesseract_lang or tesseract_psm is not None):
+    ocr_backend = params["ocr_backend"]
+    if ocr_backend == "tesseract" and (
+        params["tesseract_lang"]
+        or params["tesseract_psm"] is not None
+        or params["tesseract_output_format"]
+        or params["enable_table_detection"]
+    ):
         tesseract_config = {}
-        if tesseract_lang:
-            tesseract_config["language"] = tesseract_lang
-        if tesseract_psm is not None:
-            tesseract_config["psm"] = tesseract_psm  # type: ignore[assignment]
+        if params["tesseract_lang"]:
+            tesseract_config["language"] = params["tesseract_lang"]
+        if params["tesseract_psm"] is not None:
+            tesseract_config["psm"] = params["tesseract_psm"]
+        if params["tesseract_output_format"]:
+            tesseract_config["output_format"] = params["tesseract_output_format"]
+        if params["enable_table_detection"]:
+            tesseract_config["enable_table_detection"] = True
         cli_args["tesseract_config"] = tesseract_config
-    elif ocr_backend == "easyocr" and easyocr_languages:
-        cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
-    elif ocr_backend == "paddleocr" and paddleocr_languages:
-        cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
+    elif ocr_backend == "easyocr" and params["easyocr_languages"]:
+        cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
+    elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
+        cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
     return cli_args
@@ -158,7 +155,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
             progress.add_task("Extracting text...", total=None)
             try:
-                import magic  # type: ignore[import-not-found]  # noqa: PLC0415
+                import magic  # type: ignore[import-not-found] # noqa: PLC0415
                 mime_type = magic.from_buffer(input_bytes, mime=True)
             except ImportError:  # pragma: no cover
@@ -188,7 +185,10 @@ def _write_output(
         if verbose:
             console.print(f"[green]✓[/green] Output written to: {output}")
     else:
-        click.echo(formatted_output)
+        try:
+            click.echo(formatted_output)
+        except UnicodeEncodeError:
+            sys.stdout.buffer.write(formatted_output.encode("utf-8"))
 def handle_error(error: Exception, verbose: bool) -> None:  # pragma: no cover
@@ -248,71 +248,51 @@ def cli(ctx: click.Context) -> None:
 @click.option(
     "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
 )
-@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
+@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
 @click.option("--show-metadata", is_flag=True, help="Include metadata in output")
 @click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
 @click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
 @click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
 @click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
+@click.option(
+    "--tesseract-output-format",
+    type=click.Choice(["text", "markdown", "tsv", "hocr"]),
+    help="Tesseract OCR output format (default: markdown)",
+)
+@click.option(
+    "--enable-table-detection", is_flag=True, help="Enable table extraction from scanned documents (with TSV format)"
+)
 @click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
 @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
 @click.pass_context
-def extract(  # noqa: PLR0913
-    _: click.Context,
-    file: Path | None,
-    output: Path | None,
-    force_ocr: bool,
-    chunk_content: bool,
-    extract_tables: bool,
-    max_chars: int,
-    max_overlap: int,
-    ocr_backend: str | None,
-    config: Path | None,
-    show_metadata: bool,
-    output_format: str,
-    verbose: bool,
-    tesseract_lang: str | None,
-    tesseract_psm: int | None,
-    easyocr_languages: str | None,
-    paddleocr_languages: str | None,
-) -> None:
+def extract(ctx: click.Context) -> None:
     """Extract text from a document.
     FILE can be a path to a document or '-' to read from stdin.
     If FILE is omitted, reads from stdin.
     """
+    params = ctx.params
     try:
-        file_config = _load_config(config, verbose)
-        cli_args = _build_cli_args(
-            force_ocr,
-            chunk_content,
-            extract_tables,
-            max_chars,
-            max_overlap,
-            ocr_backend,
-            tesseract_lang,
-            tesseract_psm,
-            easyocr_languages,
-            paddleocr_languages,
-        )
+        file_config = _load_config(params["config_file"], params["verbose"])
+        cli_args = _build_cli_args(params)
         extraction_config = build_extraction_config(file_config, cli_args)
-        result = _perform_extraction(file, extraction_config, verbose)
+        result = _perform_extraction(params["file"], extraction_config, params["verbose"])
-        _write_output(result, output, show_metadata, output_format, verbose)
+        _write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
     except Exception as e:  # noqa: BLE001
-        handle_error(e, verbose)
+        handle_error(e, params["verbose"])
 @cli.command()
-@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
-def config(config: Path | None) -> None:
+@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
+def config(config_file: Path | None) -> None:
     """Show current configuration."""
     try:
-        config_path = config or find_config_file()
+        config_path = config_file or find_config_file()
         if config_path:
             file_config = load_config_from_file(config_path)

kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl