PyPI - kreuzberg - Versions diffs - 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl - Mend

kreuzberg 3.8.0py3-none-any.whl → 3.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kreuzberg/__init__.py +4 -0
kreuzberg/_api/main.py +22 -1
kreuzberg/_config.py +404 -0
kreuzberg/_entity_extraction.py +4 -5
kreuzberg/_extractors/_base.py +3 -5
kreuzberg/_extractors/_image.py +18 -32
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +39 -57
kreuzberg/_extractors/_spread_sheet.py +2 -3
kreuzberg/_extractors/_structured.py +10 -7
kreuzberg/_gmft.py +314 -10
kreuzberg/_language_detection.py +1 -1
kreuzberg/_mcp/server.py +58 -8
kreuzberg/_ocr/__init__.py +1 -22
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +90 -1
kreuzberg/_ocr/_tesseract.py +556 -5
kreuzberg/_playa.py +2 -3
kreuzberg/_types.py +46 -24
kreuzberg/_utils/_cache.py +35 -4
kreuzberg/_utils/_device.py +10 -20
kreuzberg/_utils/_errors.py +44 -45
kreuzberg/_utils/_process_pool.py +2 -6
kreuzberg/_utils/_quality.py +7 -11
kreuzberg/_utils/_serialization.py +21 -16
kreuzberg/_utils/_string.py +22 -12
kreuzberg/_utils/_table.py +3 -4
kreuzberg/cli.py +4 -5
kreuzberg/exceptions.py +10 -0
kreuzberg/extraction.py +6 -24
kreuzberg-3.8.2.dist-info/METADATA +265 -0
kreuzberg-3.8.2.dist-info/RECORD +53 -0
kreuzberg/_cli_config.py +0 -175
kreuzberg/_multiprocessing/__init__.py +0 -5
kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
kreuzberg/_ocr/_pool.py +0 -357
kreuzberg/_ocr/_sync.py +0 -566
kreuzberg-3.8.0.dist-info/METADATA +0 -313
kreuzberg-3.8.0.dist-info/RECORD +0 -57
{kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_types.py CHANGED Viewed

@@ -5,7 +5,10 @@ from collections.abc import Awaitable, Callable
 from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Any, Literal, TypedDict
+import msgspec
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
+from kreuzberg._utils._table import export_table_to_csv, export_table_to_tsv, extract_table_structure_info
 from kreuzberg.exceptions import ValidationError
 if sys.version_info < (3, 11):  # pragma: no cover
@@ -191,7 +194,7 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
     return normalized
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class Entity:
     """Represents an extracted entity with type, text, and position."""
@@ -205,7 +208,7 @@ class Entity:
     """End character offset in the content"""
-@dataclass
+@dataclass(slots=True)
 class ExtractionResult:
     """The result of a file extraction."""
@@ -226,9 +229,29 @@ class ExtractionResult:
     detected_languages: list[str] | None = None
     """Languages detected in the extracted content, if language detection is enabled."""
-    def to_dict(self) -> dict[str, Any]:
-        """Converts the ExtractionResult to a dictionary."""
-        return asdict(self)
+    def to_dict(self, include_none: bool = False) -> dict[str, Any]:
+        """Converts the ExtractionResult to a dictionary.
+        Args:
+            include_none: If True, include fields with None values.
+                         If False (default), exclude None values.
+        Returns:
+            Dictionary representation of the ExtractionResult.
+        """
+        # Use msgspec.to_builtins for efficient conversion
+        # The builtin_types parameter allows DataFrames to pass through
+        result = msgspec.to_builtins(
+            self,
+            builtin_types=(type(None),),  # Allow None to pass through
+            order="deterministic",  # Ensure consistent output
+        )
+        if include_none:
+            return result  # type: ignore[no-any-return]
+        # Remove None values to match expected behavior
+        return {k: v for k, v in result.items() if v is not None}
     def export_tables_to_csv(self) -> list[str]:
         """Export all tables to CSV format.
@@ -239,8 +262,6 @@ class ExtractionResult:
         if not self.tables:
             return []
-        from kreuzberg._utils._table import export_table_to_csv
         return [export_table_to_csv(table) for table in self.tables]
     def export_tables_to_tsv(self) -> list[str]:
@@ -252,8 +273,6 @@ class ExtractionResult:
         if not self.tables:
             return []
-        from kreuzberg._utils._table import export_table_to_tsv
         return [export_table_to_tsv(table) for table in self.tables]
     def get_table_summaries(self) -> list[dict[str, Any]]:
@@ -265,8 +284,6 @@ class ExtractionResult:
         if not self.tables:
             return []
-        from kreuzberg._utils._table import extract_table_structure_info
         return [extract_table_structure_info(table) for table in self.tables]
@@ -274,7 +291,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
 ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
-@dataclass(unsafe_hash=True)
+@dataclass(unsafe_hash=True, slots=True)
 class ExtractionConfig:
     """Represents configuration settings for an extraction process.
@@ -355,18 +372,23 @@ class ExtractionConfig:
         Returns:
             A dict of the OCR configuration or an empty dict if no backend is provided.
         """
-        if self.ocr_backend is not None:
-            if self.ocr_config is not None:
-                return asdict(self.ocr_config)
-            if self.ocr_backend == "tesseract":
-                from kreuzberg._ocr._tesseract import TesseractConfig
+        if self.ocr_backend is None:
+            return {}
-                return asdict(TesseractConfig())
-            if self.ocr_backend == "easyocr":
-                from kreuzberg._ocr._easyocr import EasyOCRConfig
+        if self.ocr_config is not None:
+            # Use asdict for OCR configs to preserve enum objects correctly
+            return asdict(self.ocr_config)
-                return asdict(EasyOCRConfig())
-            from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+        # Lazy load and cache default configs instead of creating new instances
+        if self.ocr_backend == "tesseract":
+            from kreuzberg._ocr._tesseract import TesseractConfig
-            return asdict(PaddleOCRConfig())
-        return {}
+            return asdict(TesseractConfig())
+        if self.ocr_backend == "easyocr":
+            from kreuzberg._ocr._easyocr import EasyOCRConfig
+            return asdict(EasyOCRConfig())
+        # paddleocr
+        from kreuzberg._ocr._paddleocr import PaddleOCRConfig
+        return asdict(PaddleOCRConfig())

kreuzberg/_utils/_cache.py CHANGED Viewed

@@ -64,11 +64,10 @@ class KreuzbergCache(Generic[T]):
         Returns:
             Unique cache key string
         """
-        # Use more efficient string building for cache key
         if not kwargs:
             return "empty"
-        # Build key string efficiently
+        # Build cache key using list + join (faster than StringIO)
         parts = []
         for key in sorted(kwargs):
             value = kwargs[key]
@@ -81,6 +80,7 @@ class KreuzbergCache(Generic[T]):
                 parts.append(f"{key}={type(value).__name__}:{value!s}")
         cache_str = "&".join(parts)
+        # SHA256 is secure and fast enough for cache keys
         return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
     def _get_cache_path(self, cache_key: str) -> Path:
@@ -102,15 +102,46 @@ class KreuzbergCache(Generic[T]):
     def _serialize_result(self, result: T) -> dict[str, Any]:
         """Serialize result for caching with metadata."""
+        # Handle TableData objects that contain DataFrames
+        if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
+            serialized_data = []
+            for item in result:
+                if isinstance(item, dict) and "df" in item:
+                    # Build new dict without unnecessary copy
+                    serialized_item = {k: v for k, v in item.items() if k != "df"}
+                    if hasattr(item["df"], "to_csv"):
+                        serialized_item["df_csv"] = item["df"].to_csv(index=False)
+                    else:
+                        # Fallback for non-DataFrame objects
+                        serialized_item["df_csv"] = str(item["df"])
+                    serialized_data.append(serialized_item)
+                else:
+                    serialized_data.append(item)
+            return {"type": "TableDataList", "data": serialized_data, "cached_at": time.time()}
         return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
     def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
         """Deserialize cached result."""
         data = cached_data["data"]
-        if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
-            from kreuzberg._types import ExtractionResult
+        if cached_data.get("type") == "TableDataList" and isinstance(data, list):
+            from io import StringIO
+            import pandas as pd
+            deserialized_data = []
+            for item in data:
+                if isinstance(item, dict) and "df_csv" in item:
+                    # Build new dict without unnecessary copy
+                    deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
+                    deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
+                    deserialized_data.append(deserialized_item)
+                else:
+                    deserialized_data.append(item)
+            return deserialized_data  # type: ignore[return-value]
+        if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
             return ExtractionResult(**data)  # type: ignore[return-value]
         return data  # type: ignore[no-any-return]

kreuzberg/_utils/_device.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import warnings
 from dataclasses import dataclass
+from itertools import chain
 from typing import Literal
 from kreuzberg.exceptions import ValidationError
@@ -12,7 +13,7 @@ from kreuzberg.exceptions import ValidationError
 DeviceType = Literal["cpu", "cuda", "mps", "auto"]
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class DeviceInfo:
     """Information about a compute device."""
@@ -34,28 +35,17 @@ def detect_available_devices() -> list[DeviceInfo]:
     Returns:
         List of available devices, with the most preferred device first.
     """
-    devices: list[DeviceInfo] = []
-    devices.append(
-        DeviceInfo(
-            device_type="cpu",
-            name="CPU",
-        )
-    )
-    if _is_cuda_available():
-        cuda_devices = _get_cuda_devices()
-        devices.extend(cuda_devices)
+    # Build device lists efficiently using generators
+    cpu_device = DeviceInfo(device_type="cpu", name="CPU")
-    if _is_mps_available():
-        mps_device = _get_mps_device()
-        if mps_device:
-            devices.append(mps_device)
+    cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
-    gpu_devices = [d for d in devices if d.device_type != "cpu"]
-    cpu_devices = [d for d in devices if d.device_type == "cpu"]
+    mps_device = _get_mps_device() if _is_mps_available() else None
+    mps_devices = [mps_device] if mps_device else []
-    return gpu_devices + cpu_devices
+    # Return GPU devices first, then CPU using itertools.chain
+    gpu_devices = list(chain(cuda_devices, mps_devices))
+    return [*gpu_devices, cpu_device]
 def get_optimal_device() -> DeviceInfo:

kreuzberg/_utils/_errors.py CHANGED Viewed

@@ -5,12 +5,48 @@ from __future__ import annotations
 import platform
 import traceback
 from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Any
+from pathlib import Path
+from typing import Any
 import psutil
-if TYPE_CHECKING:
-    from pathlib import Path
+from kreuzberg.exceptions import ValidationError
+# Define error keywords as frozensets for O(1) membership testing
+_SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
+_TRANSIENT_ERROR_PATTERNS = frozenset(
+    {
+        "temporary",
+        "locked",
+        "in use",
+        "access denied",
+        "permission",
+        "timeout",
+        "connection",
+        "network",
+        "too many open files",
+        "cannot allocate memory",
+        "resource temporarily unavailable",
+        "broken pipe",
+        "subprocess",
+        "signal",
+    }
+)
+_RESOURCE_ERROR_PATTERNS = frozenset(
+    {
+        "memory",
+        "out of memory",
+        "cannot allocate",
+        "too many open files",
+        "file descriptor",
+        "resource",
+        "exhausted",
+        "limit",
+        "cpu",
+        "thread",
+        "process",
+    }
+)
 def create_error_context(
@@ -37,8 +73,6 @@ def create_error_context(
     }
     if file_path:
-        from pathlib import Path
         path = Path(file_path) if isinstance(file_path, str) else file_path
         context["file"] = {
             "path": str(path),
@@ -54,11 +88,7 @@ def create_error_context(
             "traceback": traceback.format_exception_only(type(error), error),
         }
-    if (
-        any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
-        if error
-        else False
-    ):
+    if error and any(keyword in str(error).lower() for keyword in _SYSTEM_ERROR_KEYWORDS):
         try:
             mem = psutil.virtual_memory()
             context["system"] = {
@@ -96,25 +126,8 @@ def is_transient_error(error: Exception) -> bool:
     if isinstance(error, transient_types):
         return True
-    transient_patterns = [
-        "temporary",
-        "locked",
-        "in use",
-        "access denied",
-        "permission",
-        "timeout",
-        "connection",
-        "network",
-        "too many open files",
-        "cannot allocate memory",
-        "resource temporarily unavailable",
-        "broken pipe",
-        "subprocess",
-        "signal",
-    ]
     error_str = str(error).lower()
-    return any(pattern in error_str for pattern in transient_patterns)
+    return any(pattern in error_str for pattern in _TRANSIENT_ERROR_PATTERNS)
 def is_resource_error(error: Exception) -> bool:
@@ -126,22 +139,8 @@ def is_resource_error(error: Exception) -> bool:
     Returns:
         True if the error is resource-related
     """
-    resource_patterns = [
-        "memory",
-        "out of memory",
-        "cannot allocate",
-        "too many open files",
-        "file descriptor",
-        "resource",
-        "exhausted",
-        "limit",
-        "cpu",
-        "thread",
-        "process",
-    ]
     error_str = str(error).lower()
-    return any(pattern in error_str for pattern in resource_patterns)
+    return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
 def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
@@ -158,8 +157,6 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
     if attempt >= max_attempts:
         return False
-    from kreuzberg.exceptions import ValidationError
     if isinstance(error, ValidationError):
         return False
@@ -169,6 +166,8 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
 class BatchExtractionResult:
     """Result container for batch operations with partial success support."""
+    __slots__ = ("failed", "successful", "total_count")
     def __init__(self) -> None:
         """Initialize batch result container."""
         self.successful: list[tuple[int, Any]] = []

kreuzberg/_utils/_process_pool.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import io
 import multiprocessing as mp
 from concurrent.futures import ProcessPoolExecutor
 from contextlib import contextmanager
@@ -9,6 +10,7 @@ from typing import TYPE_CHECKING, Any, TypeVar
 import anyio
 import psutil
+import pypdfium2
 from typing_extensions import Self
 if TYPE_CHECKING:
@@ -59,8 +61,6 @@ def shutdown_process_pool() -> None:
 def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
     """Worker function for extracting PDF text in a separate process."""
-    import pypdfium2
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)
@@ -81,10 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
 def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
     """Worker function for converting PDF to images in a separate process."""
-    import io
-    import pypdfium2
     pdf = None
     try:
         pdf = pypdfium2.PdfDocument(pdf_path)

kreuzberg/_utils/_quality.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import re
+from functools import reduce
 from typing import Any
 # Pre-compiled patterns for performance
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
     if not text:
         return text
-    # Remove script and style content
-    for pattern in _SCRIPT_PATTERNS.values():
-        text = pattern.sub(" ", text)
+    # Remove script and style content using functools.reduce for single pass
+    text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
     # Clean OCR artifacts
     text = _clean_ocr_artifacts(text)
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    script_chars = 0
-    for pattern in _SCRIPT_PATTERNS.values():
-        matches = pattern.findall(text)
-        script_chars += sum(len(match) for match in matches)
+    # Use sum with generator expression for single-pass calculation
+    script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, script_chars / total_chars)
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
     if total_chars == 0:
         return 0.0
-    nav_chars = 0
-    for pattern in _NAVIGATION_PATTERNS.values():
-        matches = pattern.findall(text)
-        nav_chars += sum(len(match) for match in matches)
+    # Use sum with generator expression for single-pass calculation
+    nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
     return min(1.0, nav_chars / total_chars)

kreuzberg/_utils/_serialization.py CHANGED Viewed

@@ -2,16 +2,28 @@
 from __future__ import annotations
-from dataclasses import asdict, is_dataclass
-from enum import Enum
+from dataclasses import is_dataclass
 from typing import Any, TypeVar, cast
+import msgspec
 from msgspec import MsgspecError
 from msgspec.msgpack import decode, encode
 T = TypeVar("T")
+# Define dict method names in priority order
+_DICT_METHOD_NAMES = (
+    "to_dict",
+    "as_dict",
+    "dict",
+    "model_dump",
+    "json",
+    "to_list",
+    "tolist",
+)
 def encode_hook(obj: Any) -> Any:
     """Custom encoder for complex objects."""
     if callable(obj):
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
     if isinstance(obj, Exception):
         return {"message": str(obj), "type": type(obj).__name__}
-    for key in (
-        "to_dict",
-        "as_dict",
-        "dict",
-        "model_dump",
-        "json",
-        "to_list",
-        "tolist",
-    ):
-        if hasattr(obj, key):
-            method = getattr(obj, key)  # Cache the attribute lookup
-            if callable(method):
-                return method()
+    # Check for dict-like methods more efficiently using any() with generator
+    for attr_name in _DICT_METHOD_NAMES:
+        method = getattr(obj, attr_name, None)
+        if method is not None and callable(method):
+            return method()
     if is_dataclass(obj) and not isinstance(obj, type):
-        return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
+        # Use msgspec.to_builtins for more efficient conversion
+        return msgspec.to_builtins(obj)
     if hasattr(obj, "save") and hasattr(obj, "format"):
         return None

kreuzberg/_utils/_string.py CHANGED Viewed

@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
 @lru_cache(maxsize=128)
 def _get_encoding_cache_key(data_hash: str, size: int) -> str:
     """Generate cache key for encoding detection."""
+    # Use string interpolation which is faster than format strings for simple cases
     return f"{data_hash}:{size}"
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
     if not text:
         return 0.0
-    # Check for common encoding problems
-    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
-    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
     total_chars = len(text)
     if total_chars == 0:
         return 0.0
+    # Check for common encoding problems - compile patterns once
+    replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
+    control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
     # Penalize replacement and control characters
     penalty = (replacement_count + control_count * 2) / total_chars
-    # Bonus for readable character ranges
+    # Bonus for readable character ranges - more efficient counting
+    # Use generator expression with early termination
     readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
     readability_score = readable_chars / total_chars
     # Check for suspicious Cyrillic that might be misencoded Hebrew
     cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
-    if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
-        penalty += 0.3  # Heavy penalty for likely mojibake
+    if cyrillic_matches:
+        # Calculate total length more efficiently
+        cyrillic_length = sum(len(match) for match in cyrillic_matches)
+        if cyrillic_length > total_chars * 0.1:
+            penalty += 0.3  # Heavy penalty for likely mojibake
     return max(0.0, min(1.0, readability_score - penalty))
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
     # Split by double newlines to preserve paragraph breaks
     paragraphs = text.split("\n\n")
-    normalized_paragraphs = []
+    result_paragraphs = []
     for paragraph in paragraphs:
         # Use pre-compiled patterns for better performance
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
         # Clean up multiple newlines within paragraph (keep single newlines)
         cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
-        # Strip and filter empty lines efficiently
-        lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
+        # Process lines efficiently - manual loop avoids double strip() calls
+        lines = []
+        for line in cleaned.split("\n"):
+            stripped_line = line.strip()
+            if stripped_line:
+                lines.append(stripped_line)
         if lines:
-            normalized_paragraphs.append("\n".join(lines))
+            result_paragraphs.append("\n".join(lines))
-    return "\n\n".join(normalized_paragraphs)
+    return "\n\n".join(result_paragraphs)

kreuzberg/_utils/_table.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 import csv
-from io import StringIO
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
     if "df" not in table or table["df"] is None:
         return ""
-    output = StringIO()
-    table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
-    return output.getvalue().strip()
+    # Use pandas to_csv() direct string return instead of StringIO
+    csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
+    return str(csv_output).strip()
 def export_table_to_tsv(table: TableData) -> str:

kreuzberg/cli.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import json
 import sys
+import traceback
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -17,7 +18,7 @@ except ImportError as e:
     ) from e
 from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
-from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
+from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
 from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
 DEFAULT_MAX_CHARACTERS = 4000
@@ -91,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
     if config:
         file_config = load_config_from_file(config)
     else:
-        default_config = find_default_config()
+        default_config = find_config_file()
         if default_config:
             try:
                 file_config = load_config_from_file(default_config)
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
     else:
         console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
         if verbose:
-            import traceback
             console.print("\n[dim]Traceback:[/dim]")
             traceback.print_exc()
         sys.exit(1)
@@ -315,7 +314,7 @@ def extract(  # noqa: PLR0913
 def config(config: Path | None) -> None:
     """Show current configuration."""
     try:
-        config_path = config or find_default_config()
+        config_path = config or find_config_file()
         if config_path:
             file_config = load_config_from_file(config_path)

kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

kreuzberg 3.8.0py3-none-any.whl → 3.8.2py3-none-any.whl