PyPI - kreuzberg - Versions diffs - 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl - Mend

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

kreuzberg/__init__.py +9 -2
kreuzberg/_api/__init__.py +0 -0
kreuzberg/_api/main.py +87 -0
kreuzberg/_entity_extraction.py +238 -0
kreuzberg/_extractors/_base.py +39 -1
kreuzberg/_extractors/_email.py +149 -0
kreuzberg/_extractors/_html.py +15 -3
kreuzberg/_extractors/_image.py +27 -22
kreuzberg/_extractors/_pandoc.py +3 -14
kreuzberg/_extractors/_pdf.py +97 -34
kreuzberg/_extractors/_presentation.py +62 -10
kreuzberg/_extractors/_spread_sheet.py +181 -6
kreuzberg/_extractors/_structured.py +148 -0
kreuzberg/_gmft.py +318 -11
kreuzberg/_language_detection.py +95 -0
kreuzberg/_mcp/__init__.py +5 -0
kreuzberg/_mcp/server.py +227 -0
kreuzberg/_mime_types.py +27 -1
kreuzberg/_ocr/__init__.py +10 -1
kreuzberg/_ocr/_base.py +59 -0
kreuzberg/_ocr/_easyocr.py +92 -1
kreuzberg/_ocr/_paddleocr.py +89 -0
kreuzberg/_ocr/_tesseract.py +569 -5
kreuzberg/_registry.py +4 -0
kreuzberg/_types.py +181 -4
kreuzberg/_utils/_cache.py +52 -4
kreuzberg/_utils/_device.py +2 -2
kreuzberg/_utils/_errors.py +3 -7
kreuzberg/_utils/_process_pool.py +182 -9
kreuzberg/_utils/_quality.py +237 -0
kreuzberg/_utils/_serialization.py +4 -2
kreuzberg/_utils/_string.py +153 -10
kreuzberg/_utils/_sync.py +6 -7
kreuzberg/_utils/_table.py +261 -0
kreuzberg/_utils/_tmp.py +2 -2
kreuzberg/cli.py +1 -2
kreuzberg/extraction.py +43 -34
kreuzberg-3.8.1.dist-info/METADATA +301 -0
kreuzberg-3.8.1.dist-info/RECORD +53 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
kreuzberg/_multiprocessing/__init__.py +0 -6
kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
kreuzberg/_multiprocessing/process_manager.py +0 -188
kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
kreuzberg-3.3.0.dist-info/METADATA +0 -235
kreuzberg-3.3.0.dist-info/RECORD +0 -48
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_utils/_table.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""Table processing and export utilities."""
+from __future__ import annotations
+import csv
+from io import StringIO
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from kreuzberg._types import TableData
+def export_table_to_csv(table: TableData, separator: str = ",") -> str:
+    r"""Export a TableData object to CSV/TSV format.
+    Args:
+        table: TableData object containing DataFrame
+        separator: Field separator ("," for CSV, "\t" for TSV)
+    Returns:
+        String representation in CSV/TSV format
+    """
+    if "df" not in table or table["df"] is None:
+        return ""
+    output = StringIO()
+    table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
+    return output.getvalue().strip()
+def export_table_to_tsv(table: TableData) -> str:
+    """Export a TableData object to TSV format.
+    Args:
+        table: TableData object containing DataFrame
+    Returns:
+        String representation in TSV format
+    """
+    return export_table_to_csv(table, separator="\t")
+def enhance_table_markdown(table: TableData) -> str:
+    """Generate enhanced markdown table with better formatting.
+    Args:
+        table: TableData object
+    Returns:
+        Enhanced markdown table string
+    """
+    if "df" not in table or table["df"] is None:
+        return table.get("text", "")
+    df = table["df"]
+    if df.empty:
+        return table.get("text", "")
+    # Create enhanced markdown with proper alignment
+    lines = []
+    # Header row
+    headers = [str(col).strip() for col in df.columns]
+    lines.append("| " + " | ".join(headers) + " |")
+    # Separator row with alignment hints
+    lines.append(_generate_separator_row(df))
+    # Analyze float columns to determine formatting strategy
+    float_col_formatting = _analyze_float_columns(df)
+    # Data rows with proper formatting
+    for _, row in df.iterrows():
+        formatted_row = _format_table_row(row, df, float_col_formatting)
+        lines.append("| " + " | ".join(formatted_row) + " |")
+    return "\n".join(lines)
+def _generate_separator_row(df: Any) -> str:
+    """Generate separator row with proper alignment hints."""
+    separators = []
+    for col in df.columns:
+        # Check if column contains mostly numbers for right alignment
+        if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
+            separators.append("---:")  # Right align numbers
+        else:
+            separators.append("---")  # Left align text
+    return "| " + " | ".join(separators) + " |"
+def _analyze_float_columns(df: Any) -> dict[str, str]:
+    """Analyze float columns to determine formatting strategy."""
+    float_col_formatting = {}
+    for col in df.columns:
+        if str(df[col].dtype) == "float64":
+            non_null_values = df[col].dropna()
+            if len(non_null_values) > 0:
+                # If all non-null values are whole numbers, format as integers
+                all_integers = all(val.is_integer() for val in non_null_values)
+                float_col_formatting[col] = "int" if all_integers else "float"
+            else:
+                float_col_formatting[col] = "int"
+    return float_col_formatting
+def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
+    """Format a single table row with proper value formatting."""
+    formatted_row = []
+    for col_name, value in row.items():
+        if value is None or (isinstance(value, float) and str(value) == "nan"):
+            formatted_row.append("")
+        elif str(df[col_name].dtype) in ["int64", "int32"]:
+            # For integer columns, format as integers
+            formatted_row.append(str(int(value)))
+        elif isinstance(value, float):
+            # For float columns, use the determined formatting strategy
+            if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
+                formatted_row.append(str(int(value)))
+            else:
+                formatted_row.append(f"{value:.2f}")
+        else:
+            # Clean up text values
+            clean_value = str(value).strip().replace("|", "\\|")  # Escape pipes
+            formatted_row.append(clean_value)
+    return formatted_row
+def _is_numeric_column(series: Any) -> bool:
+    """Check if a pandas Series contains mostly numeric values."""
+    if len(series) == 0:
+        return False
+    try:
+        # Check if already numeric dtype first (fastest path)
+        if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
+            return True
+        # Sample-based approach for large series (>1000 rows)
+        sample_size = min(100, len(series))
+        if len(series) > 1000:
+            sample_series = series.dropna().sample(n=sample_size, random_state=42)
+        else:
+            sample_series = series.dropna()
+        if len(sample_series) == 0:
+            return False
+        # Optimized numeric conversion - avoid exception overhead
+        numeric_count = 0
+        for val in sample_series:
+            val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
+            # Quick check: if it contains only digits, decimal point, minus, plus, or e
+            if val_str and all(c in "0123456789.-+eE" for c in val_str):
+                try:
+                    float(val_str)
+                    numeric_count += 1
+                except (ValueError, TypeError):
+                    pass
+        # Consider numeric if >70% of sampled values are numeric
+        return (numeric_count / len(sample_series)) > 0.7
+    except (ValueError, TypeError, ZeroDivisionError):
+        return False
+def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
+    """Generate summary statistics for extracted tables.
+    Args:
+        tables: List of TableData objects
+    Returns:
+        Dictionary with table statistics
+    """
+    if not tables:
+        return {
+            "table_count": 0,
+            "total_rows": 0,
+            "total_columns": 0,
+            "pages_with_tables": 0,
+        }
+    total_rows = 0
+    total_columns = 0
+    pages_with_tables = set()
+    tables_by_page = {}
+    for table in tables:
+        if "df" in table and table["df"] is not None:
+            df = table["df"]
+            total_rows += len(df)
+            total_columns += len(df.columns)
+        if "page_number" in table:
+            page_num = table["page_number"]
+            pages_with_tables.add(page_num)
+            if page_num not in tables_by_page:
+                tables_by_page[page_num] = 0
+            tables_by_page[page_num] += 1
+    return {
+        "table_count": len(tables),
+        "total_rows": total_rows,
+        "total_columns": total_columns,
+        "pages_with_tables": len(pages_with_tables),
+        "avg_rows_per_table": total_rows / len(tables) if tables else 0,
+        "avg_columns_per_table": total_columns / len(tables) if tables else 0,
+        "tables_by_page": dict(tables_by_page),
+    }
+def extract_table_structure_info(table: TableData) -> dict[str, Any]:
+    """Extract structural information from a table.
+    Args:
+        table: TableData object
+    Returns:
+        Dictionary with structural information
+    """
+    info = {
+        "has_headers": False,
+        "row_count": 0,
+        "column_count": 0,
+        "numeric_columns": 0,
+        "text_columns": 0,
+        "empty_cells": 0,
+        "data_density": 0.0,
+    }
+    if "df" not in table or table["df"] is None:
+        return info
+    df = table["df"]
+    if df.empty:
+        return info
+    info["row_count"] = len(df)
+    info["column_count"] = len(df.columns)
+    info["has_headers"] = len(df.columns) > 0
+    # Analyze column types
+    for col in df.columns:
+        if _is_numeric_column(df[col]):
+            info["numeric_columns"] += 1
+        else:
+            info["text_columns"] += 1
+    # Calculate data density
+    total_cells = len(df) * len(df.columns)
+    if total_cells > 0:
+        empty_cells = df.isnull().sum().sum()
+        info["empty_cells"] = int(empty_cells)
+        info["data_density"] = (total_cells - empty_cells) / total_cells
+    return info

kreuzberg/_utils/_tmp.py CHANGED Viewed

@@ -3,14 +3,14 @@ from __future__ import annotations
 from contextlib import suppress
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING
 from anyio import Path as AsyncPath
 from kreuzberg._utils._sync import run_sync
 if TYPE_CHECKING:  # pragma: no cover
-    from collections.abc import Coroutine
+    from collections.abc import Callable, Coroutine
 async def create_temp_file(

kreuzberg/cli.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import json
 import sys
+import traceback
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
     else:
         console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
         if verbose:
-            import traceback
             console.print("\n[dim]Traceback:[/dim]")
             traceback.print_exc()
         sys.exit(1)

kreuzberg/extraction.py CHANGED Viewed

@@ -1,17 +1,23 @@
 from __future__ import annotations
+import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-from typing import TYPE_CHECKING, Final, cast
+from typing import TYPE_CHECKING, Any, Final, cast
 import anyio
 from kreuzberg import ExtractionResult
 from kreuzberg._chunker import get_chunker
+from kreuzberg._entity_extraction import extract_entities, extract_keywords
+from kreuzberg._language_detection import detect_languages
 from kreuzberg._mime_types import (
     validate_mime_type,
 )
 from kreuzberg._registry import ExtractorRegistry
 from kreuzberg._types import ExtractionConfig
+from kreuzberg._utils._document_cache import get_document_cache
+from kreuzberg._utils._errors import create_error_context
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
 from kreuzberg.exceptions import ValidationError
@@ -24,10 +30,7 @@ if TYPE_CHECKING:
 DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
-async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
-    for validator in config.validators or []:
-        await run_maybe_sync(validator, result)
+def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
     if config.chunk_content:
         result.chunks = _handle_chunk_content(
             mime_type=result.mime_type,
@@ -35,6 +38,39 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
             content=result.content,
         )
+    if config.extract_entities:
+        try:
+            result.entities = extract_entities(
+                result.content,
+                custom_patterns=config.custom_entity_patterns,
+            )
+        except RuntimeError:
+            result.entities = None
+    if config.extract_keywords:
+        try:
+            result.keywords = extract_keywords(
+                result.content,
+                keyword_count=config.keyword_count,
+            )
+        except RuntimeError:
+            result.keywords = None
+    if config.auto_detect_language:
+        result.detected_languages = detect_languages(
+            result.content,
+            config=config.language_detection_config,
+        )
+    return result
+async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
+    for validator in config.validators or []:
+        await run_maybe_sync(validator, result)
+    result = _validate_and_post_process_helper(result, config)
     for post_processor in config.post_processing_hooks or []:
         result = await run_maybe_sync(post_processor, result)
@@ -45,12 +81,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
     for validator in config.validators or []:
         run_sync_only(validator, result)
-    if config.chunk_content:
-        result.chunks = _handle_chunk_content(
-            mime_type=result.mime_type,
-            config=config,
-            content=result.content,
-        )
+    result = _validate_and_post_process_helper(result, config)
     for post_processor in config.post_processing_hooks or []:
         result = run_sync_only(post_processor, result)
@@ -62,7 +93,7 @@ def _handle_chunk_content(
     mime_type: str,
     config: ExtractionConfig,
     content: str,
-) -> list[str]:
+) -> Any:
     chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
     return chunker.chunks(content)
@@ -109,8 +140,6 @@ async def extract_file(
     Raises:
         ValidationError: If the file path or configuration is invalid.
     """
-    from kreuzberg._utils._document_cache import get_document_cache
     cache = get_document_cache()
     path = Path(file_path)
     cached_result = cache.get(path, config)
@@ -167,8 +196,6 @@ async def batch_extract_file(
     if not file_paths:
         return []
-    import multiprocessing as mp
     max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
     semaphore = anyio.Semaphore(max_concurrency)
@@ -184,8 +211,6 @@ async def batch_extract_file(
                 )
                 results[index] = result
             except Exception as e:  # noqa: BLE001
-                from kreuzberg._utils._errors import create_error_context
                 error_result = ExtractionResult(
                     content=f"Error: {type(e).__name__}: {e!s}",
                     mime_type="text/plain",
@@ -224,8 +249,6 @@ async def batch_extract_bytes(
     if not contents:
         return []
-    import multiprocessing as mp
     max_concurrency = min(len(contents), mp.cpu_count() * 2)
     semaphore = anyio.Semaphore(max_concurrency)
@@ -237,8 +260,6 @@ async def batch_extract_bytes(
                 result = await extract_bytes(content, mime_type, config)
                 results[index] = result
             except Exception as e:  # noqa: BLE001
-                from kreuzberg._utils._errors import create_error_context
                 error_result = ExtractionResult(
                     content=f"Error: {type(e).__name__}: {e!s}",
                     mime_type="text/plain",
@@ -304,8 +325,6 @@ def extract_file_sync(
     Raises:
         ValidationError: If the file path or configuration is invalid.
     """
-    from kreuzberg._utils._document_cache import get_document_cache
     cache = get_document_cache()
     path = Path(file_path)
     cached_result = cache.get(path, config)
@@ -362,9 +381,6 @@ def batch_extract_file_sync(
     if len(file_paths) <= 1:
         return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
-    import multiprocessing as mp
-    from concurrent.futures import ThreadPoolExecutor, as_completed
     max_workers = min(len(file_paths), mp.cpu_count())
     def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
@@ -375,8 +391,6 @@ def batch_extract_file_sync(
                 extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
             )
         except Exception as e:  # noqa: BLE001
-            from kreuzberg._utils._errors import create_error_context
             error_result = ExtractionResult(
                 content=f"Error: {type(e).__name__}: {e!s}",
                 mime_type="text/plain",
@@ -420,9 +434,6 @@ def batch_extract_bytes_sync(
             extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
         ]
-    import multiprocessing as mp
-    from concurrent.futures import ThreadPoolExecutor, as_completed
     max_workers = min(len(contents), mp.cpu_count())
     def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
@@ -431,8 +442,6 @@ def batch_extract_bytes_sync(
         try:
             return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
         except Exception as e:  # noqa: BLE001
-            from kreuzberg._utils._errors import create_error_context
             error_result = ExtractionResult(
                 content=f"Error: {type(e).__name__}: {e!s}",
                 mime_type="text/plain",

kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

kreuzberg 3.3.0py3-none-any.whl → 3.8.1py3-none-any.whl