PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -2,17 +2,15 @@ from __future__ import annotations
 import platform
 import warnings
-from dataclasses import dataclass
 from importlib.util import find_spec
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Final
 from PIL import Image
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
-from kreuzberg._types import ExtractionResult, Metadata
-from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
+from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
+from kreuzberg._utils._device import DeviceInfo, validate_device_request
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -20,91 +18,23 @@ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationErr
 if TYPE_CHECKING:
     from pathlib import Path
 try:  # pragma: no cover
     from typing import Unpack  # type: ignore[attr-defined]
 except ImportError:  # pragma: no cover
     from typing_extensions import Unpack
+try:
+    import numpy as np
+    from paddleocr import PaddleOCR
-PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
+    HAS_PADDLEOCR = True
+except ImportError:
+    HAS_PADDLEOCR = False
+    np = None  # type: ignore[assignment]
+    PaddleOCR = None
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class PaddleOCRConfig:
-    """Configuration options for PaddleOCR.
-    This TypedDict provides type hints and documentation for all PaddleOCR parameters.
-    """
-    cls_image_shape: str = "3,48,192"
-    """Image shape for classification algorithm in format 'channels,height,width'."""
-    det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
-    """Detection algorithm."""
-    det_db_box_thresh: float = 0.5
-    """Score threshold for detected boxes. Boxes below this value are discarded."""
-    det_db_thresh: float = 0.3
-    """Binarization threshold for DB output map."""
-    det_db_unclip_ratio: float = 2.0
-    """Expansion ratio for detected text boxes."""
-    det_east_cover_thresh: float = 0.1
-    """Score threshold for EAST output boxes."""
-    det_east_nms_thresh: float = 0.2
-    """NMS threshold for EAST model output boxes."""
-    det_east_score_thresh: float = 0.8
-    """Binarization threshold for EAST output map."""
-    det_max_side_len: int = 960
-    """Maximum size of image long side. Images exceeding this will be proportionally resized."""
-    det_model_dir: str | None = None
-    """Directory for detection model. If None, uses default model location."""
-    drop_score: float = 0.5
-    """Filter recognition results by confidence score. Results below this are discarded."""
-    enable_mkldnn: bool = False
-    """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
-    gpu_mem: int = 8000
-    """GPU memory size (in MB) to use for initialization."""
-    language: str = "en"
-    """Language to use for OCR."""
-    max_text_length: int = 25
-    """Maximum text length that the recognition algorithm can recognize."""
-    rec: bool = True
-    """Enable text recognition when using the ocr() function."""
-    rec_algorithm: Literal[
-        "CRNN",
-        "SRN",
-        "NRTR",
-        "SAR",
-        "SEED",
-        "SVTR",
-        "SVTR_LCNet",
-        "ViTSTR",
-        "ABINet",
-        "VisionLAN",
-        "SPIN",
-        "RobustScanner",
-        "RFL",
-    ] = "CRNN"
-    """Recognition algorithm."""
-    rec_image_shape: str = "3,32,320"
-    """Image shape for recognition algorithm in format 'channels,height,width'."""
-    rec_model_dir: str | None = None
-    """Directory for recognition model. If None, uses default model location."""
-    table: bool = True
-    """Whether to enable table recognition."""
-    use_angle_cls: bool = True
-    """Whether to use text orientation classification model."""
-    use_gpu: bool = False
-    """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
-    device: DeviceType = "auto"
-    """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
-    gpu_memory_limit: float | None = None
-    """Maximum GPU memory to use in GB. None for no limit."""
-    fallback_to_cpu: bool = True
-    """Whether to fallback to CPU if requested device is unavailable."""
-    use_space_char: bool = True
-    """Whether to recognize spaces."""
-    use_zero_copy_run: bool = False
-    """Whether to enable zero_copy_run for inference optimization."""
+PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
 class PaddleBackend(OCRBackend[PaddleOCRConfig]):
@@ -123,8 +53,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
-        import numpy as np  # noqa: PLC0415
         await self._init_paddle_ocr(**kwargs)
         if image.mode != "RGB":
@@ -258,12 +186,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         if cls._paddle_ocr is not None:
             return
-        try:
-            from paddleocr import PaddleOCR  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_PADDLEOCR or PaddleOCR is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
-            ) from e
+            )
         language = cls._validate_language_code(kwargs.pop("language", "en"))
@@ -379,8 +305,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         Raises:
             OCRError: If OCR processing fails.
         """
-        import numpy as np  # noqa: PLC0415
         self._init_paddle_ocr_sync(**kwargs)
         if image.mode != "RGB":
@@ -427,12 +351,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
         if cls._paddle_ocr is not None:
             return
-        try:
-            from paddleocr import PaddleOCR  # noqa: PLC0415
-        except ImportError as e:  # pragma: no cover
+        if not HAS_PADDLEOCR or PaddleOCR is None:
             raise MissingDependencyError.create_for_package(
                 dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
-            ) from e
+            )
         language = cls._validate_language_code(kwargs.pop("language", "en"))

kreuzberg/_ocr/_table_extractor.py ADDED Viewed

@@ -0,0 +1,260 @@
+from __future__ import annotations
+import csv
+from io import StringIO
+from typing import TYPE_CHECKING
+import numpy as np
+from kreuzberg.exceptions import ParsingError
+if TYPE_CHECKING:
+    from kreuzberg._types import TSVWord
+def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
+    """Parse TSV output into structured word data.
+    Args:
+        tsv_data: Raw TSV output from Tesseract.
+        min_confidence: Minimum confidence score to include a word.
+    Returns:
+        List of word dictionaries with position and text data.
+    Raises:
+        ParsingError: If TSV data cannot be parsed.
+    """
+    try:
+        reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
+        words: list[TSVWord] = []
+        for row in reader:
+            if row.get("level") == "5" and row.get("text", "").strip():
+                try:
+                    conf = float(row["conf"])
+                    if conf < min_confidence:
+                        continue
+                    words.append(
+                        {
+                            "level": int(row["level"]),
+                            "page_num": int(row["page_num"]),
+                            "block_num": int(row["block_num"]),
+                            "par_num": int(row["par_num"]),
+                            "line_num": int(row["line_num"]),
+                            "word_num": int(row["word_num"]),
+                            "left": int(row["left"]),
+                            "top": int(row["top"]),
+                            "width": int(row["width"]),
+                            "height": int(row["height"]),
+                            "conf": conf,
+                            "text": row["text"],
+                        }
+                    )
+                except (ValueError, KeyError):
+                    continue
+        return words
+    except Exception as e:
+        raise ParsingError("Failed to parse TSV data", context={"error": str(e)}) from e
+def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
+    """Detect columns using X position clustering.
+    Args:
+        words: List of word dictionaries from TSV.
+        column_threshold: Pixel threshold for column clustering.
+    Returns:
+        Sorted list of column X positions.
+    """
+    if not words:
+        return []
+    x_positions = sorted({w["left"] for w in words})
+    if len(x_positions) == 1:
+        return x_positions
+    columns = []
+    current_group = [x_positions[0]]
+    for x in x_positions[1:]:
+        if x - current_group[-1] <= column_threshold:
+            current_group.append(x)
+        else:
+            columns.append(int(np.median(current_group)))
+            current_group = [x]
+    columns.append(int(np.median(current_group)))
+    return columns
+def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
+    """Detect rows using Y position clustering.
+    Args:
+        words: List of word dictionaries from TSV.
+        row_threshold_ratio: Row threshold as ratio of mean text height.
+    Returns:
+        Sorted list of row Y positions.
+    """
+    if not words:
+        return []
+    y_centers = sorted(w["top"] + w["height"] / 2 for w in words)
+    if len(y_centers) == 1:
+        return [int(y_centers[0])]
+    mean_height = np.mean([w["height"] for w in words])
+    threshold = mean_height * row_threshold_ratio
+    rows = []
+    current_group = [y_centers[0]]
+    for y in y_centers[1:]:
+        if y - np.mean(current_group) <= threshold:
+            current_group.append(y)
+        else:
+            rows.append(int(np.median(current_group)))
+            current_group = [y]
+    rows.append(int(np.median(current_group)))
+    return rows
+def _find_closest_index(value: float, positions: list[int]) -> int:
+    """Find index of closest position.
+    Args:
+        value: The value to match.
+        positions: List of positions to search.
+    Returns:
+        Index of the closest position.
+    """
+    if not positions:
+        return 0
+    distances = [abs(value - pos) for pos in positions]
+    return distances.index(min(distances))
+def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
+    """Remove completely empty rows and columns.
+    Args:
+        table: 2D table array.
+    Returns:
+        Cleaned table with empty rows/columns removed.
+    """
+    if not table:
+        return table
+    table = [row for row in table if any(cell.strip() for cell in row)]
+    if not table:
+        return []
+    non_empty_cols = [
+        col_idx for col_idx in range(len(table[0])) if any(row[col_idx].strip() for row in table if col_idx < len(row))
+    ]
+    if not non_empty_cols:
+        return []
+    return [[row[col_idx] if col_idx < len(row) else "" for col_idx in non_empty_cols] for row in table]
+def reconstruct_table(
+    words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
+) -> list[list[str]]:
+    """Reconstruct table from words and detected structure.
+    Args:
+        words: List of word dictionaries from TSV.
+        column_threshold: Pixel threshold for column clustering.
+        row_threshold_ratio: Row threshold as ratio of mean text height.
+    Returns:
+        2D list representing the table structure.
+    """
+    if not words:
+        return []
+    col_positions = detect_columns(words, column_threshold=column_threshold)
+    row_positions = detect_rows(words, row_threshold_ratio=row_threshold_ratio)
+    if not col_positions or not row_positions:
+        return []
+    table: list[list[str]] = [[""] * len(col_positions) for _ in range(len(row_positions))]
+    for word in words:
+        col_idx = _find_closest_index(word["left"], col_positions)
+        y_center = word["top"] + word["height"] / 2
+        row_idx = _find_closest_index(y_center, row_positions)
+        if table[row_idx][col_idx]:
+            table[row_idx][col_idx] += " " + word["text"]
+        else:
+            table[row_idx][col_idx] = word["text"]
+    return _remove_empty_rows_cols(table)
+def to_markdown(table: list[list[str]]) -> str:
+    """Convert table to markdown format.
+    Args:
+        table: 2D list representing the table.
+    Returns:
+        Markdown-formatted table string.
+    """
+    if not table or not table[0]:
+        return ""
+    lines = []
+    lines.append("| " + " | ".join(str(cell) for cell in table[0]) + " |")
+    lines.append("| " + " | ".join(["---"] * len(table[0])) + " |")
+    for row in table[1:]:
+        padded_row = list(row) + [""] * (len(table[0]) - len(row))
+        lines.append("| " + " | ".join(str(cell) for cell in padded_row[: len(table[0])]) + " |")
+    return "\n".join(lines)
+def extract_table_from_tsv(
+    tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
+) -> str:
+    """Extract table from TSV data and convert to markdown.
+    Args:
+        tsv_data: Raw TSV output from Tesseract.
+        column_threshold: Pixel threshold for column clustering.
+        row_threshold_ratio: Row threshold as ratio of mean text height.
+        min_confidence: Minimum confidence score to include a word.
+    Returns:
+        Markdown-formatted table string, or empty string if no table detected.
+    """
+    words = extract_words(tsv_data, min_confidence=min_confidence)
+    if not words:
+        return ""
+    table = reconstruct_table(words, column_threshold=column_threshold, row_threshold_ratio=row_threshold_ratio)
+    if not table:
+        return ""
+    return to_markdown(table)

kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl