PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_config.py +248 -204
kreuzberg/_document_classification.py +0 -8
kreuzberg/_entity_extraction.py +1 -93
kreuzberg/_extractors/_base.py +0 -5
kreuzberg/_extractors/_email.py +1 -11
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -23
kreuzberg/_extractors/_pandoc.py +10 -89
kreuzberg/_extractors/_pdf.py +39 -92
kreuzberg/_extractors/_presentation.py +0 -17
kreuzberg/_extractors/_spread_sheet.py +13 -53
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -138
kreuzberg/_language_detection.py +1 -22
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -2
kreuzberg/_ocr/_easyocr.py +21 -108
kreuzberg/_ocr/_paddleocr.py +16 -94
kreuzberg/_ocr/_table_extractor.py +260 -0
kreuzberg/_ocr/_tesseract.py +906 -264
kreuzberg/_playa.py +5 -4
kreuzberg/_types.py +638 -40
kreuzberg/_utils/_cache.py +88 -90
kreuzberg/_utils/_device.py +0 -18
kreuzberg/_utils/_document_cache.py +0 -2
kreuzberg/_utils/_errors.py +0 -3
kreuzberg/_utils/_pdf_lock.py +0 -2
kreuzberg/_utils/_process_pool.py +19 -19
kreuzberg/_utils/_quality.py +0 -43
kreuzberg/_utils/_ref.py +48 -0
kreuzberg/_utils/_serialization.py +0 -5
kreuzberg/_utils/_string.py +9 -39
kreuzberg/_utils/_sync.py +0 -1
kreuzberg/_utils/_table.py +50 -57
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
kreuzberg-3.13.0.dist-info/RECORD +56 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import csv
 import hashlib
 import io
 import os
@@ -7,26 +8,33 @@ import re
 import subprocess
 import sys
 import tempfile
-from dataclasses import dataclass
-from enum import Enum
+from io import StringIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final
 import anyio
+import html_to_markdown
+import polars as pl
 from anyio import Path as AsyncPath
 from anyio import run_process
+from bs4 import BeautifulSoup
+from bs4.element import Tag
 from PIL import Image
+from PIL.Image import Image as PILImage
 from typing_extensions import Self
-from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
+from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr._base import OCRBackend
-from kreuzberg._types import ExtractionResult
+from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
+from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
+from kreuzberg._utils._cache import get_ocr_cache
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg._utils._tmp import create_temp_file
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
 if TYPE_CHECKING:
+    from bs4.element import Tag
     from PIL.Image import Image as PILImage
 try:  # pragma: no cover
@@ -168,68 +176,6 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
 MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
-class PSMMode(Enum):
-    """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
-    OSD_ONLY = 0
-    """Orientation and script detection only."""
-    AUTO_OSD = 1
-    """Automatic page segmentation with orientation and script detection."""
-    AUTO_ONLY = 2
-    """Automatic page segmentation without OSD."""
-    AUTO = 3
-    """Fully automatic page segmentation (default)."""
-    SINGLE_COLUMN = 4
-    """Assume a single column of text."""
-    SINGLE_BLOCK_VERTICAL = 5
-    """Assume a single uniform block of vertically aligned text."""
-    SINGLE_BLOCK = 6
-    """Assume a single uniform block of text."""
-    SINGLE_LINE = 7
-    """Treat the image as a single text line."""
-    SINGLE_WORD = 8
-    """Treat the image as a single word."""
-    CIRCLE_WORD = 9
-    """Treat the image as a single word in a circle."""
-    SINGLE_CHAR = 10
-    """Treat the image as a single character."""
-@dataclass(unsafe_hash=True, frozen=True, slots=True)
-class TesseractConfig:
-    """Configuration options for Tesseract OCR engine."""
-    classify_use_pre_adapted_templates: bool = True
-    """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
-    language: str = "eng"
-    """Language code to use for OCR.
-    Examples:
-            -   'eng' for English
-            -   'deu' for German
-            -    multiple languages combined with '+', e.g. 'eng+deu')
-    """
-    language_model_ngram_on: bool = False
-    """Enable or disable the use of n-gram-based language models for improved text recognition.
-    Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
-    psm: PSMMode = PSMMode.AUTO
-    """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
-    tessedit_dont_blkrej_good_wds: bool = True
-    """If True, prevents block rejection of words identified as good, improving text output quality."""
-    tessedit_dont_rowrej_good_wds: bool = True
-    """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
-    tessedit_enable_dict_correction: bool = True
-    """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
-    tessedit_char_whitelist: str = ""
-    """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
-    tessedit_use_primary_params_model: bool = True
-    """If True, forces the use of the primary parameters model for text recognition."""
-    textord_space_size_is_variable: bool = True
-    """Allow variable spacing between words, useful for text with irregular spacing."""
-    thresholding_method: bool = False
-    """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
 class TesseractBackend(OCRBackend[TesseractConfig]):
     _version_checked: ClassVar[bool] = False
@@ -238,10 +184,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         image: PILImage,
         **kwargs: Unpack[TesseractConfig],
     ) -> ExtractionResult:
-        from kreuzberg._utils._cache import get_ocr_cache  # noqa: PLC0415
+        use_cache = kwargs.pop("use_cache", True)
+        save_image = image
+        if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
+            save_image = image.convert("RGB")
         image_buffer = io.BytesIO()
-        await run_sync(image.save, image_buffer, format="PNG")
+        await run_sync(save_image.save, image_buffer, format="PNG")
         image_content = image_buffer.getvalue()
         cache_kwargs = {
@@ -250,7 +200,40 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             "ocr_config": str(sorted(kwargs.items())),
         }
+        if use_cache:
+            cached_result = await self._handle_cache_lookup(cache_kwargs)
+            if cached_result:
+                return cached_result
+        ocr_cache = get_ocr_cache()
+        try:
+            await self._validate_tesseract_version()
+            image_path, unlink = await create_temp_file(".png")
+            try:
+                await run_sync(save_image.save, str(image_path), format="PNG")
+            except OSError as e:
+                if "cannot write mode" not in str(e):
+                    raise
+                save_image = image.convert("RGB")
+                await run_sync(save_image.save, str(image_path), format="PNG")
+            try:
+                result = await self.process_file(image_path, **kwargs)
+                if use_cache:
+                    await ocr_cache.aset(result, **cache_kwargs)
+                return result
+            finally:
+                await unlink()
+        finally:
+            if use_cache:
+                ocr_cache.mark_complete(**cache_kwargs)
+    async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
+        """Handle cache lookup before processing."""
         ocr_cache = get_ocr_cache()
         cached_result = await ocr_cache.aget(**cache_kwargs)
         if cached_result is not None:
             return cached_result
@@ -258,49 +241,123 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if ocr_cache.is_processing(**cache_kwargs):
             event = ocr_cache.mark_processing(**cache_kwargs)
             await anyio.to_thread.run_sync(event.wait)
-            # Try cache again after waiting for other process to complete  # ~keep
             cached_result = await ocr_cache.aget(**cache_kwargs)
             if cached_result is not None:
                 return cached_result
         ocr_cache.mark_processing(**cache_kwargs)
+        return None
+    def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
+        """Prepare configuration for a Tesseract run."""
+        language = self._validate_language_code(kwargs.pop("language", "eng"))
+        psm = kwargs.pop("psm", PSMMode.AUTO)
+        output_format = kwargs.pop("output_format", "markdown")
+        enable_table_detection = kwargs.pop("enable_table_detection", False)
+        if enable_table_detection and output_format == "text":
+            output_format = "tsv"
+        if output_format == "markdown":
+            tesseract_format = "hocr"
+            ext = ".hocr"
+        elif output_format == "tsv":
+            tesseract_format = "tsv"
+            ext = ".tsv"
+        elif output_format == "hocr":
+            tesseract_format = "hocr"
+            ext = ".hocr"
+        else:
+            tesseract_format = "text"
+            ext = ".txt"
+        return {
+            "language": language,
+            "psm": psm,
+            "output_format": output_format,
+            "enable_table_detection": enable_table_detection,
+            "tesseract_format": tesseract_format,
+            "ext": ext,
+            "remaining_kwargs": kwargs,
+        }
+    async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
+        """Build and execute the Tesseract command."""
+        command = [
+            "tesseract",
+            str(path),
+            output_base,
+            "-l",
+            run_config["language"],
+            "--psm",
+            str(run_config["psm"].value),
+            "--oem",
+            "1",
+            "--loglevel",
+            "OFF",
+        ]
+        if run_config["tesseract_format"] != "text":
+            command.append(run_config["tesseract_format"])
+        for kwarg, value in run_config["remaining_kwargs"].items():
+            if kwarg.startswith("table_"):
+                continue
+            if isinstance(value, bool):
+                command.extend(["-c", f"{kwarg}={1 if value else 0}"])
+            else:
+                command.extend(["-c", f"{kwarg}={value}"])
+        env: dict[str, Any] | None = None
+        if sys.platform.startswith("linux"):
+            env = {"OMP_THREAD_LIMIT": "1"}
         try:
-            await self._validate_tesseract_version()
-            image_path, unlink = await create_temp_file(".png")
-            await run_sync(image.save, str(image_path), format="PNG")
-            try:
-                result = await self.process_file(image_path, **kwargs)
+            result = await run_process(command, env=env)
+            if not result.returncode == 0:
+                raise OCRError(
+                    "OCR failed with a non-0 return code.",
+                    context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
+                )
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr.decode("utf-8") if e.stderr else str(e)
+            raise OCRError(
+                f"Failed to OCR using tesseract: {error_msg}",
+                context={"command": command, "returncode": e.returncode, "error": error_msg},
+            ) from e
-                await ocr_cache.aset(result, **cache_kwargs)
+    async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
+        """Process the raw output from Tesseract based on the requested format."""
+        output_format = run_config["output_format"]
+        enable_table_detection = run_config["enable_table_detection"]
+        kwargs = run_config["remaining_kwargs"]
+        if output_format == "markdown":
+            return await self._process_hocr_to_markdown(output, enable_table_detection=enable_table_detection, **kwargs)
+        if output_format == "tsv" and enable_table_detection:
+            return await self._process_tsv_output(
+                output,
+                table_column_threshold=kwargs.get("table_column_threshold", 20),
+                table_row_threshold_ratio=kwargs.get("table_row_threshold_ratio", 0.5),
+                table_min_confidence=kwargs.get("table_min_confidence", 30.0),
+            )
+        if output_format == "tsv":
+            return self._extract_text_from_tsv(output)
+        if output_format == "hocr":
+            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
-                return result
-            finally:
-                await unlink()
-        finally:
-            ocr_cache.mark_complete(**cache_kwargs)
+        return ExtractionResult(
+            content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
+        )
-    async def process_file(
-        self,
-        path: Path,
-        **kwargs: Unpack[TesseractConfig],
-    ) -> ExtractionResult:
-        from kreuzberg._utils._cache import get_ocr_cache  # noqa: PLC0415
+    async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
+        use_cache = kwargs.pop("use_cache", True)
         try:
             stat = path.stat()
-            file_info = {
-                "path": str(path.resolve()),
-                "size": stat.st_size,
-                "mtime": stat.st_mtime,
-            }
+            file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
         except OSError:
-            file_info = {
-                "path": str(path),
-                "size": 0,
-                "mtime": 0,
-            }
+            file_info = {"path": str(path), "size": 0, "mtime": 0}
         cache_kwargs = {
             "file_info": str(sorted(file_info.items())),
@@ -308,71 +365,37 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             "ocr_config": str(sorted(kwargs.items())),
         }
-        ocr_cache = get_ocr_cache()
-        cached_result = await ocr_cache.aget(**cache_kwargs)
-        if cached_result is not None:
-            return cached_result
-        if ocr_cache.is_processing(**cache_kwargs):
-            event = ocr_cache.mark_processing(**cache_kwargs)
-            await anyio.to_thread.run_sync(event.wait)
-            # Try cache again after waiting for other process to complete  # ~keep
-            cached_result = await ocr_cache.aget(**cache_kwargs)
-            if cached_result is not None:
+        if use_cache:
+            cached_result = await self._handle_cache_lookup(cache_kwargs)
+            if cached_result:
                 return cached_result
-        ocr_cache.mark_processing(**cache_kwargs)
+        ocr_cache = get_ocr_cache()
         try:
             await self._validate_tesseract_version()
-            output_path, unlink = await create_temp_file(".txt")
-            language = self._validate_language_code(kwargs.pop("language", "eng"))
-            psm = kwargs.pop("psm", PSMMode.AUTO)
-            try:
-                output_base = str(output_path).replace(".txt", "")
-                command = [
-                    "tesseract",
-                    str(path),
-                    output_base,
-                    "-l",
-                    language,
-                    "--psm",
-                    str(psm.value),
-                    "--oem",
-                    "1",
-                    "--loglevel",
-                    "OFF",
-                ]
-                for kwarg, value in kwargs.items():
-                    if isinstance(value, bool):
-                        command.extend(["-c", f"{kwarg}={1 if value else 0}"])
-                    else:
-                        # Handle string parameters (like tessedit_char_whitelist)
-                        command.extend(["-c", f"{kwarg}={value}"])
-                env: dict[str, Any] | None = None
-                if sys.platform.startswith("linux"):
-                    env = {"OMP_THREAD_LIMIT": "1"}
-                result = await run_process(command, env=env)
+            run_config = self._prepare_tesseract_run_config(**kwargs)
+            output_path, unlink = await create_temp_file(run_config["ext"])
-                if not result.returncode == 0:
-                    raise OCRError(
-                        "OCR failed with a non-0 return code.",
-                        context={
-                            "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
-                        },
-                    )
+            try:
+                output_base = str(output_path).replace(run_config["ext"], "")
+                await self._execute_tesseract(path, output_base, run_config)
                 output = await AsyncPath(output_path).read_text("utf-8")
-                extraction_result = ExtractionResult(
-                    content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-                )
-                final_cache_kwargs = cache_kwargs.copy()
-                final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
-                await ocr_cache.aset(extraction_result, **final_cache_kwargs)
+                extraction_result = await self._process_tesseract_output(output, run_config)
+                if use_cache:
+                    final_cache_kwargs = cache_kwargs.copy()
+                    final_cache_kwargs["ocr_config"] = str(
+                        sorted(
+                            {
+                                **run_config["remaining_kwargs"],
+                                "language": run_config["language"],
+                                "psm": run_config["psm"],
+                            }.items()
+                        )
+                    )
+                    await ocr_cache.aset(extraction_result, **final_cache_kwargs)
                 return extraction_result
             except (RuntimeError, OSError) as e:
@@ -380,7 +403,562 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             finally:
                 await unlink()
         finally:
-            ocr_cache.mark_complete(**cache_kwargs)
+            if use_cache:
+                ocr_cache.mark_complete(**cache_kwargs)
+    async def _process_tsv_output(
+        self,
+        tsv_content: str,
+        table_column_threshold: int = 20,
+        table_row_threshold_ratio: float = 0.5,
+        table_min_confidence: float = 30.0,
+    ) -> ExtractionResult:
+        """Process TSV output and extract tables if detected.
+        Args:
+            tsv_content: Raw TSV output from Tesseract.
+            table_column_threshold: Pixel threshold for column clustering.
+            table_row_threshold_ratio: Row threshold as ratio of mean text height.
+            table_min_confidence: Minimum confidence score to include a word.
+        Returns:
+            ExtractionResult with extracted content and tables.
+        """
+        text_result = self._extract_text_from_tsv(tsv_content)
+        try:
+            if (
+                (words := extract_words(tsv_content, min_confidence=table_min_confidence))
+                and (
+                    table_data := reconstruct_table(
+                        words,
+                        column_threshold=table_column_threshold,
+                        row_threshold_ratio=table_row_threshold_ratio,
+                    )
+                )
+                and len(table_data) > 1
+            ):
+                markdown = to_markdown(table_data)
+                try:
+                    df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
+                except (ImportError, IndexError):
+                    df = None
+                table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None}  # type: ignore[typeddict-item]
+                return ExtractionResult(
+                    content=text_result.content,
+                    mime_type=text_result.mime_type,
+                    metadata=text_result.metadata,
+                    tables=[table],
+                    chunks=text_result.chunks,
+                )
+        except (ValueError, KeyError, ImportError):
+            pass
+        return text_result
+    def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
+        """Extract plain text from TSV output.
+        Args:
+            tsv_content: Raw TSV output from Tesseract.
+        Returns:
+            ExtractionResult with extracted text.
+        """
+        try:
+            reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
+            lines: dict[tuple[int, int, int, int], list[tuple[int, str]]] = {}
+            for row in reader:
+                if row.get("level") == "5" and row.get("text", "").strip():
+                    line_key = (int(row["page_num"]), int(row["block_num"]), int(row["par_num"]), int(row["line_num"]))
+                    if line_key not in lines:
+                        lines[line_key] = []
+                    lines[line_key].append((int(row["left"]), row["text"]))
+            text_parts: list[str] = []
+            last_block = -1
+            last_para = -1
+            for line_key in sorted(lines.keys()):
+                page_num, block_num, par_num, line_num = line_key
+                if block_num != last_block:
+                    if text_parts:  # ~keep
+                        text_parts.append("\n\n")
+                    last_block = block_num
+                    last_para = par_num
+                elif par_num != last_para:
+                    text_parts.append("\n\n")
+                    last_para = par_num
+                words = sorted(lines[line_key], key=lambda x: x[0])
+                line_text = " ".join(word[1] for word in words)
+                text_parts.append(line_text)
+                text_parts.append("\n")
+            content = "".join(text_parts).strip()
+        except (ValueError, KeyError):
+            content = ""
+            for line in tsv_content.split("\n")[1:]:  # ~keep skip header
+                parts = line.split("\t")
+                if len(parts) > 11 and parts[11].strip():  # ~keep text is in column 11
+                    content += parts[11] + " "
+            content = content.strip()
+        return ExtractionResult(
+            content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
+        )
+    async def _process_hocr_to_markdown(
+        self,
+        hocr_content: str,
+        enable_table_detection: bool = False,
+        html_to_markdown_config: HTMLToMarkdownConfig | None = None,
+        table_column_threshold: int = 20,
+        table_row_threshold_ratio: float = 0.5,
+        table_min_confidence: float = 30.0,
+        **_kwargs: Any,
+    ) -> ExtractionResult:
+        """Convert hOCR content to Markdown with table detection.
+        Args:
+            hocr_content: Raw hOCR HTML/XML content from Tesseract.
+            enable_table_detection: Whether to detect and format tables.
+            html_to_markdown_config: Configuration for HTML to Markdown conversion.
+            table_column_threshold: Pixel threshold for column clustering.
+            table_row_threshold_ratio: Row threshold as ratio of mean text height.
+            table_min_confidence: Minimum confidence score to include a word.
+            **kwargs: Additional configuration options.
+        Returns:
+            ExtractionResult with Markdown content and detected tables.
+        """
+        config = html_to_markdown_config or HTMLToMarkdownConfig(
+            escape_asterisks=False,
+            escape_underscores=False,
+            extract_metadata=False,
+            strip="meta title",
+        )
+        tables: list[TableData] = []
+        if enable_table_detection:
+            soup = BeautifulSoup(hocr_content, "lxml")
+            tables = await self._extract_tables_from_hocr(
+                soup,
+                table_column_threshold,
+                table_row_threshold_ratio,
+                table_min_confidence,
+            )
+        hocr_converters = self._create_hocr_converters(tables)
+        all_converters = dict(hocr_converters)
+        if config.custom_converters:
+            all_converters.update(config.custom_converters)
+        config_dict = config.to_dict()
+        config_dict["custom_converters"] = all_converters
+        try:
+            markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
+            markdown_content = normalize_spaces(markdown_content)
+        except (ValueError, TypeError, AttributeError):
+            try:
+                soup = BeautifulSoup(hocr_content, "lxml")
+                words = soup.find_all("span", class_="ocrx_word")
+                text_parts = []
+                for word in words:
+                    text = word.get_text().strip()
+                    if text:
+                        text_parts.append(text)
+                if text_parts:
+                    markdown_content = " ".join(text_parts)
+                else:
+                    markdown_content = soup.get_text().strip() or "[No text detected]"
+                markdown_content = normalize_spaces(markdown_content)
+            except (ValueError, TypeError, AttributeError):
+                markdown_content = "[OCR processing failed]"
+        if tables:
+            table_sections = []
+            for i, table in enumerate(tables):
+                table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
+            if markdown_content.strip():
+                final_content = f"{markdown_content}\n{''.join(table_sections)}"
+            else:
+                final_content = "".join(table_sections).strip()
+        else:
+            final_content = markdown_content
+        return ExtractionResult(
+            content=final_content,
+            mime_type=MARKDOWN_MIME_TYPE,
+            metadata={"source_format": "hocr", "tables_detected": len(tables)},
+            chunks=[],
+            tables=tables,
+        )
+    def _create_basic_converters(self) -> dict[str, Any]:
+        """Create basic converters for individual hOCR elements."""
+        def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR word elements - adds spaces between words."""
+            del tag
+            return f"{text.strip()} "
+        def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR line elements - handles line breaks."""
+            del tag
+            return f"{text.strip()}\n"
+        def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR paragraph elements - handles paragraph breaks."""
+            del tag
+            content = text.strip()
+            if not content:
+                return ""
+            return f"{content}\n\n"
+        def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR content area elements."""
+            del tag
+            content = text.strip()
+            if not content:
+                return ""
+            return f"{content}\n\n"
+        def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR page elements."""
+            del tag
+            return text.strip()
+        def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR separator elements - convert to horizontal rules."""
+            del tag, text
+            return "---\n"
+        def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Custom converter for hOCR photo/image elements - indicate image presence."""
+            del text
+            title = tag.get("title", "")
+            if isinstance(title, str):
+                bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
+                if bbox_match:
+                    x0, y0, x1, y1 = bbox_match.groups()
+                    width = int(x1) - int(x0)
+                    height = int(y1) - int(y0)
+                    return f"*[Image region: {width}x{height} pixels]*\n\n"
+            return "*[Image detected]*\n\n"
+        return {
+            "ocrx_word": ocrx_word_converter,
+            "ocr_line": ocr_line_converter,
+            "ocr_par": ocr_par_converter,
+            "ocr_carea": ocr_carea_converter,
+            "ocr_page": ocr_page_converter,
+            "ocr_separator": ocr_separator_converter,
+            "ocr_photo": ocr_photo_converter,
+        }
+    def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
+        """Create custom converters for hOCR elements that preserve spacing.
+        Args:
+            tables: List of detected tables (not used for filtering, tables added separately).
+        Returns:
+            Dictionary mapping HTML tags to converter functions.
+        """
+        basic_converters = self._create_basic_converters()
+        def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Generic converter for div elements based on class."""
+            class_attr = tag.get("class", "")
+            if isinstance(class_attr, list):
+                class_attr = " ".join(class_attr)
+            elif not isinstance(class_attr, str):
+                class_attr = ""
+            for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
+                if class_name in class_attr:
+                    converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
+                    return str(converter_result)
+            return text
+        def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
+            """Generic converter for span elements based on class."""
+            class_attr = tag.get("class", "")
+            if isinstance(class_attr, list):
+                class_attr = " ".join(class_attr)
+            elif not isinstance(class_attr, str):
+                class_attr = ""
+            for class_name in ["ocrx_word", "ocr_line"]:
+                if class_name in class_attr:
+                    converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
+                    return str(converter_result)
+            return f"{text.strip()} "
+        return {
+            "span": generic_span_converter,
+            "div": generic_div_converter,
+            "p": basic_converters["ocr_par"],
+        }
+    def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
+        """Synchronously process hOCR content to markdown format.
+        Args:
+            hocr_content: Raw hOCR content as string
+            config: Tesseract configuration object
+        Returns:
+            ExtractionResult with markdown content
+        """
+        tables: list[TableData] = []
+        if config.enable_table_detection:
+            pass
+        try:
+            converters = self._create_hocr_converters(tables)
+            html_config = HTMLToMarkdownConfig(
+                custom_converters=converters,
+                escape_asterisks=False,
+                escape_underscores=False,
+                extract_metadata=False,
+                strip="meta title",
+            )
+            markdown_content = html_to_markdown.convert_to_markdown(
+                hocr_content,
+                **html_config.to_dict(),
+            )
+            markdown_content = normalize_spaces(markdown_content)
+        except (ValueError, TypeError, AttributeError):
+            try:
+                soup = BeautifulSoup(hocr_content, "lxml")
+                words = soup.find_all("span", class_="ocrx_word")
+                text_parts = []
+                for word in words:
+                    text = word.get_text().strip()
+                    if text:
+                        text_parts.append(text)
+                if text_parts:
+                    markdown_content = " ".join(text_parts)
+                else:
+                    markdown_content = soup.get_text().strip() or "[No text detected]"
+                markdown_content = normalize_spaces(markdown_content)
+            except (ValueError, TypeError, AttributeError):
+                markdown_content = "[OCR processing failed]"
+        if tables:
+            table_sections = []
+            for i, table in enumerate(tables):
+                table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
+            if markdown_content.strip():
+                final_content = f"{markdown_content}\n{''.join(table_sections)}"
+            else:
+                final_content = "".join(table_sections).strip()
+        else:
+            final_content = markdown_content
+        return ExtractionResult(
+            content=final_content,
+            mime_type=MARKDOWN_MIME_TYPE,
+            metadata={"source_format": "hocr", "tables_detected": len(tables)},
+            chunks=[],
+            tables=tables,
+        )
+    def _process_tsv_output_sync(
+        self,
+        tsv_content: str,
+        table_column_threshold: int = 20,
+        table_row_threshold_ratio: float = 0.5,
+        table_min_confidence: float = 30.0,
+    ) -> ExtractionResult:
+        """Synchronously process TSV output and extract tables if detected.
+        Args:
+            tsv_content: Raw TSV output from Tesseract.
+            table_column_threshold: Pixel threshold for column clustering.
+            table_row_threshold_ratio: Row threshold as ratio of mean text height.
+            table_min_confidence: Minimum confidence score to include a word.
+        Returns:
+            ExtractionResult with extracted content and tables.
+        """
+        text_result = self._extract_text_from_tsv(tsv_content)
+        try:
+            if (
+                (words := extract_words(tsv_content, min_confidence=table_min_confidence))
+                and (
+                    table_data := reconstruct_table(
+                        words,
+                        column_threshold=table_column_threshold,
+                        row_threshold_ratio=table_row_threshold_ratio,
+                    )
+                )
+                and len(table_data) > 1
+            ):
+                markdown = to_markdown(table_data)
+                try:
+                    df = pl.DataFrame(table_data[1:], schema=table_data[0])
+                except (ImportError, IndexError):
+                    df = None
+                table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None}  # type: ignore[typeddict-item]
+                return ExtractionResult(
+                    content=text_result.content,
+                    mime_type=text_result.mime_type,
+                    metadata=text_result.metadata,
+                    tables=[table],
+                    chunks=text_result.chunks,
+                )
+        except (ValueError, KeyError, ImportError):
+            pass
+        return text_result
+    async def _extract_tables_from_hocr(
+        self,
+        soup: Any,
+        column_threshold: int = 20,
+        row_threshold_ratio: float = 0.5,
+        min_confidence: float = 30.0,
+    ) -> list[TableData]:
+        """Extract tables from hOCR structure using coordinate analysis.
+        Args:
+            soup: Parsed hOCR BeautifulSoup object.
+            column_threshold: Pixel threshold for column clustering.
+            row_threshold_ratio: Row threshold as ratio of mean text height.
+            min_confidence: Minimum confidence score to include a word.
+        Returns:
+            List of detected tables as TableData objects.
+        """
+        tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
+        if not tsv_data:
+            return []
+        if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
+            return []
+        tables: list[TableData] = []
+        try:
+            table_data = reconstruct_table(
+                words,
+                column_threshold=column_threshold,
+                row_threshold_ratio=row_threshold_ratio,
+            )
+            if table_data and len(table_data) > 1:  # ~keep At least header + one data row
+                markdown = to_markdown(table_data)
+                min_x = min(w["left"] for w in words)
+                max_x = max(w["left"] + w["width"] for w in words)
+                min_y = min(w["top"] for w in words)
+                max_y = max(w["top"] + w["height"] for w in words)
+                try:
+                    df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
+                except (ImportError, IndexError):
+                    df = None
+                dummy_image = Image.new("RGB", (1, 1), "white")
+                table: TableData = {
+                    "text": markdown,
+                    "df": df,
+                    "page_number": 1,
+                    "cropped_image": dummy_image,
+                    "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
+                }  # type: ignore[typeddict-unknown-key]
+                tables.append(table)
+        except (ValueError, KeyError, ImportError):
+            pass
+        return tables
+    async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
+        """Convert hOCR structure to TSV format for table extraction.
+        Args:
+            soup: Parsed hOCR BeautifulSoup object.
+            min_confidence: Minimum confidence score to include.
+        Returns:
+            TSV formatted string compatible with table extractor.
+        """
+        tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
+        words = soup.find_all("span", class_="ocrx_word")
+        word_num = 1
+        for word in words:
+            title = word.get("title", "")
+            text = word.get_text().strip()
+            if not text:
+                continue
+            bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
+            if not bbox_match:
+                continue
+            x0, y0, x1, y1 = map(int, bbox_match.groups())
+            conf_match = re.search(r"x_wconf (\d+)", title)
+            confidence = float(conf_match.group(1)) if conf_match else 100.0
+            if confidence < min_confidence:
+                continue
+            line = word.find_parent(class_="ocr_line")
+            par = word.find_parent(class_="ocr_par")
+            block = word.find_parent(class_="ocr_carea")
+            tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
+            tsv_lines.append(tsv_line)
+            word_num += 1
+        return "\n".join(tsv_lines)
+    def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
+        """Identify potential table regions from word coordinates.
+        Args:
+            words: List of word dictionaries with coordinates.
+        Returns:
+            List of word groups representing potential tables.
+        """
+        if not words:
+            return []
+        return [words]
     @classmethod
     async def _validate_tesseract_version(cls) -> None:
@@ -394,8 +972,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 return
             command = ["tesseract", "--version"]
-            result = await run_process(command)
-            version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
+            env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
+            try:
+                result = await run_process(command, env=env)
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+                raise MissingDependencyError(
+                    "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
+                ) from e
+            version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode("utf-8"))
             if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
                 raise MissingDependencyError(
                     "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
@@ -407,33 +991,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e
-    def process_image_sync(
-        self,
-        image: PILImage,
-        **kwargs: Unpack[TesseractConfig],
-    ) -> ExtractionResult:
-        """Synchronously process an image and extract its text and metadata.
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            The extraction result object
-        """
-        from kreuzberg._utils._cache import get_ocr_cache  # noqa: PLC0415
-        image_buffer = io.BytesIO()
-        image.save(image_buffer, format="PNG")
-        image_content = image_buffer.getvalue()
-        cache_kwargs = {
-            "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
-            "ocr_backend": "tesseract",
-            "ocr_config": str(sorted(kwargs.items())),
-        }
+    def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
+        """Handle cache lookup before processing (sync)."""
         ocr_cache = get_ocr_cache()
         cached_result = ocr_cache.get(**cache_kwargs)
         if cached_result is not None:
             return cached_result
@@ -441,46 +1002,113 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         if ocr_cache.is_processing(**cache_kwargs):
             event = ocr_cache.mark_processing(**cache_kwargs)
             event.wait()
-            # Try cache again after waiting for other process to complete
             cached_result = ocr_cache.get(**cache_kwargs)
             if cached_result is not None:
                 return cached_result
         ocr_cache.mark_processing(**cache_kwargs)
+        return None
+    def _execute_tesseract_sync(self, command: list[str]) -> None:
+        """Run tesseract command synchronously."""
+        env = os.environ.copy()
+        if sys.platform.startswith("linux"):
+            env["OMP_THREAD_LIMIT"] = "1"
+        try:
+            subprocess.run(
+                command,
+                check=True,
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=30,
+                encoding="utf-8",
+            )
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr if e.stderr else str(e)
+            raise OCRError(
+                f"Failed to OCR using tesseract: {error_msg}",
+                context={"command": command, "returncode": e.returncode, "error": error_msg},
+            ) from e
+        except subprocess.TimeoutExpired as e:
+            raise OCRError(
+                "Tesseract timed out during processing.",
+                context={"command": command, "timeout": 30},
+            ) from e
+    def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
+        """Process the raw output from Tesseract based on the requested format (sync)."""
+        output_format = run_config["output_format"]
+        enable_table_detection = run_config["enable_table_detection"]
+        kwargs = run_config["remaining_kwargs"]
+        config = TesseractConfig(**kwargs)
+        if output_format == "markdown":
+            return self._process_hocr_to_markdown_sync(output, config)
+        if output_format == "tsv" and enable_table_detection:
+            return self._process_tsv_output_sync(
+                output,
+                table_column_threshold=config.table_column_threshold,
+                table_row_threshold_ratio=config.table_row_threshold_ratio,
+                table_min_confidence=config.table_min_confidence,
+            )
+        if output_format == "tsv":
+            return self._extract_text_from_tsv(output)
+        if output_format == "hocr":
+            return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
+        return ExtractionResult(
+            content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
+        )
+    def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
+        """Synchronously process an image and extract its text and metadata."""
+        use_cache = kwargs.pop("use_cache", True)
+        save_image = image
+        if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
+            save_image = image.convert("RGB")
+        image_buffer = io.BytesIO()
+        save_image.save(image_buffer, format="PNG")
+        image_content = image_buffer.getvalue()
+        cache_kwargs = {
+            "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
+            "ocr_backend": "tesseract",
+            "ocr_config": str(sorted(kwargs.items())),
+        }
+        if use_cache:
+            cached_result = self._handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
+                return cached_result
+        ocr_cache = get_ocr_cache()
         try:
             self._validate_tesseract_version_sync()
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
                 image_path = Path(tmp_file.name)
-                image.save(str(image_path), format="PNG")
+                save_image.save(str(image_path), format="PNG")
             try:
-                result = self.process_file_sync(image_path, **kwargs)
+                kwargs_with_cache = {**kwargs, "use_cache": use_cache}
+                result = self.process_file_sync(image_path, **kwargs_with_cache)
-                ocr_cache.set(result, **cache_kwargs)
+                if use_cache:
+                    ocr_cache.set(result, **cache_kwargs)
                 return result
             finally:
                 if image_path.exists():
                     image_path.unlink()
         finally:
-            ocr_cache.mark_complete(**cache_kwargs)
+            if use_cache:
+                ocr_cache.mark_complete(**cache_kwargs)
-    def process_file_sync(
-        self,
-        path: Path,
-        **kwargs: Unpack[TesseractConfig],
-    ) -> ExtractionResult:
-        """Synchronously process a file and extract its text and metadata.
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            The extraction result object
-        """
-        from kreuzberg._utils._cache import get_ocr_cache  # noqa: PLC0415
+    def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
+        """Synchronously process a file and extract its text and metadata."""
+        use_cache = kwargs.pop("use_cache", True)
         file_info = self._get_file_info(path)
@@ -490,53 +1118,74 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             "ocr_config": str(sorted(kwargs.items())),
         }
-        ocr_cache = get_ocr_cache()
-        cached_result = ocr_cache.get(**cache_kwargs)
-        if cached_result is not None:
-            return cached_result
-        if ocr_cache.is_processing(**cache_kwargs):
-            event = ocr_cache.mark_processing(**cache_kwargs)
-            event.wait()
-            # Try cache again after waiting for other process to complete
-            cached_result = ocr_cache.get(**cache_kwargs)
-            if cached_result is not None:
+        if use_cache:
+            cached_result = self._handle_cache_lookup_sync(cache_kwargs)
+            if cached_result:
                 return cached_result
-        ocr_cache.mark_processing(**cache_kwargs)
+        ocr_cache = get_ocr_cache()
         try:
             self._validate_tesseract_version_sync()
-            with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
-                output_base = tmp_file.name.replace(".txt", "")
-            language = self._validate_language_code(kwargs.pop("language", "eng"))
-            psm = kwargs.pop("psm", PSMMode.AUTO)
+            run_config = self._prepare_tesseract_run_config(**kwargs)
+            temp_fd, temp_path = tempfile.mkstemp(suffix=run_config["ext"])
+            os.close(temp_fd)
+            Path(temp_path).unlink()
+            output_base = temp_path.replace(run_config["ext"], "")
             try:
-                command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
-                self._run_tesseract_sync(command)
+                command = self._build_tesseract_command(
+                    path,
+                    output_base,
+                    run_config["language"],
+                    run_config["psm"],
+                    run_config["tesseract_format"],
+                    **run_config["remaining_kwargs"],
+                )
+                self._execute_tesseract_sync(command)
+                output_path = Path(f"{output_base}{run_config['ext']}")
+                if not output_path.exists():
+                    return ExtractionResult(
+                        content="[OCR processing failed]",
+                        mime_type=PLAIN_TEXT_MIME_TYPE,
+                        metadata={
+                            "source_format": run_config["tesseract_format"],
+                            "error": f"{run_config['ext']} file not generated",
+                        },
+                        chunks=[],
+                        tables=[],
+                    )
-                output_path = Path(output_base + ".txt")
                 with output_path.open(encoding="utf-8") as f:
                     output = f.read()
-                extraction_result = ExtractionResult(
-                    content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
-                )
-                final_cache_kwargs = cache_kwargs.copy()
-                final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
-                ocr_cache.set(extraction_result, **final_cache_kwargs)
+                extraction_result = self._process_tesseract_output_sync(output, run_config)
+                if use_cache:
+                    final_cache_kwargs = cache_kwargs.copy()
+                    final_cache_kwargs["ocr_config"] = str(
+                        sorted(
+                            {
+                                **run_config["remaining_kwargs"],
+                                "language": run_config["language"],
+                                "psm": run_config["psm"],
+                            }.items()
+                        )
+                    )
+                    ocr_cache.set(extraction_result, **final_cache_kwargs)
                 return extraction_result
-            except (RuntimeError, OSError) as e:
-                raise OCRError(f"Failed to OCR using tesseract: {e}") from e
             finally:
-                for ext in [".txt"]:
-                    temp_file = Path(output_base + ext)
-                    if temp_file.exists():
-                        temp_file.unlink()
+                for cleanup_ext in [".txt", ".hocr", ".tsv"]:
+                    cleanup_path = Path(f"{output_base}{cleanup_ext}")
+                    cleanup_path.unlink(missing_ok=True)
+        except Exception as e:
+            raise OCRError(f"Failed to OCR using tesseract: {e}") from e
         finally:
-            ocr_cache.mark_complete(**cache_kwargs)
+            if use_cache:
+                ocr_cache.mark_complete(**cache_kwargs)
     def _get_file_info(self, path: Path) -> dict[str, Any]:
         """Get file information for caching."""
@@ -555,7 +1204,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             }
     def _build_tesseract_command(
-        self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
+        self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
     ) -> list[str]:
         """Build tesseract command with all parameters."""
         command = [
@@ -571,34 +1220,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             "--loglevel",
             "OFF",
         ]
+        if output_format != "text":
+            command.append(output_format)
         for kwarg, value in kwargs.items():
+            if kwarg.startswith("table_"):
+                continue
             if isinstance(value, bool):
                 command.extend(["-c", f"{kwarg}={1 if value else 0}"])
             else:
                 command.extend(["-c", f"{kwarg}={value}"])
         return command
-    def _run_tesseract_sync(self, command: list[str]) -> None:
-        """Run tesseract command synchronously."""
-        env = os.environ.copy()
-        if sys.platform.startswith("linux"):
-            env["OMP_THREAD_LIMIT"] = "1"
-        result = subprocess.run(
-            command,
-            check=False,
-            env=env,
-            capture_output=True,
-            text=True,
-            timeout=30,
-        )
-        if result.returncode != 0:
-            raise OCRError(
-                "OCR failed with a non-0 return code.",
-                context={"error": result.stderr},
-            )
     @classmethod
     def _validate_tesseract_version_sync(cls) -> None:
         """Synchronously validate that Tesseract is installed and is version 5 or above.
@@ -611,7 +1245,12 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 return
             command = ["tesseract", "--version"]
-            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            try:
+                result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+                raise MissingDependencyError(
+                    "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
+                ) from e
             version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
             if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
                 raise MissingDependencyError(
@@ -721,6 +1360,7 @@ def _process_image_with_tesseract(
                 capture_output=True,
                 text=True,
                 timeout=30,
+                encoding="utf-8",
             )
             if result.returncode != 0:
@@ -769,9 +1409,11 @@ def _process_image_bytes_with_tesseract(
         OCR result as dictionary.
     """
     try:
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
-            with Image.open(io.BytesIO(image_bytes)) as image:
-                image.save(tmp_image.name, format="PNG")
+        with (
+            tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
+            Image.open(io.BytesIO(image_bytes)) as image,
+        ):
+            image.save(tmp_image.name, format="PNG")
             image_path = tmp_image.name
         try:

kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.0py3-none-any.whl