kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import csv
3
4
  import hashlib
4
5
  import io
5
6
  import os
@@ -7,26 +8,33 @@ import re
7
8
  import subprocess
8
9
  import sys
9
10
  import tempfile
10
- from dataclasses import dataclass
11
- from enum import Enum
11
+ from io import StringIO
12
12
  from pathlib import Path
13
13
  from typing import TYPE_CHECKING, Any, ClassVar, Final
14
14
 
15
15
  import anyio
16
+ import html_to_markdown
17
+ import polars as pl
16
18
  from anyio import Path as AsyncPath
17
19
  from anyio import run_process
20
+ from bs4 import BeautifulSoup
21
+ from bs4.element import Tag
18
22
  from PIL import Image
23
+ from PIL.Image import Image as PILImage
19
24
  from typing_extensions import Self
20
25
 
21
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
26
+ from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
22
27
  from kreuzberg._ocr._base import OCRBackend
23
- from kreuzberg._types import ExtractionResult
28
+ from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
29
+ from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
30
+ from kreuzberg._utils._cache import get_ocr_cache
24
31
  from kreuzberg._utils._string import normalize_spaces
25
32
  from kreuzberg._utils._sync import run_sync
26
33
  from kreuzberg._utils._tmp import create_temp_file
27
34
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
28
35
 
29
36
  if TYPE_CHECKING:
37
+ from bs4.element import Tag
30
38
  from PIL.Image import Image as PILImage
31
39
 
32
40
  try: # pragma: no cover
@@ -168,68 +176,6 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
168
176
  MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
169
177
 
170
178
 
171
- class PSMMode(Enum):
172
- """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
173
-
174
- OSD_ONLY = 0
175
- """Orientation and script detection only."""
176
- AUTO_OSD = 1
177
- """Automatic page segmentation with orientation and script detection."""
178
- AUTO_ONLY = 2
179
- """Automatic page segmentation without OSD."""
180
- AUTO = 3
181
- """Fully automatic page segmentation (default)."""
182
- SINGLE_COLUMN = 4
183
- """Assume a single column of text."""
184
- SINGLE_BLOCK_VERTICAL = 5
185
- """Assume a single uniform block of vertically aligned text."""
186
- SINGLE_BLOCK = 6
187
- """Assume a single uniform block of text."""
188
- SINGLE_LINE = 7
189
- """Treat the image as a single text line."""
190
- SINGLE_WORD = 8
191
- """Treat the image as a single word."""
192
- CIRCLE_WORD = 9
193
- """Treat the image as a single word in a circle."""
194
- SINGLE_CHAR = 10
195
- """Treat the image as a single character."""
196
-
197
-
198
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
199
- class TesseractConfig:
200
- """Configuration options for Tesseract OCR engine."""
201
-
202
- classify_use_pre_adapted_templates: bool = True
203
- """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
204
- language: str = "eng"
205
- """Language code to use for OCR.
206
- Examples:
207
- - 'eng' for English
208
- - 'deu' for German
209
- - multiple languages combined with '+', e.g. 'eng+deu')
210
- """
211
- language_model_ngram_on: bool = False
212
- """Enable or disable the use of n-gram-based language models for improved text recognition.
213
-
214
- Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
215
- psm: PSMMode = PSMMode.AUTO
216
- """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
217
- tessedit_dont_blkrej_good_wds: bool = True
218
- """If True, prevents block rejection of words identified as good, improving text output quality."""
219
- tessedit_dont_rowrej_good_wds: bool = True
220
- """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
221
- tessedit_enable_dict_correction: bool = True
222
- """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
223
- tessedit_char_whitelist: str = ""
224
- """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
225
- tessedit_use_primary_params_model: bool = True
226
- """If True, forces the use of the primary parameters model for text recognition."""
227
- textord_space_size_is_variable: bool = True
228
- """Allow variable spacing between words, useful for text with irregular spacing."""
229
- thresholding_method: bool = False
230
- """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
231
-
232
-
233
179
  class TesseractBackend(OCRBackend[TesseractConfig]):
234
180
  _version_checked: ClassVar[bool] = False
235
181
 
@@ -238,10 +184,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
238
184
  image: PILImage,
239
185
  **kwargs: Unpack[TesseractConfig],
240
186
  ) -> ExtractionResult:
241
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
187
+ use_cache = kwargs.pop("use_cache", True)
188
+
189
+ save_image = image
190
+ if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
191
+ save_image = image.convert("RGB")
242
192
 
243
193
  image_buffer = io.BytesIO()
244
- await run_sync(image.save, image_buffer, format="PNG")
194
+ await run_sync(save_image.save, image_buffer, format="PNG")
245
195
  image_content = image_buffer.getvalue()
246
196
 
247
197
  cache_kwargs = {
@@ -250,7 +200,40 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
250
200
  "ocr_config": str(sorted(kwargs.items())),
251
201
  }
252
202
 
203
+ if use_cache:
204
+ cached_result = await self._handle_cache_lookup(cache_kwargs)
205
+ if cached_result:
206
+ return cached_result
207
+
208
+ ocr_cache = get_ocr_cache()
209
+ try:
210
+ await self._validate_tesseract_version()
211
+ image_path, unlink = await create_temp_file(".png")
212
+
213
+ try:
214
+ await run_sync(save_image.save, str(image_path), format="PNG")
215
+ except OSError as e:
216
+ if "cannot write mode" not in str(e):
217
+ raise
218
+ save_image = image.convert("RGB")
219
+ await run_sync(save_image.save, str(image_path), format="PNG")
220
+ try:
221
+ result = await self.process_file(image_path, **kwargs)
222
+
223
+ if use_cache:
224
+ await ocr_cache.aset(result, **cache_kwargs)
225
+
226
+ return result
227
+ finally:
228
+ await unlink()
229
+ finally:
230
+ if use_cache:
231
+ ocr_cache.mark_complete(**cache_kwargs)
232
+
233
+ async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
234
+ """Handle cache lookup before processing."""
253
235
  ocr_cache = get_ocr_cache()
236
+
254
237
  cached_result = await ocr_cache.aget(**cache_kwargs)
255
238
  if cached_result is not None:
256
239
  return cached_result
@@ -258,49 +241,123 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
258
241
  if ocr_cache.is_processing(**cache_kwargs):
259
242
  event = ocr_cache.mark_processing(**cache_kwargs)
260
243
  await anyio.to_thread.run_sync(event.wait)
261
-
262
- # Try cache again after waiting for other process to complete # ~keep
263
244
  cached_result = await ocr_cache.aget(**cache_kwargs)
264
245
  if cached_result is not None:
265
246
  return cached_result
266
247
 
267
248
  ocr_cache.mark_processing(**cache_kwargs)
249
+ return None
250
+
251
+ def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
252
+ """Prepare configuration for a Tesseract run."""
253
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
254
+ psm = kwargs.pop("psm", PSMMode.AUTO)
255
+ output_format = kwargs.pop("output_format", "markdown")
256
+ enable_table_detection = kwargs.pop("enable_table_detection", False)
257
+
258
+ if enable_table_detection and output_format == "text":
259
+ output_format = "tsv"
260
+
261
+ if output_format == "markdown":
262
+ tesseract_format = "hocr"
263
+ ext = ".hocr"
264
+ elif output_format == "tsv":
265
+ tesseract_format = "tsv"
266
+ ext = ".tsv"
267
+ elif output_format == "hocr":
268
+ tesseract_format = "hocr"
269
+ ext = ".hocr"
270
+ else:
271
+ tesseract_format = "text"
272
+ ext = ".txt"
273
+
274
+ return {
275
+ "language": language,
276
+ "psm": psm,
277
+ "output_format": output_format,
278
+ "enable_table_detection": enable_table_detection,
279
+ "tesseract_format": tesseract_format,
280
+ "ext": ext,
281
+ "remaining_kwargs": kwargs,
282
+ }
283
+
284
+ async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
285
+ """Build and execute the Tesseract command."""
286
+ command = [
287
+ "tesseract",
288
+ str(path),
289
+ output_base,
290
+ "-l",
291
+ run_config["language"],
292
+ "--psm",
293
+ str(run_config["psm"].value),
294
+ "--oem",
295
+ "1",
296
+ "--loglevel",
297
+ "OFF",
298
+ ]
299
+
300
+ if run_config["tesseract_format"] != "text":
301
+ command.append(run_config["tesseract_format"])
302
+
303
+ for kwarg, value in run_config["remaining_kwargs"].items():
304
+ if kwarg.startswith("table_"):
305
+ continue
306
+ if isinstance(value, bool):
307
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
308
+ else:
309
+ command.extend(["-c", f"{kwarg}={value}"])
310
+
311
+ env: dict[str, Any] | None = None
312
+ if sys.platform.startswith("linux"):
313
+ env = {"OMP_THREAD_LIMIT": "1"}
268
314
 
269
315
  try:
270
- await self._validate_tesseract_version()
271
- image_path, unlink = await create_temp_file(".png")
272
- await run_sync(image.save, str(image_path), format="PNG")
273
- try:
274
- result = await self.process_file(image_path, **kwargs)
316
+ result = await run_process(command, env=env)
317
+ if not result.returncode == 0:
318
+ raise OCRError(
319
+ "OCR failed with a non-0 return code.",
320
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
321
+ )
322
+ except subprocess.CalledProcessError as e:
323
+ error_msg = e.stderr.decode("utf-8") if e.stderr else str(e)
324
+ raise OCRError(
325
+ f"Failed to OCR using tesseract: {error_msg}",
326
+ context={"command": command, "returncode": e.returncode, "error": error_msg},
327
+ ) from e
275
328
 
276
- await ocr_cache.aset(result, **cache_kwargs)
329
+ async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
330
+ """Process the raw output from Tesseract based on the requested format."""
331
+ output_format = run_config["output_format"]
332
+ enable_table_detection = run_config["enable_table_detection"]
333
+ kwargs = run_config["remaining_kwargs"]
334
+
335
+ if output_format == "markdown":
336
+ return await self._process_hocr_to_markdown(output, enable_table_detection=enable_table_detection, **kwargs)
337
+ if output_format == "tsv" and enable_table_detection:
338
+ return await self._process_tsv_output(
339
+ output,
340
+ table_column_threshold=kwargs.get("table_column_threshold", 20),
341
+ table_row_threshold_ratio=kwargs.get("table_row_threshold_ratio", 0.5),
342
+ table_min_confidence=kwargs.get("table_min_confidence", 30.0),
343
+ )
344
+ if output_format == "tsv":
345
+ return self._extract_text_from_tsv(output)
346
+ if output_format == "hocr":
347
+ return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
277
348
 
278
- return result
279
- finally:
280
- await unlink()
281
- finally:
282
- ocr_cache.mark_complete(**cache_kwargs)
349
+ return ExtractionResult(
350
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
351
+ )
283
352
 
284
- async def process_file(
285
- self,
286
- path: Path,
287
- **kwargs: Unpack[TesseractConfig],
288
- ) -> ExtractionResult:
289
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
353
+ async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
354
+ use_cache = kwargs.pop("use_cache", True)
290
355
 
291
356
  try:
292
357
  stat = path.stat()
293
- file_info = {
294
- "path": str(path.resolve()),
295
- "size": stat.st_size,
296
- "mtime": stat.st_mtime,
297
- }
358
+ file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
298
359
  except OSError:
299
- file_info = {
300
- "path": str(path),
301
- "size": 0,
302
- "mtime": 0,
303
- }
360
+ file_info = {"path": str(path), "size": 0, "mtime": 0}
304
361
 
305
362
  cache_kwargs = {
306
363
  "file_info": str(sorted(file_info.items())),
@@ -308,71 +365,37 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
308
365
  "ocr_config": str(sorted(kwargs.items())),
309
366
  }
310
367
 
311
- ocr_cache = get_ocr_cache()
312
- cached_result = await ocr_cache.aget(**cache_kwargs)
313
- if cached_result is not None:
314
- return cached_result
315
-
316
- if ocr_cache.is_processing(**cache_kwargs):
317
- event = ocr_cache.mark_processing(**cache_kwargs)
318
- await anyio.to_thread.run_sync(event.wait)
319
-
320
- # Try cache again after waiting for other process to complete # ~keep
321
- cached_result = await ocr_cache.aget(**cache_kwargs)
322
- if cached_result is not None:
368
+ if use_cache:
369
+ cached_result = await self._handle_cache_lookup(cache_kwargs)
370
+ if cached_result:
323
371
  return cached_result
324
372
 
325
- ocr_cache.mark_processing(**cache_kwargs)
326
-
373
+ ocr_cache = get_ocr_cache()
327
374
  try:
328
375
  await self._validate_tesseract_version()
329
- output_path, unlink = await create_temp_file(".txt")
330
- language = self._validate_language_code(kwargs.pop("language", "eng"))
331
- psm = kwargs.pop("psm", PSMMode.AUTO)
332
- try:
333
- output_base = str(output_path).replace(".txt", "")
334
- command = [
335
- "tesseract",
336
- str(path),
337
- output_base,
338
- "-l",
339
- language,
340
- "--psm",
341
- str(psm.value),
342
- "--oem",
343
- "1",
344
- "--loglevel",
345
- "OFF",
346
- ]
347
- for kwarg, value in kwargs.items():
348
- if isinstance(value, bool):
349
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
350
- else:
351
- # Handle string parameters (like tessedit_char_whitelist)
352
- command.extend(["-c", f"{kwarg}={value}"])
353
-
354
- env: dict[str, Any] | None = None
355
- if sys.platform.startswith("linux"):
356
- env = {"OMP_THREAD_LIMIT": "1"}
357
376
 
358
- result = await run_process(command, env=env)
377
+ run_config = self._prepare_tesseract_run_config(**kwargs)
378
+ output_path, unlink = await create_temp_file(run_config["ext"])
359
379
 
360
- if not result.returncode == 0:
361
- raise OCRError(
362
- "OCR failed with a non-0 return code.",
363
- context={
364
- "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
365
- },
366
- )
380
+ try:
381
+ output_base = str(output_path).replace(run_config["ext"], "")
382
+ await self._execute_tesseract(path, output_base, run_config)
367
383
 
368
384
  output = await AsyncPath(output_path).read_text("utf-8")
369
- extraction_result = ExtractionResult(
370
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
371
- )
372
-
373
- final_cache_kwargs = cache_kwargs.copy()
374
- final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
375
- await ocr_cache.aset(extraction_result, **final_cache_kwargs)
385
+ extraction_result = await self._process_tesseract_output(output, run_config)
386
+
387
+ if use_cache:
388
+ final_cache_kwargs = cache_kwargs.copy()
389
+ final_cache_kwargs["ocr_config"] = str(
390
+ sorted(
391
+ {
392
+ **run_config["remaining_kwargs"],
393
+ "language": run_config["language"],
394
+ "psm": run_config["psm"],
395
+ }.items()
396
+ )
397
+ )
398
+ await ocr_cache.aset(extraction_result, **final_cache_kwargs)
376
399
 
377
400
  return extraction_result
378
401
  except (RuntimeError, OSError) as e:
@@ -380,7 +403,562 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
380
403
  finally:
381
404
  await unlink()
382
405
  finally:
383
- ocr_cache.mark_complete(**cache_kwargs)
406
+ if use_cache:
407
+ ocr_cache.mark_complete(**cache_kwargs)
408
+
409
+ async def _process_tsv_output(
410
+ self,
411
+ tsv_content: str,
412
+ table_column_threshold: int = 20,
413
+ table_row_threshold_ratio: float = 0.5,
414
+ table_min_confidence: float = 30.0,
415
+ ) -> ExtractionResult:
416
+ """Process TSV output and extract tables if detected.
417
+
418
+ Args:
419
+ tsv_content: Raw TSV output from Tesseract.
420
+ table_column_threshold: Pixel threshold for column clustering.
421
+ table_row_threshold_ratio: Row threshold as ratio of mean text height.
422
+ table_min_confidence: Minimum confidence score to include a word.
423
+
424
+ Returns:
425
+ ExtractionResult with extracted content and tables.
426
+ """
427
+ text_result = self._extract_text_from_tsv(tsv_content)
428
+
429
+ try:
430
+ if (
431
+ (words := extract_words(tsv_content, min_confidence=table_min_confidence))
432
+ and (
433
+ table_data := reconstruct_table(
434
+ words,
435
+ column_threshold=table_column_threshold,
436
+ row_threshold_ratio=table_row_threshold_ratio,
437
+ )
438
+ )
439
+ and len(table_data) > 1
440
+ ):
441
+ markdown = to_markdown(table_data)
442
+
443
+ try:
444
+ df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
445
+ except (ImportError, IndexError):
446
+ df = None
447
+
448
+ table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
449
+
450
+ return ExtractionResult(
451
+ content=text_result.content,
452
+ mime_type=text_result.mime_type,
453
+ metadata=text_result.metadata,
454
+ tables=[table],
455
+ chunks=text_result.chunks,
456
+ )
457
+ except (ValueError, KeyError, ImportError):
458
+ pass
459
+
460
+ return text_result
461
+
462
+ def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
463
+ """Extract plain text from TSV output.
464
+
465
+ Args:
466
+ tsv_content: Raw TSV output from Tesseract.
467
+
468
+ Returns:
469
+ ExtractionResult with extracted text.
470
+ """
471
+ try:
472
+ reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
473
+
474
+ lines: dict[tuple[int, int, int, int], list[tuple[int, str]]] = {}
475
+
476
+ for row in reader:
477
+ if row.get("level") == "5" and row.get("text", "").strip():
478
+ line_key = (int(row["page_num"]), int(row["block_num"]), int(row["par_num"]), int(row["line_num"]))
479
+
480
+ if line_key not in lines:
481
+ lines[line_key] = []
482
+
483
+ lines[line_key].append((int(row["left"]), row["text"]))
484
+
485
+ text_parts: list[str] = []
486
+ last_block = -1
487
+ last_para = -1
488
+
489
+ for line_key in sorted(lines.keys()):
490
+ page_num, block_num, par_num, line_num = line_key
491
+
492
+ if block_num != last_block:
493
+ if text_parts: # ~keep
494
+ text_parts.append("\n\n")
495
+ last_block = block_num
496
+ last_para = par_num
497
+ elif par_num != last_para:
498
+ text_parts.append("\n\n")
499
+ last_para = par_num
500
+
501
+ words = sorted(lines[line_key], key=lambda x: x[0])
502
+ line_text = " ".join(word[1] for word in words)
503
+ text_parts.append(line_text)
504
+ text_parts.append("\n")
505
+
506
+ content = "".join(text_parts).strip()
507
+
508
+ except (ValueError, KeyError):
509
+ content = ""
510
+ for line in tsv_content.split("\n")[1:]: # ~keep skip header
511
+ parts = line.split("\t")
512
+ if len(parts) > 11 and parts[11].strip(): # ~keep text is in column 11
513
+ content += parts[11] + " "
514
+ content = content.strip()
515
+
516
+ return ExtractionResult(
517
+ content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
518
+ )
519
+
520
+ async def _process_hocr_to_markdown(
521
+ self,
522
+ hocr_content: str,
523
+ enable_table_detection: bool = False,
524
+ html_to_markdown_config: HTMLToMarkdownConfig | None = None,
525
+ table_column_threshold: int = 20,
526
+ table_row_threshold_ratio: float = 0.5,
527
+ table_min_confidence: float = 30.0,
528
+ **_kwargs: Any,
529
+ ) -> ExtractionResult:
530
+ """Convert hOCR content to Markdown with table detection.
531
+
532
+ Args:
533
+ hocr_content: Raw hOCR HTML/XML content from Tesseract.
534
+ enable_table_detection: Whether to detect and format tables.
535
+ html_to_markdown_config: Configuration for HTML to Markdown conversion.
536
+ table_column_threshold: Pixel threshold for column clustering.
537
+ table_row_threshold_ratio: Row threshold as ratio of mean text height.
538
+ table_min_confidence: Minimum confidence score to include a word.
539
+ **kwargs: Additional configuration options.
540
+
541
+ Returns:
542
+ ExtractionResult with Markdown content and detected tables.
543
+ """
544
+ config = html_to_markdown_config or HTMLToMarkdownConfig(
545
+ escape_asterisks=False,
546
+ escape_underscores=False,
547
+ extract_metadata=False,
548
+ strip="meta title",
549
+ )
550
+
551
+ tables: list[TableData] = []
552
+ if enable_table_detection:
553
+ soup = BeautifulSoup(hocr_content, "lxml")
554
+ tables = await self._extract_tables_from_hocr(
555
+ soup,
556
+ table_column_threshold,
557
+ table_row_threshold_ratio,
558
+ table_min_confidence,
559
+ )
560
+
561
+ hocr_converters = self._create_hocr_converters(tables)
562
+
563
+ all_converters = dict(hocr_converters)
564
+ if config.custom_converters:
565
+ all_converters.update(config.custom_converters)
566
+
567
+ config_dict = config.to_dict()
568
+ config_dict["custom_converters"] = all_converters
569
+
570
+ try:
571
+ markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
572
+ markdown_content = normalize_spaces(markdown_content)
573
+ except (ValueError, TypeError, AttributeError):
574
+ try:
575
+ soup = BeautifulSoup(hocr_content, "lxml")
576
+ words = soup.find_all("span", class_="ocrx_word")
577
+ text_parts = []
578
+ for word in words:
579
+ text = word.get_text().strip()
580
+ if text:
581
+ text_parts.append(text)
582
+
583
+ if text_parts:
584
+ markdown_content = " ".join(text_parts)
585
+ else:
586
+ markdown_content = soup.get_text().strip() or "[No text detected]"
587
+
588
+ markdown_content = normalize_spaces(markdown_content)
589
+ except (ValueError, TypeError, AttributeError):
590
+ markdown_content = "[OCR processing failed]"
591
+
592
+ if tables:
593
+ table_sections = []
594
+ for i, table in enumerate(tables):
595
+ table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
596
+
597
+ if markdown_content.strip():
598
+ final_content = f"{markdown_content}\n{''.join(table_sections)}"
599
+ else:
600
+ final_content = "".join(table_sections).strip()
601
+ else:
602
+ final_content = markdown_content
603
+
604
+ return ExtractionResult(
605
+ content=final_content,
606
+ mime_type=MARKDOWN_MIME_TYPE,
607
+ metadata={"source_format": "hocr", "tables_detected": len(tables)},
608
+ chunks=[],
609
+ tables=tables,
610
+ )
611
+
612
+ def _create_basic_converters(self) -> dict[str, Any]:
613
+ """Create basic converters for individual hOCR elements."""
614
+
615
+ def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
616
+ """Custom converter for hOCR word elements - adds spaces between words."""
617
+ del tag
618
+ return f"{text.strip()} "
619
+
620
+ def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
621
+ """Custom converter for hOCR line elements - handles line breaks."""
622
+ del tag
623
+ return f"{text.strip()}\n"
624
+
625
+ def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
626
+ """Custom converter for hOCR paragraph elements - handles paragraph breaks."""
627
+ del tag
628
+ content = text.strip()
629
+ if not content:
630
+ return ""
631
+ return f"{content}\n\n"
632
+
633
+ def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
634
+ """Custom converter for hOCR content area elements."""
635
+ del tag
636
+ content = text.strip()
637
+ if not content:
638
+ return ""
639
+ return f"{content}\n\n"
640
+
641
+ def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
642
+ """Custom converter for hOCR page elements."""
643
+ del tag
644
+ return text.strip()
645
+
646
+ def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
647
+ """Custom converter for hOCR separator elements - convert to horizontal rules."""
648
+ del tag, text
649
+ return "---\n"
650
+
651
+ def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
652
+ """Custom converter for hOCR photo/image elements - indicate image presence."""
653
+ del text
654
+ title = tag.get("title", "")
655
+ if isinstance(title, str):
656
+ bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
657
+ if bbox_match:
658
+ x0, y0, x1, y1 = bbox_match.groups()
659
+ width = int(x1) - int(x0)
660
+ height = int(y1) - int(y0)
661
+ return f"*[Image region: {width}x{height} pixels]*\n\n"
662
+ return "*[Image detected]*\n\n"
663
+
664
+ return {
665
+ "ocrx_word": ocrx_word_converter,
666
+ "ocr_line": ocr_line_converter,
667
+ "ocr_par": ocr_par_converter,
668
+ "ocr_carea": ocr_carea_converter,
669
+ "ocr_page": ocr_page_converter,
670
+ "ocr_separator": ocr_separator_converter,
671
+ "ocr_photo": ocr_photo_converter,
672
+ }
673
+
674
+ def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
675
+ """Create custom converters for hOCR elements that preserve spacing.
676
+
677
+ Args:
678
+ tables: List of detected tables (not used for filtering, tables added separately).
679
+
680
+ Returns:
681
+ Dictionary mapping HTML tags to converter functions.
682
+ """
683
+ basic_converters = self._create_basic_converters()
684
+
685
+ def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
686
+ """Generic converter for div elements based on class."""
687
+ class_attr = tag.get("class", "")
688
+ if isinstance(class_attr, list):
689
+ class_attr = " ".join(class_attr)
690
+ elif not isinstance(class_attr, str):
691
+ class_attr = ""
692
+
693
+ for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
694
+ if class_name in class_attr:
695
+ converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
696
+ return str(converter_result)
697
+ return text
698
+
699
+ def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
700
+ """Generic converter for span elements based on class."""
701
+ class_attr = tag.get("class", "")
702
+ if isinstance(class_attr, list):
703
+ class_attr = " ".join(class_attr)
704
+ elif not isinstance(class_attr, str):
705
+ class_attr = ""
706
+
707
+ for class_name in ["ocrx_word", "ocr_line"]:
708
+ if class_name in class_attr:
709
+ converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
710
+ return str(converter_result)
711
+ return f"{text.strip()} "
712
+
713
+ return {
714
+ "span": generic_span_converter,
715
+ "div": generic_div_converter,
716
+ "p": basic_converters["ocr_par"],
717
+ }
718
+
719
+ def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
720
+ """Synchronously process hOCR content to markdown format.
721
+
722
+ Args:
723
+ hocr_content: Raw hOCR content as string
724
+ config: Tesseract configuration object
725
+
726
+ Returns:
727
+ ExtractionResult with markdown content
728
+ """
729
+ tables: list[TableData] = []
730
+
731
+ if config.enable_table_detection:
732
+ pass
733
+
734
+ try:
735
+ converters = self._create_hocr_converters(tables)
736
+
737
+ html_config = HTMLToMarkdownConfig(
738
+ custom_converters=converters,
739
+ escape_asterisks=False,
740
+ escape_underscores=False,
741
+ extract_metadata=False,
742
+ strip="meta title",
743
+ )
744
+
745
+ markdown_content = html_to_markdown.convert_to_markdown(
746
+ hocr_content,
747
+ **html_config.to_dict(),
748
+ )
749
+
750
+ markdown_content = normalize_spaces(markdown_content)
751
+
752
+ except (ValueError, TypeError, AttributeError):
753
+ try:
754
+ soup = BeautifulSoup(hocr_content, "lxml")
755
+ words = soup.find_all("span", class_="ocrx_word")
756
+ text_parts = []
757
+ for word in words:
758
+ text = word.get_text().strip()
759
+ if text:
760
+ text_parts.append(text)
761
+
762
+ if text_parts:
763
+ markdown_content = " ".join(text_parts)
764
+ else:
765
+ markdown_content = soup.get_text().strip() or "[No text detected]"
766
+
767
+ markdown_content = normalize_spaces(markdown_content)
768
+ except (ValueError, TypeError, AttributeError):
769
+ markdown_content = "[OCR processing failed]"
770
+
771
+ if tables:
772
+ table_sections = []
773
+ for i, table in enumerate(tables):
774
+ table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
775
+
776
+ if markdown_content.strip():
777
+ final_content = f"{markdown_content}\n{''.join(table_sections)}"
778
+ else:
779
+ final_content = "".join(table_sections).strip()
780
+ else:
781
+ final_content = markdown_content
782
+
783
+ return ExtractionResult(
784
+ content=final_content,
785
+ mime_type=MARKDOWN_MIME_TYPE,
786
+ metadata={"source_format": "hocr", "tables_detected": len(tables)},
787
+ chunks=[],
788
+ tables=tables,
789
+ )
790
+
791
+ def _process_tsv_output_sync(
792
+ self,
793
+ tsv_content: str,
794
+ table_column_threshold: int = 20,
795
+ table_row_threshold_ratio: float = 0.5,
796
+ table_min_confidence: float = 30.0,
797
+ ) -> ExtractionResult:
798
+ """Synchronously process TSV output and extract tables if detected.
799
+
800
+ Args:
801
+ tsv_content: Raw TSV output from Tesseract.
802
+ table_column_threshold: Pixel threshold for column clustering.
803
+ table_row_threshold_ratio: Row threshold as ratio of mean text height.
804
+ table_min_confidence: Minimum confidence score to include a word.
805
+
806
+ Returns:
807
+ ExtractionResult with extracted content and tables.
808
+ """
809
+ text_result = self._extract_text_from_tsv(tsv_content)
810
+
811
+ try:
812
+ if (
813
+ (words := extract_words(tsv_content, min_confidence=table_min_confidence))
814
+ and (
815
+ table_data := reconstruct_table(
816
+ words,
817
+ column_threshold=table_column_threshold,
818
+ row_threshold_ratio=table_row_threshold_ratio,
819
+ )
820
+ )
821
+ and len(table_data) > 1
822
+ ):
823
+ markdown = to_markdown(table_data)
824
+
825
+ try:
826
+ df = pl.DataFrame(table_data[1:], schema=table_data[0])
827
+ except (ImportError, IndexError):
828
+ df = None
829
+
830
+ table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
831
+
832
+ return ExtractionResult(
833
+ content=text_result.content,
834
+ mime_type=text_result.mime_type,
835
+ metadata=text_result.metadata,
836
+ tables=[table],
837
+ chunks=text_result.chunks,
838
+ )
839
+ except (ValueError, KeyError, ImportError):
840
+ pass
841
+
842
+ return text_result
843
+
844
+ async def _extract_tables_from_hocr(
845
+ self,
846
+ soup: Any,
847
+ column_threshold: int = 20,
848
+ row_threshold_ratio: float = 0.5,
849
+ min_confidence: float = 30.0,
850
+ ) -> list[TableData]:
851
+ """Extract tables from hOCR structure using coordinate analysis.
852
+
853
+ Args:
854
+ soup: Parsed hOCR BeautifulSoup object.
855
+ column_threshold: Pixel threshold for column clustering.
856
+ row_threshold_ratio: Row threshold as ratio of mean text height.
857
+ min_confidence: Minimum confidence score to include a word.
858
+
859
+ Returns:
860
+ List of detected tables as TableData objects.
861
+ """
862
+ tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
863
+
864
+ if not tsv_data:
865
+ return []
866
+
867
+ if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
868
+ return []
869
+
870
+ tables: list[TableData] = []
871
+ try:
872
+ table_data = reconstruct_table(
873
+ words,
874
+ column_threshold=column_threshold,
875
+ row_threshold_ratio=row_threshold_ratio,
876
+ )
877
+ if table_data and len(table_data) > 1: # ~keep At least header + one data row
878
+ markdown = to_markdown(table_data)
879
+
880
+ min_x = min(w["left"] for w in words)
881
+ max_x = max(w["left"] + w["width"] for w in words)
882
+ min_y = min(w["top"] for w in words)
883
+ max_y = max(w["top"] + w["height"] for w in words)
884
+
885
+ try:
886
+ df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
887
+ except (ImportError, IndexError):
888
+ df = None
889
+
890
+ dummy_image = Image.new("RGB", (1, 1), "white")
891
+
892
+ table: TableData = {
893
+ "text": markdown,
894
+ "df": df,
895
+ "page_number": 1,
896
+ "cropped_image": dummy_image,
897
+ "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
898
+ } # type: ignore[typeddict-unknown-key]
899
+ tables.append(table)
900
+ except (ValueError, KeyError, ImportError):
901
+ pass
902
+
903
+ return tables
904
+
905
+ async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
906
+ """Convert hOCR structure to TSV format for table extraction.
907
+
908
+ Args:
909
+ soup: Parsed hOCR BeautifulSoup object.
910
+ min_confidence: Minimum confidence score to include.
911
+
912
+ Returns:
913
+ TSV formatted string compatible with table extractor.
914
+ """
915
+ tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
916
+
917
+ words = soup.find_all("span", class_="ocrx_word")
918
+ word_num = 1
919
+
920
+ for word in words:
921
+ title = word.get("title", "")
922
+ text = word.get_text().strip()
923
+
924
+ if not text:
925
+ continue
926
+
927
+ bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
928
+ if not bbox_match:
929
+ continue
930
+
931
+ x0, y0, x1, y1 = map(int, bbox_match.groups())
932
+
933
+ conf_match = re.search(r"x_wconf (\d+)", title)
934
+ confidence = float(conf_match.group(1)) if conf_match else 100.0
935
+
936
+ if confidence < min_confidence:
937
+ continue
938
+
939
+ line = word.find_parent(class_="ocr_line")
940
+ par = word.find_parent(class_="ocr_par")
941
+ block = word.find_parent(class_="ocr_carea")
942
+
943
+ tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
944
+ tsv_lines.append(tsv_line)
945
+ word_num += 1
946
+
947
+ return "\n".join(tsv_lines)
948
+
949
+ def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
950
+ """Identify potential table regions from word coordinates.
951
+
952
+ Args:
953
+ words: List of word dictionaries with coordinates.
954
+
955
+ Returns:
956
+ List of word groups representing potential tables.
957
+ """
958
+ if not words:
959
+ return []
960
+
961
+ return [words]
384
962
 
385
963
  @classmethod
386
964
  async def _validate_tesseract_version(cls) -> None:
@@ -394,8 +972,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
394
972
  return
395
973
 
396
974
  command = ["tesseract", "--version"]
397
- result = await run_process(command)
398
- version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
975
+ env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
976
+ try:
977
+ result = await run_process(command, env=env)
978
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
979
+ raise MissingDependencyError(
980
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
981
+ ) from e
982
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode("utf-8"))
399
983
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
400
984
  raise MissingDependencyError(
401
985
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
@@ -407,33 +991,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
407
991
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
408
992
  ) from e
409
993
 
410
- def process_image_sync(
411
- self,
412
- image: PILImage,
413
- **kwargs: Unpack[TesseractConfig],
414
- ) -> ExtractionResult:
415
- """Synchronously process an image and extract its text and metadata.
416
-
417
- Args:
418
- image: An instance of PIL.Image representing the input image.
419
- **kwargs: Any kwargs related to the given backend
420
-
421
- Returns:
422
- The extraction result object
423
- """
424
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
425
-
426
- image_buffer = io.BytesIO()
427
- image.save(image_buffer, format="PNG")
428
- image_content = image_buffer.getvalue()
429
-
430
- cache_kwargs = {
431
- "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
432
- "ocr_backend": "tesseract",
433
- "ocr_config": str(sorted(kwargs.items())),
434
- }
435
-
994
+ def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
995
+ """Handle cache lookup before processing (sync)."""
436
996
  ocr_cache = get_ocr_cache()
997
+
437
998
  cached_result = ocr_cache.get(**cache_kwargs)
438
999
  if cached_result is not None:
439
1000
  return cached_result
@@ -441,46 +1002,113 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
441
1002
  if ocr_cache.is_processing(**cache_kwargs):
442
1003
  event = ocr_cache.mark_processing(**cache_kwargs)
443
1004
  event.wait()
444
-
445
- # Try cache again after waiting for other process to complete
446
1005
  cached_result = ocr_cache.get(**cache_kwargs)
447
1006
  if cached_result is not None:
448
1007
  return cached_result
449
1008
 
450
1009
  ocr_cache.mark_processing(**cache_kwargs)
1010
+ return None
1011
+
1012
+ def _execute_tesseract_sync(self, command: list[str]) -> None:
1013
+ """Run tesseract command synchronously."""
1014
+ env = os.environ.copy()
1015
+ if sys.platform.startswith("linux"):
1016
+ env["OMP_THREAD_LIMIT"] = "1"
1017
+
1018
+ try:
1019
+ subprocess.run(
1020
+ command,
1021
+ check=True,
1022
+ env=env,
1023
+ capture_output=True,
1024
+ text=True,
1025
+ timeout=30,
1026
+ encoding="utf-8",
1027
+ )
1028
+ except subprocess.CalledProcessError as e:
1029
+ error_msg = e.stderr if e.stderr else str(e)
1030
+ raise OCRError(
1031
+ f"Failed to OCR using tesseract: {error_msg}",
1032
+ context={"command": command, "returncode": e.returncode, "error": error_msg},
1033
+ ) from e
1034
+ except subprocess.TimeoutExpired as e:
1035
+ raise OCRError(
1036
+ "Tesseract timed out during processing.",
1037
+ context={"command": command, "timeout": 30},
1038
+ ) from e
1039
+
1040
+ def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
1041
+ """Process the raw output from Tesseract based on the requested format (sync)."""
1042
+ output_format = run_config["output_format"]
1043
+ enable_table_detection = run_config["enable_table_detection"]
1044
+ kwargs = run_config["remaining_kwargs"]
1045
+ config = TesseractConfig(**kwargs)
1046
+
1047
+ if output_format == "markdown":
1048
+ return self._process_hocr_to_markdown_sync(output, config)
1049
+ if output_format == "tsv" and enable_table_detection:
1050
+ return self._process_tsv_output_sync(
1051
+ output,
1052
+ table_column_threshold=config.table_column_threshold,
1053
+ table_row_threshold_ratio=config.table_row_threshold_ratio,
1054
+ table_min_confidence=config.table_min_confidence,
1055
+ )
1056
+ if output_format == "tsv":
1057
+ return self._extract_text_from_tsv(output)
1058
+ if output_format == "hocr":
1059
+ return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
451
1060
 
1061
+ return ExtractionResult(
1062
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
1063
+ )
1064
+
1065
+ def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
1066
+ """Synchronously process an image and extract its text and metadata."""
1067
+ use_cache = kwargs.pop("use_cache", True)
1068
+
1069
+ save_image = image
1070
+ if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
1071
+ save_image = image.convert("RGB")
1072
+
1073
+ image_buffer = io.BytesIO()
1074
+ save_image.save(image_buffer, format="PNG")
1075
+ image_content = image_buffer.getvalue()
1076
+
1077
+ cache_kwargs = {
1078
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
1079
+ "ocr_backend": "tesseract",
1080
+ "ocr_config": str(sorted(kwargs.items())),
1081
+ }
1082
+
1083
+ if use_cache:
1084
+ cached_result = self._handle_cache_lookup_sync(cache_kwargs)
1085
+ if cached_result:
1086
+ return cached_result
1087
+
1088
+ ocr_cache = get_ocr_cache()
452
1089
  try:
453
1090
  self._validate_tesseract_version_sync()
454
1091
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
455
1092
  image_path = Path(tmp_file.name)
456
- image.save(str(image_path), format="PNG")
1093
+ save_image.save(str(image_path), format="PNG")
457
1094
  try:
458
- result = self.process_file_sync(image_path, **kwargs)
1095
+ kwargs_with_cache = {**kwargs, "use_cache": use_cache}
1096
+ result = self.process_file_sync(image_path, **kwargs_with_cache)
459
1097
 
460
- ocr_cache.set(result, **cache_kwargs)
1098
+ if use_cache:
1099
+ ocr_cache.set(result, **cache_kwargs)
461
1100
 
462
1101
  return result
463
1102
  finally:
464
1103
  if image_path.exists():
465
1104
  image_path.unlink()
466
1105
  finally:
467
- ocr_cache.mark_complete(**cache_kwargs)
1106
+ if use_cache:
1107
+ ocr_cache.mark_complete(**cache_kwargs)
468
1108
 
469
- def process_file_sync(
470
- self,
471
- path: Path,
472
- **kwargs: Unpack[TesseractConfig],
473
- ) -> ExtractionResult:
474
- """Synchronously process a file and extract its text and metadata.
475
-
476
- Args:
477
- path: A Path object representing the file to be processed.
478
- **kwargs: Any kwargs related to the given backend
479
-
480
- Returns:
481
- The extraction result object
482
- """
483
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
1109
+ def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
1110
+ """Synchronously process a file and extract its text and metadata."""
1111
+ use_cache = kwargs.pop("use_cache", True)
484
1112
 
485
1113
  file_info = self._get_file_info(path)
486
1114
 
@@ -490,53 +1118,74 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
490
1118
  "ocr_config": str(sorted(kwargs.items())),
491
1119
  }
492
1120
 
493
- ocr_cache = get_ocr_cache()
494
- cached_result = ocr_cache.get(**cache_kwargs)
495
- if cached_result is not None:
496
- return cached_result
497
-
498
- if ocr_cache.is_processing(**cache_kwargs):
499
- event = ocr_cache.mark_processing(**cache_kwargs)
500
- event.wait()
501
-
502
- # Try cache again after waiting for other process to complete
503
- cached_result = ocr_cache.get(**cache_kwargs)
504
- if cached_result is not None:
1121
+ if use_cache:
1122
+ cached_result = self._handle_cache_lookup_sync(cache_kwargs)
1123
+ if cached_result:
505
1124
  return cached_result
506
1125
 
507
- ocr_cache.mark_processing(**cache_kwargs)
508
-
1126
+ ocr_cache = get_ocr_cache()
509
1127
  try:
510
1128
  self._validate_tesseract_version_sync()
511
- with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
512
- output_base = tmp_file.name.replace(".txt", "")
513
- language = self._validate_language_code(kwargs.pop("language", "eng"))
514
- psm = kwargs.pop("psm", PSMMode.AUTO)
1129
+
1130
+ run_config = self._prepare_tesseract_run_config(**kwargs)
1131
+
1132
+ temp_fd, temp_path = tempfile.mkstemp(suffix=run_config["ext"])
1133
+ os.close(temp_fd)
1134
+ Path(temp_path).unlink()
1135
+ output_base = temp_path.replace(run_config["ext"], "")
1136
+
515
1137
  try:
516
- command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
517
- self._run_tesseract_sync(command)
1138
+ command = self._build_tesseract_command(
1139
+ path,
1140
+ output_base,
1141
+ run_config["language"],
1142
+ run_config["psm"],
1143
+ run_config["tesseract_format"],
1144
+ **run_config["remaining_kwargs"],
1145
+ )
1146
+ self._execute_tesseract_sync(command)
1147
+
1148
+ output_path = Path(f"{output_base}{run_config['ext']}")
1149
+ if not output_path.exists():
1150
+ return ExtractionResult(
1151
+ content="[OCR processing failed]",
1152
+ mime_type=PLAIN_TEXT_MIME_TYPE,
1153
+ metadata={
1154
+ "source_format": run_config["tesseract_format"],
1155
+ "error": f"{run_config['ext']} file not generated",
1156
+ },
1157
+ chunks=[],
1158
+ tables=[],
1159
+ )
518
1160
 
519
- output_path = Path(output_base + ".txt")
520
1161
  with output_path.open(encoding="utf-8") as f:
521
1162
  output = f.read()
522
- extraction_result = ExtractionResult(
523
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
524
- )
525
1163
 
526
- final_cache_kwargs = cache_kwargs.copy()
527
- final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
528
- ocr_cache.set(extraction_result, **final_cache_kwargs)
1164
+ extraction_result = self._process_tesseract_output_sync(output, run_config)
1165
+
1166
+ if use_cache:
1167
+ final_cache_kwargs = cache_kwargs.copy()
1168
+ final_cache_kwargs["ocr_config"] = str(
1169
+ sorted(
1170
+ {
1171
+ **run_config["remaining_kwargs"],
1172
+ "language": run_config["language"],
1173
+ "psm": run_config["psm"],
1174
+ }.items()
1175
+ )
1176
+ )
1177
+ ocr_cache.set(extraction_result, **final_cache_kwargs)
529
1178
 
530
1179
  return extraction_result
531
- except (RuntimeError, OSError) as e:
532
- raise OCRError(f"Failed to OCR using tesseract: {e}") from e
533
1180
  finally:
534
- for ext in [".txt"]:
535
- temp_file = Path(output_base + ext)
536
- if temp_file.exists():
537
- temp_file.unlink()
1181
+ for cleanup_ext in [".txt", ".hocr", ".tsv"]:
1182
+ cleanup_path = Path(f"{output_base}{cleanup_ext}")
1183
+ cleanup_path.unlink(missing_ok=True)
1184
+ except Exception as e:
1185
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
538
1186
  finally:
539
- ocr_cache.mark_complete(**cache_kwargs)
1187
+ if use_cache:
1188
+ ocr_cache.mark_complete(**cache_kwargs)
540
1189
 
541
1190
  def _get_file_info(self, path: Path) -> dict[str, Any]:
542
1191
  """Get file information for caching."""
@@ -555,7 +1204,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
555
1204
  }
556
1205
 
557
1206
  def _build_tesseract_command(
558
- self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
1207
+ self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
559
1208
  ) -> list[str]:
560
1209
  """Build tesseract command with all parameters."""
561
1210
  command = [
@@ -571,34 +1220,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
571
1220
  "--loglevel",
572
1221
  "OFF",
573
1222
  ]
1223
+
1224
+ if output_format != "text":
1225
+ command.append(output_format)
1226
+
574
1227
  for kwarg, value in kwargs.items():
1228
+ if kwarg.startswith("table_"):
1229
+ continue
575
1230
  if isinstance(value, bool):
576
1231
  command.extend(["-c", f"{kwarg}={1 if value else 0}"])
577
1232
  else:
578
1233
  command.extend(["-c", f"{kwarg}={value}"])
579
1234
  return command
580
1235
 
581
- def _run_tesseract_sync(self, command: list[str]) -> None:
582
- """Run tesseract command synchronously."""
583
- env = os.environ.copy()
584
- if sys.platform.startswith("linux"):
585
- env["OMP_THREAD_LIMIT"] = "1"
586
-
587
- result = subprocess.run(
588
- command,
589
- check=False,
590
- env=env,
591
- capture_output=True,
592
- text=True,
593
- timeout=30,
594
- )
595
-
596
- if result.returncode != 0:
597
- raise OCRError(
598
- "OCR failed with a non-0 return code.",
599
- context={"error": result.stderr},
600
- )
601
-
602
1236
  @classmethod
603
1237
  def _validate_tesseract_version_sync(cls) -> None:
604
1238
  """Synchronously validate that Tesseract is installed and is version 5 or above.
@@ -611,7 +1245,12 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
611
1245
  return
612
1246
 
613
1247
  command = ["tesseract", "--version"]
614
- result = subprocess.run(command, capture_output=True, text=True, check=False)
1248
+ try:
1249
+ result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
1250
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
1251
+ raise MissingDependencyError(
1252
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
1253
+ ) from e
615
1254
  version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
616
1255
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
617
1256
  raise MissingDependencyError(
@@ -721,6 +1360,7 @@ def _process_image_with_tesseract(
721
1360
  capture_output=True,
722
1361
  text=True,
723
1362
  timeout=30,
1363
+ encoding="utf-8",
724
1364
  )
725
1365
 
726
1366
  if result.returncode != 0:
@@ -769,9 +1409,11 @@ def _process_image_bytes_with_tesseract(
769
1409
  OCR result as dictionary.
770
1410
  """
771
1411
  try:
772
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
773
- with Image.open(io.BytesIO(image_bytes)) as image:
774
- image.save(tmp_image.name, format="PNG")
1412
+ with (
1413
+ tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
1414
+ Image.open(io.BytesIO(image_bytes)) as image,
1415
+ ):
1416
+ image.save(tmp_image.name, format="PNG")
775
1417
  image_path = tmp_image.name
776
1418
 
777
1419
  try: