kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import csv
3
4
  import hashlib
4
5
  import io
5
6
  import os
@@ -7,26 +8,33 @@ import re
7
8
  import subprocess
8
9
  import sys
9
10
  import tempfile
10
- from dataclasses import dataclass
11
- from enum import Enum
11
+ from io import StringIO
12
12
  from pathlib import Path
13
13
  from typing import TYPE_CHECKING, Any, ClassVar, Final
14
14
 
15
15
  import anyio
16
+ import html_to_markdown
17
+ import polars as pl
16
18
  from anyio import Path as AsyncPath
17
19
  from anyio import run_process
20
+ from bs4 import BeautifulSoup
21
+ from bs4.element import Tag
18
22
  from PIL import Image
23
+ from PIL.Image import Image as PILImage
19
24
  from typing_extensions import Self
20
25
 
21
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
26
+ from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
22
27
  from kreuzberg._ocr._base import OCRBackend
23
- from kreuzberg._types import ExtractionResult
28
+ from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
29
+ from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
30
+ from kreuzberg._utils._cache import get_ocr_cache
24
31
  from kreuzberg._utils._string import normalize_spaces
25
32
  from kreuzberg._utils._sync import run_sync
26
33
  from kreuzberg._utils._tmp import create_temp_file
27
34
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
28
35
 
29
36
  if TYPE_CHECKING:
37
+ from bs4.element import Tag
30
38
  from PIL.Image import Image as PILImage
31
39
 
32
40
  try: # pragma: no cover
@@ -168,68 +176,6 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
168
176
  MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
169
177
 
170
178
 
171
- class PSMMode(Enum):
172
- """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
173
-
174
- OSD_ONLY = 0
175
- """Orientation and script detection only."""
176
- AUTO_OSD = 1
177
- """Automatic page segmentation with orientation and script detection."""
178
- AUTO_ONLY = 2
179
- """Automatic page segmentation without OSD."""
180
- AUTO = 3
181
- """Fully automatic page segmentation (default)."""
182
- SINGLE_COLUMN = 4
183
- """Assume a single column of text."""
184
- SINGLE_BLOCK_VERTICAL = 5
185
- """Assume a single uniform block of vertically aligned text."""
186
- SINGLE_BLOCK = 6
187
- """Assume a single uniform block of text."""
188
- SINGLE_LINE = 7
189
- """Treat the image as a single text line."""
190
- SINGLE_WORD = 8
191
- """Treat the image as a single word."""
192
- CIRCLE_WORD = 9
193
- """Treat the image as a single word in a circle."""
194
- SINGLE_CHAR = 10
195
- """Treat the image as a single character."""
196
-
197
-
198
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
199
- class TesseractConfig:
200
- """Configuration options for Tesseract OCR engine."""
201
-
202
- classify_use_pre_adapted_templates: bool = True
203
- """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
204
- language: str = "eng"
205
- """Language code to use for OCR.
206
- Examples:
207
- - 'eng' for English
208
- - 'deu' for German
209
- - multiple languages combined with '+', e.g. 'eng+deu')
210
- """
211
- language_model_ngram_on: bool = False
212
- """Enable or disable the use of n-gram-based language models for improved text recognition.
213
-
214
- Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
215
- psm: PSMMode = PSMMode.AUTO
216
- """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
217
- tessedit_dont_blkrej_good_wds: bool = True
218
- """If True, prevents block rejection of words identified as good, improving text output quality."""
219
- tessedit_dont_rowrej_good_wds: bool = True
220
- """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
221
- tessedit_enable_dict_correction: bool = True
222
- """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
223
- tessedit_char_whitelist: str = ""
224
- """Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
225
- tessedit_use_primary_params_model: bool = True
226
- """If True, forces the use of the primary parameters model for text recognition."""
227
- textord_space_size_is_variable: bool = True
228
- """Allow variable spacing between words, useful for text with irregular spacing."""
229
- thresholding_method: bool = False
230
- """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
231
-
232
-
233
179
  class TesseractBackend(OCRBackend[TesseractConfig]):
234
180
  _version_checked: ClassVar[bool] = False
235
181
 
@@ -238,10 +184,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
238
184
  image: PILImage,
239
185
  **kwargs: Unpack[TesseractConfig],
240
186
  ) -> ExtractionResult:
241
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
187
+ use_cache = kwargs.pop("use_cache", True)
188
+
189
+ save_image = image
190
+ if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
191
+ save_image = image.convert("RGB")
242
192
 
243
193
  image_buffer = io.BytesIO()
244
- await run_sync(image.save, image_buffer, format="PNG")
194
+ await run_sync(save_image.save, image_buffer, format="PNG")
245
195
  image_content = image_buffer.getvalue()
246
196
 
247
197
  cache_kwargs = {
@@ -250,7 +200,39 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
250
200
  "ocr_config": str(sorted(kwargs.items())),
251
201
  }
252
202
 
203
+ if use_cache:
204
+ cached_result = await self._handle_cache_lookup(cache_kwargs)
205
+ if cached_result:
206
+ return cached_result
207
+
208
+ ocr_cache = get_ocr_cache()
209
+ try:
210
+ await self._validate_tesseract_version()
211
+ image_path, unlink = await create_temp_file(".png")
212
+
213
+ try:
214
+ await run_sync(save_image.save, str(image_path), format="PNG")
215
+ except OSError as e:
216
+ if "cannot write mode" not in str(e):
217
+ raise
218
+ save_image = image.convert("RGB")
219
+ await run_sync(save_image.save, str(image_path), format="PNG")
220
+ try:
221
+ result = await self.process_file(image_path, **kwargs)
222
+
223
+ if use_cache:
224
+ await ocr_cache.aset(result, **cache_kwargs)
225
+
226
+ return result
227
+ finally:
228
+ await unlink()
229
+ finally:
230
+ if use_cache:
231
+ ocr_cache.mark_complete(**cache_kwargs)
232
+
233
+ async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
253
234
  ocr_cache = get_ocr_cache()
235
+
254
236
  cached_result = await ocr_cache.aget(**cache_kwargs)
255
237
  if cached_result is not None:
256
238
  return cached_result
@@ -258,49 +240,120 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
258
240
  if ocr_cache.is_processing(**cache_kwargs):
259
241
  event = ocr_cache.mark_processing(**cache_kwargs)
260
242
  await anyio.to_thread.run_sync(event.wait)
261
-
262
- # Try cache again after waiting for other process to complete # ~keep
263
243
  cached_result = await ocr_cache.aget(**cache_kwargs)
264
244
  if cached_result is not None:
265
245
  return cached_result
266
246
 
267
247
  ocr_cache.mark_processing(**cache_kwargs)
248
+ return None
249
+
250
+ def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
251
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
252
+ psm = kwargs.pop("psm", PSMMode.AUTO)
253
+ output_format = kwargs.pop("output_format", "markdown")
254
+ enable_table_detection = kwargs.pop("enable_table_detection", False)
255
+
256
+ if enable_table_detection and output_format == "text":
257
+ output_format = "tsv"
258
+
259
+ if output_format == "markdown":
260
+ tesseract_format = "hocr"
261
+ ext = ".hocr"
262
+ elif output_format == "tsv":
263
+ tesseract_format = "tsv"
264
+ ext = ".tsv"
265
+ elif output_format == "hocr":
266
+ tesseract_format = "hocr"
267
+ ext = ".hocr"
268
+ else:
269
+ tesseract_format = "text"
270
+ ext = ".txt"
271
+
272
+ return {
273
+ "language": language,
274
+ "psm": psm,
275
+ "output_format": output_format,
276
+ "enable_table_detection": enable_table_detection,
277
+ "tesseract_format": tesseract_format,
278
+ "ext": ext,
279
+ "remaining_kwargs": kwargs,
280
+ }
281
+
282
+ async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
283
+ command = [
284
+ "tesseract",
285
+ str(path),
286
+ output_base,
287
+ "-l",
288
+ run_config["language"],
289
+ "--psm",
290
+ str(run_config["psm"].value),
291
+ "--oem",
292
+ "1",
293
+ "--loglevel",
294
+ "OFF",
295
+ ]
296
+
297
+ if run_config["tesseract_format"] != "text":
298
+ command.append(run_config["tesseract_format"])
299
+
300
+ for kwarg, value in run_config["remaining_kwargs"].items():
301
+ if kwarg.startswith("table_"):
302
+ continue
303
+ if isinstance(value, bool):
304
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
305
+ else:
306
+ command.extend(["-c", f"{kwarg}={value}"])
307
+
308
+ env: dict[str, Any] | None = None
309
+ if sys.platform.startswith("linux"):
310
+ env = {"OMP_THREAD_LIMIT": "1"}
268
311
 
269
312
  try:
270
- await self._validate_tesseract_version()
271
- image_path, unlink = await create_temp_file(".png")
272
- await run_sync(image.save, str(image_path), format="PNG")
273
- try:
274
- result = await self.process_file(image_path, **kwargs)
313
+ result = await run_process(command, env=env)
314
+ if not result.returncode == 0:
315
+ raise OCRError(
316
+ "OCR failed with a non-0 return code.",
317
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
318
+ )
319
+ except subprocess.CalledProcessError as e:
320
+ error_msg = e.stderr.decode("utf-8") if e.stderr else str(e)
321
+ raise OCRError(
322
+ f"Failed to OCR using tesseract: {error_msg}",
323
+ context={"command": command, "returncode": e.returncode, "error": error_msg},
324
+ ) from e
275
325
 
276
- await ocr_cache.aset(result, **cache_kwargs)
326
+ async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
327
+ output_format = run_config["output_format"]
328
+ enable_table_detection = run_config["enable_table_detection"]
329
+ kwargs = run_config["remaining_kwargs"]
330
+
331
+ if output_format == "markdown":
332
+ return await self._process_hocr_to_markdown(output, enable_table_detection=enable_table_detection, **kwargs)
333
+ if output_format == "tsv" and enable_table_detection:
334
+ return await self._process_tsv_output(
335
+ output,
336
+ table_column_threshold=kwargs.get("table_column_threshold", 20),
337
+ table_row_threshold_ratio=kwargs.get("table_row_threshold_ratio", 0.5),
338
+ table_min_confidence=kwargs.get("table_min_confidence", 30.0),
339
+ )
340
+ if output_format == "tsv":
341
+ return self._extract_text_from_tsv(output)
342
+ if output_format == "hocr":
343
+ return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
277
344
 
278
- return result
279
- finally:
280
- await unlink()
281
- finally:
282
- ocr_cache.mark_complete(**cache_kwargs)
345
+ return ExtractionResult(
346
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
347
+ )
283
348
 
284
- async def process_file(
285
- self,
286
- path: Path,
287
- **kwargs: Unpack[TesseractConfig],
288
- ) -> ExtractionResult:
289
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
349
+ async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
350
+ use_cache = kwargs.pop("use_cache", True)
290
351
 
291
352
  try:
292
353
  stat = path.stat()
293
- file_info = {
294
- "path": str(path.resolve()),
295
- "size": stat.st_size,
296
- "mtime": stat.st_mtime,
297
- }
354
+ file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
298
355
  except OSError:
299
- file_info = {
300
- "path": str(path),
301
- "size": 0,
302
- "mtime": 0,
303
- }
356
+ file_info = {"path": str(path), "size": 0, "mtime": 0}
304
357
 
305
358
  cache_kwargs = {
306
359
  "file_info": str(sorted(file_info.items())),
@@ -308,71 +361,37 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
308
361
  "ocr_config": str(sorted(kwargs.items())),
309
362
  }
310
363
 
311
- ocr_cache = get_ocr_cache()
312
- cached_result = await ocr_cache.aget(**cache_kwargs)
313
- if cached_result is not None:
314
- return cached_result
315
-
316
- if ocr_cache.is_processing(**cache_kwargs):
317
- event = ocr_cache.mark_processing(**cache_kwargs)
318
- await anyio.to_thread.run_sync(event.wait)
319
-
320
- # Try cache again after waiting for other process to complete # ~keep
321
- cached_result = await ocr_cache.aget(**cache_kwargs)
322
- if cached_result is not None:
364
+ if use_cache:
365
+ cached_result = await self._handle_cache_lookup(cache_kwargs)
366
+ if cached_result:
323
367
  return cached_result
324
368
 
325
- ocr_cache.mark_processing(**cache_kwargs)
326
-
369
+ ocr_cache = get_ocr_cache()
327
370
  try:
328
371
  await self._validate_tesseract_version()
329
- output_path, unlink = await create_temp_file(".txt")
330
- language = self._validate_language_code(kwargs.pop("language", "eng"))
331
- psm = kwargs.pop("psm", PSMMode.AUTO)
332
- try:
333
- output_base = str(output_path).replace(".txt", "")
334
- command = [
335
- "tesseract",
336
- str(path),
337
- output_base,
338
- "-l",
339
- language,
340
- "--psm",
341
- str(psm.value),
342
- "--oem",
343
- "1",
344
- "--loglevel",
345
- "OFF",
346
- ]
347
- for kwarg, value in kwargs.items():
348
- if isinstance(value, bool):
349
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
350
- else:
351
- # Handle string parameters (like tessedit_char_whitelist)
352
- command.extend(["-c", f"{kwarg}={value}"])
353
-
354
- env: dict[str, Any] | None = None
355
- if sys.platform.startswith("linux"):
356
- env = {"OMP_THREAD_LIMIT": "1"}
357
372
 
358
- result = await run_process(command, env=env)
373
+ run_config = self._prepare_tesseract_run_config(**kwargs)
374
+ output_path, unlink = await create_temp_file(run_config["ext"])
359
375
 
360
- if not result.returncode == 0:
361
- raise OCRError(
362
- "OCR failed with a non-0 return code.",
363
- context={
364
- "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
365
- },
366
- )
376
+ try:
377
+ output_base = str(output_path).replace(run_config["ext"], "")
378
+ await self._execute_tesseract(path, output_base, run_config)
367
379
 
368
380
  output = await AsyncPath(output_path).read_text("utf-8")
369
- extraction_result = ExtractionResult(
370
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
371
- )
372
-
373
- final_cache_kwargs = cache_kwargs.copy()
374
- final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
375
- await ocr_cache.aset(extraction_result, **final_cache_kwargs)
381
+ extraction_result = await self._process_tesseract_output(output, run_config)
382
+
383
+ if use_cache:
384
+ final_cache_kwargs = cache_kwargs.copy()
385
+ final_cache_kwargs["ocr_config"] = str(
386
+ sorted(
387
+ {
388
+ **run_config["remaining_kwargs"],
389
+ "language": run_config["language"],
390
+ "psm": run_config["psm"],
391
+ }.items()
392
+ )
393
+ )
394
+ await ocr_cache.aset(extraction_result, **final_cache_kwargs)
376
395
 
377
396
  return extraction_result
378
397
  except (RuntimeError, OSError) as e:
@@ -380,22 +399,478 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
380
399
  finally:
381
400
  await unlink()
382
401
  finally:
383
- ocr_cache.mark_complete(**cache_kwargs)
402
+ if use_cache:
403
+ ocr_cache.mark_complete(**cache_kwargs)
404
+
405
+ async def _process_tsv_output(
406
+ self,
407
+ tsv_content: str,
408
+ table_column_threshold: int = 20,
409
+ table_row_threshold_ratio: float = 0.5,
410
+ table_min_confidence: float = 30.0,
411
+ ) -> ExtractionResult:
412
+ text_result = self._extract_text_from_tsv(tsv_content)
413
+
414
+ try:
415
+ if (
416
+ (words := extract_words(tsv_content, min_confidence=table_min_confidence))
417
+ and (
418
+ table_data := reconstruct_table(
419
+ words,
420
+ column_threshold=table_column_threshold,
421
+ row_threshold_ratio=table_row_threshold_ratio,
422
+ )
423
+ )
424
+ and len(table_data) > 1
425
+ ):
426
+ markdown = to_markdown(table_data)
427
+
428
+ try:
429
+ df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
430
+ except (ImportError, IndexError):
431
+ df = None
432
+
433
+ table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
434
+
435
+ return ExtractionResult(
436
+ content=text_result.content,
437
+ mime_type=text_result.mime_type,
438
+ metadata=text_result.metadata,
439
+ tables=[table],
440
+ chunks=text_result.chunks,
441
+ )
442
+ except (ValueError, KeyError, ImportError):
443
+ pass
444
+
445
+ return text_result
446
+
447
+ def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
448
+ try:
449
+ reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
450
+
451
+ lines: dict[tuple[int, int, int, int], list[tuple[int, str]]] = {}
452
+
453
+ for row in reader:
454
+ if row.get("level") == "5" and row.get("text", "").strip():
455
+ line_key = (int(row["page_num"]), int(row["block_num"]), int(row["par_num"]), int(row["line_num"]))
456
+
457
+ if line_key not in lines:
458
+ lines[line_key] = []
459
+
460
+ lines[line_key].append((int(row["left"]), row["text"]))
461
+
462
+ text_parts: list[str] = []
463
+ last_block = -1
464
+ last_para = -1
465
+
466
+ for line_key in sorted(lines.keys()):
467
+ page_num, block_num, par_num, line_num = line_key
468
+
469
+ if block_num != last_block:
470
+ if text_parts: # ~keep
471
+ text_parts.append("\n\n")
472
+ last_block = block_num
473
+ last_para = par_num
474
+ elif par_num != last_para:
475
+ text_parts.append("\n\n")
476
+ last_para = par_num
477
+
478
+ words = sorted(lines[line_key], key=lambda x: x[0])
479
+ line_text = " ".join(word[1] for word in words)
480
+ text_parts.append(line_text)
481
+ text_parts.append("\n")
482
+
483
+ content = "".join(text_parts).strip()
484
+
485
+ except (ValueError, KeyError):
486
+ content = ""
487
+ for line in tsv_content.split("\n")[1:]: # ~keep skip header
488
+ parts = line.split("\t")
489
+ if len(parts) > 11 and parts[11].strip(): # ~keep text is in column 11
490
+ content += parts[11] + " "
491
+ content = content.strip()
492
+
493
+ return ExtractionResult(
494
+ content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
495
+ )
496
+
497
+ async def _process_hocr_to_markdown(
498
+ self,
499
+ hocr_content: str,
500
+ enable_table_detection: bool = False,
501
+ html_to_markdown_config: HTMLToMarkdownConfig | None = None,
502
+ table_column_threshold: int = 20,
503
+ table_row_threshold_ratio: float = 0.5,
504
+ table_min_confidence: float = 30.0,
505
+ **_kwargs: Any,
506
+ ) -> ExtractionResult:
507
+ config = html_to_markdown_config or HTMLToMarkdownConfig(
508
+ escape_asterisks=False,
509
+ escape_underscores=False,
510
+ extract_metadata=False,
511
+ strip="meta title",
512
+ )
513
+
514
+ tables: list[TableData] = []
515
+ if enable_table_detection:
516
+ soup = BeautifulSoup(hocr_content, "lxml")
517
+ tables = await self._extract_tables_from_hocr(
518
+ soup,
519
+ table_column_threshold,
520
+ table_row_threshold_ratio,
521
+ table_min_confidence,
522
+ )
523
+
524
+ hocr_converters = self._create_hocr_converters(tables)
525
+
526
+ all_converters = dict(hocr_converters)
527
+ if config.custom_converters:
528
+ all_converters.update(config.custom_converters)
529
+
530
+ config_dict = config.to_dict()
531
+ config_dict["custom_converters"] = all_converters
532
+
533
+ try:
534
+ markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
535
+ markdown_content = normalize_spaces(markdown_content)
536
+ except (ValueError, TypeError, AttributeError):
537
+ try:
538
+ soup = BeautifulSoup(hocr_content, "lxml")
539
+ words = soup.find_all("span", class_="ocrx_word")
540
+ text_parts = []
541
+ for word in words:
542
+ text = word.get_text().strip()
543
+ if text:
544
+ text_parts.append(text)
545
+
546
+ if text_parts:
547
+ markdown_content = " ".join(text_parts)
548
+ else:
549
+ markdown_content = soup.get_text().strip() or "[No text detected]"
550
+
551
+ markdown_content = normalize_spaces(markdown_content)
552
+ except (ValueError, TypeError, AttributeError):
553
+ markdown_content = "[OCR processing failed]"
554
+
555
+ if tables:
556
+ table_sections = []
557
+ for i, table in enumerate(tables):
558
+ table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
559
+
560
+ if markdown_content.strip():
561
+ final_content = f"{markdown_content}\n{''.join(table_sections)}"
562
+ else:
563
+ final_content = "".join(table_sections).strip()
564
+ else:
565
+ final_content = markdown_content
566
+
567
+ return ExtractionResult(
568
+ content=final_content,
569
+ mime_type=MARKDOWN_MIME_TYPE,
570
+ metadata={"source_format": "hocr", "tables_detected": len(tables)},
571
+ chunks=[],
572
+ tables=tables,
573
+ )
574
+
575
+ def _create_basic_converters(self) -> dict[str, Any]:
576
+ def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
577
+ del tag
578
+ return f"{text.strip()} "
579
+
580
+ def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
581
+ del tag
582
+ return f"{text.strip()}\n"
583
+
584
+ def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
585
+ del tag
586
+ content = text.strip()
587
+ if not content:
588
+ return ""
589
+ return f"{content}\n\n"
590
+
591
+ def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
592
+ del tag
593
+ content = text.strip()
594
+ if not content:
595
+ return ""
596
+ return f"{content}\n\n"
597
+
598
+ def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
599
+ del tag
600
+ return text.strip()
601
+
602
+ def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
603
+ del tag, text
604
+ return "---\n"
605
+
606
+ def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
607
+ del text
608
+ title = tag.get("title", "")
609
+ if isinstance(title, str):
610
+ bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
611
+ if bbox_match:
612
+ x0, y0, x1, y1 = bbox_match.groups()
613
+ width = int(x1) - int(x0)
614
+ height = int(y1) - int(y0)
615
+ return f"*[Image region: {width}x{height} pixels]*\n\n"
616
+ return "*[Image detected]*\n\n"
617
+
618
+ return {
619
+ "ocrx_word": ocrx_word_converter,
620
+ "ocr_line": ocr_line_converter,
621
+ "ocr_par": ocr_par_converter,
622
+ "ocr_carea": ocr_carea_converter,
623
+ "ocr_page": ocr_page_converter,
624
+ "ocr_separator": ocr_separator_converter,
625
+ "ocr_photo": ocr_photo_converter,
626
+ }
627
+
628
+ def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
629
+ basic_converters = self._create_basic_converters()
630
+
631
+ def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
632
+ class_attr = tag.get("class", "")
633
+ if isinstance(class_attr, list):
634
+ class_attr = " ".join(class_attr)
635
+ elif not isinstance(class_attr, str):
636
+ class_attr = ""
637
+
638
+ for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
639
+ if class_name in class_attr:
640
+ converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
641
+ return str(converter_result)
642
+ return text
643
+
644
+ def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
645
+ class_attr = tag.get("class", "")
646
+ if isinstance(class_attr, list):
647
+ class_attr = " ".join(class_attr)
648
+ elif not isinstance(class_attr, str):
649
+ class_attr = ""
650
+
651
+ for class_name in ["ocrx_word", "ocr_line"]:
652
+ if class_name in class_attr:
653
+ converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
654
+ return str(converter_result)
655
+ return f"{text.strip()} "
656
+
657
+ return {
658
+ "span": generic_span_converter,
659
+ "div": generic_div_converter,
660
+ "p": basic_converters["ocr_par"],
661
+ }
662
+
663
+ def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
664
+ tables: list[TableData] = []
665
+
666
+ if config.enable_table_detection:
667
+ pass
668
+
669
+ try:
670
+ converters = self._create_hocr_converters(tables)
671
+
672
+ html_config = HTMLToMarkdownConfig(
673
+ custom_converters=converters,
674
+ escape_asterisks=False,
675
+ escape_underscores=False,
676
+ extract_metadata=False,
677
+ strip="meta title",
678
+ )
679
+
680
+ markdown_content = html_to_markdown.convert_to_markdown(
681
+ hocr_content,
682
+ **html_config.to_dict(),
683
+ )
684
+
685
+ markdown_content = normalize_spaces(markdown_content)
686
+
687
+ except (ValueError, TypeError, AttributeError):
688
+ try:
689
+ soup = BeautifulSoup(hocr_content, "lxml")
690
+ words = soup.find_all("span", class_="ocrx_word")
691
+ text_parts = []
692
+ for word in words:
693
+ text = word.get_text().strip()
694
+ if text:
695
+ text_parts.append(text)
696
+
697
+ if text_parts:
698
+ markdown_content = " ".join(text_parts)
699
+ else:
700
+ markdown_content = soup.get_text().strip() or "[No text detected]"
701
+
702
+ markdown_content = normalize_spaces(markdown_content)
703
+ except (ValueError, TypeError, AttributeError):
704
+ markdown_content = "[OCR processing failed]"
705
+
706
+ if tables:
707
+ table_sections = []
708
+ for i, table in enumerate(tables):
709
+ table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
710
+
711
+ if markdown_content.strip():
712
+ final_content = f"{markdown_content}\n{''.join(table_sections)}"
713
+ else:
714
+ final_content = "".join(table_sections).strip()
715
+ else:
716
+ final_content = markdown_content
717
+
718
+ return ExtractionResult(
719
+ content=final_content,
720
+ mime_type=MARKDOWN_MIME_TYPE,
721
+ metadata={"source_format": "hocr", "tables_detected": len(tables)},
722
+ chunks=[],
723
+ tables=tables,
724
+ )
725
+
726
+ def _process_tsv_output_sync(
727
+ self,
728
+ tsv_content: str,
729
+ table_column_threshold: int = 20,
730
+ table_row_threshold_ratio: float = 0.5,
731
+ table_min_confidence: float = 30.0,
732
+ ) -> ExtractionResult:
733
+ text_result = self._extract_text_from_tsv(tsv_content)
734
+
735
+ try:
736
+ if (
737
+ (words := extract_words(tsv_content, min_confidence=table_min_confidence))
738
+ and (
739
+ table_data := reconstruct_table(
740
+ words,
741
+ column_threshold=table_column_threshold,
742
+ row_threshold_ratio=table_row_threshold_ratio,
743
+ )
744
+ )
745
+ and len(table_data) > 1
746
+ ):
747
+ markdown = to_markdown(table_data)
748
+
749
+ try:
750
+ df = pl.DataFrame(table_data[1:], schema=table_data[0])
751
+ except (ImportError, IndexError):
752
+ df = None
753
+
754
+ table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
755
+
756
+ return ExtractionResult(
757
+ content=text_result.content,
758
+ mime_type=text_result.mime_type,
759
+ metadata=text_result.metadata,
760
+ tables=[table],
761
+ chunks=text_result.chunks,
762
+ )
763
+ except (ValueError, KeyError, ImportError):
764
+ pass
765
+
766
+ return text_result
767
+
768
+ async def _extract_tables_from_hocr(
769
+ self,
770
+ soup: Any,
771
+ column_threshold: int = 20,
772
+ row_threshold_ratio: float = 0.5,
773
+ min_confidence: float = 30.0,
774
+ ) -> list[TableData]:
775
+ tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
776
+
777
+ if not tsv_data:
778
+ return []
779
+
780
+ if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
781
+ return []
782
+
783
+ tables: list[TableData] = []
784
+ try:
785
+ table_data = reconstruct_table(
786
+ words,
787
+ column_threshold=column_threshold,
788
+ row_threshold_ratio=row_threshold_ratio,
789
+ )
790
+ if table_data and len(table_data) > 1: # ~keep At least header + one data row
791
+ markdown = to_markdown(table_data)
792
+
793
+ min_x = min(w["left"] for w in words)
794
+ max_x = max(w["left"] + w["width"] for w in words)
795
+ min_y = min(w["top"] for w in words)
796
+ max_y = max(w["top"] + w["height"] for w in words)
797
+
798
+ try:
799
+ df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
800
+ except (ImportError, IndexError):
801
+ df = None
802
+
803
+ dummy_image = Image.new("RGB", (1, 1), "white")
804
+
805
+ table: TableData = {
806
+ "text": markdown,
807
+ "df": df,
808
+ "page_number": 1,
809
+ "cropped_image": dummy_image,
810
+ "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
811
+ } # type: ignore[typeddict-unknown-key]
812
+ tables.append(table)
813
+ except (ValueError, KeyError, ImportError):
814
+ pass
815
+
816
+ return tables
817
+
818
+ async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
819
+ tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
820
+
821
+ words = soup.find_all("span", class_="ocrx_word")
822
+ word_num = 1
823
+
824
+ for word in words:
825
+ title = word.get("title", "")
826
+ text = word.get_text().strip()
827
+
828
+ if not text:
829
+ continue
830
+
831
+ bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
832
+ if not bbox_match:
833
+ continue
834
+
835
+ x0, y0, x1, y1 = map(int, bbox_match.groups())
836
+
837
+ conf_match = re.search(r"x_wconf (\d+)", title)
838
+ confidence = float(conf_match.group(1)) if conf_match else 100.0
839
+
840
+ if confidence < min_confidence:
841
+ continue
842
+
843
+ line = word.find_parent(class_="ocr_line")
844
+ par = word.find_parent(class_="ocr_par")
845
+ block = word.find_parent(class_="ocr_carea")
846
+
847
+ tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
848
+ tsv_lines.append(tsv_line)
849
+ word_num += 1
850
+
851
+ return "\n".join(tsv_lines)
852
+
853
+ def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
854
+ if not words:
855
+ return []
856
+
857
+ return [words]
384
858
 
385
859
  @classmethod
386
860
  async def _validate_tesseract_version(cls) -> None:
387
- """Validate that Tesseract is installed and is version 5 or above.
388
-
389
- Raises:
390
- MissingDependencyError: If Tesseract is not installed or is below version 5.
391
- """
392
861
  try:
393
862
  if cls._version_checked:
394
863
  return
395
864
 
396
865
  command = ["tesseract", "--version"]
397
- result = await run_process(command)
398
- version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
866
+ env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
867
+ try:
868
+ result = await run_process(command, env=env)
869
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
870
+ raise MissingDependencyError(
871
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
872
+ ) from e
873
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode("utf-8"))
399
874
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
400
875
  raise MissingDependencyError(
401
876
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
@@ -407,33 +882,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
407
882
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
408
883
  ) from e
409
884
 
410
- def process_image_sync(
411
- self,
412
- image: PILImage,
413
- **kwargs: Unpack[TesseractConfig],
414
- ) -> ExtractionResult:
415
- """Synchronously process an image and extract its text and metadata.
416
-
417
- Args:
418
- image: An instance of PIL.Image representing the input image.
419
- **kwargs: Any kwargs related to the given backend
420
-
421
- Returns:
422
- The extraction result object
423
- """
424
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
425
-
426
- image_buffer = io.BytesIO()
427
- image.save(image_buffer, format="PNG")
428
- image_content = image_buffer.getvalue()
429
-
430
- cache_kwargs = {
431
- "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
432
- "ocr_backend": "tesseract",
433
- "ocr_config": str(sorted(kwargs.items())),
434
- }
435
-
885
+ def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
436
886
  ocr_cache = get_ocr_cache()
887
+
437
888
  cached_result = ocr_cache.get(**cache_kwargs)
438
889
  if cached_result is not None:
439
890
  return cached_result
@@ -441,46 +892,109 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
441
892
  if ocr_cache.is_processing(**cache_kwargs):
442
893
  event = ocr_cache.mark_processing(**cache_kwargs)
443
894
  event.wait()
444
-
445
- # Try cache again after waiting for other process to complete
446
895
  cached_result = ocr_cache.get(**cache_kwargs)
447
896
  if cached_result is not None:
448
897
  return cached_result
449
898
 
450
899
  ocr_cache.mark_processing(**cache_kwargs)
900
+ return None
901
+
902
+ def _execute_tesseract_sync(self, command: list[str]) -> None:
903
+ env = os.environ.copy()
904
+ if sys.platform.startswith("linux"):
905
+ env["OMP_THREAD_LIMIT"] = "1"
451
906
 
907
+ try:
908
+ subprocess.run(
909
+ command,
910
+ check=True,
911
+ env=env,
912
+ capture_output=True,
913
+ text=True,
914
+ timeout=30,
915
+ encoding="utf-8",
916
+ )
917
+ except subprocess.CalledProcessError as e:
918
+ error_msg = e.stderr if e.stderr else str(e)
919
+ raise OCRError(
920
+ f"Failed to OCR using tesseract: {error_msg}",
921
+ context={"command": command, "returncode": e.returncode, "error": error_msg},
922
+ ) from e
923
+ except subprocess.TimeoutExpired as e:
924
+ raise OCRError(
925
+ "Tesseract timed out during processing.",
926
+ context={"command": command, "timeout": 30},
927
+ ) from e
928
+
929
+ def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
930
+ output_format = run_config["output_format"]
931
+ enable_table_detection = run_config["enable_table_detection"]
932
+ kwargs = run_config["remaining_kwargs"]
933
+ config = TesseractConfig(**kwargs)
934
+
935
+ if output_format == "markdown":
936
+ return self._process_hocr_to_markdown_sync(output, config)
937
+ if output_format == "tsv" and enable_table_detection:
938
+ return self._process_tsv_output_sync(
939
+ output,
940
+ table_column_threshold=config.table_column_threshold,
941
+ table_row_threshold_ratio=config.table_row_threshold_ratio,
942
+ table_min_confidence=config.table_min_confidence,
943
+ )
944
+ if output_format == "tsv":
945
+ return self._extract_text_from_tsv(output)
946
+ if output_format == "hocr":
947
+ return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
948
+
949
+ return ExtractionResult(
950
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
951
+ )
952
+
953
+ def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
954
+ use_cache = kwargs.pop("use_cache", True)
955
+
956
+ save_image = image
957
+ if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
958
+ save_image = image.convert("RGB")
959
+
960
+ image_buffer = io.BytesIO()
961
+ save_image.save(image_buffer, format="PNG")
962
+ image_content = image_buffer.getvalue()
963
+
964
+ cache_kwargs = {
965
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
966
+ "ocr_backend": "tesseract",
967
+ "ocr_config": str(sorted(kwargs.items())),
968
+ }
969
+
970
+ if use_cache:
971
+ cached_result = self._handle_cache_lookup_sync(cache_kwargs)
972
+ if cached_result:
973
+ return cached_result
974
+
975
+ ocr_cache = get_ocr_cache()
452
976
  try:
453
977
  self._validate_tesseract_version_sync()
454
978
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
455
979
  image_path = Path(tmp_file.name)
456
- image.save(str(image_path), format="PNG")
980
+ save_image.save(str(image_path), format="PNG")
457
981
  try:
458
- result = self.process_file_sync(image_path, **kwargs)
982
+ kwargs_with_cache = {**kwargs, "use_cache": use_cache}
983
+ result = self.process_file_sync(image_path, **kwargs_with_cache)
459
984
 
460
- ocr_cache.set(result, **cache_kwargs)
985
+ if use_cache:
986
+ ocr_cache.set(result, **cache_kwargs)
461
987
 
462
988
  return result
463
989
  finally:
464
990
  if image_path.exists():
465
991
  image_path.unlink()
466
992
  finally:
467
- ocr_cache.mark_complete(**cache_kwargs)
468
-
469
- def process_file_sync(
470
- self,
471
- path: Path,
472
- **kwargs: Unpack[TesseractConfig],
473
- ) -> ExtractionResult:
474
- """Synchronously process a file and extract its text and metadata.
475
-
476
- Args:
477
- path: A Path object representing the file to be processed.
478
- **kwargs: Any kwargs related to the given backend
993
+ if use_cache:
994
+ ocr_cache.mark_complete(**cache_kwargs)
479
995
 
480
- Returns:
481
- The extraction result object
482
- """
483
- from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
996
+ def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
997
+ use_cache = kwargs.pop("use_cache", True)
484
998
 
485
999
  file_info = self._get_file_info(path)
486
1000
 
@@ -490,56 +1004,76 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
490
1004
  "ocr_config": str(sorted(kwargs.items())),
491
1005
  }
492
1006
 
493
- ocr_cache = get_ocr_cache()
494
- cached_result = ocr_cache.get(**cache_kwargs)
495
- if cached_result is not None:
496
- return cached_result
497
-
498
- if ocr_cache.is_processing(**cache_kwargs):
499
- event = ocr_cache.mark_processing(**cache_kwargs)
500
- event.wait()
501
-
502
- # Try cache again after waiting for other process to complete
503
- cached_result = ocr_cache.get(**cache_kwargs)
504
- if cached_result is not None:
1007
+ if use_cache:
1008
+ cached_result = self._handle_cache_lookup_sync(cache_kwargs)
1009
+ if cached_result:
505
1010
  return cached_result
506
1011
 
507
- ocr_cache.mark_processing(**cache_kwargs)
508
-
1012
+ ocr_cache = get_ocr_cache()
509
1013
  try:
510
1014
  self._validate_tesseract_version_sync()
511
- with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
512
- output_base = tmp_file.name.replace(".txt", "")
513
- language = self._validate_language_code(kwargs.pop("language", "eng"))
514
- psm = kwargs.pop("psm", PSMMode.AUTO)
1015
+
1016
+ run_config = self._prepare_tesseract_run_config(**kwargs)
1017
+
1018
+ temp_fd, temp_path = tempfile.mkstemp(suffix=run_config["ext"])
1019
+ os.close(temp_fd)
1020
+ Path(temp_path).unlink()
1021
+ output_base = temp_path.replace(run_config["ext"], "")
1022
+
515
1023
  try:
516
- command = self._build_tesseract_command(path, output_base, language, psm, **kwargs)
517
- self._run_tesseract_sync(command)
1024
+ command = self._build_tesseract_command(
1025
+ path,
1026
+ output_base,
1027
+ run_config["language"],
1028
+ run_config["psm"],
1029
+ run_config["tesseract_format"],
1030
+ **run_config["remaining_kwargs"],
1031
+ )
1032
+ self._execute_tesseract_sync(command)
1033
+
1034
+ output_path = Path(f"{output_base}{run_config['ext']}")
1035
+ if not output_path.exists():
1036
+ return ExtractionResult(
1037
+ content="[OCR processing failed]",
1038
+ mime_type=PLAIN_TEXT_MIME_TYPE,
1039
+ metadata={
1040
+ "source_format": run_config["tesseract_format"],
1041
+ "error": f"{run_config['ext']} file not generated",
1042
+ },
1043
+ chunks=[],
1044
+ tables=[],
1045
+ )
518
1046
 
519
- output_path = Path(output_base + ".txt")
520
1047
  with output_path.open(encoding="utf-8") as f:
521
1048
  output = f.read()
522
- extraction_result = ExtractionResult(
523
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
524
- )
525
1049
 
526
- final_cache_kwargs = cache_kwargs.copy()
527
- final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
528
- ocr_cache.set(extraction_result, **final_cache_kwargs)
1050
+ extraction_result = self._process_tesseract_output_sync(output, run_config)
1051
+
1052
+ if use_cache:
1053
+ final_cache_kwargs = cache_kwargs.copy()
1054
+ final_cache_kwargs["ocr_config"] = str(
1055
+ sorted(
1056
+ {
1057
+ **run_config["remaining_kwargs"],
1058
+ "language": run_config["language"],
1059
+ "psm": run_config["psm"],
1060
+ }.items()
1061
+ )
1062
+ )
1063
+ ocr_cache.set(extraction_result, **final_cache_kwargs)
529
1064
 
530
1065
  return extraction_result
531
- except (RuntimeError, OSError) as e:
532
- raise OCRError(f"Failed to OCR using tesseract: {e}") from e
533
1066
  finally:
534
- for ext in [".txt"]:
535
- temp_file = Path(output_base + ext)
536
- if temp_file.exists():
537
- temp_file.unlink()
1067
+ for cleanup_ext in [".txt", ".hocr", ".tsv"]:
1068
+ cleanup_path = Path(f"{output_base}{cleanup_ext}")
1069
+ cleanup_path.unlink(missing_ok=True)
1070
+ except Exception as e:
1071
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
538
1072
  finally:
539
- ocr_cache.mark_complete(**cache_kwargs)
1073
+ if use_cache:
1074
+ ocr_cache.mark_complete(**cache_kwargs)
540
1075
 
541
1076
  def _get_file_info(self, path: Path) -> dict[str, Any]:
542
- """Get file information for caching."""
543
1077
  try:
544
1078
  stat = path.stat()
545
1079
  return {
@@ -555,9 +1089,8 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
555
1089
  }
556
1090
 
557
1091
  def _build_tesseract_command(
558
- self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
1092
+ self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
559
1093
  ) -> list[str]:
560
- """Build tesseract command with all parameters."""
561
1094
  command = [
562
1095
  "tesseract",
563
1096
  str(path),
@@ -571,47 +1104,32 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
571
1104
  "--loglevel",
572
1105
  "OFF",
573
1106
  ]
1107
+
1108
+ if output_format != "text":
1109
+ command.append(output_format)
1110
+
574
1111
  for kwarg, value in kwargs.items():
1112
+ if kwarg.startswith("table_"):
1113
+ continue
575
1114
  if isinstance(value, bool):
576
1115
  command.extend(["-c", f"{kwarg}={1 if value else 0}"])
577
1116
  else:
578
1117
  command.extend(["-c", f"{kwarg}={value}"])
579
1118
  return command
580
1119
 
581
- def _run_tesseract_sync(self, command: list[str]) -> None:
582
- """Run tesseract command synchronously."""
583
- env = os.environ.copy()
584
- if sys.platform.startswith("linux"):
585
- env["OMP_THREAD_LIMIT"] = "1"
586
-
587
- result = subprocess.run(
588
- command,
589
- check=False,
590
- env=env,
591
- capture_output=True,
592
- text=True,
593
- timeout=30,
594
- )
595
-
596
- if result.returncode != 0:
597
- raise OCRError(
598
- "OCR failed with a non-0 return code.",
599
- context={"error": result.stderr},
600
- )
601
-
602
1120
  @classmethod
603
1121
  def _validate_tesseract_version_sync(cls) -> None:
604
- """Synchronously validate that Tesseract is installed and is version 5 or above.
605
-
606
- Raises:
607
- MissingDependencyError: If Tesseract is not installed or is below version 5.
608
- """
609
1122
  try:
610
1123
  if cls._version_checked:
611
1124
  return
612
1125
 
613
1126
  command = ["tesseract", "--version"]
614
- result = subprocess.run(command, capture_output=True, text=True, check=False)
1127
+ try:
1128
+ result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
1129
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
1130
+ raise MissingDependencyError(
1131
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
1132
+ ) from e
615
1133
  version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
616
1134
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
617
1135
  raise MissingDependencyError(
@@ -626,17 +1144,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
626
1144
 
627
1145
  @staticmethod
628
1146
  def _validate_language_code(language_code: str) -> str:
629
- """Convert a language code to Tesseract format.
630
-
631
- Args:
632
- language_code: Tesseract supported language code or multiple language codes connected with '+'
633
-
634
- Raises:
635
- ValidationError: If the language is not supported by Tesseract
636
-
637
- Returns:
638
- Language code compatible with Tesseract
639
- """
640
1147
  normalized = language_code.lower()
641
1148
  if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
642
1149
  return normalized
@@ -661,18 +1168,6 @@ def _process_image_with_tesseract(
661
1168
  image_path: str,
662
1169
  config_dict: dict[str, Any],
663
1170
  ) -> dict[str, Any]:
664
- """Process a single image with Tesseract in a separate process.
665
-
666
- This function is designed to be executed in a subprocess.
667
- It uses direct tesseract command execution to avoid async complications.
668
-
669
- Args:
670
- image_path: Path to the image file.
671
- config_dict: Tesseract configuration as dictionary.
672
-
673
- Returns:
674
- OCR result as dictionary.
675
- """
676
1171
  try:
677
1172
  with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
678
1173
  output_base = tmp_file.name.replace(".txt", "")
@@ -721,6 +1216,7 @@ def _process_image_with_tesseract(
721
1216
  capture_output=True,
722
1217
  text=True,
723
1218
  timeout=30,
1219
+ encoding="utf-8",
724
1220
  )
725
1221
 
726
1222
  if result.returncode != 0:
@@ -759,19 +1255,12 @@ def _process_image_bytes_with_tesseract(
759
1255
  image_bytes: bytes,
760
1256
  config_dict: dict[str, Any],
761
1257
  ) -> dict[str, Any]:
762
- """Process image bytes with Tesseract in a separate process.
763
-
764
- Args:
765
- image_bytes: Image data as bytes.
766
- config_dict: Tesseract configuration as dictionary.
767
-
768
- Returns:
769
- OCR result as dictionary.
770
- """
771
1258
  try:
772
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
773
- with Image.open(io.BytesIO(image_bytes)) as image:
774
- image.save(tmp_image.name, format="PNG")
1259
+ with (
1260
+ tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
1261
+ Image.open(io.BytesIO(image_bytes)) as image,
1262
+ ):
1263
+ image.save(tmp_image.name, format="PNG")
775
1264
  image_path = tmp_image.name
776
1265
 
777
1266
  try:
@@ -791,21 +1280,12 @@ def _process_image_bytes_with_tesseract(
791
1280
 
792
1281
 
793
1282
  class TesseractProcessPool:
794
- """Process pool for parallel Tesseract OCR processing."""
795
-
796
1283
  def __init__(
797
1284
  self,
798
1285
  config: TesseractConfig | None = None,
799
1286
  max_processes: int | None = None,
800
1287
  memory_limit_gb: float | None = None,
801
1288
  ) -> None:
802
- """Initialize the Tesseract process pool.
803
-
804
- Args:
805
- config: Default Tesseract configuration.
806
- max_processes: Maximum number of processes.
807
- memory_limit_gb: Memory limit in GB.
808
- """
809
1289
  from kreuzberg._utils._process_pool import ProcessPoolManager # noqa: PLC0415
810
1290
 
811
1291
  self.config = config or TesseractConfig()
@@ -815,7 +1295,6 @@ class TesseractProcessPool:
815
1295
  )
816
1296
 
817
1297
  def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
818
- """Convert TesseractConfig to dictionary for pickling."""
819
1298
  cfg = config or self.config
820
1299
 
821
1300
  config_dict = {}
@@ -830,7 +1309,6 @@ class TesseractProcessPool:
830
1309
  return config_dict
831
1310
 
832
1311
  def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
833
- """Convert result dictionary back to OCRResult."""
834
1312
  if not result_dict["success"]:
835
1313
  raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
836
1314
 
@@ -846,15 +1324,6 @@ class TesseractProcessPool:
846
1324
  image_path: str | Path,
847
1325
  config: TesseractConfig | None = None,
848
1326
  ) -> ExtractionResult:
849
- """Process a single image file with Tesseract.
850
-
851
- Args:
852
- image_path: Path to the image file.
853
- config: Tesseract configuration (uses default if None).
854
-
855
- Returns:
856
- OCR result.
857
- """
858
1327
  config_dict = self._config_to_dict(config)
859
1328
 
860
1329
  task_memory_mb = 80
@@ -873,15 +1342,6 @@ class TesseractProcessPool:
873
1342
  image_bytes: bytes,
874
1343
  config: TesseractConfig | None = None,
875
1344
  ) -> ExtractionResult:
876
- """Process image bytes with Tesseract.
877
-
878
- Args:
879
- image_bytes: Image data as bytes.
880
- config: Tesseract configuration (uses default if None).
881
-
882
- Returns:
883
- OCR result.
884
- """
885
1345
  config_dict = self._config_to_dict(config)
886
1346
 
887
1347
  image_size_mb = len(image_bytes) / 1024 / 1024
@@ -902,16 +1362,6 @@ class TesseractProcessPool:
902
1362
  config: TesseractConfig | None = None,
903
1363
  max_concurrent: int | None = None,
904
1364
  ) -> list[ExtractionResult]:
905
- """Process a batch of images in parallel.
906
-
907
- Args:
908
- image_paths: List of image file paths.
909
- config: Tesseract configuration (uses default if None).
910
- max_concurrent: Maximum concurrent processes.
911
-
912
- Returns:
913
- List of OCR results in the same order as input.
914
- """
915
1365
  if not image_paths:
916
1366
  return []
917
1367
 
@@ -936,16 +1386,6 @@ class TesseractProcessPool:
936
1386
  config: TesseractConfig | None = None,
937
1387
  max_concurrent: int | None = None,
938
1388
  ) -> list[ExtractionResult]:
939
- """Process a batch of image bytes in parallel.
940
-
941
- Args:
942
- image_bytes_list: List of image data as bytes.
943
- config: Tesseract configuration (uses default if None).
944
- max_concurrent: Maximum concurrent processes.
945
-
946
- Returns:
947
- List of OCR results in the same order as input.
948
- """
949
1389
  if not image_bytes_list:
950
1390
  return []
951
1391
 
@@ -966,15 +1406,12 @@ class TesseractProcessPool:
966
1406
  return [self._result_from_dict(result_dict) for result_dict in result_dicts]
967
1407
 
968
1408
  def get_system_info(self) -> dict[str, Any]:
969
- """Get system information from the process manager."""
970
1409
  return self.process_manager.get_system_info()
971
1410
 
972
1411
  def shutdown(self, wait: bool = True) -> None:
973
- """Shutdown the process pool."""
974
1412
  self.process_manager.shutdown(wait=wait)
975
1413
 
976
1414
  async def __aenter__(self) -> Self:
977
- """Async context manager entry."""
978
1415
  return self
979
1416
 
980
1417
  async def __aexit__(
@@ -983,5 +1420,4 @@ class TesseractProcessPool:
983
1420
  exc_val: BaseException | None,
984
1421
  exc_tb: object,
985
1422
  ) -> None:
986
- """Async context manager exit."""
987
1423
  self.shutdown()