kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import csv
|
3
4
|
import hashlib
|
4
5
|
import io
|
5
6
|
import os
|
@@ -7,26 +8,33 @@ import re
|
|
7
8
|
import subprocess
|
8
9
|
import sys
|
9
10
|
import tempfile
|
10
|
-
from
|
11
|
-
from enum import Enum
|
11
|
+
from io import StringIO
|
12
12
|
from pathlib import Path
|
13
13
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
14
14
|
|
15
15
|
import anyio
|
16
|
+
import html_to_markdown
|
17
|
+
import polars as pl
|
16
18
|
from anyio import Path as AsyncPath
|
17
19
|
from anyio import run_process
|
20
|
+
from bs4 import BeautifulSoup
|
21
|
+
from bs4.element import Tag
|
18
22
|
from PIL import Image
|
23
|
+
from PIL.Image import Image as PILImage
|
19
24
|
from typing_extensions import Self
|
20
25
|
|
21
|
-
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
26
|
+
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
22
27
|
from kreuzberg._ocr._base import OCRBackend
|
23
|
-
from kreuzberg.
|
28
|
+
from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
|
29
|
+
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
|
30
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
24
31
|
from kreuzberg._utils._string import normalize_spaces
|
25
32
|
from kreuzberg._utils._sync import run_sync
|
26
33
|
from kreuzberg._utils._tmp import create_temp_file
|
27
34
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
28
35
|
|
29
36
|
if TYPE_CHECKING:
|
37
|
+
from bs4.element import Tag
|
30
38
|
from PIL.Image import Image as PILImage
|
31
39
|
|
32
40
|
try: # pragma: no cover
|
@@ -168,68 +176,6 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
168
176
|
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
169
177
|
|
170
178
|
|
171
|
-
class PSMMode(Enum):
|
172
|
-
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
173
|
-
|
174
|
-
OSD_ONLY = 0
|
175
|
-
"""Orientation and script detection only."""
|
176
|
-
AUTO_OSD = 1
|
177
|
-
"""Automatic page segmentation with orientation and script detection."""
|
178
|
-
AUTO_ONLY = 2
|
179
|
-
"""Automatic page segmentation without OSD."""
|
180
|
-
AUTO = 3
|
181
|
-
"""Fully automatic page segmentation (default)."""
|
182
|
-
SINGLE_COLUMN = 4
|
183
|
-
"""Assume a single column of text."""
|
184
|
-
SINGLE_BLOCK_VERTICAL = 5
|
185
|
-
"""Assume a single uniform block of vertically aligned text."""
|
186
|
-
SINGLE_BLOCK = 6
|
187
|
-
"""Assume a single uniform block of text."""
|
188
|
-
SINGLE_LINE = 7
|
189
|
-
"""Treat the image as a single text line."""
|
190
|
-
SINGLE_WORD = 8
|
191
|
-
"""Treat the image as a single word."""
|
192
|
-
CIRCLE_WORD = 9
|
193
|
-
"""Treat the image as a single word in a circle."""
|
194
|
-
SINGLE_CHAR = 10
|
195
|
-
"""Treat the image as a single character."""
|
196
|
-
|
197
|
-
|
198
|
-
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
199
|
-
class TesseractConfig:
|
200
|
-
"""Configuration options for Tesseract OCR engine."""
|
201
|
-
|
202
|
-
classify_use_pre_adapted_templates: bool = True
|
203
|
-
"""Whether to use pre-adapted templates during classification to improve recognition accuracy."""
|
204
|
-
language: str = "eng"
|
205
|
-
"""Language code to use for OCR.
|
206
|
-
Examples:
|
207
|
-
- 'eng' for English
|
208
|
-
- 'deu' for German
|
209
|
-
- multiple languages combined with '+', e.g. 'eng+deu')
|
210
|
-
"""
|
211
|
-
language_model_ngram_on: bool = False
|
212
|
-
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
213
|
-
|
214
|
-
Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
|
215
|
-
psm: PSMMode = PSMMode.AUTO
|
216
|
-
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
217
|
-
tessedit_dont_blkrej_good_wds: bool = True
|
218
|
-
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
219
|
-
tessedit_dont_rowrej_good_wds: bool = True
|
220
|
-
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
221
|
-
tessedit_enable_dict_correction: bool = True
|
222
|
-
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
223
|
-
tessedit_char_whitelist: str = ""
|
224
|
-
"""Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
|
225
|
-
tessedit_use_primary_params_model: bool = True
|
226
|
-
"""If True, forces the use of the primary parameters model for text recognition."""
|
227
|
-
textord_space_size_is_variable: bool = True
|
228
|
-
"""Allow variable spacing between words, useful for text with irregular spacing."""
|
229
|
-
thresholding_method: bool = False
|
230
|
-
"""Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
|
231
|
-
|
232
|
-
|
233
179
|
class TesseractBackend(OCRBackend[TesseractConfig]):
|
234
180
|
_version_checked: ClassVar[bool] = False
|
235
181
|
|
@@ -238,10 +184,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
238
184
|
image: PILImage,
|
239
185
|
**kwargs: Unpack[TesseractConfig],
|
240
186
|
) -> ExtractionResult:
|
241
|
-
|
187
|
+
use_cache = kwargs.pop("use_cache", True)
|
188
|
+
|
189
|
+
save_image = image
|
190
|
+
if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
|
191
|
+
save_image = image.convert("RGB")
|
242
192
|
|
243
193
|
image_buffer = io.BytesIO()
|
244
|
-
await run_sync(
|
194
|
+
await run_sync(save_image.save, image_buffer, format="PNG")
|
245
195
|
image_content = image_buffer.getvalue()
|
246
196
|
|
247
197
|
cache_kwargs = {
|
@@ -250,7 +200,40 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
250
200
|
"ocr_config": str(sorted(kwargs.items())),
|
251
201
|
}
|
252
202
|
|
203
|
+
if use_cache:
|
204
|
+
cached_result = await self._handle_cache_lookup(cache_kwargs)
|
205
|
+
if cached_result:
|
206
|
+
return cached_result
|
207
|
+
|
208
|
+
ocr_cache = get_ocr_cache()
|
209
|
+
try:
|
210
|
+
await self._validate_tesseract_version()
|
211
|
+
image_path, unlink = await create_temp_file(".png")
|
212
|
+
|
213
|
+
try:
|
214
|
+
await run_sync(save_image.save, str(image_path), format="PNG")
|
215
|
+
except OSError as e:
|
216
|
+
if "cannot write mode" not in str(e):
|
217
|
+
raise
|
218
|
+
save_image = image.convert("RGB")
|
219
|
+
await run_sync(save_image.save, str(image_path), format="PNG")
|
220
|
+
try:
|
221
|
+
result = await self.process_file(image_path, **kwargs)
|
222
|
+
|
223
|
+
if use_cache:
|
224
|
+
await ocr_cache.aset(result, **cache_kwargs)
|
225
|
+
|
226
|
+
return result
|
227
|
+
finally:
|
228
|
+
await unlink()
|
229
|
+
finally:
|
230
|
+
if use_cache:
|
231
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
232
|
+
|
233
|
+
async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
234
|
+
"""Handle cache lookup before processing."""
|
253
235
|
ocr_cache = get_ocr_cache()
|
236
|
+
|
254
237
|
cached_result = await ocr_cache.aget(**cache_kwargs)
|
255
238
|
if cached_result is not None:
|
256
239
|
return cached_result
|
@@ -258,49 +241,123 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
258
241
|
if ocr_cache.is_processing(**cache_kwargs):
|
259
242
|
event = ocr_cache.mark_processing(**cache_kwargs)
|
260
243
|
await anyio.to_thread.run_sync(event.wait)
|
261
|
-
|
262
|
-
# Try cache again after waiting for other process to complete # ~keep
|
263
244
|
cached_result = await ocr_cache.aget(**cache_kwargs)
|
264
245
|
if cached_result is not None:
|
265
246
|
return cached_result
|
266
247
|
|
267
248
|
ocr_cache.mark_processing(**cache_kwargs)
|
249
|
+
return None
|
250
|
+
|
251
|
+
def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
|
252
|
+
"""Prepare configuration for a Tesseract run."""
|
253
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
254
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
255
|
+
output_format = kwargs.pop("output_format", "markdown")
|
256
|
+
enable_table_detection = kwargs.pop("enable_table_detection", False)
|
257
|
+
|
258
|
+
if enable_table_detection and output_format == "text":
|
259
|
+
output_format = "tsv"
|
260
|
+
|
261
|
+
if output_format == "markdown":
|
262
|
+
tesseract_format = "hocr"
|
263
|
+
ext = ".hocr"
|
264
|
+
elif output_format == "tsv":
|
265
|
+
tesseract_format = "tsv"
|
266
|
+
ext = ".tsv"
|
267
|
+
elif output_format == "hocr":
|
268
|
+
tesseract_format = "hocr"
|
269
|
+
ext = ".hocr"
|
270
|
+
else:
|
271
|
+
tesseract_format = "text"
|
272
|
+
ext = ".txt"
|
273
|
+
|
274
|
+
return {
|
275
|
+
"language": language,
|
276
|
+
"psm": psm,
|
277
|
+
"output_format": output_format,
|
278
|
+
"enable_table_detection": enable_table_detection,
|
279
|
+
"tesseract_format": tesseract_format,
|
280
|
+
"ext": ext,
|
281
|
+
"remaining_kwargs": kwargs,
|
282
|
+
}
|
283
|
+
|
284
|
+
async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
|
285
|
+
"""Build and execute the Tesseract command."""
|
286
|
+
command = [
|
287
|
+
"tesseract",
|
288
|
+
str(path),
|
289
|
+
output_base,
|
290
|
+
"-l",
|
291
|
+
run_config["language"],
|
292
|
+
"--psm",
|
293
|
+
str(run_config["psm"].value),
|
294
|
+
"--oem",
|
295
|
+
"1",
|
296
|
+
"--loglevel",
|
297
|
+
"OFF",
|
298
|
+
]
|
299
|
+
|
300
|
+
if run_config["tesseract_format"] != "text":
|
301
|
+
command.append(run_config["tesseract_format"])
|
302
|
+
|
303
|
+
for kwarg, value in run_config["remaining_kwargs"].items():
|
304
|
+
if kwarg.startswith("table_"):
|
305
|
+
continue
|
306
|
+
if isinstance(value, bool):
|
307
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
308
|
+
else:
|
309
|
+
command.extend(["-c", f"{kwarg}={value}"])
|
310
|
+
|
311
|
+
env: dict[str, Any] | None = None
|
312
|
+
if sys.platform.startswith("linux"):
|
313
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
268
314
|
|
269
315
|
try:
|
270
|
-
await
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
316
|
+
result = await run_process(command, env=env)
|
317
|
+
if not result.returncode == 0:
|
318
|
+
raise OCRError(
|
319
|
+
"OCR failed with a non-0 return code.",
|
320
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
321
|
+
)
|
322
|
+
except subprocess.CalledProcessError as e:
|
323
|
+
error_msg = e.stderr.decode("utf-8") if e.stderr else str(e)
|
324
|
+
raise OCRError(
|
325
|
+
f"Failed to OCR using tesseract: {error_msg}",
|
326
|
+
context={"command": command, "returncode": e.returncode, "error": error_msg},
|
327
|
+
) from e
|
275
328
|
|
276
|
-
|
329
|
+
async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
|
330
|
+
"""Process the raw output from Tesseract based on the requested format."""
|
331
|
+
output_format = run_config["output_format"]
|
332
|
+
enable_table_detection = run_config["enable_table_detection"]
|
333
|
+
kwargs = run_config["remaining_kwargs"]
|
334
|
+
|
335
|
+
if output_format == "markdown":
|
336
|
+
return await self._process_hocr_to_markdown(output, enable_table_detection=enable_table_detection, **kwargs)
|
337
|
+
if output_format == "tsv" and enable_table_detection:
|
338
|
+
return await self._process_tsv_output(
|
339
|
+
output,
|
340
|
+
table_column_threshold=kwargs.get("table_column_threshold", 20),
|
341
|
+
table_row_threshold_ratio=kwargs.get("table_row_threshold_ratio", 0.5),
|
342
|
+
table_min_confidence=kwargs.get("table_min_confidence", 30.0),
|
343
|
+
)
|
344
|
+
if output_format == "tsv":
|
345
|
+
return self._extract_text_from_tsv(output)
|
346
|
+
if output_format == "hocr":
|
347
|
+
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
|
277
348
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
finally:
|
282
|
-
ocr_cache.mark_complete(**cache_kwargs)
|
349
|
+
return ExtractionResult(
|
350
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
351
|
+
)
|
283
352
|
|
284
|
-
async def process_file(
|
285
|
-
|
286
|
-
path: Path,
|
287
|
-
**kwargs: Unpack[TesseractConfig],
|
288
|
-
) -> ExtractionResult:
|
289
|
-
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
353
|
+
async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
354
|
+
use_cache = kwargs.pop("use_cache", True)
|
290
355
|
|
291
356
|
try:
|
292
357
|
stat = path.stat()
|
293
|
-
file_info = {
|
294
|
-
"path": str(path.resolve()),
|
295
|
-
"size": stat.st_size,
|
296
|
-
"mtime": stat.st_mtime,
|
297
|
-
}
|
358
|
+
file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
|
298
359
|
except OSError:
|
299
|
-
file_info = {
|
300
|
-
"path": str(path),
|
301
|
-
"size": 0,
|
302
|
-
"mtime": 0,
|
303
|
-
}
|
360
|
+
file_info = {"path": str(path), "size": 0, "mtime": 0}
|
304
361
|
|
305
362
|
cache_kwargs = {
|
306
363
|
"file_info": str(sorted(file_info.items())),
|
@@ -308,71 +365,37 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
308
365
|
"ocr_config": str(sorted(kwargs.items())),
|
309
366
|
}
|
310
367
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
return cached_result
|
315
|
-
|
316
|
-
if ocr_cache.is_processing(**cache_kwargs):
|
317
|
-
event = ocr_cache.mark_processing(**cache_kwargs)
|
318
|
-
await anyio.to_thread.run_sync(event.wait)
|
319
|
-
|
320
|
-
# Try cache again after waiting for other process to complete # ~keep
|
321
|
-
cached_result = await ocr_cache.aget(**cache_kwargs)
|
322
|
-
if cached_result is not None:
|
368
|
+
if use_cache:
|
369
|
+
cached_result = await self._handle_cache_lookup(cache_kwargs)
|
370
|
+
if cached_result:
|
323
371
|
return cached_result
|
324
372
|
|
325
|
-
ocr_cache
|
326
|
-
|
373
|
+
ocr_cache = get_ocr_cache()
|
327
374
|
try:
|
328
375
|
await self._validate_tesseract_version()
|
329
|
-
output_path, unlink = await create_temp_file(".txt")
|
330
|
-
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
331
|
-
psm = kwargs.pop("psm", PSMMode.AUTO)
|
332
|
-
try:
|
333
|
-
output_base = str(output_path).replace(".txt", "")
|
334
|
-
command = [
|
335
|
-
"tesseract",
|
336
|
-
str(path),
|
337
|
-
output_base,
|
338
|
-
"-l",
|
339
|
-
language,
|
340
|
-
"--psm",
|
341
|
-
str(psm.value),
|
342
|
-
"--oem",
|
343
|
-
"1",
|
344
|
-
"--loglevel",
|
345
|
-
"OFF",
|
346
|
-
]
|
347
|
-
for kwarg, value in kwargs.items():
|
348
|
-
if isinstance(value, bool):
|
349
|
-
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
350
|
-
else:
|
351
|
-
# Handle string parameters (like tessedit_char_whitelist)
|
352
|
-
command.extend(["-c", f"{kwarg}={value}"])
|
353
|
-
|
354
|
-
env: dict[str, Any] | None = None
|
355
|
-
if sys.platform.startswith("linux"):
|
356
|
-
env = {"OMP_THREAD_LIMIT": "1"}
|
357
376
|
|
358
|
-
|
377
|
+
run_config = self._prepare_tesseract_run_config(**kwargs)
|
378
|
+
output_path, unlink = await create_temp_file(run_config["ext"])
|
359
379
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
context={
|
364
|
-
"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
|
365
|
-
},
|
366
|
-
)
|
380
|
+
try:
|
381
|
+
output_base = str(output_path).replace(run_config["ext"], "")
|
382
|
+
await self._execute_tesseract(path, output_base, run_config)
|
367
383
|
|
368
384
|
output = await AsyncPath(output_path).read_text("utf-8")
|
369
|
-
extraction_result =
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
385
|
+
extraction_result = await self._process_tesseract_output(output, run_config)
|
386
|
+
|
387
|
+
if use_cache:
|
388
|
+
final_cache_kwargs = cache_kwargs.copy()
|
389
|
+
final_cache_kwargs["ocr_config"] = str(
|
390
|
+
sorted(
|
391
|
+
{
|
392
|
+
**run_config["remaining_kwargs"],
|
393
|
+
"language": run_config["language"],
|
394
|
+
"psm": run_config["psm"],
|
395
|
+
}.items()
|
396
|
+
)
|
397
|
+
)
|
398
|
+
await ocr_cache.aset(extraction_result, **final_cache_kwargs)
|
376
399
|
|
377
400
|
return extraction_result
|
378
401
|
except (RuntimeError, OSError) as e:
|
@@ -380,7 +403,562 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
380
403
|
finally:
|
381
404
|
await unlink()
|
382
405
|
finally:
|
383
|
-
|
406
|
+
if use_cache:
|
407
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
408
|
+
|
409
|
+
async def _process_tsv_output(
|
410
|
+
self,
|
411
|
+
tsv_content: str,
|
412
|
+
table_column_threshold: int = 20,
|
413
|
+
table_row_threshold_ratio: float = 0.5,
|
414
|
+
table_min_confidence: float = 30.0,
|
415
|
+
) -> ExtractionResult:
|
416
|
+
"""Process TSV output and extract tables if detected.
|
417
|
+
|
418
|
+
Args:
|
419
|
+
tsv_content: Raw TSV output from Tesseract.
|
420
|
+
table_column_threshold: Pixel threshold for column clustering.
|
421
|
+
table_row_threshold_ratio: Row threshold as ratio of mean text height.
|
422
|
+
table_min_confidence: Minimum confidence score to include a word.
|
423
|
+
|
424
|
+
Returns:
|
425
|
+
ExtractionResult with extracted content and tables.
|
426
|
+
"""
|
427
|
+
text_result = self._extract_text_from_tsv(tsv_content)
|
428
|
+
|
429
|
+
try:
|
430
|
+
if (
|
431
|
+
(words := extract_words(tsv_content, min_confidence=table_min_confidence))
|
432
|
+
and (
|
433
|
+
table_data := reconstruct_table(
|
434
|
+
words,
|
435
|
+
column_threshold=table_column_threshold,
|
436
|
+
row_threshold_ratio=table_row_threshold_ratio,
|
437
|
+
)
|
438
|
+
)
|
439
|
+
and len(table_data) > 1
|
440
|
+
):
|
441
|
+
markdown = to_markdown(table_data)
|
442
|
+
|
443
|
+
try:
|
444
|
+
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
445
|
+
except (ImportError, IndexError):
|
446
|
+
df = None
|
447
|
+
|
448
|
+
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
449
|
+
|
450
|
+
return ExtractionResult(
|
451
|
+
content=text_result.content,
|
452
|
+
mime_type=text_result.mime_type,
|
453
|
+
metadata=text_result.metadata,
|
454
|
+
tables=[table],
|
455
|
+
chunks=text_result.chunks,
|
456
|
+
)
|
457
|
+
except (ValueError, KeyError, ImportError):
|
458
|
+
pass
|
459
|
+
|
460
|
+
return text_result
|
461
|
+
|
462
|
+
def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
|
463
|
+
"""Extract plain text from TSV output.
|
464
|
+
|
465
|
+
Args:
|
466
|
+
tsv_content: Raw TSV output from Tesseract.
|
467
|
+
|
468
|
+
Returns:
|
469
|
+
ExtractionResult with extracted text.
|
470
|
+
"""
|
471
|
+
try:
|
472
|
+
reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
|
473
|
+
|
474
|
+
lines: dict[tuple[int, int, int, int], list[tuple[int, str]]] = {}
|
475
|
+
|
476
|
+
for row in reader:
|
477
|
+
if row.get("level") == "5" and row.get("text", "").strip():
|
478
|
+
line_key = (int(row["page_num"]), int(row["block_num"]), int(row["par_num"]), int(row["line_num"]))
|
479
|
+
|
480
|
+
if line_key not in lines:
|
481
|
+
lines[line_key] = []
|
482
|
+
|
483
|
+
lines[line_key].append((int(row["left"]), row["text"]))
|
484
|
+
|
485
|
+
text_parts: list[str] = []
|
486
|
+
last_block = -1
|
487
|
+
last_para = -1
|
488
|
+
|
489
|
+
for line_key in sorted(lines.keys()):
|
490
|
+
page_num, block_num, par_num, line_num = line_key
|
491
|
+
|
492
|
+
if block_num != last_block:
|
493
|
+
if text_parts: # ~keep
|
494
|
+
text_parts.append("\n\n")
|
495
|
+
last_block = block_num
|
496
|
+
last_para = par_num
|
497
|
+
elif par_num != last_para:
|
498
|
+
text_parts.append("\n\n")
|
499
|
+
last_para = par_num
|
500
|
+
|
501
|
+
words = sorted(lines[line_key], key=lambda x: x[0])
|
502
|
+
line_text = " ".join(word[1] for word in words)
|
503
|
+
text_parts.append(line_text)
|
504
|
+
text_parts.append("\n")
|
505
|
+
|
506
|
+
content = "".join(text_parts).strip()
|
507
|
+
|
508
|
+
except (ValueError, KeyError):
|
509
|
+
content = ""
|
510
|
+
for line in tsv_content.split("\n")[1:]: # ~keep skip header
|
511
|
+
parts = line.split("\t")
|
512
|
+
if len(parts) > 11 and parts[11].strip(): # ~keep text is in column 11
|
513
|
+
content += parts[11] + " "
|
514
|
+
content = content.strip()
|
515
|
+
|
516
|
+
return ExtractionResult(
|
517
|
+
content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
518
|
+
)
|
519
|
+
|
520
|
+
async def _process_hocr_to_markdown(
|
521
|
+
self,
|
522
|
+
hocr_content: str,
|
523
|
+
enable_table_detection: bool = False,
|
524
|
+
html_to_markdown_config: HTMLToMarkdownConfig | None = None,
|
525
|
+
table_column_threshold: int = 20,
|
526
|
+
table_row_threshold_ratio: float = 0.5,
|
527
|
+
table_min_confidence: float = 30.0,
|
528
|
+
**_kwargs: Any,
|
529
|
+
) -> ExtractionResult:
|
530
|
+
"""Convert hOCR content to Markdown with table detection.
|
531
|
+
|
532
|
+
Args:
|
533
|
+
hocr_content: Raw hOCR HTML/XML content from Tesseract.
|
534
|
+
enable_table_detection: Whether to detect and format tables.
|
535
|
+
html_to_markdown_config: Configuration for HTML to Markdown conversion.
|
536
|
+
table_column_threshold: Pixel threshold for column clustering.
|
537
|
+
table_row_threshold_ratio: Row threshold as ratio of mean text height.
|
538
|
+
table_min_confidence: Minimum confidence score to include a word.
|
539
|
+
**kwargs: Additional configuration options.
|
540
|
+
|
541
|
+
Returns:
|
542
|
+
ExtractionResult with Markdown content and detected tables.
|
543
|
+
"""
|
544
|
+
config = html_to_markdown_config or HTMLToMarkdownConfig(
|
545
|
+
escape_asterisks=False,
|
546
|
+
escape_underscores=False,
|
547
|
+
extract_metadata=False,
|
548
|
+
strip="meta title",
|
549
|
+
)
|
550
|
+
|
551
|
+
tables: list[TableData] = []
|
552
|
+
if enable_table_detection:
|
553
|
+
soup = BeautifulSoup(hocr_content, "lxml")
|
554
|
+
tables = await self._extract_tables_from_hocr(
|
555
|
+
soup,
|
556
|
+
table_column_threshold,
|
557
|
+
table_row_threshold_ratio,
|
558
|
+
table_min_confidence,
|
559
|
+
)
|
560
|
+
|
561
|
+
hocr_converters = self._create_hocr_converters(tables)
|
562
|
+
|
563
|
+
all_converters = dict(hocr_converters)
|
564
|
+
if config.custom_converters:
|
565
|
+
all_converters.update(config.custom_converters)
|
566
|
+
|
567
|
+
config_dict = config.to_dict()
|
568
|
+
config_dict["custom_converters"] = all_converters
|
569
|
+
|
570
|
+
try:
|
571
|
+
markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
|
572
|
+
markdown_content = normalize_spaces(markdown_content)
|
573
|
+
except (ValueError, TypeError, AttributeError):
|
574
|
+
try:
|
575
|
+
soup = BeautifulSoup(hocr_content, "lxml")
|
576
|
+
words = soup.find_all("span", class_="ocrx_word")
|
577
|
+
text_parts = []
|
578
|
+
for word in words:
|
579
|
+
text = word.get_text().strip()
|
580
|
+
if text:
|
581
|
+
text_parts.append(text)
|
582
|
+
|
583
|
+
if text_parts:
|
584
|
+
markdown_content = " ".join(text_parts)
|
585
|
+
else:
|
586
|
+
markdown_content = soup.get_text().strip() or "[No text detected]"
|
587
|
+
|
588
|
+
markdown_content = normalize_spaces(markdown_content)
|
589
|
+
except (ValueError, TypeError, AttributeError):
|
590
|
+
markdown_content = "[OCR processing failed]"
|
591
|
+
|
592
|
+
if tables:
|
593
|
+
table_sections = []
|
594
|
+
for i, table in enumerate(tables):
|
595
|
+
table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
|
596
|
+
|
597
|
+
if markdown_content.strip():
|
598
|
+
final_content = f"{markdown_content}\n{''.join(table_sections)}"
|
599
|
+
else:
|
600
|
+
final_content = "".join(table_sections).strip()
|
601
|
+
else:
|
602
|
+
final_content = markdown_content
|
603
|
+
|
604
|
+
return ExtractionResult(
|
605
|
+
content=final_content,
|
606
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
607
|
+
metadata={"source_format": "hocr", "tables_detected": len(tables)},
|
608
|
+
chunks=[],
|
609
|
+
tables=tables,
|
610
|
+
)
|
611
|
+
|
612
|
+
def _create_basic_converters(self) -> dict[str, Any]:
|
613
|
+
"""Create basic converters for individual hOCR elements."""
|
614
|
+
|
615
|
+
def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
616
|
+
"""Custom converter for hOCR word elements - adds spaces between words."""
|
617
|
+
del tag
|
618
|
+
return f"{text.strip()} "
|
619
|
+
|
620
|
+
def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
621
|
+
"""Custom converter for hOCR line elements - handles line breaks."""
|
622
|
+
del tag
|
623
|
+
return f"{text.strip()}\n"
|
624
|
+
|
625
|
+
def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
626
|
+
"""Custom converter for hOCR paragraph elements - handles paragraph breaks."""
|
627
|
+
del tag
|
628
|
+
content = text.strip()
|
629
|
+
if not content:
|
630
|
+
return ""
|
631
|
+
return f"{content}\n\n"
|
632
|
+
|
633
|
+
def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
634
|
+
"""Custom converter for hOCR content area elements."""
|
635
|
+
del tag
|
636
|
+
content = text.strip()
|
637
|
+
if not content:
|
638
|
+
return ""
|
639
|
+
return f"{content}\n\n"
|
640
|
+
|
641
|
+
def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
642
|
+
"""Custom converter for hOCR page elements."""
|
643
|
+
del tag
|
644
|
+
return text.strip()
|
645
|
+
|
646
|
+
def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
647
|
+
"""Custom converter for hOCR separator elements - convert to horizontal rules."""
|
648
|
+
del tag, text
|
649
|
+
return "---\n"
|
650
|
+
|
651
|
+
def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
652
|
+
"""Custom converter for hOCR photo/image elements - indicate image presence."""
|
653
|
+
del text
|
654
|
+
title = tag.get("title", "")
|
655
|
+
if isinstance(title, str):
|
656
|
+
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
|
657
|
+
if bbox_match:
|
658
|
+
x0, y0, x1, y1 = bbox_match.groups()
|
659
|
+
width = int(x1) - int(x0)
|
660
|
+
height = int(y1) - int(y0)
|
661
|
+
return f"*[Image region: {width}x{height} pixels]*\n\n"
|
662
|
+
return "*[Image detected]*\n\n"
|
663
|
+
|
664
|
+
return {
|
665
|
+
"ocrx_word": ocrx_word_converter,
|
666
|
+
"ocr_line": ocr_line_converter,
|
667
|
+
"ocr_par": ocr_par_converter,
|
668
|
+
"ocr_carea": ocr_carea_converter,
|
669
|
+
"ocr_page": ocr_page_converter,
|
670
|
+
"ocr_separator": ocr_separator_converter,
|
671
|
+
"ocr_photo": ocr_photo_converter,
|
672
|
+
}
|
673
|
+
|
674
|
+
def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
|
675
|
+
"""Create custom converters for hOCR elements that preserve spacing.
|
676
|
+
|
677
|
+
Args:
|
678
|
+
tables: List of detected tables (not used for filtering, tables added separately).
|
679
|
+
|
680
|
+
Returns:
|
681
|
+
Dictionary mapping HTML tags to converter functions.
|
682
|
+
"""
|
683
|
+
basic_converters = self._create_basic_converters()
|
684
|
+
|
685
|
+
def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
686
|
+
"""Generic converter for div elements based on class."""
|
687
|
+
class_attr = tag.get("class", "")
|
688
|
+
if isinstance(class_attr, list):
|
689
|
+
class_attr = " ".join(class_attr)
|
690
|
+
elif not isinstance(class_attr, str):
|
691
|
+
class_attr = ""
|
692
|
+
|
693
|
+
for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
|
694
|
+
if class_name in class_attr:
|
695
|
+
converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
|
696
|
+
return str(converter_result)
|
697
|
+
return text
|
698
|
+
|
699
|
+
def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
700
|
+
"""Generic converter for span elements based on class."""
|
701
|
+
class_attr = tag.get("class", "")
|
702
|
+
if isinstance(class_attr, list):
|
703
|
+
class_attr = " ".join(class_attr)
|
704
|
+
elif not isinstance(class_attr, str):
|
705
|
+
class_attr = ""
|
706
|
+
|
707
|
+
for class_name in ["ocrx_word", "ocr_line"]:
|
708
|
+
if class_name in class_attr:
|
709
|
+
converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
|
710
|
+
return str(converter_result)
|
711
|
+
return f"{text.strip()} "
|
712
|
+
|
713
|
+
return {
|
714
|
+
"span": generic_span_converter,
|
715
|
+
"div": generic_div_converter,
|
716
|
+
"p": basic_converters["ocr_par"],
|
717
|
+
}
|
718
|
+
|
719
|
+
def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
|
720
|
+
"""Synchronously process hOCR content to markdown format.
|
721
|
+
|
722
|
+
Args:
|
723
|
+
hocr_content: Raw hOCR content as string
|
724
|
+
config: Tesseract configuration object
|
725
|
+
|
726
|
+
Returns:
|
727
|
+
ExtractionResult with markdown content
|
728
|
+
"""
|
729
|
+
tables: list[TableData] = []
|
730
|
+
|
731
|
+
if config.enable_table_detection:
|
732
|
+
pass
|
733
|
+
|
734
|
+
try:
|
735
|
+
converters = self._create_hocr_converters(tables)
|
736
|
+
|
737
|
+
html_config = HTMLToMarkdownConfig(
|
738
|
+
custom_converters=converters,
|
739
|
+
escape_asterisks=False,
|
740
|
+
escape_underscores=False,
|
741
|
+
extract_metadata=False,
|
742
|
+
strip="meta title",
|
743
|
+
)
|
744
|
+
|
745
|
+
markdown_content = html_to_markdown.convert_to_markdown(
|
746
|
+
hocr_content,
|
747
|
+
**html_config.to_dict(),
|
748
|
+
)
|
749
|
+
|
750
|
+
markdown_content = normalize_spaces(markdown_content)
|
751
|
+
|
752
|
+
except (ValueError, TypeError, AttributeError):
|
753
|
+
try:
|
754
|
+
soup = BeautifulSoup(hocr_content, "lxml")
|
755
|
+
words = soup.find_all("span", class_="ocrx_word")
|
756
|
+
text_parts = []
|
757
|
+
for word in words:
|
758
|
+
text = word.get_text().strip()
|
759
|
+
if text:
|
760
|
+
text_parts.append(text)
|
761
|
+
|
762
|
+
if text_parts:
|
763
|
+
markdown_content = " ".join(text_parts)
|
764
|
+
else:
|
765
|
+
markdown_content = soup.get_text().strip() or "[No text detected]"
|
766
|
+
|
767
|
+
markdown_content = normalize_spaces(markdown_content)
|
768
|
+
except (ValueError, TypeError, AttributeError):
|
769
|
+
markdown_content = "[OCR processing failed]"
|
770
|
+
|
771
|
+
if tables:
|
772
|
+
table_sections = []
|
773
|
+
for i, table in enumerate(tables):
|
774
|
+
table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
|
775
|
+
|
776
|
+
if markdown_content.strip():
|
777
|
+
final_content = f"{markdown_content}\n{''.join(table_sections)}"
|
778
|
+
else:
|
779
|
+
final_content = "".join(table_sections).strip()
|
780
|
+
else:
|
781
|
+
final_content = markdown_content
|
782
|
+
|
783
|
+
return ExtractionResult(
|
784
|
+
content=final_content,
|
785
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
786
|
+
metadata={"source_format": "hocr", "tables_detected": len(tables)},
|
787
|
+
chunks=[],
|
788
|
+
tables=tables,
|
789
|
+
)
|
790
|
+
|
791
|
+
def _process_tsv_output_sync(
|
792
|
+
self,
|
793
|
+
tsv_content: str,
|
794
|
+
table_column_threshold: int = 20,
|
795
|
+
table_row_threshold_ratio: float = 0.5,
|
796
|
+
table_min_confidence: float = 30.0,
|
797
|
+
) -> ExtractionResult:
|
798
|
+
"""Synchronously process TSV output and extract tables if detected.
|
799
|
+
|
800
|
+
Args:
|
801
|
+
tsv_content: Raw TSV output from Tesseract.
|
802
|
+
table_column_threshold: Pixel threshold for column clustering.
|
803
|
+
table_row_threshold_ratio: Row threshold as ratio of mean text height.
|
804
|
+
table_min_confidence: Minimum confidence score to include a word.
|
805
|
+
|
806
|
+
Returns:
|
807
|
+
ExtractionResult with extracted content and tables.
|
808
|
+
"""
|
809
|
+
text_result = self._extract_text_from_tsv(tsv_content)
|
810
|
+
|
811
|
+
try:
|
812
|
+
if (
|
813
|
+
(words := extract_words(tsv_content, min_confidence=table_min_confidence))
|
814
|
+
and (
|
815
|
+
table_data := reconstruct_table(
|
816
|
+
words,
|
817
|
+
column_threshold=table_column_threshold,
|
818
|
+
row_threshold_ratio=table_row_threshold_ratio,
|
819
|
+
)
|
820
|
+
)
|
821
|
+
and len(table_data) > 1
|
822
|
+
):
|
823
|
+
markdown = to_markdown(table_data)
|
824
|
+
|
825
|
+
try:
|
826
|
+
df = pl.DataFrame(table_data[1:], schema=table_data[0])
|
827
|
+
except (ImportError, IndexError):
|
828
|
+
df = None
|
829
|
+
|
830
|
+
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
831
|
+
|
832
|
+
return ExtractionResult(
|
833
|
+
content=text_result.content,
|
834
|
+
mime_type=text_result.mime_type,
|
835
|
+
metadata=text_result.metadata,
|
836
|
+
tables=[table],
|
837
|
+
chunks=text_result.chunks,
|
838
|
+
)
|
839
|
+
except (ValueError, KeyError, ImportError):
|
840
|
+
pass
|
841
|
+
|
842
|
+
return text_result
|
843
|
+
|
844
|
+
async def _extract_tables_from_hocr(
|
845
|
+
self,
|
846
|
+
soup: Any,
|
847
|
+
column_threshold: int = 20,
|
848
|
+
row_threshold_ratio: float = 0.5,
|
849
|
+
min_confidence: float = 30.0,
|
850
|
+
) -> list[TableData]:
|
851
|
+
"""Extract tables from hOCR structure using coordinate analysis.
|
852
|
+
|
853
|
+
Args:
|
854
|
+
soup: Parsed hOCR BeautifulSoup object.
|
855
|
+
column_threshold: Pixel threshold for column clustering.
|
856
|
+
row_threshold_ratio: Row threshold as ratio of mean text height.
|
857
|
+
min_confidence: Minimum confidence score to include a word.
|
858
|
+
|
859
|
+
Returns:
|
860
|
+
List of detected tables as TableData objects.
|
861
|
+
"""
|
862
|
+
tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
|
863
|
+
|
864
|
+
if not tsv_data:
|
865
|
+
return []
|
866
|
+
|
867
|
+
if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
|
868
|
+
return []
|
869
|
+
|
870
|
+
tables: list[TableData] = []
|
871
|
+
try:
|
872
|
+
table_data = reconstruct_table(
|
873
|
+
words,
|
874
|
+
column_threshold=column_threshold,
|
875
|
+
row_threshold_ratio=row_threshold_ratio,
|
876
|
+
)
|
877
|
+
if table_data and len(table_data) > 1: # ~keep At least header + one data row
|
878
|
+
markdown = to_markdown(table_data)
|
879
|
+
|
880
|
+
min_x = min(w["left"] for w in words)
|
881
|
+
max_x = max(w["left"] + w["width"] for w in words)
|
882
|
+
min_y = min(w["top"] for w in words)
|
883
|
+
max_y = max(w["top"] + w["height"] for w in words)
|
884
|
+
|
885
|
+
try:
|
886
|
+
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
887
|
+
except (ImportError, IndexError):
|
888
|
+
df = None
|
889
|
+
|
890
|
+
dummy_image = Image.new("RGB", (1, 1), "white")
|
891
|
+
|
892
|
+
table: TableData = {
|
893
|
+
"text": markdown,
|
894
|
+
"df": df,
|
895
|
+
"page_number": 1,
|
896
|
+
"cropped_image": dummy_image,
|
897
|
+
"metadata": {"bbox": (min_x, min_y, max_x, max_y)},
|
898
|
+
} # type: ignore[typeddict-unknown-key]
|
899
|
+
tables.append(table)
|
900
|
+
except (ValueError, KeyError, ImportError):
|
901
|
+
pass
|
902
|
+
|
903
|
+
return tables
|
904
|
+
|
905
|
+
async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
|
906
|
+
"""Convert hOCR structure to TSV format for table extraction.
|
907
|
+
|
908
|
+
Args:
|
909
|
+
soup: Parsed hOCR BeautifulSoup object.
|
910
|
+
min_confidence: Minimum confidence score to include.
|
911
|
+
|
912
|
+
Returns:
|
913
|
+
TSV formatted string compatible with table extractor.
|
914
|
+
"""
|
915
|
+
tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
|
916
|
+
|
917
|
+
words = soup.find_all("span", class_="ocrx_word")
|
918
|
+
word_num = 1
|
919
|
+
|
920
|
+
for word in words:
|
921
|
+
title = word.get("title", "")
|
922
|
+
text = word.get_text().strip()
|
923
|
+
|
924
|
+
if not text:
|
925
|
+
continue
|
926
|
+
|
927
|
+
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
|
928
|
+
if not bbox_match:
|
929
|
+
continue
|
930
|
+
|
931
|
+
x0, y0, x1, y1 = map(int, bbox_match.groups())
|
932
|
+
|
933
|
+
conf_match = re.search(r"x_wconf (\d+)", title)
|
934
|
+
confidence = float(conf_match.group(1)) if conf_match else 100.0
|
935
|
+
|
936
|
+
if confidence < min_confidence:
|
937
|
+
continue
|
938
|
+
|
939
|
+
line = word.find_parent(class_="ocr_line")
|
940
|
+
par = word.find_parent(class_="ocr_par")
|
941
|
+
block = word.find_parent(class_="ocr_carea")
|
942
|
+
|
943
|
+
tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
|
944
|
+
tsv_lines.append(tsv_line)
|
945
|
+
word_num += 1
|
946
|
+
|
947
|
+
return "\n".join(tsv_lines)
|
948
|
+
|
949
|
+
def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
|
950
|
+
"""Identify potential table regions from word coordinates.
|
951
|
+
|
952
|
+
Args:
|
953
|
+
words: List of word dictionaries with coordinates.
|
954
|
+
|
955
|
+
Returns:
|
956
|
+
List of word groups representing potential tables.
|
957
|
+
"""
|
958
|
+
if not words:
|
959
|
+
return []
|
960
|
+
|
961
|
+
return [words]
|
384
962
|
|
385
963
|
@classmethod
|
386
964
|
async def _validate_tesseract_version(cls) -> None:
|
@@ -394,8 +972,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
394
972
|
return
|
395
973
|
|
396
974
|
command = ["tesseract", "--version"]
|
397
|
-
|
398
|
-
|
975
|
+
env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
|
976
|
+
try:
|
977
|
+
result = await run_process(command, env=env)
|
978
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
979
|
+
raise MissingDependencyError(
|
980
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
981
|
+
) from e
|
982
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode("utf-8"))
|
399
983
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
400
984
|
raise MissingDependencyError(
|
401
985
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
@@ -407,33 +991,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
407
991
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
408
992
|
) from e
|
409
993
|
|
410
|
-
def
|
411
|
-
|
412
|
-
image: PILImage,
|
413
|
-
**kwargs: Unpack[TesseractConfig],
|
414
|
-
) -> ExtractionResult:
|
415
|
-
"""Synchronously process an image and extract its text and metadata.
|
416
|
-
|
417
|
-
Args:
|
418
|
-
image: An instance of PIL.Image representing the input image.
|
419
|
-
**kwargs: Any kwargs related to the given backend
|
420
|
-
|
421
|
-
Returns:
|
422
|
-
The extraction result object
|
423
|
-
"""
|
424
|
-
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
425
|
-
|
426
|
-
image_buffer = io.BytesIO()
|
427
|
-
image.save(image_buffer, format="PNG")
|
428
|
-
image_content = image_buffer.getvalue()
|
429
|
-
|
430
|
-
cache_kwargs = {
|
431
|
-
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
432
|
-
"ocr_backend": "tesseract",
|
433
|
-
"ocr_config": str(sorted(kwargs.items())),
|
434
|
-
}
|
435
|
-
|
994
|
+
def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
995
|
+
"""Handle cache lookup before processing (sync)."""
|
436
996
|
ocr_cache = get_ocr_cache()
|
997
|
+
|
437
998
|
cached_result = ocr_cache.get(**cache_kwargs)
|
438
999
|
if cached_result is not None:
|
439
1000
|
return cached_result
|
@@ -441,46 +1002,113 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
441
1002
|
if ocr_cache.is_processing(**cache_kwargs):
|
442
1003
|
event = ocr_cache.mark_processing(**cache_kwargs)
|
443
1004
|
event.wait()
|
444
|
-
|
445
|
-
# Try cache again after waiting for other process to complete
|
446
1005
|
cached_result = ocr_cache.get(**cache_kwargs)
|
447
1006
|
if cached_result is not None:
|
448
1007
|
return cached_result
|
449
1008
|
|
450
1009
|
ocr_cache.mark_processing(**cache_kwargs)
|
1010
|
+
return None
|
1011
|
+
|
1012
|
+
def _execute_tesseract_sync(self, command: list[str]) -> None:
|
1013
|
+
"""Run tesseract command synchronously."""
|
1014
|
+
env = os.environ.copy()
|
1015
|
+
if sys.platform.startswith("linux"):
|
1016
|
+
env["OMP_THREAD_LIMIT"] = "1"
|
1017
|
+
|
1018
|
+
try:
|
1019
|
+
subprocess.run(
|
1020
|
+
command,
|
1021
|
+
check=True,
|
1022
|
+
env=env,
|
1023
|
+
capture_output=True,
|
1024
|
+
text=True,
|
1025
|
+
timeout=30,
|
1026
|
+
encoding="utf-8",
|
1027
|
+
)
|
1028
|
+
except subprocess.CalledProcessError as e:
|
1029
|
+
error_msg = e.stderr if e.stderr else str(e)
|
1030
|
+
raise OCRError(
|
1031
|
+
f"Failed to OCR using tesseract: {error_msg}",
|
1032
|
+
context={"command": command, "returncode": e.returncode, "error": error_msg},
|
1033
|
+
) from e
|
1034
|
+
except subprocess.TimeoutExpired as e:
|
1035
|
+
raise OCRError(
|
1036
|
+
"Tesseract timed out during processing.",
|
1037
|
+
context={"command": command, "timeout": 30},
|
1038
|
+
) from e
|
1039
|
+
|
1040
|
+
def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
|
1041
|
+
"""Process the raw output from Tesseract based on the requested format (sync)."""
|
1042
|
+
output_format = run_config["output_format"]
|
1043
|
+
enable_table_detection = run_config["enable_table_detection"]
|
1044
|
+
kwargs = run_config["remaining_kwargs"]
|
1045
|
+
config = TesseractConfig(**kwargs)
|
1046
|
+
|
1047
|
+
if output_format == "markdown":
|
1048
|
+
return self._process_hocr_to_markdown_sync(output, config)
|
1049
|
+
if output_format == "tsv" and enable_table_detection:
|
1050
|
+
return self._process_tsv_output_sync(
|
1051
|
+
output,
|
1052
|
+
table_column_threshold=config.table_column_threshold,
|
1053
|
+
table_row_threshold_ratio=config.table_row_threshold_ratio,
|
1054
|
+
table_min_confidence=config.table_min_confidence,
|
1055
|
+
)
|
1056
|
+
if output_format == "tsv":
|
1057
|
+
return self._extract_text_from_tsv(output)
|
1058
|
+
if output_format == "hocr":
|
1059
|
+
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
|
451
1060
|
|
1061
|
+
return ExtractionResult(
|
1062
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
1063
|
+
)
|
1064
|
+
|
1065
|
+
def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
1066
|
+
"""Synchronously process an image and extract its text and metadata."""
|
1067
|
+
use_cache = kwargs.pop("use_cache", True)
|
1068
|
+
|
1069
|
+
save_image = image
|
1070
|
+
if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
|
1071
|
+
save_image = image.convert("RGB")
|
1072
|
+
|
1073
|
+
image_buffer = io.BytesIO()
|
1074
|
+
save_image.save(image_buffer, format="PNG")
|
1075
|
+
image_content = image_buffer.getvalue()
|
1076
|
+
|
1077
|
+
cache_kwargs = {
|
1078
|
+
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
1079
|
+
"ocr_backend": "tesseract",
|
1080
|
+
"ocr_config": str(sorted(kwargs.items())),
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
if use_cache:
|
1084
|
+
cached_result = self._handle_cache_lookup_sync(cache_kwargs)
|
1085
|
+
if cached_result:
|
1086
|
+
return cached_result
|
1087
|
+
|
1088
|
+
ocr_cache = get_ocr_cache()
|
452
1089
|
try:
|
453
1090
|
self._validate_tesseract_version_sync()
|
454
1091
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
|
455
1092
|
image_path = Path(tmp_file.name)
|
456
|
-
|
1093
|
+
save_image.save(str(image_path), format="PNG")
|
457
1094
|
try:
|
458
|
-
|
1095
|
+
kwargs_with_cache = {**kwargs, "use_cache": use_cache}
|
1096
|
+
result = self.process_file_sync(image_path, **kwargs_with_cache)
|
459
1097
|
|
460
|
-
|
1098
|
+
if use_cache:
|
1099
|
+
ocr_cache.set(result, **cache_kwargs)
|
461
1100
|
|
462
1101
|
return result
|
463
1102
|
finally:
|
464
1103
|
if image_path.exists():
|
465
1104
|
image_path.unlink()
|
466
1105
|
finally:
|
467
|
-
|
1106
|
+
if use_cache:
|
1107
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
468
1108
|
|
469
|
-
def process_file_sync(
|
470
|
-
|
471
|
-
|
472
|
-
**kwargs: Unpack[TesseractConfig],
|
473
|
-
) -> ExtractionResult:
|
474
|
-
"""Synchronously process a file and extract its text and metadata.
|
475
|
-
|
476
|
-
Args:
|
477
|
-
path: A Path object representing the file to be processed.
|
478
|
-
**kwargs: Any kwargs related to the given backend
|
479
|
-
|
480
|
-
Returns:
|
481
|
-
The extraction result object
|
482
|
-
"""
|
483
|
-
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
1109
|
+
def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
1110
|
+
"""Synchronously process a file and extract its text and metadata."""
|
1111
|
+
use_cache = kwargs.pop("use_cache", True)
|
484
1112
|
|
485
1113
|
file_info = self._get_file_info(path)
|
486
1114
|
|
@@ -490,53 +1118,74 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
490
1118
|
"ocr_config": str(sorted(kwargs.items())),
|
491
1119
|
}
|
492
1120
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
return cached_result
|
497
|
-
|
498
|
-
if ocr_cache.is_processing(**cache_kwargs):
|
499
|
-
event = ocr_cache.mark_processing(**cache_kwargs)
|
500
|
-
event.wait()
|
501
|
-
|
502
|
-
# Try cache again after waiting for other process to complete
|
503
|
-
cached_result = ocr_cache.get(**cache_kwargs)
|
504
|
-
if cached_result is not None:
|
1121
|
+
if use_cache:
|
1122
|
+
cached_result = self._handle_cache_lookup_sync(cache_kwargs)
|
1123
|
+
if cached_result:
|
505
1124
|
return cached_result
|
506
1125
|
|
507
|
-
ocr_cache
|
508
|
-
|
1126
|
+
ocr_cache = get_ocr_cache()
|
509
1127
|
try:
|
510
1128
|
self._validate_tesseract_version_sync()
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
1129
|
+
|
1130
|
+
run_config = self._prepare_tesseract_run_config(**kwargs)
|
1131
|
+
|
1132
|
+
temp_fd, temp_path = tempfile.mkstemp(suffix=run_config["ext"])
|
1133
|
+
os.close(temp_fd)
|
1134
|
+
Path(temp_path).unlink()
|
1135
|
+
output_base = temp_path.replace(run_config["ext"], "")
|
1136
|
+
|
515
1137
|
try:
|
516
|
-
command = self._build_tesseract_command(
|
517
|
-
|
1138
|
+
command = self._build_tesseract_command(
|
1139
|
+
path,
|
1140
|
+
output_base,
|
1141
|
+
run_config["language"],
|
1142
|
+
run_config["psm"],
|
1143
|
+
run_config["tesseract_format"],
|
1144
|
+
**run_config["remaining_kwargs"],
|
1145
|
+
)
|
1146
|
+
self._execute_tesseract_sync(command)
|
1147
|
+
|
1148
|
+
output_path = Path(f"{output_base}{run_config['ext']}")
|
1149
|
+
if not output_path.exists():
|
1150
|
+
return ExtractionResult(
|
1151
|
+
content="[OCR processing failed]",
|
1152
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
1153
|
+
metadata={
|
1154
|
+
"source_format": run_config["tesseract_format"],
|
1155
|
+
"error": f"{run_config['ext']} file not generated",
|
1156
|
+
},
|
1157
|
+
chunks=[],
|
1158
|
+
tables=[],
|
1159
|
+
)
|
518
1160
|
|
519
|
-
output_path = Path(output_base + ".txt")
|
520
1161
|
with output_path.open(encoding="utf-8") as f:
|
521
1162
|
output = f.read()
|
522
|
-
extraction_result = ExtractionResult(
|
523
|
-
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
524
|
-
)
|
525
1163
|
|
526
|
-
|
527
|
-
|
528
|
-
|
1164
|
+
extraction_result = self._process_tesseract_output_sync(output, run_config)
|
1165
|
+
|
1166
|
+
if use_cache:
|
1167
|
+
final_cache_kwargs = cache_kwargs.copy()
|
1168
|
+
final_cache_kwargs["ocr_config"] = str(
|
1169
|
+
sorted(
|
1170
|
+
{
|
1171
|
+
**run_config["remaining_kwargs"],
|
1172
|
+
"language": run_config["language"],
|
1173
|
+
"psm": run_config["psm"],
|
1174
|
+
}.items()
|
1175
|
+
)
|
1176
|
+
)
|
1177
|
+
ocr_cache.set(extraction_result, **final_cache_kwargs)
|
529
1178
|
|
530
1179
|
return extraction_result
|
531
|
-
except (RuntimeError, OSError) as e:
|
532
|
-
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
533
1180
|
finally:
|
534
|
-
for
|
535
|
-
|
536
|
-
|
537
|
-
|
1181
|
+
for cleanup_ext in [".txt", ".hocr", ".tsv"]:
|
1182
|
+
cleanup_path = Path(f"{output_base}{cleanup_ext}")
|
1183
|
+
cleanup_path.unlink(missing_ok=True)
|
1184
|
+
except Exception as e:
|
1185
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
538
1186
|
finally:
|
539
|
-
|
1187
|
+
if use_cache:
|
1188
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
540
1189
|
|
541
1190
|
def _get_file_info(self, path: Path) -> dict[str, Any]:
|
542
1191
|
"""Get file information for caching."""
|
@@ -555,7 +1204,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
555
1204
|
}
|
556
1205
|
|
557
1206
|
def _build_tesseract_command(
|
558
|
-
self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
|
1207
|
+
self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
|
559
1208
|
) -> list[str]:
|
560
1209
|
"""Build tesseract command with all parameters."""
|
561
1210
|
command = [
|
@@ -571,34 +1220,19 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
571
1220
|
"--loglevel",
|
572
1221
|
"OFF",
|
573
1222
|
]
|
1223
|
+
|
1224
|
+
if output_format != "text":
|
1225
|
+
command.append(output_format)
|
1226
|
+
|
574
1227
|
for kwarg, value in kwargs.items():
|
1228
|
+
if kwarg.startswith("table_"):
|
1229
|
+
continue
|
575
1230
|
if isinstance(value, bool):
|
576
1231
|
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
577
1232
|
else:
|
578
1233
|
command.extend(["-c", f"{kwarg}={value}"])
|
579
1234
|
return command
|
580
1235
|
|
581
|
-
def _run_tesseract_sync(self, command: list[str]) -> None:
|
582
|
-
"""Run tesseract command synchronously."""
|
583
|
-
env = os.environ.copy()
|
584
|
-
if sys.platform.startswith("linux"):
|
585
|
-
env["OMP_THREAD_LIMIT"] = "1"
|
586
|
-
|
587
|
-
result = subprocess.run(
|
588
|
-
command,
|
589
|
-
check=False,
|
590
|
-
env=env,
|
591
|
-
capture_output=True,
|
592
|
-
text=True,
|
593
|
-
timeout=30,
|
594
|
-
)
|
595
|
-
|
596
|
-
if result.returncode != 0:
|
597
|
-
raise OCRError(
|
598
|
-
"OCR failed with a non-0 return code.",
|
599
|
-
context={"error": result.stderr},
|
600
|
-
)
|
601
|
-
|
602
1236
|
@classmethod
|
603
1237
|
def _validate_tesseract_version_sync(cls) -> None:
|
604
1238
|
"""Synchronously validate that Tesseract is installed and is version 5 or above.
|
@@ -611,7 +1245,12 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
611
1245
|
return
|
612
1246
|
|
613
1247
|
command = ["tesseract", "--version"]
|
614
|
-
|
1248
|
+
try:
|
1249
|
+
result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
|
1250
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
1251
|
+
raise MissingDependencyError(
|
1252
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
1253
|
+
) from e
|
615
1254
|
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
|
616
1255
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
617
1256
|
raise MissingDependencyError(
|
@@ -721,6 +1360,7 @@ def _process_image_with_tesseract(
|
|
721
1360
|
capture_output=True,
|
722
1361
|
text=True,
|
723
1362
|
timeout=30,
|
1363
|
+
encoding="utf-8",
|
724
1364
|
)
|
725
1365
|
|
726
1366
|
if result.returncode != 0:
|
@@ -769,9 +1409,11 @@ def _process_image_bytes_with_tesseract(
|
|
769
1409
|
OCR result as dictionary.
|
770
1410
|
"""
|
771
1411
|
try:
|
772
|
-
with
|
773
|
-
|
774
|
-
|
1412
|
+
with (
|
1413
|
+
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
|
1414
|
+
Image.open(io.BytesIO(image_bytes)) as image,
|
1415
|
+
):
|
1416
|
+
image.save(tmp_image.name, format="PNG")
|
775
1417
|
image_path = tmp_image.name
|
776
1418
|
|
777
1419
|
try:
|