kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import csv
|
3
4
|
import hashlib
|
4
5
|
import io
|
5
6
|
import os
|
@@ -7,26 +8,33 @@ import re
|
|
7
8
|
import subprocess
|
8
9
|
import sys
|
9
10
|
import tempfile
|
10
|
-
from
|
11
|
-
from enum import Enum
|
11
|
+
from io import StringIO
|
12
12
|
from pathlib import Path
|
13
13
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
14
14
|
|
15
15
|
import anyio
|
16
|
+
import html_to_markdown
|
17
|
+
import polars as pl
|
16
18
|
from anyio import Path as AsyncPath
|
17
19
|
from anyio import run_process
|
20
|
+
from bs4 import BeautifulSoup
|
21
|
+
from bs4.element import Tag
|
18
22
|
from PIL import Image
|
23
|
+
from PIL.Image import Image as PILImage
|
19
24
|
from typing_extensions import Self
|
20
25
|
|
21
|
-
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
26
|
+
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
22
27
|
from kreuzberg._ocr._base import OCRBackend
|
23
|
-
from kreuzberg.
|
28
|
+
from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
|
29
|
+
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
|
30
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
24
31
|
from kreuzberg._utils._string import normalize_spaces
|
25
32
|
from kreuzberg._utils._sync import run_sync
|
26
33
|
from kreuzberg._utils._tmp import create_temp_file
|
27
34
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
28
35
|
|
29
36
|
if TYPE_CHECKING:
|
37
|
+
from bs4.element import Tag
|
30
38
|
from PIL.Image import Image as PILImage
|
31
39
|
|
32
40
|
try: # pragma: no cover
|
@@ -168,68 +176,6 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
168
176
|
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
169
177
|
|
170
178
|
|
171
|
-
class PSMMode(Enum):
|
172
|
-
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
173
|
-
|
174
|
-
OSD_ONLY = 0
|
175
|
-
"""Orientation and script detection only."""
|
176
|
-
AUTO_OSD = 1
|
177
|
-
"""Automatic page segmentation with orientation and script detection."""
|
178
|
-
AUTO_ONLY = 2
|
179
|
-
"""Automatic page segmentation without OSD."""
|
180
|
-
AUTO = 3
|
181
|
-
"""Fully automatic page segmentation (default)."""
|
182
|
-
SINGLE_COLUMN = 4
|
183
|
-
"""Assume a single column of text."""
|
184
|
-
SINGLE_BLOCK_VERTICAL = 5
|
185
|
-
"""Assume a single uniform block of vertically aligned text."""
|
186
|
-
SINGLE_BLOCK = 6
|
187
|
-
"""Assume a single uniform block of text."""
|
188
|
-
SINGLE_LINE = 7
|
189
|
-
"""Treat the image as a single text line."""
|
190
|
-
SINGLE_WORD = 8
|
191
|
-
"""Treat the image as a single word."""
|
192
|
-
CIRCLE_WORD = 9
|
193
|
-
"""Treat the image as a single word in a circle."""
|
194
|
-
SINGLE_CHAR = 10
|
195
|
-
"""Treat the image as a single character."""
|
196
|
-
|
197
|
-
|
198
|
-
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
199
|
-
class TesseractConfig:
|
200
|
-
"""Configuration options for Tesseract OCR engine."""
|
201
|
-
|
202
|
-
classify_use_pre_adapted_templates: bool = True
|
203
|
-
"""Whether to use pre-adapted templates during classification to improve recognition accuracy."""
|
204
|
-
language: str = "eng"
|
205
|
-
"""Language code to use for OCR.
|
206
|
-
Examples:
|
207
|
-
- 'eng' for English
|
208
|
-
- 'deu' for German
|
209
|
-
- multiple languages combined with '+', e.g. 'eng+deu')
|
210
|
-
"""
|
211
|
-
language_model_ngram_on: bool = False
|
212
|
-
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
213
|
-
|
214
|
-
Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
|
215
|
-
psm: PSMMode = PSMMode.AUTO
|
216
|
-
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
217
|
-
tessedit_dont_blkrej_good_wds: bool = True
|
218
|
-
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
219
|
-
tessedit_dont_rowrej_good_wds: bool = True
|
220
|
-
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
221
|
-
tessedit_enable_dict_correction: bool = True
|
222
|
-
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
223
|
-
tessedit_char_whitelist: str = ""
|
224
|
-
"""Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
|
225
|
-
tessedit_use_primary_params_model: bool = True
|
226
|
-
"""If True, forces the use of the primary parameters model for text recognition."""
|
227
|
-
textord_space_size_is_variable: bool = True
|
228
|
-
"""Allow variable spacing between words, useful for text with irregular spacing."""
|
229
|
-
thresholding_method: bool = False
|
230
|
-
"""Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
|
231
|
-
|
232
|
-
|
233
179
|
class TesseractBackend(OCRBackend[TesseractConfig]):
|
234
180
|
_version_checked: ClassVar[bool] = False
|
235
181
|
|
@@ -238,10 +184,14 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
238
184
|
image: PILImage,
|
239
185
|
**kwargs: Unpack[TesseractConfig],
|
240
186
|
) -> ExtractionResult:
|
241
|
-
|
187
|
+
use_cache = kwargs.pop("use_cache", True)
|
188
|
+
|
189
|
+
save_image = image
|
190
|
+
if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
|
191
|
+
save_image = image.convert("RGB")
|
242
192
|
|
243
193
|
image_buffer = io.BytesIO()
|
244
|
-
await run_sync(
|
194
|
+
await run_sync(save_image.save, image_buffer, format="PNG")
|
245
195
|
image_content = image_buffer.getvalue()
|
246
196
|
|
247
197
|
cache_kwargs = {
|
@@ -250,7 +200,39 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
250
200
|
"ocr_config": str(sorted(kwargs.items())),
|
251
201
|
}
|
252
202
|
|
203
|
+
if use_cache:
|
204
|
+
cached_result = await self._handle_cache_lookup(cache_kwargs)
|
205
|
+
if cached_result:
|
206
|
+
return cached_result
|
207
|
+
|
208
|
+
ocr_cache = get_ocr_cache()
|
209
|
+
try:
|
210
|
+
await self._validate_tesseract_version()
|
211
|
+
image_path, unlink = await create_temp_file(".png")
|
212
|
+
|
213
|
+
try:
|
214
|
+
await run_sync(save_image.save, str(image_path), format="PNG")
|
215
|
+
except OSError as e:
|
216
|
+
if "cannot write mode" not in str(e):
|
217
|
+
raise
|
218
|
+
save_image = image.convert("RGB")
|
219
|
+
await run_sync(save_image.save, str(image_path), format="PNG")
|
220
|
+
try:
|
221
|
+
result = await self.process_file(image_path, **kwargs)
|
222
|
+
|
223
|
+
if use_cache:
|
224
|
+
await ocr_cache.aset(result, **cache_kwargs)
|
225
|
+
|
226
|
+
return result
|
227
|
+
finally:
|
228
|
+
await unlink()
|
229
|
+
finally:
|
230
|
+
if use_cache:
|
231
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
232
|
+
|
233
|
+
async def _handle_cache_lookup(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
253
234
|
ocr_cache = get_ocr_cache()
|
235
|
+
|
254
236
|
cached_result = await ocr_cache.aget(**cache_kwargs)
|
255
237
|
if cached_result is not None:
|
256
238
|
return cached_result
|
@@ -258,49 +240,120 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
258
240
|
if ocr_cache.is_processing(**cache_kwargs):
|
259
241
|
event = ocr_cache.mark_processing(**cache_kwargs)
|
260
242
|
await anyio.to_thread.run_sync(event.wait)
|
261
|
-
|
262
|
-
# Try cache again after waiting for other process to complete # ~keep
|
263
243
|
cached_result = await ocr_cache.aget(**cache_kwargs)
|
264
244
|
if cached_result is not None:
|
265
245
|
return cached_result
|
266
246
|
|
267
247
|
ocr_cache.mark_processing(**cache_kwargs)
|
248
|
+
return None
|
249
|
+
|
250
|
+
def _prepare_tesseract_run_config(self, **kwargs: Any) -> dict[str, Any]:
|
251
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
252
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
253
|
+
output_format = kwargs.pop("output_format", "markdown")
|
254
|
+
enable_table_detection = kwargs.pop("enable_table_detection", False)
|
255
|
+
|
256
|
+
if enable_table_detection and output_format == "text":
|
257
|
+
output_format = "tsv"
|
258
|
+
|
259
|
+
if output_format == "markdown":
|
260
|
+
tesseract_format = "hocr"
|
261
|
+
ext = ".hocr"
|
262
|
+
elif output_format == "tsv":
|
263
|
+
tesseract_format = "tsv"
|
264
|
+
ext = ".tsv"
|
265
|
+
elif output_format == "hocr":
|
266
|
+
tesseract_format = "hocr"
|
267
|
+
ext = ".hocr"
|
268
|
+
else:
|
269
|
+
tesseract_format = "text"
|
270
|
+
ext = ".txt"
|
271
|
+
|
272
|
+
return {
|
273
|
+
"language": language,
|
274
|
+
"psm": psm,
|
275
|
+
"output_format": output_format,
|
276
|
+
"enable_table_detection": enable_table_detection,
|
277
|
+
"tesseract_format": tesseract_format,
|
278
|
+
"ext": ext,
|
279
|
+
"remaining_kwargs": kwargs,
|
280
|
+
}
|
281
|
+
|
282
|
+
async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
|
283
|
+
command = [
|
284
|
+
"tesseract",
|
285
|
+
str(path),
|
286
|
+
output_base,
|
287
|
+
"-l",
|
288
|
+
run_config["language"],
|
289
|
+
"--psm",
|
290
|
+
str(run_config["psm"].value),
|
291
|
+
"--oem",
|
292
|
+
"1",
|
293
|
+
"--loglevel",
|
294
|
+
"OFF",
|
295
|
+
]
|
296
|
+
|
297
|
+
if run_config["tesseract_format"] != "text":
|
298
|
+
command.append(run_config["tesseract_format"])
|
299
|
+
|
300
|
+
for kwarg, value in run_config["remaining_kwargs"].items():
|
301
|
+
if kwarg.startswith("table_"):
|
302
|
+
continue
|
303
|
+
if isinstance(value, bool):
|
304
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
305
|
+
else:
|
306
|
+
command.extend(["-c", f"{kwarg}={value}"])
|
307
|
+
|
308
|
+
env: dict[str, Any] | None = None
|
309
|
+
if sys.platform.startswith("linux"):
|
310
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
268
311
|
|
269
312
|
try:
|
270
|
-
await
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
313
|
+
result = await run_process(command, env=env)
|
314
|
+
if not result.returncode == 0:
|
315
|
+
raise OCRError(
|
316
|
+
"OCR failed with a non-0 return code.",
|
317
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
318
|
+
)
|
319
|
+
except subprocess.CalledProcessError as e:
|
320
|
+
error_msg = e.stderr.decode("utf-8") if e.stderr else str(e)
|
321
|
+
raise OCRError(
|
322
|
+
f"Failed to OCR using tesseract: {error_msg}",
|
323
|
+
context={"command": command, "returncode": e.returncode, "error": error_msg},
|
324
|
+
) from e
|
275
325
|
|
276
|
-
|
326
|
+
async def _process_tesseract_output(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
|
327
|
+
output_format = run_config["output_format"]
|
328
|
+
enable_table_detection = run_config["enable_table_detection"]
|
329
|
+
kwargs = run_config["remaining_kwargs"]
|
330
|
+
|
331
|
+
if output_format == "markdown":
|
332
|
+
return await self._process_hocr_to_markdown(output, enable_table_detection=enable_table_detection, **kwargs)
|
333
|
+
if output_format == "tsv" and enable_table_detection:
|
334
|
+
return await self._process_tsv_output(
|
335
|
+
output,
|
336
|
+
table_column_threshold=kwargs.get("table_column_threshold", 20),
|
337
|
+
table_row_threshold_ratio=kwargs.get("table_row_threshold_ratio", 0.5),
|
338
|
+
table_min_confidence=kwargs.get("table_min_confidence", 30.0),
|
339
|
+
)
|
340
|
+
if output_format == "tsv":
|
341
|
+
return self._extract_text_from_tsv(output)
|
342
|
+
if output_format == "hocr":
|
343
|
+
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
|
277
344
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
finally:
|
282
|
-
ocr_cache.mark_complete(**cache_kwargs)
|
345
|
+
return ExtractionResult(
|
346
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
347
|
+
)
|
283
348
|
|
284
|
-
async def process_file(
|
285
|
-
|
286
|
-
path: Path,
|
287
|
-
**kwargs: Unpack[TesseractConfig],
|
288
|
-
) -> ExtractionResult:
|
289
|
-
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
349
|
+
async def process_file(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
350
|
+
use_cache = kwargs.pop("use_cache", True)
|
290
351
|
|
291
352
|
try:
|
292
353
|
stat = path.stat()
|
293
|
-
file_info = {
|
294
|
-
"path": str(path.resolve()),
|
295
|
-
"size": stat.st_size,
|
296
|
-
"mtime": stat.st_mtime,
|
297
|
-
}
|
354
|
+
file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
|
298
355
|
except OSError:
|
299
|
-
file_info = {
|
300
|
-
"path": str(path),
|
301
|
-
"size": 0,
|
302
|
-
"mtime": 0,
|
303
|
-
}
|
356
|
+
file_info = {"path": str(path), "size": 0, "mtime": 0}
|
304
357
|
|
305
358
|
cache_kwargs = {
|
306
359
|
"file_info": str(sorted(file_info.items())),
|
@@ -308,71 +361,37 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
308
361
|
"ocr_config": str(sorted(kwargs.items())),
|
309
362
|
}
|
310
363
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
return cached_result
|
315
|
-
|
316
|
-
if ocr_cache.is_processing(**cache_kwargs):
|
317
|
-
event = ocr_cache.mark_processing(**cache_kwargs)
|
318
|
-
await anyio.to_thread.run_sync(event.wait)
|
319
|
-
|
320
|
-
# Try cache again after waiting for other process to complete # ~keep
|
321
|
-
cached_result = await ocr_cache.aget(**cache_kwargs)
|
322
|
-
if cached_result is not None:
|
364
|
+
if use_cache:
|
365
|
+
cached_result = await self._handle_cache_lookup(cache_kwargs)
|
366
|
+
if cached_result:
|
323
367
|
return cached_result
|
324
368
|
|
325
|
-
ocr_cache
|
326
|
-
|
369
|
+
ocr_cache = get_ocr_cache()
|
327
370
|
try:
|
328
371
|
await self._validate_tesseract_version()
|
329
|
-
output_path, unlink = await create_temp_file(".txt")
|
330
|
-
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
331
|
-
psm = kwargs.pop("psm", PSMMode.AUTO)
|
332
|
-
try:
|
333
|
-
output_base = str(output_path).replace(".txt", "")
|
334
|
-
command = [
|
335
|
-
"tesseract",
|
336
|
-
str(path),
|
337
|
-
output_base,
|
338
|
-
"-l",
|
339
|
-
language,
|
340
|
-
"--psm",
|
341
|
-
str(psm.value),
|
342
|
-
"--oem",
|
343
|
-
"1",
|
344
|
-
"--loglevel",
|
345
|
-
"OFF",
|
346
|
-
]
|
347
|
-
for kwarg, value in kwargs.items():
|
348
|
-
if isinstance(value, bool):
|
349
|
-
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
350
|
-
else:
|
351
|
-
# Handle string parameters (like tessedit_char_whitelist)
|
352
|
-
command.extend(["-c", f"{kwarg}={value}"])
|
353
|
-
|
354
|
-
env: dict[str, Any] | None = None
|
355
|
-
if sys.platform.startswith("linux"):
|
356
|
-
env = {"OMP_THREAD_LIMIT": "1"}
|
357
372
|
|
358
|
-
|
373
|
+
run_config = self._prepare_tesseract_run_config(**kwargs)
|
374
|
+
output_path, unlink = await create_temp_file(run_config["ext"])
|
359
375
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
context={
|
364
|
-
"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
|
365
|
-
},
|
366
|
-
)
|
376
|
+
try:
|
377
|
+
output_base = str(output_path).replace(run_config["ext"], "")
|
378
|
+
await self._execute_tesseract(path, output_base, run_config)
|
367
379
|
|
368
380
|
output = await AsyncPath(output_path).read_text("utf-8")
|
369
|
-
extraction_result =
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
381
|
+
extraction_result = await self._process_tesseract_output(output, run_config)
|
382
|
+
|
383
|
+
if use_cache:
|
384
|
+
final_cache_kwargs = cache_kwargs.copy()
|
385
|
+
final_cache_kwargs["ocr_config"] = str(
|
386
|
+
sorted(
|
387
|
+
{
|
388
|
+
**run_config["remaining_kwargs"],
|
389
|
+
"language": run_config["language"],
|
390
|
+
"psm": run_config["psm"],
|
391
|
+
}.items()
|
392
|
+
)
|
393
|
+
)
|
394
|
+
await ocr_cache.aset(extraction_result, **final_cache_kwargs)
|
376
395
|
|
377
396
|
return extraction_result
|
378
397
|
except (RuntimeError, OSError) as e:
|
@@ -380,22 +399,478 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
380
399
|
finally:
|
381
400
|
await unlink()
|
382
401
|
finally:
|
383
|
-
|
402
|
+
if use_cache:
|
403
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
404
|
+
|
405
|
+
async def _process_tsv_output(
|
406
|
+
self,
|
407
|
+
tsv_content: str,
|
408
|
+
table_column_threshold: int = 20,
|
409
|
+
table_row_threshold_ratio: float = 0.5,
|
410
|
+
table_min_confidence: float = 30.0,
|
411
|
+
) -> ExtractionResult:
|
412
|
+
text_result = self._extract_text_from_tsv(tsv_content)
|
413
|
+
|
414
|
+
try:
|
415
|
+
if (
|
416
|
+
(words := extract_words(tsv_content, min_confidence=table_min_confidence))
|
417
|
+
and (
|
418
|
+
table_data := reconstruct_table(
|
419
|
+
words,
|
420
|
+
column_threshold=table_column_threshold,
|
421
|
+
row_threshold_ratio=table_row_threshold_ratio,
|
422
|
+
)
|
423
|
+
)
|
424
|
+
and len(table_data) > 1
|
425
|
+
):
|
426
|
+
markdown = to_markdown(table_data)
|
427
|
+
|
428
|
+
try:
|
429
|
+
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
430
|
+
except (ImportError, IndexError):
|
431
|
+
df = None
|
432
|
+
|
433
|
+
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
434
|
+
|
435
|
+
return ExtractionResult(
|
436
|
+
content=text_result.content,
|
437
|
+
mime_type=text_result.mime_type,
|
438
|
+
metadata=text_result.metadata,
|
439
|
+
tables=[table],
|
440
|
+
chunks=text_result.chunks,
|
441
|
+
)
|
442
|
+
except (ValueError, KeyError, ImportError):
|
443
|
+
pass
|
444
|
+
|
445
|
+
return text_result
|
446
|
+
|
447
|
+
def _extract_text_from_tsv(self, tsv_content: str) -> ExtractionResult:
|
448
|
+
try:
|
449
|
+
reader = csv.DictReader(StringIO(tsv_content), delimiter="\t")
|
450
|
+
|
451
|
+
lines: dict[tuple[int, int, int, int], list[tuple[int, str]]] = {}
|
452
|
+
|
453
|
+
for row in reader:
|
454
|
+
if row.get("level") == "5" and row.get("text", "").strip():
|
455
|
+
line_key = (int(row["page_num"]), int(row["block_num"]), int(row["par_num"]), int(row["line_num"]))
|
456
|
+
|
457
|
+
if line_key not in lines:
|
458
|
+
lines[line_key] = []
|
459
|
+
|
460
|
+
lines[line_key].append((int(row["left"]), row["text"]))
|
461
|
+
|
462
|
+
text_parts: list[str] = []
|
463
|
+
last_block = -1
|
464
|
+
last_para = -1
|
465
|
+
|
466
|
+
for line_key in sorted(lines.keys()):
|
467
|
+
page_num, block_num, par_num, line_num = line_key
|
468
|
+
|
469
|
+
if block_num != last_block:
|
470
|
+
if text_parts: # ~keep
|
471
|
+
text_parts.append("\n\n")
|
472
|
+
last_block = block_num
|
473
|
+
last_para = par_num
|
474
|
+
elif par_num != last_para:
|
475
|
+
text_parts.append("\n\n")
|
476
|
+
last_para = par_num
|
477
|
+
|
478
|
+
words = sorted(lines[line_key], key=lambda x: x[0])
|
479
|
+
line_text = " ".join(word[1] for word in words)
|
480
|
+
text_parts.append(line_text)
|
481
|
+
text_parts.append("\n")
|
482
|
+
|
483
|
+
content = "".join(text_parts).strip()
|
484
|
+
|
485
|
+
except (ValueError, KeyError):
|
486
|
+
content = ""
|
487
|
+
for line in tsv_content.split("\n")[1:]: # ~keep skip header
|
488
|
+
parts = line.split("\t")
|
489
|
+
if len(parts) > 11 and parts[11].strip(): # ~keep text is in column 11
|
490
|
+
content += parts[11] + " "
|
491
|
+
content = content.strip()
|
492
|
+
|
493
|
+
return ExtractionResult(
|
494
|
+
content=normalize_spaces(content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
495
|
+
)
|
496
|
+
|
497
|
+
async def _process_hocr_to_markdown(
|
498
|
+
self,
|
499
|
+
hocr_content: str,
|
500
|
+
enable_table_detection: bool = False,
|
501
|
+
html_to_markdown_config: HTMLToMarkdownConfig | None = None,
|
502
|
+
table_column_threshold: int = 20,
|
503
|
+
table_row_threshold_ratio: float = 0.5,
|
504
|
+
table_min_confidence: float = 30.0,
|
505
|
+
**_kwargs: Any,
|
506
|
+
) -> ExtractionResult:
|
507
|
+
config = html_to_markdown_config or HTMLToMarkdownConfig(
|
508
|
+
escape_asterisks=False,
|
509
|
+
escape_underscores=False,
|
510
|
+
extract_metadata=False,
|
511
|
+
strip="meta title",
|
512
|
+
)
|
513
|
+
|
514
|
+
tables: list[TableData] = []
|
515
|
+
if enable_table_detection:
|
516
|
+
soup = BeautifulSoup(hocr_content, "lxml")
|
517
|
+
tables = await self._extract_tables_from_hocr(
|
518
|
+
soup,
|
519
|
+
table_column_threshold,
|
520
|
+
table_row_threshold_ratio,
|
521
|
+
table_min_confidence,
|
522
|
+
)
|
523
|
+
|
524
|
+
hocr_converters = self._create_hocr_converters(tables)
|
525
|
+
|
526
|
+
all_converters = dict(hocr_converters)
|
527
|
+
if config.custom_converters:
|
528
|
+
all_converters.update(config.custom_converters)
|
529
|
+
|
530
|
+
config_dict = config.to_dict()
|
531
|
+
config_dict["custom_converters"] = all_converters
|
532
|
+
|
533
|
+
try:
|
534
|
+
markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
|
535
|
+
markdown_content = normalize_spaces(markdown_content)
|
536
|
+
except (ValueError, TypeError, AttributeError):
|
537
|
+
try:
|
538
|
+
soup = BeautifulSoup(hocr_content, "lxml")
|
539
|
+
words = soup.find_all("span", class_="ocrx_word")
|
540
|
+
text_parts = []
|
541
|
+
for word in words:
|
542
|
+
text = word.get_text().strip()
|
543
|
+
if text:
|
544
|
+
text_parts.append(text)
|
545
|
+
|
546
|
+
if text_parts:
|
547
|
+
markdown_content = " ".join(text_parts)
|
548
|
+
else:
|
549
|
+
markdown_content = soup.get_text().strip() or "[No text detected]"
|
550
|
+
|
551
|
+
markdown_content = normalize_spaces(markdown_content)
|
552
|
+
except (ValueError, TypeError, AttributeError):
|
553
|
+
markdown_content = "[OCR processing failed]"
|
554
|
+
|
555
|
+
if tables:
|
556
|
+
table_sections = []
|
557
|
+
for i, table in enumerate(tables):
|
558
|
+
table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
|
559
|
+
|
560
|
+
if markdown_content.strip():
|
561
|
+
final_content = f"{markdown_content}\n{''.join(table_sections)}"
|
562
|
+
else:
|
563
|
+
final_content = "".join(table_sections).strip()
|
564
|
+
else:
|
565
|
+
final_content = markdown_content
|
566
|
+
|
567
|
+
return ExtractionResult(
|
568
|
+
content=final_content,
|
569
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
570
|
+
metadata={"source_format": "hocr", "tables_detected": len(tables)},
|
571
|
+
chunks=[],
|
572
|
+
tables=tables,
|
573
|
+
)
|
574
|
+
|
575
|
+
def _create_basic_converters(self) -> dict[str, Any]:
|
576
|
+
def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
577
|
+
del tag
|
578
|
+
return f"{text.strip()} "
|
579
|
+
|
580
|
+
def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
581
|
+
del tag
|
582
|
+
return f"{text.strip()}\n"
|
583
|
+
|
584
|
+
def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
585
|
+
del tag
|
586
|
+
content = text.strip()
|
587
|
+
if not content:
|
588
|
+
return ""
|
589
|
+
return f"{content}\n\n"
|
590
|
+
|
591
|
+
def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
592
|
+
del tag
|
593
|
+
content = text.strip()
|
594
|
+
if not content:
|
595
|
+
return ""
|
596
|
+
return f"{content}\n\n"
|
597
|
+
|
598
|
+
def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
599
|
+
del tag
|
600
|
+
return text.strip()
|
601
|
+
|
602
|
+
def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
603
|
+
del tag, text
|
604
|
+
return "---\n"
|
605
|
+
|
606
|
+
def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
607
|
+
del text
|
608
|
+
title = tag.get("title", "")
|
609
|
+
if isinstance(title, str):
|
610
|
+
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
|
611
|
+
if bbox_match:
|
612
|
+
x0, y0, x1, y1 = bbox_match.groups()
|
613
|
+
width = int(x1) - int(x0)
|
614
|
+
height = int(y1) - int(y0)
|
615
|
+
return f"*[Image region: {width}x{height} pixels]*\n\n"
|
616
|
+
return "*[Image detected]*\n\n"
|
617
|
+
|
618
|
+
return {
|
619
|
+
"ocrx_word": ocrx_word_converter,
|
620
|
+
"ocr_line": ocr_line_converter,
|
621
|
+
"ocr_par": ocr_par_converter,
|
622
|
+
"ocr_carea": ocr_carea_converter,
|
623
|
+
"ocr_page": ocr_page_converter,
|
624
|
+
"ocr_separator": ocr_separator_converter,
|
625
|
+
"ocr_photo": ocr_photo_converter,
|
626
|
+
}
|
627
|
+
|
628
|
+
def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
|
629
|
+
basic_converters = self._create_basic_converters()
|
630
|
+
|
631
|
+
def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
632
|
+
class_attr = tag.get("class", "")
|
633
|
+
if isinstance(class_attr, list):
|
634
|
+
class_attr = " ".join(class_attr)
|
635
|
+
elif not isinstance(class_attr, str):
|
636
|
+
class_attr = ""
|
637
|
+
|
638
|
+
for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
|
639
|
+
if class_name in class_attr:
|
640
|
+
converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
|
641
|
+
return str(converter_result)
|
642
|
+
return text
|
643
|
+
|
644
|
+
def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
645
|
+
class_attr = tag.get("class", "")
|
646
|
+
if isinstance(class_attr, list):
|
647
|
+
class_attr = " ".join(class_attr)
|
648
|
+
elif not isinstance(class_attr, str):
|
649
|
+
class_attr = ""
|
650
|
+
|
651
|
+
for class_name in ["ocrx_word", "ocr_line"]:
|
652
|
+
if class_name in class_attr:
|
653
|
+
converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
|
654
|
+
return str(converter_result)
|
655
|
+
return f"{text.strip()} "
|
656
|
+
|
657
|
+
return {
|
658
|
+
"span": generic_span_converter,
|
659
|
+
"div": generic_div_converter,
|
660
|
+
"p": basic_converters["ocr_par"],
|
661
|
+
}
|
662
|
+
|
663
|
+
def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
|
664
|
+
tables: list[TableData] = []
|
665
|
+
|
666
|
+
if config.enable_table_detection:
|
667
|
+
pass
|
668
|
+
|
669
|
+
try:
|
670
|
+
converters = self._create_hocr_converters(tables)
|
671
|
+
|
672
|
+
html_config = HTMLToMarkdownConfig(
|
673
|
+
custom_converters=converters,
|
674
|
+
escape_asterisks=False,
|
675
|
+
escape_underscores=False,
|
676
|
+
extract_metadata=False,
|
677
|
+
strip="meta title",
|
678
|
+
)
|
679
|
+
|
680
|
+
markdown_content = html_to_markdown.convert_to_markdown(
|
681
|
+
hocr_content,
|
682
|
+
**html_config.to_dict(),
|
683
|
+
)
|
684
|
+
|
685
|
+
markdown_content = normalize_spaces(markdown_content)
|
686
|
+
|
687
|
+
except (ValueError, TypeError, AttributeError):
|
688
|
+
try:
|
689
|
+
soup = BeautifulSoup(hocr_content, "lxml")
|
690
|
+
words = soup.find_all("span", class_="ocrx_word")
|
691
|
+
text_parts = []
|
692
|
+
for word in words:
|
693
|
+
text = word.get_text().strip()
|
694
|
+
if text:
|
695
|
+
text_parts.append(text)
|
696
|
+
|
697
|
+
if text_parts:
|
698
|
+
markdown_content = " ".join(text_parts)
|
699
|
+
else:
|
700
|
+
markdown_content = soup.get_text().strip() or "[No text detected]"
|
701
|
+
|
702
|
+
markdown_content = normalize_spaces(markdown_content)
|
703
|
+
except (ValueError, TypeError, AttributeError):
|
704
|
+
markdown_content = "[OCR processing failed]"
|
705
|
+
|
706
|
+
if tables:
|
707
|
+
table_sections = []
|
708
|
+
for i, table in enumerate(tables):
|
709
|
+
table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
|
710
|
+
|
711
|
+
if markdown_content.strip():
|
712
|
+
final_content = f"{markdown_content}\n{''.join(table_sections)}"
|
713
|
+
else:
|
714
|
+
final_content = "".join(table_sections).strip()
|
715
|
+
else:
|
716
|
+
final_content = markdown_content
|
717
|
+
|
718
|
+
return ExtractionResult(
|
719
|
+
content=final_content,
|
720
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
721
|
+
metadata={"source_format": "hocr", "tables_detected": len(tables)},
|
722
|
+
chunks=[],
|
723
|
+
tables=tables,
|
724
|
+
)
|
725
|
+
|
726
|
+
def _process_tsv_output_sync(
|
727
|
+
self,
|
728
|
+
tsv_content: str,
|
729
|
+
table_column_threshold: int = 20,
|
730
|
+
table_row_threshold_ratio: float = 0.5,
|
731
|
+
table_min_confidence: float = 30.0,
|
732
|
+
) -> ExtractionResult:
|
733
|
+
text_result = self._extract_text_from_tsv(tsv_content)
|
734
|
+
|
735
|
+
try:
|
736
|
+
if (
|
737
|
+
(words := extract_words(tsv_content, min_confidence=table_min_confidence))
|
738
|
+
and (
|
739
|
+
table_data := reconstruct_table(
|
740
|
+
words,
|
741
|
+
column_threshold=table_column_threshold,
|
742
|
+
row_threshold_ratio=table_row_threshold_ratio,
|
743
|
+
)
|
744
|
+
)
|
745
|
+
and len(table_data) > 1
|
746
|
+
):
|
747
|
+
markdown = to_markdown(table_data)
|
748
|
+
|
749
|
+
try:
|
750
|
+
df = pl.DataFrame(table_data[1:], schema=table_data[0])
|
751
|
+
except (ImportError, IndexError):
|
752
|
+
df = None
|
753
|
+
|
754
|
+
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
755
|
+
|
756
|
+
return ExtractionResult(
|
757
|
+
content=text_result.content,
|
758
|
+
mime_type=text_result.mime_type,
|
759
|
+
metadata=text_result.metadata,
|
760
|
+
tables=[table],
|
761
|
+
chunks=text_result.chunks,
|
762
|
+
)
|
763
|
+
except (ValueError, KeyError, ImportError):
|
764
|
+
pass
|
765
|
+
|
766
|
+
return text_result
|
767
|
+
|
768
|
+
async def _extract_tables_from_hocr(
|
769
|
+
self,
|
770
|
+
soup: Any,
|
771
|
+
column_threshold: int = 20,
|
772
|
+
row_threshold_ratio: float = 0.5,
|
773
|
+
min_confidence: float = 30.0,
|
774
|
+
) -> list[TableData]:
|
775
|
+
tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
|
776
|
+
|
777
|
+
if not tsv_data:
|
778
|
+
return []
|
779
|
+
|
780
|
+
if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
|
781
|
+
return []
|
782
|
+
|
783
|
+
tables: list[TableData] = []
|
784
|
+
try:
|
785
|
+
table_data = reconstruct_table(
|
786
|
+
words,
|
787
|
+
column_threshold=column_threshold,
|
788
|
+
row_threshold_ratio=row_threshold_ratio,
|
789
|
+
)
|
790
|
+
if table_data and len(table_data) > 1: # ~keep At least header + one data row
|
791
|
+
markdown = to_markdown(table_data)
|
792
|
+
|
793
|
+
min_x = min(w["left"] for w in words)
|
794
|
+
max_x = max(w["left"] + w["width"] for w in words)
|
795
|
+
min_y = min(w["top"] for w in words)
|
796
|
+
max_y = max(w["top"] + w["height"] for w in words)
|
797
|
+
|
798
|
+
try:
|
799
|
+
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
800
|
+
except (ImportError, IndexError):
|
801
|
+
df = None
|
802
|
+
|
803
|
+
dummy_image = Image.new("RGB", (1, 1), "white")
|
804
|
+
|
805
|
+
table: TableData = {
|
806
|
+
"text": markdown,
|
807
|
+
"df": df,
|
808
|
+
"page_number": 1,
|
809
|
+
"cropped_image": dummy_image,
|
810
|
+
"metadata": {"bbox": (min_x, min_y, max_x, max_y)},
|
811
|
+
} # type: ignore[typeddict-unknown-key]
|
812
|
+
tables.append(table)
|
813
|
+
except (ValueError, KeyError, ImportError):
|
814
|
+
pass
|
815
|
+
|
816
|
+
return tables
|
817
|
+
|
818
|
+
async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
|
819
|
+
tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
|
820
|
+
|
821
|
+
words = soup.find_all("span", class_="ocrx_word")
|
822
|
+
word_num = 1
|
823
|
+
|
824
|
+
for word in words:
|
825
|
+
title = word.get("title", "")
|
826
|
+
text = word.get_text().strip()
|
827
|
+
|
828
|
+
if not text:
|
829
|
+
continue
|
830
|
+
|
831
|
+
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
|
832
|
+
if not bbox_match:
|
833
|
+
continue
|
834
|
+
|
835
|
+
x0, y0, x1, y1 = map(int, bbox_match.groups())
|
836
|
+
|
837
|
+
conf_match = re.search(r"x_wconf (\d+)", title)
|
838
|
+
confidence = float(conf_match.group(1)) if conf_match else 100.0
|
839
|
+
|
840
|
+
if confidence < min_confidence:
|
841
|
+
continue
|
842
|
+
|
843
|
+
line = word.find_parent(class_="ocr_line")
|
844
|
+
par = word.find_parent(class_="ocr_par")
|
845
|
+
block = word.find_parent(class_="ocr_carea")
|
846
|
+
|
847
|
+
tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
|
848
|
+
tsv_lines.append(tsv_line)
|
849
|
+
word_num += 1
|
850
|
+
|
851
|
+
return "\n".join(tsv_lines)
|
852
|
+
|
853
|
+
def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
|
854
|
+
if not words:
|
855
|
+
return []
|
856
|
+
|
857
|
+
return [words]
|
384
858
|
|
385
859
|
@classmethod
|
386
860
|
async def _validate_tesseract_version(cls) -> None:
|
387
|
-
"""Validate that Tesseract is installed and is version 5 or above.
|
388
|
-
|
389
|
-
Raises:
|
390
|
-
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
391
|
-
"""
|
392
861
|
try:
|
393
862
|
if cls._version_checked:
|
394
863
|
return
|
395
864
|
|
396
865
|
command = ["tesseract", "--version"]
|
397
|
-
|
398
|
-
|
866
|
+
env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
|
867
|
+
try:
|
868
|
+
result = await run_process(command, env=env)
|
869
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
870
|
+
raise MissingDependencyError(
|
871
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
872
|
+
) from e
|
873
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode("utf-8"))
|
399
874
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
400
875
|
raise MissingDependencyError(
|
401
876
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
@@ -407,33 +882,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
407
882
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
408
883
|
) from e
|
409
884
|
|
410
|
-
def
|
411
|
-
self,
|
412
|
-
image: PILImage,
|
413
|
-
**kwargs: Unpack[TesseractConfig],
|
414
|
-
) -> ExtractionResult:
|
415
|
-
"""Synchronously process an image and extract its text and metadata.
|
416
|
-
|
417
|
-
Args:
|
418
|
-
image: An instance of PIL.Image representing the input image.
|
419
|
-
**kwargs: Any kwargs related to the given backend
|
420
|
-
|
421
|
-
Returns:
|
422
|
-
The extraction result object
|
423
|
-
"""
|
424
|
-
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
425
|
-
|
426
|
-
image_buffer = io.BytesIO()
|
427
|
-
image.save(image_buffer, format="PNG")
|
428
|
-
image_content = image_buffer.getvalue()
|
429
|
-
|
430
|
-
cache_kwargs = {
|
431
|
-
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
432
|
-
"ocr_backend": "tesseract",
|
433
|
-
"ocr_config": str(sorted(kwargs.items())),
|
434
|
-
}
|
435
|
-
|
885
|
+
def _handle_cache_lookup_sync(self, cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
436
886
|
ocr_cache = get_ocr_cache()
|
887
|
+
|
437
888
|
cached_result = ocr_cache.get(**cache_kwargs)
|
438
889
|
if cached_result is not None:
|
439
890
|
return cached_result
|
@@ -441,46 +892,109 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
441
892
|
if ocr_cache.is_processing(**cache_kwargs):
|
442
893
|
event = ocr_cache.mark_processing(**cache_kwargs)
|
443
894
|
event.wait()
|
444
|
-
|
445
|
-
# Try cache again after waiting for other process to complete
|
446
895
|
cached_result = ocr_cache.get(**cache_kwargs)
|
447
896
|
if cached_result is not None:
|
448
897
|
return cached_result
|
449
898
|
|
450
899
|
ocr_cache.mark_processing(**cache_kwargs)
|
900
|
+
return None
|
901
|
+
|
902
|
+
def _execute_tesseract_sync(self, command: list[str]) -> None:
|
903
|
+
env = os.environ.copy()
|
904
|
+
if sys.platform.startswith("linux"):
|
905
|
+
env["OMP_THREAD_LIMIT"] = "1"
|
451
906
|
|
907
|
+
try:
|
908
|
+
subprocess.run(
|
909
|
+
command,
|
910
|
+
check=True,
|
911
|
+
env=env,
|
912
|
+
capture_output=True,
|
913
|
+
text=True,
|
914
|
+
timeout=30,
|
915
|
+
encoding="utf-8",
|
916
|
+
)
|
917
|
+
except subprocess.CalledProcessError as e:
|
918
|
+
error_msg = e.stderr if e.stderr else str(e)
|
919
|
+
raise OCRError(
|
920
|
+
f"Failed to OCR using tesseract: {error_msg}",
|
921
|
+
context={"command": command, "returncode": e.returncode, "error": error_msg},
|
922
|
+
) from e
|
923
|
+
except subprocess.TimeoutExpired as e:
|
924
|
+
raise OCRError(
|
925
|
+
"Tesseract timed out during processing.",
|
926
|
+
context={"command": command, "timeout": 30},
|
927
|
+
) from e
|
928
|
+
|
929
|
+
def _process_tesseract_output_sync(self, output: str, run_config: dict[str, Any]) -> ExtractionResult:
|
930
|
+
output_format = run_config["output_format"]
|
931
|
+
enable_table_detection = run_config["enable_table_detection"]
|
932
|
+
kwargs = run_config["remaining_kwargs"]
|
933
|
+
config = TesseractConfig(**kwargs)
|
934
|
+
|
935
|
+
if output_format == "markdown":
|
936
|
+
return self._process_hocr_to_markdown_sync(output, config)
|
937
|
+
if output_format == "tsv" and enable_table_detection:
|
938
|
+
return self._process_tsv_output_sync(
|
939
|
+
output,
|
940
|
+
table_column_threshold=config.table_column_threshold,
|
941
|
+
table_row_threshold_ratio=config.table_row_threshold_ratio,
|
942
|
+
table_min_confidence=config.table_min_confidence,
|
943
|
+
)
|
944
|
+
if output_format == "tsv":
|
945
|
+
return self._extract_text_from_tsv(output)
|
946
|
+
if output_format == "hocr":
|
947
|
+
return ExtractionResult(content=output, mime_type=HTML_MIME_TYPE, metadata={}, chunks=[])
|
948
|
+
|
949
|
+
return ExtractionResult(
|
950
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
951
|
+
)
|
952
|
+
|
953
|
+
def process_image_sync(self, image: PILImage, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
954
|
+
use_cache = kwargs.pop("use_cache", True)
|
955
|
+
|
956
|
+
save_image = image
|
957
|
+
if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
|
958
|
+
save_image = image.convert("RGB")
|
959
|
+
|
960
|
+
image_buffer = io.BytesIO()
|
961
|
+
save_image.save(image_buffer, format="PNG")
|
962
|
+
image_content = image_buffer.getvalue()
|
963
|
+
|
964
|
+
cache_kwargs = {
|
965
|
+
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
966
|
+
"ocr_backend": "tesseract",
|
967
|
+
"ocr_config": str(sorted(kwargs.items())),
|
968
|
+
}
|
969
|
+
|
970
|
+
if use_cache:
|
971
|
+
cached_result = self._handle_cache_lookup_sync(cache_kwargs)
|
972
|
+
if cached_result:
|
973
|
+
return cached_result
|
974
|
+
|
975
|
+
ocr_cache = get_ocr_cache()
|
452
976
|
try:
|
453
977
|
self._validate_tesseract_version_sync()
|
454
978
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
|
455
979
|
image_path = Path(tmp_file.name)
|
456
|
-
|
980
|
+
save_image.save(str(image_path), format="PNG")
|
457
981
|
try:
|
458
|
-
|
982
|
+
kwargs_with_cache = {**kwargs, "use_cache": use_cache}
|
983
|
+
result = self.process_file_sync(image_path, **kwargs_with_cache)
|
459
984
|
|
460
|
-
|
985
|
+
if use_cache:
|
986
|
+
ocr_cache.set(result, **cache_kwargs)
|
461
987
|
|
462
988
|
return result
|
463
989
|
finally:
|
464
990
|
if image_path.exists():
|
465
991
|
image_path.unlink()
|
466
992
|
finally:
|
467
|
-
|
468
|
-
|
469
|
-
def process_file_sync(
|
470
|
-
self,
|
471
|
-
path: Path,
|
472
|
-
**kwargs: Unpack[TesseractConfig],
|
473
|
-
) -> ExtractionResult:
|
474
|
-
"""Synchronously process a file and extract its text and metadata.
|
475
|
-
|
476
|
-
Args:
|
477
|
-
path: A Path object representing the file to be processed.
|
478
|
-
**kwargs: Any kwargs related to the given backend
|
993
|
+
if use_cache:
|
994
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
479
995
|
|
480
|
-
|
481
|
-
|
482
|
-
"""
|
483
|
-
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
996
|
+
def process_file_sync(self, path: Path, **kwargs: Unpack[TesseractConfig]) -> ExtractionResult:
|
997
|
+
use_cache = kwargs.pop("use_cache", True)
|
484
998
|
|
485
999
|
file_info = self._get_file_info(path)
|
486
1000
|
|
@@ -490,56 +1004,76 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
490
1004
|
"ocr_config": str(sorted(kwargs.items())),
|
491
1005
|
}
|
492
1006
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
return cached_result
|
497
|
-
|
498
|
-
if ocr_cache.is_processing(**cache_kwargs):
|
499
|
-
event = ocr_cache.mark_processing(**cache_kwargs)
|
500
|
-
event.wait()
|
501
|
-
|
502
|
-
# Try cache again after waiting for other process to complete
|
503
|
-
cached_result = ocr_cache.get(**cache_kwargs)
|
504
|
-
if cached_result is not None:
|
1007
|
+
if use_cache:
|
1008
|
+
cached_result = self._handle_cache_lookup_sync(cache_kwargs)
|
1009
|
+
if cached_result:
|
505
1010
|
return cached_result
|
506
1011
|
|
507
|
-
ocr_cache
|
508
|
-
|
1012
|
+
ocr_cache = get_ocr_cache()
|
509
1013
|
try:
|
510
1014
|
self._validate_tesseract_version_sync()
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
1015
|
+
|
1016
|
+
run_config = self._prepare_tesseract_run_config(**kwargs)
|
1017
|
+
|
1018
|
+
temp_fd, temp_path = tempfile.mkstemp(suffix=run_config["ext"])
|
1019
|
+
os.close(temp_fd)
|
1020
|
+
Path(temp_path).unlink()
|
1021
|
+
output_base = temp_path.replace(run_config["ext"], "")
|
1022
|
+
|
515
1023
|
try:
|
516
|
-
command = self._build_tesseract_command(
|
517
|
-
|
1024
|
+
command = self._build_tesseract_command(
|
1025
|
+
path,
|
1026
|
+
output_base,
|
1027
|
+
run_config["language"],
|
1028
|
+
run_config["psm"],
|
1029
|
+
run_config["tesseract_format"],
|
1030
|
+
**run_config["remaining_kwargs"],
|
1031
|
+
)
|
1032
|
+
self._execute_tesseract_sync(command)
|
1033
|
+
|
1034
|
+
output_path = Path(f"{output_base}{run_config['ext']}")
|
1035
|
+
if not output_path.exists():
|
1036
|
+
return ExtractionResult(
|
1037
|
+
content="[OCR processing failed]",
|
1038
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
1039
|
+
metadata={
|
1040
|
+
"source_format": run_config["tesseract_format"],
|
1041
|
+
"error": f"{run_config['ext']} file not generated",
|
1042
|
+
},
|
1043
|
+
chunks=[],
|
1044
|
+
tables=[],
|
1045
|
+
)
|
518
1046
|
|
519
|
-
output_path = Path(output_base + ".txt")
|
520
1047
|
with output_path.open(encoding="utf-8") as f:
|
521
1048
|
output = f.read()
|
522
|
-
extraction_result = ExtractionResult(
|
523
|
-
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
524
|
-
)
|
525
1049
|
|
526
|
-
|
527
|
-
|
528
|
-
|
1050
|
+
extraction_result = self._process_tesseract_output_sync(output, run_config)
|
1051
|
+
|
1052
|
+
if use_cache:
|
1053
|
+
final_cache_kwargs = cache_kwargs.copy()
|
1054
|
+
final_cache_kwargs["ocr_config"] = str(
|
1055
|
+
sorted(
|
1056
|
+
{
|
1057
|
+
**run_config["remaining_kwargs"],
|
1058
|
+
"language": run_config["language"],
|
1059
|
+
"psm": run_config["psm"],
|
1060
|
+
}.items()
|
1061
|
+
)
|
1062
|
+
)
|
1063
|
+
ocr_cache.set(extraction_result, **final_cache_kwargs)
|
529
1064
|
|
530
1065
|
return extraction_result
|
531
|
-
except (RuntimeError, OSError) as e:
|
532
|
-
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
533
1066
|
finally:
|
534
|
-
for
|
535
|
-
|
536
|
-
|
537
|
-
|
1067
|
+
for cleanup_ext in [".txt", ".hocr", ".tsv"]:
|
1068
|
+
cleanup_path = Path(f"{output_base}{cleanup_ext}")
|
1069
|
+
cleanup_path.unlink(missing_ok=True)
|
1070
|
+
except Exception as e:
|
1071
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
538
1072
|
finally:
|
539
|
-
|
1073
|
+
if use_cache:
|
1074
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
540
1075
|
|
541
1076
|
def _get_file_info(self, path: Path) -> dict[str, Any]:
|
542
|
-
"""Get file information for caching."""
|
543
1077
|
try:
|
544
1078
|
stat = path.stat()
|
545
1079
|
return {
|
@@ -555,9 +1089,8 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
555
1089
|
}
|
556
1090
|
|
557
1091
|
def _build_tesseract_command(
|
558
|
-
self, path: Path, output_base: str, language: str, psm: PSMMode, **kwargs: Any
|
1092
|
+
self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
|
559
1093
|
) -> list[str]:
|
560
|
-
"""Build tesseract command with all parameters."""
|
561
1094
|
command = [
|
562
1095
|
"tesseract",
|
563
1096
|
str(path),
|
@@ -571,47 +1104,32 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
571
1104
|
"--loglevel",
|
572
1105
|
"OFF",
|
573
1106
|
]
|
1107
|
+
|
1108
|
+
if output_format != "text":
|
1109
|
+
command.append(output_format)
|
1110
|
+
|
574
1111
|
for kwarg, value in kwargs.items():
|
1112
|
+
if kwarg.startswith("table_"):
|
1113
|
+
continue
|
575
1114
|
if isinstance(value, bool):
|
576
1115
|
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
577
1116
|
else:
|
578
1117
|
command.extend(["-c", f"{kwarg}={value}"])
|
579
1118
|
return command
|
580
1119
|
|
581
|
-
def _run_tesseract_sync(self, command: list[str]) -> None:
|
582
|
-
"""Run tesseract command synchronously."""
|
583
|
-
env = os.environ.copy()
|
584
|
-
if sys.platform.startswith("linux"):
|
585
|
-
env["OMP_THREAD_LIMIT"] = "1"
|
586
|
-
|
587
|
-
result = subprocess.run(
|
588
|
-
command,
|
589
|
-
check=False,
|
590
|
-
env=env,
|
591
|
-
capture_output=True,
|
592
|
-
text=True,
|
593
|
-
timeout=30,
|
594
|
-
)
|
595
|
-
|
596
|
-
if result.returncode != 0:
|
597
|
-
raise OCRError(
|
598
|
-
"OCR failed with a non-0 return code.",
|
599
|
-
context={"error": result.stderr},
|
600
|
-
)
|
601
|
-
|
602
1120
|
@classmethod
|
603
1121
|
def _validate_tesseract_version_sync(cls) -> None:
|
604
|
-
"""Synchronously validate that Tesseract is installed and is version 5 or above.
|
605
|
-
|
606
|
-
Raises:
|
607
|
-
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
608
|
-
"""
|
609
1122
|
try:
|
610
1123
|
if cls._version_checked:
|
611
1124
|
return
|
612
1125
|
|
613
1126
|
command = ["tesseract", "--version"]
|
614
|
-
|
1127
|
+
try:
|
1128
|
+
result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
|
1129
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
1130
|
+
raise MissingDependencyError(
|
1131
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
1132
|
+
) from e
|
615
1133
|
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout)
|
616
1134
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
617
1135
|
raise MissingDependencyError(
|
@@ -626,17 +1144,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
626
1144
|
|
627
1145
|
@staticmethod
|
628
1146
|
def _validate_language_code(language_code: str) -> str:
|
629
|
-
"""Convert a language code to Tesseract format.
|
630
|
-
|
631
|
-
Args:
|
632
|
-
language_code: Tesseract supported language code or multiple language codes connected with '+'
|
633
|
-
|
634
|
-
Raises:
|
635
|
-
ValidationError: If the language is not supported by Tesseract
|
636
|
-
|
637
|
-
Returns:
|
638
|
-
Language code compatible with Tesseract
|
639
|
-
"""
|
640
1147
|
normalized = language_code.lower()
|
641
1148
|
if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
|
642
1149
|
return normalized
|
@@ -661,18 +1168,6 @@ def _process_image_with_tesseract(
|
|
661
1168
|
image_path: str,
|
662
1169
|
config_dict: dict[str, Any],
|
663
1170
|
) -> dict[str, Any]:
|
664
|
-
"""Process a single image with Tesseract in a separate process.
|
665
|
-
|
666
|
-
This function is designed to be executed in a subprocess.
|
667
|
-
It uses direct tesseract command execution to avoid async complications.
|
668
|
-
|
669
|
-
Args:
|
670
|
-
image_path: Path to the image file.
|
671
|
-
config_dict: Tesseract configuration as dictionary.
|
672
|
-
|
673
|
-
Returns:
|
674
|
-
OCR result as dictionary.
|
675
|
-
"""
|
676
1171
|
try:
|
677
1172
|
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
|
678
1173
|
output_base = tmp_file.name.replace(".txt", "")
|
@@ -721,6 +1216,7 @@ def _process_image_with_tesseract(
|
|
721
1216
|
capture_output=True,
|
722
1217
|
text=True,
|
723
1218
|
timeout=30,
|
1219
|
+
encoding="utf-8",
|
724
1220
|
)
|
725
1221
|
|
726
1222
|
if result.returncode != 0:
|
@@ -759,19 +1255,12 @@ def _process_image_bytes_with_tesseract(
|
|
759
1255
|
image_bytes: bytes,
|
760
1256
|
config_dict: dict[str, Any],
|
761
1257
|
) -> dict[str, Any]:
|
762
|
-
"""Process image bytes with Tesseract in a separate process.
|
763
|
-
|
764
|
-
Args:
|
765
|
-
image_bytes: Image data as bytes.
|
766
|
-
config_dict: Tesseract configuration as dictionary.
|
767
|
-
|
768
|
-
Returns:
|
769
|
-
OCR result as dictionary.
|
770
|
-
"""
|
771
1258
|
try:
|
772
|
-
with
|
773
|
-
|
774
|
-
|
1259
|
+
with (
|
1260
|
+
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image,
|
1261
|
+
Image.open(io.BytesIO(image_bytes)) as image,
|
1262
|
+
):
|
1263
|
+
image.save(tmp_image.name, format="PNG")
|
775
1264
|
image_path = tmp_image.name
|
776
1265
|
|
777
1266
|
try:
|
@@ -791,21 +1280,12 @@ def _process_image_bytes_with_tesseract(
|
|
791
1280
|
|
792
1281
|
|
793
1282
|
class TesseractProcessPool:
|
794
|
-
"""Process pool for parallel Tesseract OCR processing."""
|
795
|
-
|
796
1283
|
def __init__(
|
797
1284
|
self,
|
798
1285
|
config: TesseractConfig | None = None,
|
799
1286
|
max_processes: int | None = None,
|
800
1287
|
memory_limit_gb: float | None = None,
|
801
1288
|
) -> None:
|
802
|
-
"""Initialize the Tesseract process pool.
|
803
|
-
|
804
|
-
Args:
|
805
|
-
config: Default Tesseract configuration.
|
806
|
-
max_processes: Maximum number of processes.
|
807
|
-
memory_limit_gb: Memory limit in GB.
|
808
|
-
"""
|
809
1289
|
from kreuzberg._utils._process_pool import ProcessPoolManager # noqa: PLC0415
|
810
1290
|
|
811
1291
|
self.config = config or TesseractConfig()
|
@@ -815,7 +1295,6 @@ class TesseractProcessPool:
|
|
815
1295
|
)
|
816
1296
|
|
817
1297
|
def _config_to_dict(self, config: TesseractConfig | None = None) -> dict[str, Any]:
|
818
|
-
"""Convert TesseractConfig to dictionary for pickling."""
|
819
1298
|
cfg = config or self.config
|
820
1299
|
|
821
1300
|
config_dict = {}
|
@@ -830,7 +1309,6 @@ class TesseractProcessPool:
|
|
830
1309
|
return config_dict
|
831
1310
|
|
832
1311
|
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
833
|
-
"""Convert result dictionary back to OCRResult."""
|
834
1312
|
if not result_dict["success"]:
|
835
1313
|
raise OCRError(f"Tesseract processing failed: {result_dict['error']}")
|
836
1314
|
|
@@ -846,15 +1324,6 @@ class TesseractProcessPool:
|
|
846
1324
|
image_path: str | Path,
|
847
1325
|
config: TesseractConfig | None = None,
|
848
1326
|
) -> ExtractionResult:
|
849
|
-
"""Process a single image file with Tesseract.
|
850
|
-
|
851
|
-
Args:
|
852
|
-
image_path: Path to the image file.
|
853
|
-
config: Tesseract configuration (uses default if None).
|
854
|
-
|
855
|
-
Returns:
|
856
|
-
OCR result.
|
857
|
-
"""
|
858
1327
|
config_dict = self._config_to_dict(config)
|
859
1328
|
|
860
1329
|
task_memory_mb = 80
|
@@ -873,15 +1342,6 @@ class TesseractProcessPool:
|
|
873
1342
|
image_bytes: bytes,
|
874
1343
|
config: TesseractConfig | None = None,
|
875
1344
|
) -> ExtractionResult:
|
876
|
-
"""Process image bytes with Tesseract.
|
877
|
-
|
878
|
-
Args:
|
879
|
-
image_bytes: Image data as bytes.
|
880
|
-
config: Tesseract configuration (uses default if None).
|
881
|
-
|
882
|
-
Returns:
|
883
|
-
OCR result.
|
884
|
-
"""
|
885
1345
|
config_dict = self._config_to_dict(config)
|
886
1346
|
|
887
1347
|
image_size_mb = len(image_bytes) / 1024 / 1024
|
@@ -902,16 +1362,6 @@ class TesseractProcessPool:
|
|
902
1362
|
config: TesseractConfig | None = None,
|
903
1363
|
max_concurrent: int | None = None,
|
904
1364
|
) -> list[ExtractionResult]:
|
905
|
-
"""Process a batch of images in parallel.
|
906
|
-
|
907
|
-
Args:
|
908
|
-
image_paths: List of image file paths.
|
909
|
-
config: Tesseract configuration (uses default if None).
|
910
|
-
max_concurrent: Maximum concurrent processes.
|
911
|
-
|
912
|
-
Returns:
|
913
|
-
List of OCR results in the same order as input.
|
914
|
-
"""
|
915
1365
|
if not image_paths:
|
916
1366
|
return []
|
917
1367
|
|
@@ -936,16 +1386,6 @@ class TesseractProcessPool:
|
|
936
1386
|
config: TesseractConfig | None = None,
|
937
1387
|
max_concurrent: int | None = None,
|
938
1388
|
) -> list[ExtractionResult]:
|
939
|
-
"""Process a batch of image bytes in parallel.
|
940
|
-
|
941
|
-
Args:
|
942
|
-
image_bytes_list: List of image data as bytes.
|
943
|
-
config: Tesseract configuration (uses default if None).
|
944
|
-
max_concurrent: Maximum concurrent processes.
|
945
|
-
|
946
|
-
Returns:
|
947
|
-
List of OCR results in the same order as input.
|
948
|
-
"""
|
949
1389
|
if not image_bytes_list:
|
950
1390
|
return []
|
951
1391
|
|
@@ -966,15 +1406,12 @@ class TesseractProcessPool:
|
|
966
1406
|
return [self._result_from_dict(result_dict) for result_dict in result_dicts]
|
967
1407
|
|
968
1408
|
def get_system_info(self) -> dict[str, Any]:
|
969
|
-
"""Get system information from the process manager."""
|
970
1409
|
return self.process_manager.get_system_info()
|
971
1410
|
|
972
1411
|
def shutdown(self, wait: bool = True) -> None:
|
973
|
-
"""Shutdown the process pool."""
|
974
1412
|
self.process_manager.shutdown(wait=wait)
|
975
1413
|
|
976
1414
|
async def __aenter__(self) -> Self:
|
977
|
-
"""Async context manager entry."""
|
978
1415
|
return self
|
979
1416
|
|
980
1417
|
async def __aexit__(
|
@@ -983,5 +1420,4 @@ class TesseractProcessPool:
|
|
983
1420
|
exc_val: BaseException | None,
|
984
1421
|
exc_tb: object,
|
985
1422
|
) -> None:
|
986
|
-
"""Async context manager exit."""
|
987
1423
|
self.shutdown()
|