kreuzberg 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -1
- kreuzberg/_extractors/__init__.py +0 -0
- kreuzberg/_extractors/_base.py +92 -0
- kreuzberg/_extractors/_html.py +34 -0
- kreuzberg/_extractors/_image.py +74 -0
- kreuzberg/_extractors/_pandoc.py +613 -0
- kreuzberg/_extractors/_pdf.py +171 -0
- kreuzberg/_extractors/_presentation.py +233 -0
- kreuzberg/_extractors/_spread_sheet.py +125 -0
- kreuzberg/_gmft.py +174 -0
- kreuzberg/_ocr/__init__.py +17 -0
- kreuzberg/_ocr/_base.py +54 -0
- kreuzberg/_ocr/_easyocr.py +376 -0
- kreuzberg/_ocr/_paddleocr.py +283 -0
- kreuzberg/_ocr/_tesseract.py +342 -0
- kreuzberg/_types.py +31 -4
- kreuzberg/_utils/__init__.py +0 -0
- kreuzberg/_utils/_string.py +39 -0
- kreuzberg/_utils/_sync.py +121 -0
- kreuzberg/_utils/_tmp.py +37 -0
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/METADATA +14 -19
- kreuzberg-3.1.0.dist-info/RECORD +33 -0
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/WHEEL +1 -1
- kreuzberg-3.0.0.dist-info/RECORD +0 -15
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/licenses/LICENSE +0 -0
- {kreuzberg-3.0.0.dist-info → kreuzberg-3.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,283 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import platform
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from importlib.util import find_spec
|
6
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
7
|
+
|
8
|
+
from PIL import Image
|
9
|
+
|
10
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
11
|
+
from kreuzberg._ocr._base import OCRBackend
|
12
|
+
from kreuzberg._types import ExtractionResult, Metadata
|
13
|
+
from kreuzberg._utils._string import normalize_spaces
|
14
|
+
from kreuzberg._utils._sync import run_sync
|
15
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from pathlib import Path
|
19
|
+
|
20
|
+
|
21
|
+
try: # pragma: no cover
|
22
|
+
from typing import Unpack # type: ignore[attr-defined]
|
23
|
+
except ImportError: # pragma: no cover
|
24
|
+
from typing_extensions import Unpack
|
25
|
+
|
26
|
+
|
27
|
+
PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass(unsafe_hash=True, frozen=True)
|
31
|
+
class PaddleOCRConfig:
|
32
|
+
"""Configuration options for PaddleOCR.
|
33
|
+
|
34
|
+
This TypedDict provides type hints and documentation for all PaddleOCR parameters.
|
35
|
+
"""
|
36
|
+
|
37
|
+
cls_image_shape: str = "3,48,192"
|
38
|
+
"""Image shape for classification algorithm in format 'channels,height,width'."""
|
39
|
+
det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
|
40
|
+
"""Detection algorithm."""
|
41
|
+
det_db_box_thresh: float = 0.5
|
42
|
+
"""Score threshold for detected boxes. Boxes below this value are discarded."""
|
43
|
+
det_db_thresh: float = 0.3
|
44
|
+
"""Binarization threshold for DB output map."""
|
45
|
+
det_db_unclip_ratio: float = 2.0
|
46
|
+
"""Expansion ratio for detected text boxes."""
|
47
|
+
det_east_cover_thresh: float = 0.1
|
48
|
+
"""Score threshold for EAST output boxes."""
|
49
|
+
det_east_nms_thresh: float = 0.2
|
50
|
+
"""NMS threshold for EAST model output boxes."""
|
51
|
+
det_east_score_thresh: float = 0.8
|
52
|
+
"""Binarization threshold for EAST output map."""
|
53
|
+
det_max_side_len: int = 960
|
54
|
+
"""Maximum size of image long side. Images exceeding this will be proportionally resized."""
|
55
|
+
drop_score: float = 0.5
|
56
|
+
"""Filter recognition results by confidence score. Results below this are discarded."""
|
57
|
+
enable_mkldnn: bool = False
|
58
|
+
"""Whether to enable MKL-DNN acceleration (Intel CPU only)."""
|
59
|
+
gpu_mem: int = 8000
|
60
|
+
"""GPU memory size (in MB) to use for initialization."""
|
61
|
+
language: str = "en"
|
62
|
+
"""Language to use for OCR."""
|
63
|
+
max_text_length: int = 25
|
64
|
+
"""Maximum text length that the recognition algorithm can recognize."""
|
65
|
+
rec: bool = True
|
66
|
+
"""Enable text recognition when using the ocr() function."""
|
67
|
+
rec_algorithm: Literal[
|
68
|
+
"CRNN",
|
69
|
+
"SRN",
|
70
|
+
"NRTR",
|
71
|
+
"SAR",
|
72
|
+
"SEED",
|
73
|
+
"SVTR",
|
74
|
+
"SVTR_LCNet",
|
75
|
+
"ViTSTR",
|
76
|
+
"ABINet",
|
77
|
+
"VisionLAN",
|
78
|
+
"SPIN",
|
79
|
+
"RobustScanner",
|
80
|
+
"RFL",
|
81
|
+
] = "CRNN"
|
82
|
+
"""Recognition algorithm."""
|
83
|
+
rec_image_shape: str = "3,32,320"
|
84
|
+
"""Image shape for recognition algorithm in format 'channels,height,width'."""
|
85
|
+
table: bool = True
|
86
|
+
"""Whether to enable table recognition."""
|
87
|
+
use_angle_cls: bool = True
|
88
|
+
"""Whether to use text orientation classification model."""
|
89
|
+
use_gpu: bool = False
|
90
|
+
"""Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
|
91
|
+
use_space_char: bool = True
|
92
|
+
"""Whether to recognize spaces."""
|
93
|
+
use_zero_copy_run: bool = False
|
94
|
+
"""Whether to enable zero_copy_run for inference optimization."""
|
95
|
+
|
96
|
+
|
97
|
+
class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
98
|
+
_paddle_ocr: ClassVar[Any] = None
|
99
|
+
|
100
|
+
async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
101
|
+
"""Asynchronously process an image and extract its text and metadata using PaddleOCR.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
image: An instance of PIL.Image representing the input image.
|
105
|
+
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
109
|
+
|
110
|
+
Raises:
|
111
|
+
OCRError: If OCR processing fails.
|
112
|
+
"""
|
113
|
+
import numpy as np
|
114
|
+
|
115
|
+
await self._init_paddle_ocr(**kwargs)
|
116
|
+
image_np = np.array(image)
|
117
|
+
try:
|
118
|
+
result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
|
119
|
+
return self._process_paddle_result(result, image)
|
120
|
+
except Exception as e:
|
121
|
+
raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
|
122
|
+
|
123
|
+
async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
|
124
|
+
"""Asynchronously process a file and extract its text and metadata using PaddleOCR.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
path: A Path object representing the file to be processed.
|
128
|
+
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
132
|
+
|
133
|
+
Raises:
|
134
|
+
OCRError: If file loading or OCR processing fails.
|
135
|
+
"""
|
136
|
+
await self._init_paddle_ocr(**kwargs)
|
137
|
+
try:
|
138
|
+
image = await run_sync(Image.open, path)
|
139
|
+
return await self.process_image(image, **kwargs)
|
140
|
+
except Exception as e:
|
141
|
+
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
142
|
+
|
143
|
+
@staticmethod
|
144
|
+
def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
|
145
|
+
"""Process PaddleOCR result into an ExtractionResult with metadata.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
result: The raw result from PaddleOCR.
|
149
|
+
image: The original PIL image.
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
153
|
+
"""
|
154
|
+
text_content = ""
|
155
|
+
confidence_sum = 0
|
156
|
+
confidence_count = 0
|
157
|
+
|
158
|
+
for page_result in result:
|
159
|
+
if not page_result:
|
160
|
+
continue
|
161
|
+
|
162
|
+
sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
|
163
|
+
line_groups: list[list[Any]] = []
|
164
|
+
current_line: list[Any] = []
|
165
|
+
prev_y: float | None = None
|
166
|
+
|
167
|
+
for box in sorted_boxes:
|
168
|
+
box_points, (_, _) = box
|
169
|
+
current_y = sum(point[1] for point in box_points) / 4
|
170
|
+
min_box_distance = 20
|
171
|
+
|
172
|
+
if prev_y is None or abs(current_y - prev_y) > min_box_distance:
|
173
|
+
if current_line:
|
174
|
+
line_groups.append(current_line)
|
175
|
+
current_line = [box]
|
176
|
+
else:
|
177
|
+
current_line.append(box)
|
178
|
+
|
179
|
+
prev_y = current_y
|
180
|
+
|
181
|
+
if current_line:
|
182
|
+
line_groups.append(current_line)
|
183
|
+
|
184
|
+
for line in line_groups:
|
185
|
+
line_sorted = sorted(line, key=lambda x: x[0][0][0])
|
186
|
+
|
187
|
+
for box in line_sorted:
|
188
|
+
_, (text, confidence) = box
|
189
|
+
if text:
|
190
|
+
text_content += text + " "
|
191
|
+
confidence_sum += confidence
|
192
|
+
confidence_count += 1
|
193
|
+
|
194
|
+
text_content += "\n"
|
195
|
+
|
196
|
+
width, height = image.size
|
197
|
+
metadata = Metadata(
|
198
|
+
width=width,
|
199
|
+
height=height,
|
200
|
+
)
|
201
|
+
|
202
|
+
return ExtractionResult(
|
203
|
+
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
|
204
|
+
)
|
205
|
+
|
206
|
+
@classmethod
|
207
|
+
def _is_mkldnn_supported(cls) -> bool:
|
208
|
+
"""Check if the current architecture supports MKL-DNN optimization.
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
True if MKL-DNN is supported on this architecture.
|
212
|
+
"""
|
213
|
+
system = platform.system().lower()
|
214
|
+
processor = platform.processor().lower()
|
215
|
+
machine = platform.machine().lower()
|
216
|
+
|
217
|
+
if system in ("linux", "windows"):
|
218
|
+
return "intel" in processor or "x86" in machine or "amd64" in machine or "x86_64" in machine
|
219
|
+
|
220
|
+
if system == "darwin":
|
221
|
+
return machine == "x86_64"
|
222
|
+
|
223
|
+
return False
|
224
|
+
|
225
|
+
@classmethod
|
226
|
+
async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
|
227
|
+
"""Initialize PaddleOCR with the provided configuration.
|
228
|
+
|
229
|
+
Args:
|
230
|
+
**kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
|
231
|
+
|
232
|
+
Raises:
|
233
|
+
MissingDependencyError: If PaddleOCR is not installed.
|
234
|
+
OCRError: If initialization fails.
|
235
|
+
"""
|
236
|
+
if cls._paddle_ocr is not None:
|
237
|
+
return
|
238
|
+
|
239
|
+
try:
|
240
|
+
from paddleocr import PaddleOCR
|
241
|
+
except ImportError as e:
|
242
|
+
raise MissingDependencyError.create_for_package(
|
243
|
+
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
244
|
+
) from e
|
245
|
+
|
246
|
+
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
247
|
+
has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
|
248
|
+
kwargs.setdefault("use_angle_cls", True)
|
249
|
+
kwargs.setdefault("use_gpu", has_gpu_package)
|
250
|
+
kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
|
251
|
+
kwargs.setdefault("det_db_thresh", 0.3)
|
252
|
+
kwargs.setdefault("det_db_box_thresh", 0.5)
|
253
|
+
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
254
|
+
|
255
|
+
try:
|
256
|
+
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
|
257
|
+
except Exception as e:
|
258
|
+
raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
|
259
|
+
|
260
|
+
@staticmethod
|
261
|
+
def _validate_language_code(lang_code: str) -> str:
|
262
|
+
"""Convert a language code to PaddleOCR format.
|
263
|
+
|
264
|
+
Args:
|
265
|
+
lang_code: ISO language code or language name
|
266
|
+
|
267
|
+
Raises:
|
268
|
+
ValidationError: If the language is not supported by PaddleOCR
|
269
|
+
|
270
|
+
Returns:
|
271
|
+
Language code compatible with PaddleOCR
|
272
|
+
"""
|
273
|
+
normalized = lang_code.lower()
|
274
|
+
if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
|
275
|
+
return normalized
|
276
|
+
|
277
|
+
raise ValidationError(
|
278
|
+
"The provided language code is not supported by PaddleOCR",
|
279
|
+
context={
|
280
|
+
"language_code": lang_code,
|
281
|
+
"supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
|
282
|
+
},
|
283
|
+
)
|
@@ -0,0 +1,342 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import sys
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from enum import Enum
|
7
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
8
|
+
|
9
|
+
from anyio import Path as AsyncPath
|
10
|
+
from anyio import run_process
|
11
|
+
|
12
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
13
|
+
from kreuzberg._ocr._base import OCRBackend
|
14
|
+
from kreuzberg._types import ExtractionResult
|
15
|
+
from kreuzberg._utils._string import normalize_spaces
|
16
|
+
from kreuzberg._utils._sync import run_sync
|
17
|
+
from kreuzberg._utils._tmp import create_temp_file
|
18
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
19
|
+
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from pathlib import Path
|
22
|
+
|
23
|
+
from PIL.Image import Image
|
24
|
+
|
25
|
+
try: # pragma: no cover
|
26
|
+
from typing import Unpack # type: ignore[attr-defined]
|
27
|
+
except ImportError: # pragma: no cover
|
28
|
+
from typing_extensions import Unpack
|
29
|
+
|
30
|
+
|
31
|
+
TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
32
|
+
"afr",
|
33
|
+
"amh",
|
34
|
+
"ara",
|
35
|
+
"asm",
|
36
|
+
"aze",
|
37
|
+
"aze_cyrl",
|
38
|
+
"bel",
|
39
|
+
"ben",
|
40
|
+
"bod",
|
41
|
+
"bos",
|
42
|
+
"bre",
|
43
|
+
"bul",
|
44
|
+
"cat",
|
45
|
+
"ceb",
|
46
|
+
"ces",
|
47
|
+
"chi_sim",
|
48
|
+
"chi_tra",
|
49
|
+
"chr",
|
50
|
+
"cos",
|
51
|
+
"cym",
|
52
|
+
"dan",
|
53
|
+
"dan_frak",
|
54
|
+
"deu",
|
55
|
+
"deu_frak",
|
56
|
+
"deu_latf",
|
57
|
+
"dzo",
|
58
|
+
"ell",
|
59
|
+
"eng",
|
60
|
+
"enm",
|
61
|
+
"epo",
|
62
|
+
"equ",
|
63
|
+
"est",
|
64
|
+
"eus",
|
65
|
+
"fao",
|
66
|
+
"fas",
|
67
|
+
"fil",
|
68
|
+
"fin",
|
69
|
+
"fra",
|
70
|
+
"frk",
|
71
|
+
"frm",
|
72
|
+
"fry",
|
73
|
+
"gla",
|
74
|
+
"gle",
|
75
|
+
"glg",
|
76
|
+
"grc",
|
77
|
+
"guj",
|
78
|
+
"hat",
|
79
|
+
"heb",
|
80
|
+
"hin",
|
81
|
+
"hrv",
|
82
|
+
"hun",
|
83
|
+
"hye",
|
84
|
+
"iku",
|
85
|
+
"ind",
|
86
|
+
"isl",
|
87
|
+
"ita",
|
88
|
+
"ita_old",
|
89
|
+
"jav",
|
90
|
+
"jpn",
|
91
|
+
"kan",
|
92
|
+
"kat",
|
93
|
+
"kat_old",
|
94
|
+
"kaz",
|
95
|
+
"khm",
|
96
|
+
"kir",
|
97
|
+
"kmr",
|
98
|
+
"kor",
|
99
|
+
"kor_vert",
|
100
|
+
"kur",
|
101
|
+
"lao",
|
102
|
+
"lat",
|
103
|
+
"lav",
|
104
|
+
"lit",
|
105
|
+
"ltz",
|
106
|
+
"mal",
|
107
|
+
"mar",
|
108
|
+
"mkd",
|
109
|
+
"mlt",
|
110
|
+
"mon",
|
111
|
+
"mri",
|
112
|
+
"msa",
|
113
|
+
"mya",
|
114
|
+
"nep",
|
115
|
+
"nld",
|
116
|
+
"nor",
|
117
|
+
"oci",
|
118
|
+
"ori",
|
119
|
+
"osd",
|
120
|
+
"pan",
|
121
|
+
"pol",
|
122
|
+
"por",
|
123
|
+
"pus",
|
124
|
+
"que",
|
125
|
+
"ron",
|
126
|
+
"rus",
|
127
|
+
"san",
|
128
|
+
"sin",
|
129
|
+
"slk",
|
130
|
+
"slk_frak",
|
131
|
+
"slv",
|
132
|
+
"snd",
|
133
|
+
"spa",
|
134
|
+
"spa_old",
|
135
|
+
"sqi",
|
136
|
+
"srp",
|
137
|
+
"srp_latn",
|
138
|
+
"sun",
|
139
|
+
"swa",
|
140
|
+
"swe",
|
141
|
+
"syr",
|
142
|
+
"tam",
|
143
|
+
"tat",
|
144
|
+
"tel",
|
145
|
+
"tgk",
|
146
|
+
"tgl",
|
147
|
+
"tha", # codespell:ignore
|
148
|
+
"tir",
|
149
|
+
"ton",
|
150
|
+
"tur",
|
151
|
+
"uig",
|
152
|
+
"ukr",
|
153
|
+
"urd",
|
154
|
+
"uzb",
|
155
|
+
"uzb_cyrl",
|
156
|
+
"vie", # codespell:ignore
|
157
|
+
"yid",
|
158
|
+
"yor",
|
159
|
+
}
|
160
|
+
|
161
|
+
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
162
|
+
|
163
|
+
|
164
|
+
class PSMMode(Enum):
|
165
|
+
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
166
|
+
|
167
|
+
OSD_ONLY = 0
|
168
|
+
"""Orientation and script detection only."""
|
169
|
+
AUTO_OSD = 1
|
170
|
+
"""Automatic page segmentation with orientation and script detection."""
|
171
|
+
AUTO_ONLY = 2
|
172
|
+
"""Automatic page segmentation without OSD."""
|
173
|
+
AUTO = 3
|
174
|
+
"""Fully automatic page segmentation (default)."""
|
175
|
+
SINGLE_COLUMN = 4
|
176
|
+
"""Assume a single column of text."""
|
177
|
+
SINGLE_BLOCK_VERTICAL = 5
|
178
|
+
"""Assume a single uniform block of vertically aligned text."""
|
179
|
+
SINGLE_BLOCK = 6
|
180
|
+
"""Assume a single uniform block of text."""
|
181
|
+
SINGLE_LINE = 7
|
182
|
+
"""Treat the image as a single text line."""
|
183
|
+
SINGLE_WORD = 8
|
184
|
+
"""Treat the image as a single word."""
|
185
|
+
CIRCLE_WORD = 9
|
186
|
+
"""Treat the image as a single word in a circle."""
|
187
|
+
SINGLE_CHAR = 10
|
188
|
+
"""Treat the image as a single character."""
|
189
|
+
|
190
|
+
|
191
|
+
@dataclass(unsafe_hash=True, frozen=True)
|
192
|
+
class TesseractConfig:
|
193
|
+
"""Configuration options for Tesseract OCR engine."""
|
194
|
+
|
195
|
+
classify_use_pre_adapted_templates: bool = True
|
196
|
+
"""Whether to use pre-adapted templates during classification to improve recognition accuracy."""
|
197
|
+
language: str = "eng"
|
198
|
+
"""Language code to use for OCR.
|
199
|
+
Examples:
|
200
|
+
- 'eng' for English
|
201
|
+
- 'deu' for German
|
202
|
+
- multiple languages combined with '+', e.g. 'eng+deu')
|
203
|
+
"""
|
204
|
+
language_model_ngram_on: bool = True
|
205
|
+
"""Enable or disable the use of n-gram-based language models for improved text recognition."""
|
206
|
+
psm: PSMMode = PSMMode.AUTO
|
207
|
+
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
208
|
+
tessedit_dont_blkrej_good_wds: bool = True
|
209
|
+
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
210
|
+
tessedit_dont_rowrej_good_wds: bool = True
|
211
|
+
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
212
|
+
tessedit_enable_dict_correction: bool = True
|
213
|
+
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
214
|
+
tessedit_use_primary_params_model: bool = True
|
215
|
+
"""If True, forces the use of the primary parameters model for text recognition."""
|
216
|
+
textord_space_size_is_variable: bool = True
|
217
|
+
"""Allow variable spacing between words, useful for text with irregular spacing."""
|
218
|
+
thresholding_method: bool = False
|
219
|
+
"""Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
|
220
|
+
|
221
|
+
|
222
|
+
class TesseractBackend(OCRBackend[TesseractConfig]):
|
223
|
+
_version_checked: ClassVar[bool] = False
|
224
|
+
|
225
|
+
async def process_image(
|
226
|
+
self,
|
227
|
+
image: Image,
|
228
|
+
**kwargs: Unpack[TesseractConfig],
|
229
|
+
) -> ExtractionResult:
|
230
|
+
await self._validate_tesseract_version()
|
231
|
+
image_path, unlink = await create_temp_file(".png")
|
232
|
+
await run_sync(image.save, str(image_path), format="PNG")
|
233
|
+
try:
|
234
|
+
return await self.process_file(image_path, **kwargs)
|
235
|
+
finally:
|
236
|
+
await unlink()
|
237
|
+
|
238
|
+
async def process_file(
|
239
|
+
self,
|
240
|
+
path: Path,
|
241
|
+
**kwargs: Unpack[TesseractConfig],
|
242
|
+
) -> ExtractionResult:
|
243
|
+
await self._validate_tesseract_version()
|
244
|
+
output_path, unlink = await create_temp_file(".txt")
|
245
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
246
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
247
|
+
try:
|
248
|
+
output_base = str(output_path).replace(".txt", "")
|
249
|
+
command = [
|
250
|
+
"tesseract",
|
251
|
+
str(path),
|
252
|
+
output_base,
|
253
|
+
"-l",
|
254
|
+
language,
|
255
|
+
"--psm",
|
256
|
+
str(psm.value),
|
257
|
+
"--oem",
|
258
|
+
"1",
|
259
|
+
"--loglevel",
|
260
|
+
"OFF",
|
261
|
+
]
|
262
|
+
for kwarg, value in kwargs.items():
|
263
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
264
|
+
|
265
|
+
env: dict[str, Any] | None = None
|
266
|
+
if sys.platform.startswith("linux"):
|
267
|
+
# we have to prevent multithreading this way otherwise we will get deadlocks ~keep
|
268
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
269
|
+
|
270
|
+
result = await run_process(command, env=env)
|
271
|
+
|
272
|
+
if not result.returncode == 0:
|
273
|
+
raise OCRError(
|
274
|
+
"OCR failed with a non-0 return code.",
|
275
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
276
|
+
)
|
277
|
+
|
278
|
+
output = await AsyncPath(output_path).read_text("utf-8")
|
279
|
+
return ExtractionResult(
|
280
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
281
|
+
)
|
282
|
+
except (RuntimeError, OSError) as e:
|
283
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
284
|
+
finally:
|
285
|
+
await unlink()
|
286
|
+
|
287
|
+
@classmethod
|
288
|
+
async def _validate_tesseract_version(cls) -> None:
|
289
|
+
"""Validate that Tesseract is installed and is version 5 or above.
|
290
|
+
|
291
|
+
Raises:
|
292
|
+
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
293
|
+
"""
|
294
|
+
try:
|
295
|
+
if cls._version_checked:
|
296
|
+
return
|
297
|
+
|
298
|
+
command = ["tesseract", "--version"]
|
299
|
+
result = await run_process(command)
|
300
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
301
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
302
|
+
raise MissingDependencyError(
|
303
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
304
|
+
)
|
305
|
+
|
306
|
+
cls._version_checked = True
|
307
|
+
except FileNotFoundError as e:
|
308
|
+
raise MissingDependencyError(
|
309
|
+
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
310
|
+
) from e
|
311
|
+
|
312
|
+
@staticmethod
|
313
|
+
def _validate_language_code(language_code: str) -> str:
|
314
|
+
"""Convert a language code to Tesseract format.
|
315
|
+
|
316
|
+
Args:
|
317
|
+
language_code: Tesseract supported language code or multiple language codes connected with '+'
|
318
|
+
|
319
|
+
Raises:
|
320
|
+
ValidationError: If the language is not supported by Tesseract
|
321
|
+
|
322
|
+
Returns:
|
323
|
+
Language code compatible with Tesseract
|
324
|
+
"""
|
325
|
+
normalized = language_code.lower()
|
326
|
+
if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
|
327
|
+
return normalized
|
328
|
+
|
329
|
+
if "+" in normalized and all(lang in TESSERACT_SUPPORTED_LANGUAGE_CODES for lang in normalized.split("+")):
|
330
|
+
return normalized
|
331
|
+
|
332
|
+
raise ValidationError(
|
333
|
+
"The provided language code is not supported by Tesseract",
|
334
|
+
context={
|
335
|
+
"language_code": normalized
|
336
|
+
if "+" not in normalized
|
337
|
+
else ",".join(
|
338
|
+
[lang for lang in normalized.split("+") if lang not in TESSERACT_SUPPORTED_LANGUAGE_CODES]
|
339
|
+
),
|
340
|
+
"supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
|
341
|
+
},
|
342
|
+
)
|
kreuzberg/_types.py
CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import sys
|
4
4
|
from collections.abc import Awaitable
|
5
|
-
from dataclasses import asdict, dataclass
|
5
|
+
from dataclasses import asdict, dataclass, field
|
6
6
|
from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
|
7
7
|
|
8
8
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
@@ -14,6 +14,10 @@ else: # pragma: no cover
|
|
14
14
|
from typing import NotRequired
|
15
15
|
|
16
16
|
if TYPE_CHECKING:
|
17
|
+
from pandas import DataFrame
|
18
|
+
from PIL.Image import Image
|
19
|
+
|
20
|
+
from kreuzberg._gmft import GMFTConfig
|
17
21
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
18
22
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
19
23
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
@@ -21,6 +25,19 @@ if TYPE_CHECKING:
|
|
21
25
|
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
22
26
|
|
23
27
|
|
28
|
+
class TableData(TypedDict):
|
29
|
+
"""Table data, returned from table extraction."""
|
30
|
+
|
31
|
+
cropped_image: Image
|
32
|
+
"""The cropped image of the table."""
|
33
|
+
df: DataFrame
|
34
|
+
"""The table data as a pandas DataFrame."""
|
35
|
+
page_number: int
|
36
|
+
"""The page number of the table."""
|
37
|
+
text: str
|
38
|
+
"""The table text as a markdown string."""
|
39
|
+
|
40
|
+
|
24
41
|
class Metadata(TypedDict, total=False):
|
25
42
|
"""Base metadata common to all document types.
|
26
43
|
|
@@ -88,12 +105,14 @@ class ExtractionResult:
|
|
88
105
|
|
89
106
|
content: str
|
90
107
|
"""The extracted content."""
|
91
|
-
chunks: list[str]
|
92
|
-
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
93
108
|
mime_type: str
|
94
109
|
"""The mime type of the extracted content. Is either text/plain or text/markdown."""
|
95
110
|
metadata: Metadata
|
96
111
|
"""The metadata of the content."""
|
112
|
+
tables: list[TableData] = field(default_factory=list)
|
113
|
+
"""Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
|
114
|
+
chunks: list[str] = field(default_factory=list)
|
115
|
+
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
97
116
|
|
98
117
|
|
99
118
|
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
@@ -114,14 +133,22 @@ class ExtractionConfig:
|
|
114
133
|
"""Whether to force OCR."""
|
115
134
|
chunk_content: bool = False
|
116
135
|
"""Whether to chunk the content into smaller chunks."""
|
136
|
+
extract_tables: bool = False
|
137
|
+
"""Whether to extract tables from the content. This requires the 'gmft' dependency."""
|
117
138
|
max_chars: int = DEFAULT_MAX_CHARACTERS
|
118
139
|
"""The size of each chunk in characters."""
|
119
140
|
max_overlap: int = DEFAULT_MAX_OVERLAP
|
120
141
|
"""The overlap between chunks in characters."""
|
121
142
|
ocr_backend: OcrBackendType | None = "tesseract"
|
122
|
-
"""The OCR backend to use.
|
143
|
+
"""The OCR backend to use.
|
144
|
+
|
145
|
+
Notes:
|
146
|
+
- If set to 'None', OCR will not be performed.
|
147
|
+
"""
|
123
148
|
ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
|
124
149
|
"""Configuration to pass to the OCR backend."""
|
150
|
+
gmft_config: GMFTConfig | None = None
|
151
|
+
"""GMFT configuration."""
|
125
152
|
post_processing_hooks: list[PostProcessingHook] | None = None
|
126
153
|
"""Post processing hooks to call after processing is done and before the final result is returned."""
|
127
154
|
validators: list[ValidationHook] | None = None
|