kreuzberg 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,283 @@
1
+ from __future__ import annotations
2
+
3
+ import platform
4
+ from dataclasses import dataclass
5
+ from importlib.util import find_spec
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
7
+
8
+ from PIL import Image
9
+
10
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
11
+ from kreuzberg._ocr._base import OCRBackend
12
+ from kreuzberg._types import ExtractionResult, Metadata
13
+ from kreuzberg._utils._string import normalize_spaces
14
+ from kreuzberg._utils._sync import run_sync
15
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
16
+
17
+ if TYPE_CHECKING:
18
+ from pathlib import Path
19
+
20
+
21
+ try: # pragma: no cover
22
+ from typing import Unpack # type: ignore[attr-defined]
23
+ except ImportError: # pragma: no cover
24
+ from typing_extensions import Unpack
25
+
26
+
27
+ PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
28
+
29
+
30
+ @dataclass(unsafe_hash=True, frozen=True)
31
+ class PaddleOCRConfig:
32
+ """Configuration options for PaddleOCR.
33
+
34
+ This TypedDict provides type hints and documentation for all PaddleOCR parameters.
35
+ """
36
+
37
+ cls_image_shape: str = "3,48,192"
38
+ """Image shape for classification algorithm in format 'channels,height,width'."""
39
+ det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
40
+ """Detection algorithm."""
41
+ det_db_box_thresh: float = 0.5
42
+ """Score threshold for detected boxes. Boxes below this value are discarded."""
43
+ det_db_thresh: float = 0.3
44
+ """Binarization threshold for DB output map."""
45
+ det_db_unclip_ratio: float = 2.0
46
+ """Expansion ratio for detected text boxes."""
47
+ det_east_cover_thresh: float = 0.1
48
+ """Score threshold for EAST output boxes."""
49
+ det_east_nms_thresh: float = 0.2
50
+ """NMS threshold for EAST model output boxes."""
51
+ det_east_score_thresh: float = 0.8
52
+ """Binarization threshold for EAST output map."""
53
+ det_max_side_len: int = 960
54
+ """Maximum size of image long side. Images exceeding this will be proportionally resized."""
55
+ drop_score: float = 0.5
56
+ """Filter recognition results by confidence score. Results below this are discarded."""
57
+ enable_mkldnn: bool = False
58
+ """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
59
+ gpu_mem: int = 8000
60
+ """GPU memory size (in MB) to use for initialization."""
61
+ language: str = "en"
62
+ """Language to use for OCR."""
63
+ max_text_length: int = 25
64
+ """Maximum text length that the recognition algorithm can recognize."""
65
+ rec: bool = True
66
+ """Enable text recognition when using the ocr() function."""
67
+ rec_algorithm: Literal[
68
+ "CRNN",
69
+ "SRN",
70
+ "NRTR",
71
+ "SAR",
72
+ "SEED",
73
+ "SVTR",
74
+ "SVTR_LCNet",
75
+ "ViTSTR",
76
+ "ABINet",
77
+ "VisionLAN",
78
+ "SPIN",
79
+ "RobustScanner",
80
+ "RFL",
81
+ ] = "CRNN"
82
+ """Recognition algorithm."""
83
+ rec_image_shape: str = "3,32,320"
84
+ """Image shape for recognition algorithm in format 'channels,height,width'."""
85
+ table: bool = True
86
+ """Whether to enable table recognition."""
87
+ use_angle_cls: bool = True
88
+ """Whether to use text orientation classification model."""
89
+ use_gpu: bool = False
90
+ """Whether to use GPU for inference. Requires installing the paddlepaddle-gpu package"""
91
+ use_space_char: bool = True
92
+ """Whether to recognize spaces."""
93
+ use_zero_copy_run: bool = False
94
+ """Whether to enable zero_copy_run for inference optimization."""
95
+
96
+
97
+ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
98
+ _paddle_ocr: ClassVar[Any] = None
99
+
100
+ async def process_image(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
101
+ """Asynchronously process an image and extract its text and metadata using PaddleOCR.
102
+
103
+ Args:
104
+ image: An instance of PIL.Image representing the input image.
105
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
106
+
107
+ Returns:
108
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
109
+
110
+ Raises:
111
+ OCRError: If OCR processing fails.
112
+ """
113
+ import numpy as np
114
+
115
+ await self._init_paddle_ocr(**kwargs)
116
+ image_np = np.array(image)
117
+ try:
118
+ result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
119
+ return self._process_paddle_result(result, image)
120
+ except Exception as e:
121
+ raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
122
+
123
+ async def process_file(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
124
+ """Asynchronously process a file and extract its text and metadata using PaddleOCR.
125
+
126
+ Args:
127
+ path: A Path object representing the file to be processed.
128
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
129
+
130
+ Returns:
131
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
132
+
133
+ Raises:
134
+ OCRError: If file loading or OCR processing fails.
135
+ """
136
+ await self._init_paddle_ocr(**kwargs)
137
+ try:
138
+ image = await run_sync(Image.open, path)
139
+ return await self.process_image(image, **kwargs)
140
+ except Exception as e:
141
+ raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
142
+
143
+ @staticmethod
144
+ def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
145
+ """Process PaddleOCR result into an ExtractionResult with metadata.
146
+
147
+ Args:
148
+ result: The raw result from PaddleOCR.
149
+ image: The original PIL image.
150
+
151
+ Returns:
152
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
153
+ """
154
+ text_content = ""
155
+ confidence_sum = 0
156
+ confidence_count = 0
157
+
158
+ for page_result in result:
159
+ if not page_result:
160
+ continue
161
+
162
+ sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
163
+ line_groups: list[list[Any]] = []
164
+ current_line: list[Any] = []
165
+ prev_y: float | None = None
166
+
167
+ for box in sorted_boxes:
168
+ box_points, (_, _) = box
169
+ current_y = sum(point[1] for point in box_points) / 4
170
+ min_box_distance = 20
171
+
172
+ if prev_y is None or abs(current_y - prev_y) > min_box_distance:
173
+ if current_line:
174
+ line_groups.append(current_line)
175
+ current_line = [box]
176
+ else:
177
+ current_line.append(box)
178
+
179
+ prev_y = current_y
180
+
181
+ if current_line:
182
+ line_groups.append(current_line)
183
+
184
+ for line in line_groups:
185
+ line_sorted = sorted(line, key=lambda x: x[0][0][0])
186
+
187
+ for box in line_sorted:
188
+ _, (text, confidence) = box
189
+ if text:
190
+ text_content += text + " "
191
+ confidence_sum += confidence
192
+ confidence_count += 1
193
+
194
+ text_content += "\n"
195
+
196
+ width, height = image.size
197
+ metadata = Metadata(
198
+ width=width,
199
+ height=height,
200
+ )
201
+
202
+ return ExtractionResult(
203
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
204
+ )
205
+
206
+ @classmethod
207
+ def _is_mkldnn_supported(cls) -> bool:
208
+ """Check if the current architecture supports MKL-DNN optimization.
209
+
210
+ Returns:
211
+ True if MKL-DNN is supported on this architecture.
212
+ """
213
+ system = platform.system().lower()
214
+ processor = platform.processor().lower()
215
+ machine = platform.machine().lower()
216
+
217
+ if system in ("linux", "windows"):
218
+ return "intel" in processor or "x86" in machine or "amd64" in machine or "x86_64" in machine
219
+
220
+ if system == "darwin":
221
+ return machine == "x86_64"
222
+
223
+ return False
224
+
225
+ @classmethod
226
+ async def _init_paddle_ocr(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
227
+ """Initialize PaddleOCR with the provided configuration.
228
+
229
+ Args:
230
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
231
+
232
+ Raises:
233
+ MissingDependencyError: If PaddleOCR is not installed.
234
+ OCRError: If initialization fails.
235
+ """
236
+ if cls._paddle_ocr is not None:
237
+ return
238
+
239
+ try:
240
+ from paddleocr import PaddleOCR
241
+ except ImportError as e:
242
+ raise MissingDependencyError.create_for_package(
243
+ dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
244
+ ) from e
245
+
246
+ language = cls._validate_language_code(kwargs.pop("language", "en"))
247
+ has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
248
+ kwargs.setdefault("use_angle_cls", True)
249
+ kwargs.setdefault("use_gpu", has_gpu_package)
250
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not has_gpu_package)
251
+ kwargs.setdefault("det_db_thresh", 0.3)
252
+ kwargs.setdefault("det_db_box_thresh", 0.5)
253
+ kwargs.setdefault("det_db_unclip_ratio", 1.6)
254
+
255
+ try:
256
+ cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
257
+ except Exception as e:
258
+ raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e
259
+
260
+ @staticmethod
261
+ def _validate_language_code(lang_code: str) -> str:
262
+ """Convert a language code to PaddleOCR format.
263
+
264
+ Args:
265
+ lang_code: ISO language code or language name
266
+
267
+ Raises:
268
+ ValidationError: If the language is not supported by PaddleOCR
269
+
270
+ Returns:
271
+ Language code compatible with PaddleOCR
272
+ """
273
+ normalized = lang_code.lower()
274
+ if normalized in PADDLEOCR_SUPPORTED_LANGUAGE_CODES:
275
+ return normalized
276
+
277
+ raise ValidationError(
278
+ "The provided language code is not supported by PaddleOCR",
279
+ context={
280
+ "language_code": lang_code,
281
+ "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
282
+ },
283
+ )
@@ -0,0 +1,342 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import sys
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Any, ClassVar, Final
8
+
9
+ from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
+
12
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
13
+ from kreuzberg._ocr._base import OCRBackend
14
+ from kreuzberg._types import ExtractionResult
15
+ from kreuzberg._utils._string import normalize_spaces
16
+ from kreuzberg._utils._sync import run_sync
17
+ from kreuzberg._utils._tmp import create_temp_file
18
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
19
+
20
+ if TYPE_CHECKING:
21
+ from pathlib import Path
22
+
23
+ from PIL.Image import Image
24
+
25
+ try: # pragma: no cover
26
+ from typing import Unpack # type: ignore[attr-defined]
27
+ except ImportError: # pragma: no cover
28
+ from typing_extensions import Unpack
29
+
30
+
31
+ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
32
+ "afr",
33
+ "amh",
34
+ "ara",
35
+ "asm",
36
+ "aze",
37
+ "aze_cyrl",
38
+ "bel",
39
+ "ben",
40
+ "bod",
41
+ "bos",
42
+ "bre",
43
+ "bul",
44
+ "cat",
45
+ "ceb",
46
+ "ces",
47
+ "chi_sim",
48
+ "chi_tra",
49
+ "chr",
50
+ "cos",
51
+ "cym",
52
+ "dan",
53
+ "dan_frak",
54
+ "deu",
55
+ "deu_frak",
56
+ "deu_latf",
57
+ "dzo",
58
+ "ell",
59
+ "eng",
60
+ "enm",
61
+ "epo",
62
+ "equ",
63
+ "est",
64
+ "eus",
65
+ "fao",
66
+ "fas",
67
+ "fil",
68
+ "fin",
69
+ "fra",
70
+ "frk",
71
+ "frm",
72
+ "fry",
73
+ "gla",
74
+ "gle",
75
+ "glg",
76
+ "grc",
77
+ "guj",
78
+ "hat",
79
+ "heb",
80
+ "hin",
81
+ "hrv",
82
+ "hun",
83
+ "hye",
84
+ "iku",
85
+ "ind",
86
+ "isl",
87
+ "ita",
88
+ "ita_old",
89
+ "jav",
90
+ "jpn",
91
+ "kan",
92
+ "kat",
93
+ "kat_old",
94
+ "kaz",
95
+ "khm",
96
+ "kir",
97
+ "kmr",
98
+ "kor",
99
+ "kor_vert",
100
+ "kur",
101
+ "lao",
102
+ "lat",
103
+ "lav",
104
+ "lit",
105
+ "ltz",
106
+ "mal",
107
+ "mar",
108
+ "mkd",
109
+ "mlt",
110
+ "mon",
111
+ "mri",
112
+ "msa",
113
+ "mya",
114
+ "nep",
115
+ "nld",
116
+ "nor",
117
+ "oci",
118
+ "ori",
119
+ "osd",
120
+ "pan",
121
+ "pol",
122
+ "por",
123
+ "pus",
124
+ "que",
125
+ "ron",
126
+ "rus",
127
+ "san",
128
+ "sin",
129
+ "slk",
130
+ "slk_frak",
131
+ "slv",
132
+ "snd",
133
+ "spa",
134
+ "spa_old",
135
+ "sqi",
136
+ "srp",
137
+ "srp_latn",
138
+ "sun",
139
+ "swa",
140
+ "swe",
141
+ "syr",
142
+ "tam",
143
+ "tat",
144
+ "tel",
145
+ "tgk",
146
+ "tgl",
147
+ "tha", # codespell:ignore
148
+ "tir",
149
+ "ton",
150
+ "tur",
151
+ "uig",
152
+ "ukr",
153
+ "urd",
154
+ "uzb",
155
+ "uzb_cyrl",
156
+ "vie", # codespell:ignore
157
+ "yid",
158
+ "yor",
159
+ }
160
+
161
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
162
+
163
+
164
+ class PSMMode(Enum):
165
+ """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
166
+
167
+ OSD_ONLY = 0
168
+ """Orientation and script detection only."""
169
+ AUTO_OSD = 1
170
+ """Automatic page segmentation with orientation and script detection."""
171
+ AUTO_ONLY = 2
172
+ """Automatic page segmentation without OSD."""
173
+ AUTO = 3
174
+ """Fully automatic page segmentation (default)."""
175
+ SINGLE_COLUMN = 4
176
+ """Assume a single column of text."""
177
+ SINGLE_BLOCK_VERTICAL = 5
178
+ """Assume a single uniform block of vertically aligned text."""
179
+ SINGLE_BLOCK = 6
180
+ """Assume a single uniform block of text."""
181
+ SINGLE_LINE = 7
182
+ """Treat the image as a single text line."""
183
+ SINGLE_WORD = 8
184
+ """Treat the image as a single word."""
185
+ CIRCLE_WORD = 9
186
+ """Treat the image as a single word in a circle."""
187
+ SINGLE_CHAR = 10
188
+ """Treat the image as a single character."""
189
+
190
+
191
+ @dataclass(unsafe_hash=True, frozen=True)
192
+ class TesseractConfig:
193
+ """Configuration options for Tesseract OCR engine."""
194
+
195
+ classify_use_pre_adapted_templates: bool = True
196
+ """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
197
+ language: str = "eng"
198
+ """Language code to use for OCR.
199
+ Examples:
200
+ - 'eng' for English
201
+ - 'deu' for German
202
+ - multiple languages combined with '+', e.g. 'eng+deu')
203
+ """
204
+ language_model_ngram_on: bool = True
205
+ """Enable or disable the use of n-gram-based language models for improved text recognition."""
206
+ psm: PSMMode = PSMMode.AUTO
207
+ """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
208
+ tessedit_dont_blkrej_good_wds: bool = True
209
+ """If True, prevents block rejection of words identified as good, improving text output quality."""
210
+ tessedit_dont_rowrej_good_wds: bool = True
211
+ """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
212
+ tessedit_enable_dict_correction: bool = True
213
+ """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
214
+ tessedit_use_primary_params_model: bool = True
215
+ """If True, forces the use of the primary parameters model for text recognition."""
216
+ textord_space_size_is_variable: bool = True
217
+ """Allow variable spacing between words, useful for text with irregular spacing."""
218
+ thresholding_method: bool = False
219
+ """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
220
+
221
+
222
+ class TesseractBackend(OCRBackend[TesseractConfig]):
223
+ _version_checked: ClassVar[bool] = False
224
+
225
+ async def process_image(
226
+ self,
227
+ image: Image,
228
+ **kwargs: Unpack[TesseractConfig],
229
+ ) -> ExtractionResult:
230
+ await self._validate_tesseract_version()
231
+ image_path, unlink = await create_temp_file(".png")
232
+ await run_sync(image.save, str(image_path), format="PNG")
233
+ try:
234
+ return await self.process_file(image_path, **kwargs)
235
+ finally:
236
+ await unlink()
237
+
238
+ async def process_file(
239
+ self,
240
+ path: Path,
241
+ **kwargs: Unpack[TesseractConfig],
242
+ ) -> ExtractionResult:
243
+ await self._validate_tesseract_version()
244
+ output_path, unlink = await create_temp_file(".txt")
245
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
246
+ psm = kwargs.pop("psm", PSMMode.AUTO)
247
+ try:
248
+ output_base = str(output_path).replace(".txt", "")
249
+ command = [
250
+ "tesseract",
251
+ str(path),
252
+ output_base,
253
+ "-l",
254
+ language,
255
+ "--psm",
256
+ str(psm.value),
257
+ "--oem",
258
+ "1",
259
+ "--loglevel",
260
+ "OFF",
261
+ ]
262
+ for kwarg, value in kwargs.items():
263
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
264
+
265
+ env: dict[str, Any] | None = None
266
+ if sys.platform.startswith("linux"):
267
+ # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
268
+ env = {"OMP_THREAD_LIMIT": "1"}
269
+
270
+ result = await run_process(command, env=env)
271
+
272
+ if not result.returncode == 0:
273
+ raise OCRError(
274
+ "OCR failed with a non-0 return code.",
275
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
276
+ )
277
+
278
+ output = await AsyncPath(output_path).read_text("utf-8")
279
+ return ExtractionResult(
280
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
281
+ )
282
+ except (RuntimeError, OSError) as e:
283
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
284
+ finally:
285
+ await unlink()
286
+
287
+ @classmethod
288
+ async def _validate_tesseract_version(cls) -> None:
289
+ """Validate that Tesseract is installed and is version 5 or above.
290
+
291
+ Raises:
292
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
293
+ """
294
+ try:
295
+ if cls._version_checked:
296
+ return
297
+
298
+ command = ["tesseract", "--version"]
299
+ result = await run_process(command)
300
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
301
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
302
+ raise MissingDependencyError(
303
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
304
+ )
305
+
306
+ cls._version_checked = True
307
+ except FileNotFoundError as e:
308
+ raise MissingDependencyError(
309
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
310
+ ) from e
311
+
312
+ @staticmethod
313
+ def _validate_language_code(language_code: str) -> str:
314
+ """Convert a language code to Tesseract format.
315
+
316
+ Args:
317
+ language_code: Tesseract supported language code or multiple language codes connected with '+'
318
+
319
+ Raises:
320
+ ValidationError: If the language is not supported by Tesseract
321
+
322
+ Returns:
323
+ Language code compatible with Tesseract
324
+ """
325
+ normalized = language_code.lower()
326
+ if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
327
+ return normalized
328
+
329
+ if "+" in normalized and all(lang in TESSERACT_SUPPORTED_LANGUAGE_CODES for lang in normalized.split("+")):
330
+ return normalized
331
+
332
+ raise ValidationError(
333
+ "The provided language code is not supported by Tesseract",
334
+ context={
335
+ "language_code": normalized
336
+ if "+" not in normalized
337
+ else ",".join(
338
+ [lang for lang in normalized.split("+") if lang not in TESSERACT_SUPPORTED_LANGUAGE_CODES]
339
+ ),
340
+ "supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
341
+ },
342
+ )
kreuzberg/_types.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import sys
4
4
  from collections.abc import Awaitable
5
- from dataclasses import asdict, dataclass
5
+ from dataclasses import asdict, dataclass, field
6
6
  from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
7
7
 
8
8
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
@@ -14,6 +14,10 @@ else: # pragma: no cover
14
14
  from typing import NotRequired
15
15
 
16
16
  if TYPE_CHECKING:
17
+ from pandas import DataFrame
18
+ from PIL.Image import Image
19
+
20
+ from kreuzberg._gmft import GMFTConfig
17
21
  from kreuzberg._ocr._easyocr import EasyOCRConfig
18
22
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
19
23
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -21,6 +25,19 @@ if TYPE_CHECKING:
21
25
  OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
22
26
 
23
27
 
28
+ class TableData(TypedDict):
29
+ """Table data, returned from table extraction."""
30
+
31
+ cropped_image: Image
32
+ """The cropped image of the table."""
33
+ df: DataFrame
34
+ """The table data as a pandas DataFrame."""
35
+ page_number: int
36
+ """The page number of the table."""
37
+ text: str
38
+ """The table text as a markdown string."""
39
+
40
+
24
41
  class Metadata(TypedDict, total=False):
25
42
  """Base metadata common to all document types.
26
43
 
@@ -88,12 +105,14 @@ class ExtractionResult:
88
105
 
89
106
  content: str
90
107
  """The extracted content."""
91
- chunks: list[str]
92
- """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
93
108
  mime_type: str
94
109
  """The mime type of the extracted content. Is either text/plain or text/markdown."""
95
110
  metadata: Metadata
96
111
  """The metadata of the content."""
112
+ tables: list[TableData] = field(default_factory=list)
113
+ """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
114
+ chunks: list[str] = field(default_factory=list)
115
+ """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
97
116
 
98
117
 
99
118
  PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
@@ -114,14 +133,22 @@ class ExtractionConfig:
114
133
  """Whether to force OCR."""
115
134
  chunk_content: bool = False
116
135
  """Whether to chunk the content into smaller chunks."""
136
+ extract_tables: bool = False
137
+ """Whether to extract tables from the content. This requires the 'gmft' dependency."""
117
138
  max_chars: int = DEFAULT_MAX_CHARACTERS
118
139
  """The size of each chunk in characters."""
119
140
  max_overlap: int = DEFAULT_MAX_OVERLAP
120
141
  """The overlap between chunks in characters."""
121
142
  ocr_backend: OcrBackendType | None = "tesseract"
122
- """The OCR backend to use."""
143
+ """The OCR backend to use.
144
+
145
+ Notes:
146
+ - If set to 'None', OCR will not be performed.
147
+ """
123
148
  ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
124
149
  """Configuration to pass to the OCR backend."""
150
+ gmft_config: GMFTConfig | None = None
151
+ """GMFT configuration."""
125
152
  post_processing_hooks: list[PostProcessingHook] | None = None
126
153
  """Post processing hooks to call after processing is done and before the final result is returned."""
127
154
  validators: list[ValidationHook] | None = None