kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from collections.abc import Awaitable, Callable
|
4
|
+
from collections.abc import Awaitable, Callable, Iterable, Mapping
|
5
5
|
from dataclasses import asdict, dataclass, field
|
6
|
+
from enum import Enum
|
6
7
|
from typing import TYPE_CHECKING, Any, Literal, TypedDict
|
7
8
|
|
8
9
|
import msgspec
|
@@ -15,32 +16,492 @@ from kreuzberg._utils._table import (
|
|
15
16
|
)
|
16
17
|
from kreuzberg.exceptions import ValidationError
|
17
18
|
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from kreuzberg._utils._device import DeviceType
|
21
|
+
|
18
22
|
if sys.version_info < (3, 11): # pragma: no cover
|
19
23
|
from typing_extensions import NotRequired
|
20
24
|
else: # pragma: no cover
|
21
25
|
from typing import NotRequired
|
22
26
|
|
23
27
|
if TYPE_CHECKING:
|
24
|
-
from
|
25
|
-
from PIL.Image import Image
|
28
|
+
from pathlib import Path
|
26
29
|
|
27
|
-
from
|
28
|
-
from
|
29
|
-
from kreuzberg._language_detection import LanguageDetectionConfig
|
30
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
31
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
32
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
30
|
+
from PIL.Image import Image
|
31
|
+
from polars import DataFrame
|
33
32
|
|
34
33
|
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
34
|
+
OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
|
35
35
|
|
36
36
|
|
37
|
-
class
|
38
|
-
|
37
|
+
class ConfigDict:
|
38
|
+
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
39
|
+
result = msgspec.to_builtins(
|
40
|
+
self,
|
41
|
+
builtin_types=(type(None),),
|
42
|
+
order="deterministic",
|
43
|
+
)
|
44
|
+
|
45
|
+
if include_none:
|
46
|
+
return result # type: ignore[no-any-return]
|
47
|
+
|
48
|
+
return {k: v for k, v in result.items() if v is not None}
|
49
|
+
|
39
50
|
|
51
|
+
class PSMMode(Enum):
|
52
|
+
OSD_ONLY = 0
|
53
|
+
"""Orientation and script detection only."""
|
54
|
+
AUTO_OSD = 1
|
55
|
+
"""Automatic page segmentation with orientation and script detection."""
|
56
|
+
AUTO_ONLY = 2
|
57
|
+
"""Automatic page segmentation without OSD."""
|
58
|
+
AUTO = 3
|
59
|
+
"""Fully automatic page segmentation (default)."""
|
60
|
+
SINGLE_COLUMN = 4
|
61
|
+
"""Assume a single column of text."""
|
62
|
+
SINGLE_BLOCK_VERTICAL = 5
|
63
|
+
"""Assume a single uniform block of vertically aligned text."""
|
64
|
+
SINGLE_BLOCK = 6
|
65
|
+
"""Assume a single uniform block of text."""
|
66
|
+
SINGLE_LINE = 7
|
67
|
+
"""Treat the image as a single text line."""
|
68
|
+
SINGLE_WORD = 8
|
69
|
+
"""Treat the image as a single word."""
|
70
|
+
CIRCLE_WORD = 9
|
71
|
+
"""Treat the image as a single word in a circle."""
|
72
|
+
SINGLE_CHAR = 10
|
73
|
+
"""Treat the image as a single character."""
|
74
|
+
|
75
|
+
|
76
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
77
|
+
class TesseractConfig(ConfigDict):
|
78
|
+
classify_use_pre_adapted_templates: bool = True
|
79
|
+
"""Whether to use pre-adapted templates during classification to improve recognition accuracy."""
|
80
|
+
language: str = "eng"
|
81
|
+
"""Language code to use for OCR.
|
82
|
+
Examples:
|
83
|
+
- 'eng' for English
|
84
|
+
- 'deu' for German
|
85
|
+
- multiple languages combined with '+', e.g. 'eng+deu'
|
86
|
+
"""
|
87
|
+
language_model_ngram_on: bool = False
|
88
|
+
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
89
|
+
Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
|
90
|
+
psm: PSMMode = PSMMode.AUTO
|
91
|
+
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
92
|
+
tessedit_dont_blkrej_good_wds: bool = True
|
93
|
+
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
94
|
+
tessedit_dont_rowrej_good_wds: bool = True
|
95
|
+
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
96
|
+
tessedit_enable_dict_correction: bool = True
|
97
|
+
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
98
|
+
tessedit_char_whitelist: str = ""
|
99
|
+
"""Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
|
100
|
+
tessedit_use_primary_params_model: bool = True
|
101
|
+
"""If True, forces the use of the primary parameters model for text recognition."""
|
102
|
+
textord_space_size_is_variable: bool = True
|
103
|
+
"""Allow variable spacing between words, useful for text with irregular spacing."""
|
104
|
+
thresholding_method: bool = False
|
105
|
+
"""Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
|
106
|
+
output_format: OutputFormatType = "markdown"
|
107
|
+
"""Output format: 'markdown' (default), 'text', 'tsv' (for structured data), or 'hocr' (HTML-based)."""
|
108
|
+
enable_table_detection: bool = False
|
109
|
+
"""Enable table structure detection from TSV output."""
|
110
|
+
table_column_threshold: int = 20
|
111
|
+
"""Pixel threshold for column clustering in table detection."""
|
112
|
+
table_row_threshold_ratio: float = 0.5
|
113
|
+
"""Row threshold as ratio of mean text height for table detection."""
|
114
|
+
table_min_confidence: float = 30.0
|
115
|
+
"""Minimum confidence score to include a word in table extraction."""
|
116
|
+
|
117
|
+
|
118
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
119
|
+
class EasyOCRConfig(ConfigDict):
|
120
|
+
add_margin: float = 0.1
|
121
|
+
"""Extend bounding boxes in all directions."""
|
122
|
+
adjust_contrast: float = 0.5
|
123
|
+
"""Target contrast level for low contrast text."""
|
124
|
+
beam_width: int = 5
|
125
|
+
"""Beam width for beam search in recognition."""
|
126
|
+
canvas_size: int = 2560
|
127
|
+
"""Maximum image dimension for detection."""
|
128
|
+
contrast_ths: float = 0.1
|
129
|
+
"""Contrast threshold for preprocessing."""
|
130
|
+
decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
|
131
|
+
"""Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
|
132
|
+
height_ths: float = 0.5
|
133
|
+
"""Maximum difference in box height for merging."""
|
134
|
+
language: str | list[str] = "en"
|
135
|
+
"""Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
|
136
|
+
a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
|
137
|
+
link_threshold: float = 0.4
|
138
|
+
"""Link confidence threshold."""
|
139
|
+
low_text: float = 0.4
|
140
|
+
"""Text low-bound score."""
|
141
|
+
mag_ratio: float = 1.0
|
142
|
+
"""Image magnification ratio."""
|
143
|
+
min_size: int = 10
|
144
|
+
"""Minimum text box size in pixels."""
|
145
|
+
rotation_info: list[int] | None = None
|
146
|
+
"""List of angles to try for detection."""
|
147
|
+
slope_ths: float = 0.1
|
148
|
+
"""Maximum slope for merging text boxes."""
|
149
|
+
text_threshold: float = 0.7
|
150
|
+
"""Text confidence threshold."""
|
151
|
+
use_gpu: bool = False
|
152
|
+
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
153
|
+
device: DeviceType = "auto"
|
154
|
+
"""Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
|
155
|
+
gpu_memory_limit: float | None = None
|
156
|
+
"""Maximum GPU memory to use in GB. None for no limit."""
|
157
|
+
fallback_to_cpu: bool = True
|
158
|
+
"""Whether to fallback to CPU if requested device is unavailable."""
|
159
|
+
width_ths: float = 0.5
|
160
|
+
"""Maximum horizontal distance for merging boxes."""
|
161
|
+
x_ths: float = 1.0
|
162
|
+
"""Maximum horizontal distance for paragraph merging."""
|
163
|
+
y_ths: float = 0.5
|
164
|
+
"""Maximum vertical distance for paragraph merging."""
|
165
|
+
ycenter_ths: float = 0.5
|
166
|
+
"""Maximum shift in y direction for merging."""
|
167
|
+
|
168
|
+
|
169
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
170
|
+
class PaddleOCRConfig(ConfigDict):
|
171
|
+
cls_image_shape: str = "3,48,192"
|
172
|
+
"""Image shape for classification algorithm in format 'channels,height,width'."""
|
173
|
+
det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
|
174
|
+
"""Detection algorithm."""
|
175
|
+
det_db_box_thresh: float = 0.5
|
176
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_box_thresh' instead. Score threshold for detected boxes."""
|
177
|
+
det_db_thresh: float = 0.3
|
178
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_thresh' instead. Binarization threshold for DB output map."""
|
179
|
+
det_db_unclip_ratio: float = 2.0
|
180
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Use 'text_det_unclip_ratio' instead. Expansion ratio for detected text boxes."""
|
181
|
+
det_east_cover_thresh: float = 0.1
|
182
|
+
"""Score threshold for EAST output boxes."""
|
183
|
+
det_east_nms_thresh: float = 0.2
|
184
|
+
"""NMS threshold for EAST model output boxes."""
|
185
|
+
det_east_score_thresh: float = 0.8
|
186
|
+
"""Binarization threshold for EAST output map."""
|
187
|
+
det_max_side_len: int = 960
|
188
|
+
"""Maximum size of image long side. Images exceeding this will be proportionally resized."""
|
189
|
+
det_model_dir: str | None = None
|
190
|
+
"""Directory for detection model. If None, uses default model location."""
|
191
|
+
drop_score: float = 0.5
|
192
|
+
"""Filter recognition results by confidence score. Results below this are discarded."""
|
193
|
+
enable_mkldnn: bool = False
|
194
|
+
"""Whether to enable MKL-DNN acceleration (Intel CPU only)."""
|
195
|
+
gpu_mem: int = 8000
|
196
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. GPU memory size (in MB) to use for initialization."""
|
197
|
+
language: str = "en"
|
198
|
+
"""Language to use for OCR."""
|
199
|
+
max_text_length: int = 25
|
200
|
+
"""Maximum text length that the recognition algorithm can recognize."""
|
201
|
+
rec: bool = True
|
202
|
+
"""Enable text recognition when using the ocr() function."""
|
203
|
+
rec_algorithm: Literal[
|
204
|
+
"CRNN",
|
205
|
+
"SRN",
|
206
|
+
"NRTR",
|
207
|
+
"SAR",
|
208
|
+
"SEED",
|
209
|
+
"SVTR",
|
210
|
+
"SVTR_LCNet",
|
211
|
+
"ViTSTR",
|
212
|
+
"ABINet",
|
213
|
+
"VisionLAN",
|
214
|
+
"SPIN",
|
215
|
+
"RobustScanner",
|
216
|
+
"RFL",
|
217
|
+
] = "CRNN"
|
218
|
+
"""Recognition algorithm."""
|
219
|
+
rec_image_shape: str = "3,32,320"
|
220
|
+
"""Image shape for recognition algorithm in format 'channels,height,width'."""
|
221
|
+
rec_model_dir: str | None = None
|
222
|
+
"""Directory for recognition model. If None, uses default model location."""
|
223
|
+
table: bool = True
|
224
|
+
"""Whether to enable table recognition."""
|
225
|
+
use_angle_cls: bool = True
|
226
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Use 'use_textline_orientation' instead. Whether to use text orientation classification model."""
|
227
|
+
use_gpu: bool = False
|
228
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. Use hardware acceleration flags instead."""
|
229
|
+
device: DeviceType = "auto"
|
230
|
+
"""Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
|
231
|
+
gpu_memory_limit: float | None = None
|
232
|
+
"""DEPRECATED in PaddleOCR 3.2.0+: Parameter no longer supported. Maximum GPU memory to use in GB."""
|
233
|
+
fallback_to_cpu: bool = True
|
234
|
+
"""Whether to fallback to CPU if requested device is unavailable."""
|
235
|
+
use_space_char: bool = True
|
236
|
+
"""Whether to recognize spaces."""
|
237
|
+
use_zero_copy_run: bool = False
|
238
|
+
"""Whether to enable zero_copy_run for inference optimization."""
|
239
|
+
|
240
|
+
text_det_thresh: float = 0.3
|
241
|
+
"""Binarization threshold for text detection output map (replaces det_db_thresh)."""
|
242
|
+
text_det_box_thresh: float = 0.5
|
243
|
+
"""Score threshold for detected text boxes (replaces det_db_box_thresh)."""
|
244
|
+
text_det_unclip_ratio: float = 2.0
|
245
|
+
"""Expansion ratio for detected text boxes (replaces det_db_unclip_ratio)."""
|
246
|
+
use_textline_orientation: bool = True
|
247
|
+
"""Whether to use text line orientation classification model (replaces use_angle_cls)."""
|
248
|
+
|
249
|
+
|
250
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
251
|
+
class GMFTConfig(ConfigDict):
|
252
|
+
verbosity: int = 0
|
253
|
+
"""
|
254
|
+
Verbosity level for logging.
|
255
|
+
|
256
|
+
0: errors only
|
257
|
+
1: print warnings
|
258
|
+
2: print warnings and info
|
259
|
+
3: print warnings, info, and debug
|
260
|
+
"""
|
261
|
+
formatter_base_threshold: float = 0.3
|
262
|
+
"""
|
263
|
+
Base threshold for the confidence demanded of a table feature (row/column).
|
264
|
+
|
265
|
+
Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
|
266
|
+
"""
|
267
|
+
cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
|
268
|
+
default_factory=lambda: {
|
269
|
+
0: 0.3,
|
270
|
+
1: 0.3,
|
271
|
+
2: 0.3,
|
272
|
+
3: 0.3,
|
273
|
+
4: 0.5,
|
274
|
+
5: 0.5,
|
275
|
+
6: 99,
|
276
|
+
},
|
277
|
+
hash=False,
|
278
|
+
)
|
279
|
+
"""
|
280
|
+
Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
|
281
|
+
|
282
|
+
But low confidences may be better than too high confidence (see formatter_base_threshold)
|
283
|
+
"""
|
284
|
+
detector_base_threshold: float = 0.9
|
285
|
+
"""Minimum confidence score required for a table"""
|
286
|
+
remove_null_rows: bool = True
|
287
|
+
"""
|
288
|
+
Flag to remove rows with no text.
|
289
|
+
"""
|
290
|
+
enable_multi_header: bool = False
|
291
|
+
"""
|
292
|
+
Enable multi-indices in the dataframe.
|
293
|
+
|
294
|
+
If false, then multiple headers will be merged column-wise.
|
295
|
+
"""
|
296
|
+
semantic_spanning_cells: bool = False
|
297
|
+
"""
|
298
|
+
[Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
|
299
|
+
"""
|
300
|
+
semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
|
301
|
+
"""
|
302
|
+
[Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
|
303
|
+
|
304
|
+
Possible values: 'algorithm', 'deep', None.
|
305
|
+
"""
|
306
|
+
large_table_if_n_rows_removed: int = 8
|
307
|
+
"""
|
308
|
+
If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
|
309
|
+
"""
|
310
|
+
large_table_threshold: int = 10
|
311
|
+
"""
|
312
|
+
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
313
|
+
|
314
|
+
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
|
315
|
+
"""
|
316
|
+
large_table_row_overlap_threshold: float = 0.2
|
317
|
+
"""
|
318
|
+
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
319
|
+
|
320
|
+
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
|
321
|
+
"""
|
322
|
+
large_table_maximum_rows: int = 1000
|
323
|
+
"""
|
324
|
+
Maximum number of rows allowed for a large table.
|
325
|
+
"""
|
326
|
+
force_large_table_assumption: bool | None = None
|
327
|
+
"""
|
328
|
+
Force the large table assumption to be applied, regardless of the number of rows and overlap.
|
329
|
+
"""
|
330
|
+
total_overlap_reject_threshold: float = 0.9
|
331
|
+
"""
|
332
|
+
Reject if total overlap is > 90% of table area.
|
333
|
+
"""
|
334
|
+
total_overlap_warn_threshold: float = 0.1
|
335
|
+
"""
|
336
|
+
Warn if total overlap is > 10% of table area.
|
337
|
+
"""
|
338
|
+
nms_warn_threshold: int = 5
|
339
|
+
"""
|
340
|
+
Warn if non maxima suppression removes > 5 rows.
|
341
|
+
"""
|
342
|
+
iob_reject_threshold: float = 0.05
|
343
|
+
"""
|
344
|
+
Reject if iob between textbox and cell is < 5%.
|
345
|
+
"""
|
346
|
+
iob_warn_threshold: float = 0.5
|
347
|
+
"""
|
348
|
+
Warn if iob between textbox and cell is < 50%.
|
349
|
+
"""
|
350
|
+
|
351
|
+
|
352
|
+
@dataclass(frozen=True, slots=True)
|
353
|
+
class LanguageDetectionConfig(ConfigDict):
|
354
|
+
low_memory: bool = True
|
355
|
+
"""If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
356
|
+
Defaults to True for better memory efficiency."""
|
357
|
+
top_k: int = 3
|
358
|
+
"""Maximum number of languages to return for multilingual detection."""
|
359
|
+
multilingual: bool = False
|
360
|
+
"""If True, uses multilingual detection to handle mixed-language text.
|
361
|
+
If False, uses single language detection."""
|
362
|
+
cache_dir: str | None = None
|
363
|
+
"""Custom directory for model cache. If None, uses system default."""
|
364
|
+
allow_fallback: bool = True
|
365
|
+
"""If True, falls back to small model if large model fails."""
|
366
|
+
|
367
|
+
|
368
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
369
|
+
class SpacyEntityExtractionConfig(ConfigDict):
|
370
|
+
model_cache_dir: str | Path | None = None
|
371
|
+
"""Directory to cache spaCy models. If None, uses spaCy's default."""
|
372
|
+
language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
|
373
|
+
"""Mapping of language codes to spaCy model names.
|
374
|
+
|
375
|
+
If None, uses default mappings:
|
376
|
+
- en: en_core_web_sm
|
377
|
+
- de: de_core_news_sm
|
378
|
+
- fr: fr_core_news_sm
|
379
|
+
- es: es_core_news_sm
|
380
|
+
- pt: pt_core_news_sm
|
381
|
+
- it: it_core_news_sm
|
382
|
+
- nl: nl_core_news_sm
|
383
|
+
- zh: zh_core_web_sm
|
384
|
+
- ja: ja_core_news_sm
|
385
|
+
"""
|
386
|
+
fallback_to_multilingual: bool = True
|
387
|
+
"""If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
|
388
|
+
max_doc_length: int = 1000000
|
389
|
+
"""Maximum document length for spaCy processing."""
|
390
|
+
batch_size: int = 1000
|
391
|
+
"""Batch size for processing multiple texts."""
|
392
|
+
|
393
|
+
def __post_init__(self) -> None:
|
394
|
+
if self.language_models is None:
|
395
|
+
object.__setattr__(self, "language_models", self._get_default_language_models())
|
396
|
+
|
397
|
+
if isinstance(self.language_models, dict):
|
398
|
+
object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
|
399
|
+
|
400
|
+
@staticmethod
|
401
|
+
def _get_default_language_models() -> dict[str, str]:
|
402
|
+
return {
|
403
|
+
"en": "en_core_web_sm",
|
404
|
+
"de": "de_core_news_sm",
|
405
|
+
"fr": "fr_core_news_sm",
|
406
|
+
"es": "es_core_news_sm",
|
407
|
+
"pt": "pt_core_news_sm",
|
408
|
+
"it": "it_core_news_sm",
|
409
|
+
"nl": "nl_core_news_sm",
|
410
|
+
"zh": "zh_core_web_sm",
|
411
|
+
"ja": "ja_core_news_sm",
|
412
|
+
"ko": "ko_core_news_sm",
|
413
|
+
"ru": "ru_core_news_sm",
|
414
|
+
"pl": "pl_core_news_sm",
|
415
|
+
"ro": "ro_core_news_sm",
|
416
|
+
"el": "el_core_news_sm",
|
417
|
+
"da": "da_core_news_sm",
|
418
|
+
"fi": "fi_core_news_sm",
|
419
|
+
"nb": "nb_core_news_sm",
|
420
|
+
"sv": "sv_core_news_sm",
|
421
|
+
"ca": "ca_core_news_sm",
|
422
|
+
"hr": "hr_core_news_sm",
|
423
|
+
"lt": "lt_core_news_sm",
|
424
|
+
"mk": "mk_core_news_sm",
|
425
|
+
"sl": "sl_core_news_sm",
|
426
|
+
"uk": "uk_core_news_sm",
|
427
|
+
"xx": "xx_ent_wiki_sm",
|
428
|
+
}
|
429
|
+
|
430
|
+
def get_model_for_language(self, language_code: str) -> str | None:
|
431
|
+
if not self.language_models:
|
432
|
+
return None
|
433
|
+
|
434
|
+
models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
|
435
|
+
|
436
|
+
if language_code in models_dict:
|
437
|
+
return models_dict[language_code]
|
438
|
+
|
439
|
+
base_lang = language_code.split("-")[0].lower()
|
440
|
+
if base_lang in models_dict:
|
441
|
+
return models_dict[base_lang]
|
442
|
+
|
443
|
+
return None
|
444
|
+
|
445
|
+
def get_fallback_model(self) -> str | None:
|
446
|
+
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
447
|
+
|
448
|
+
|
449
|
+
class BoundingBox(TypedDict):
|
450
|
+
left: int
|
451
|
+
"""X coordinate of the left edge."""
|
452
|
+
top: int
|
453
|
+
"""Y coordinate of the top edge."""
|
454
|
+
width: int
|
455
|
+
"""Width of the bounding box."""
|
456
|
+
height: int
|
457
|
+
"""Height of the bounding box."""
|
458
|
+
|
459
|
+
|
460
|
+
class TSVWord(TypedDict):
|
461
|
+
level: int
|
462
|
+
"""Hierarchy level (1=page, 2=block, 3=para, 4=line, 5=word)."""
|
463
|
+
page_num: int
|
464
|
+
"""Page number."""
|
465
|
+
block_num: int
|
466
|
+
"""Block number within the page."""
|
467
|
+
par_num: int
|
468
|
+
"""Paragraph number within the block."""
|
469
|
+
line_num: int
|
470
|
+
"""Line number within the paragraph."""
|
471
|
+
word_num: int
|
472
|
+
"""Word number within the line."""
|
473
|
+
left: int
|
474
|
+
"""X coordinate of the left edge of the word."""
|
475
|
+
top: int
|
476
|
+
"""Y coordinate of the top edge of the word."""
|
477
|
+
width: int
|
478
|
+
"""Width of the word bounding box."""
|
479
|
+
height: int
|
480
|
+
"""Height of the word bounding box."""
|
481
|
+
conf: float
|
482
|
+
"""Confidence score (0-100)."""
|
483
|
+
text: str
|
484
|
+
"""The recognized text content."""
|
485
|
+
|
486
|
+
|
487
|
+
class TableCell(TypedDict):
|
488
|
+
row: int
|
489
|
+
"""Row index (0-based)."""
|
490
|
+
col: int
|
491
|
+
"""Column index (0-based)."""
|
492
|
+
text: str
|
493
|
+
"""Cell text content."""
|
494
|
+
bbox: BoundingBox
|
495
|
+
"""Bounding box of the cell."""
|
496
|
+
confidence: float
|
497
|
+
"""Average confidence of words in the cell."""
|
498
|
+
|
499
|
+
|
500
|
+
class TableData(TypedDict):
|
40
501
|
cropped_image: Image
|
41
502
|
"""The cropped image of the table."""
|
42
|
-
df: DataFrame
|
43
|
-
"""The table data as a
|
503
|
+
df: DataFrame | None
|
504
|
+
"""The table data as a polars DataFrame."""
|
44
505
|
page_number: int
|
45
506
|
"""The page number of the table."""
|
46
507
|
text: str
|
@@ -48,12 +509,6 @@ class TableData(TypedDict):
|
|
48
509
|
|
49
510
|
|
50
511
|
class Metadata(TypedDict, total=False):
|
51
|
-
"""Base metadata common to all document types.
|
52
|
-
|
53
|
-
All fields will only be included if they contain non-empty values.
|
54
|
-
Any field that would be empty or None is omitted from the dictionary.
|
55
|
-
"""
|
56
|
-
|
57
512
|
authors: NotRequired[list[str]]
|
58
513
|
"""List of document authors."""
|
59
514
|
categories: NotRequired[list[str]]
|
@@ -106,8 +561,6 @@ class Metadata(TypedDict, total=False):
|
|
106
561
|
"""Version identifier or revision number."""
|
107
562
|
width: NotRequired[int]
|
108
563
|
"""Width of the document page/slide/image, if applicable."""
|
109
|
-
|
110
|
-
# Email-specific fields
|
111
564
|
email_from: NotRequired[str]
|
112
565
|
"""Email sender (from field)."""
|
113
566
|
email_to: NotRequired[str]
|
@@ -120,25 +573,26 @@ class Metadata(TypedDict, total=False):
|
|
120
573
|
"""Email date or document date."""
|
121
574
|
attachments: NotRequired[list[str]]
|
122
575
|
"""List of attachment names."""
|
123
|
-
|
124
|
-
# Additional metadata fields for various extractors
|
125
576
|
content: NotRequired[str]
|
126
577
|
"""Content metadata field."""
|
127
578
|
parse_error: NotRequired[str]
|
128
579
|
"""Parse error information."""
|
129
580
|
warning: NotRequired[str]
|
130
581
|
"""Warning messages."""
|
131
|
-
|
132
|
-
# Table extraction metadata
|
133
582
|
table_count: NotRequired[int]
|
134
583
|
"""Number of tables extracted from the document."""
|
584
|
+
tables_detected: NotRequired[int]
|
585
|
+
"""Number of tables detected in the document."""
|
135
586
|
tables_summary: NotRequired[str]
|
136
587
|
"""Summary of table extraction results."""
|
137
588
|
quality_score: NotRequired[float]
|
138
589
|
"""Quality score for extracted content (0.0-1.0)."""
|
590
|
+
source_format: NotRequired[str]
|
591
|
+
"""Source format of the extracted content."""
|
592
|
+
error: NotRequired[str]
|
593
|
+
"""Error message if extraction failed."""
|
139
594
|
|
140
595
|
|
141
|
-
# Cache valid metadata keys at module level for performance
|
142
596
|
_VALID_METADATA_KEYS = {
|
143
597
|
"authors",
|
144
598
|
"categories",
|
@@ -182,14 +636,9 @@ _VALID_METADATA_KEYS = {
|
|
182
636
|
|
183
637
|
|
184
638
|
def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
185
|
-
"""Normalize any dict to proper Metadata TypedDict.
|
186
|
-
|
187
|
-
Filters out invalid keys and ensures type safety.
|
188
|
-
"""
|
189
639
|
if not data:
|
190
640
|
return {}
|
191
641
|
|
192
|
-
# Filter and return only valid metadata
|
193
642
|
normalized: Metadata = {}
|
194
643
|
for key, value in data.items():
|
195
644
|
if key in _VALID_METADATA_KEYS and value is not None:
|
@@ -200,8 +649,6 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
|
200
649
|
|
201
650
|
@dataclass(frozen=True, slots=True)
|
202
651
|
class Entity:
|
203
|
-
"""Represents an extracted entity with type, text, and position."""
|
204
|
-
|
205
652
|
type: str
|
206
653
|
"""e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
|
207
654
|
text: str
|
@@ -214,8 +661,6 @@ class Entity:
|
|
214
661
|
|
215
662
|
@dataclass(slots=True)
|
216
663
|
class ExtractionResult:
|
217
|
-
"""The result of a file extraction."""
|
218
|
-
|
219
664
|
content: str
|
220
665
|
"""The extracted content."""
|
221
666
|
mime_type: str
|
@@ -240,57 +685,30 @@ class ExtractionResult:
|
|
240
685
|
"""Internal layout data from OCR, not for public use."""
|
241
686
|
|
242
687
|
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
243
|
-
"""Converts the ExtractionResult to a dictionary.
|
244
|
-
|
245
|
-
Args:
|
246
|
-
include_none: If True, include fields with None values.
|
247
|
-
If False (default), exclude None values.
|
248
|
-
|
249
|
-
Returns:
|
250
|
-
Dictionary representation of the ExtractionResult.
|
251
|
-
"""
|
252
|
-
# Use msgspec.to_builtins for efficient conversion
|
253
|
-
# The builtin_types parameter allows DataFrames to pass through
|
254
688
|
result = msgspec.to_builtins(
|
255
689
|
self,
|
256
|
-
builtin_types=(type(None),),
|
257
|
-
order="deterministic",
|
690
|
+
builtin_types=(type(None),),
|
691
|
+
order="deterministic",
|
258
692
|
)
|
259
693
|
|
260
694
|
if include_none:
|
261
695
|
return result # type: ignore[no-any-return]
|
262
696
|
|
263
|
-
# Remove None values to match expected behavior
|
264
697
|
return {k: v for k, v in result.items() if v is not None}
|
265
698
|
|
266
699
|
def export_tables_to_csv(self) -> list[str]:
|
267
|
-
"""Export all tables to CSV format.
|
268
|
-
|
269
|
-
Returns:
|
270
|
-
List of CSV strings, one per table
|
271
|
-
"""
|
272
700
|
if not self.tables: # pragma: no cover
|
273
701
|
return []
|
274
702
|
|
275
703
|
return [export_table_to_csv(table) for table in self.tables]
|
276
704
|
|
277
705
|
def export_tables_to_tsv(self) -> list[str]:
|
278
|
-
"""Export all tables to TSV format.
|
279
|
-
|
280
|
-
Returns:
|
281
|
-
List of TSV strings, one per table
|
282
|
-
"""
|
283
706
|
if not self.tables: # pragma: no cover
|
284
707
|
return []
|
285
708
|
|
286
709
|
return [export_table_to_tsv(table) for table in self.tables]
|
287
710
|
|
288
711
|
def get_table_summaries(self) -> list[dict[str, Any]]:
|
289
|
-
"""Get structural information for all tables.
|
290
|
-
|
291
|
-
Returns:
|
292
|
-
List of table structure dictionaries
|
293
|
-
"""
|
294
712
|
if not self.tables: # pragma: no cover
|
295
713
|
return []
|
296
714
|
|
@@ -302,21 +720,15 @@ ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
|
302
720
|
|
303
721
|
|
304
722
|
@dataclass(unsafe_hash=True, slots=True)
|
305
|
-
class ExtractionConfig:
|
306
|
-
"""Represents configuration settings for an extraction process.
|
307
|
-
|
308
|
-
This class encapsulates the configuration options for extracting text
|
309
|
-
from images or documents using Optical Character Recognition (OCR). It
|
310
|
-
provides options to customize the OCR behavior, select the backend
|
311
|
-
engine, and configure engine-specific parameters.
|
312
|
-
"""
|
313
|
-
|
723
|
+
class ExtractionConfig(ConfigDict):
|
314
724
|
force_ocr: bool = False
|
315
725
|
"""Whether to force OCR."""
|
316
726
|
chunk_content: bool = False
|
317
727
|
"""Whether to chunk the content into smaller chunks."""
|
318
728
|
extract_tables: bool = False
|
319
729
|
"""Whether to extract tables from the content. This requires the 'gmft' dependency."""
|
730
|
+
extract_tables_from_ocr: bool = False
|
731
|
+
"""Extract tables from OCR output using TSV format (Tesseract only)."""
|
320
732
|
max_chars: int = DEFAULT_MAX_CHARACTERS
|
321
733
|
"""The size of each chunk in characters."""
|
322
734
|
max_overlap: int = DEFAULT_MAX_OVERLAP
|
@@ -359,6 +771,10 @@ class ExtractionConfig:
|
|
359
771
|
"""Whether to apply quality post-processing to improve extraction results."""
|
360
772
|
pdf_password: str | list[str] = ""
|
361
773
|
"""Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
|
774
|
+
html_to_markdown_config: HTMLToMarkdownConfig | None = None
|
775
|
+
"""Configuration for HTML to Markdown conversion. If None, uses default settings."""
|
776
|
+
use_cache: bool = True
|
777
|
+
"""Whether to use caching for extraction results. Set to False to disable all caching."""
|
362
778
|
|
363
779
|
def __post_init__(self) -> None:
|
364
780
|
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
@@ -367,9 +783,6 @@ class ExtractionConfig:
|
|
367
783
|
object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
|
368
784
|
if self.validators is not None and isinstance(self.validators, list):
|
369
785
|
object.__setattr__(self, "validators", tuple(self.validators))
|
370
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
371
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
372
|
-
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
373
786
|
|
374
787
|
if self.ocr_backend is None and self.ocr_config is not None:
|
375
788
|
raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
|
@@ -385,29 +798,111 @@ class ExtractionConfig:
|
|
385
798
|
)
|
386
799
|
|
387
800
|
def get_config_dict(self) -> dict[str, Any]:
|
388
|
-
"""Returns the OCR configuration object based on the backend specified.
|
389
|
-
|
390
|
-
Returns:
|
391
|
-
A dict of the OCR configuration or an empty dict if no backend is provided.
|
392
|
-
"""
|
393
801
|
if self.ocr_backend is None:
|
394
|
-
return {}
|
802
|
+
return {"use_cache": self.use_cache}
|
395
803
|
|
396
804
|
if self.ocr_config is not None:
|
397
|
-
|
398
|
-
|
805
|
+
config_dict = asdict(self.ocr_config)
|
806
|
+
config_dict["use_cache"] = self.use_cache
|
807
|
+
return config_dict
|
399
808
|
|
400
|
-
# Lazy load and cache default configs instead of creating new instances
|
401
809
|
match self.ocr_backend:
|
402
810
|
case "tesseract":
|
403
|
-
|
404
|
-
|
405
|
-
return
|
811
|
+
config_dict = asdict(TesseractConfig())
|
812
|
+
config_dict["use_cache"] = self.use_cache
|
813
|
+
return config_dict
|
406
814
|
case "easyocr":
|
407
|
-
|
815
|
+
config_dict = asdict(EasyOCRConfig())
|
816
|
+
config_dict["use_cache"] = self.use_cache
|
817
|
+
return config_dict
|
818
|
+
case _:
|
819
|
+
config_dict = asdict(PaddleOCRConfig())
|
820
|
+
config_dict["use_cache"] = self.use_cache
|
821
|
+
return config_dict
|
822
|
+
|
823
|
+
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
824
|
+
result = msgspec.to_builtins(
|
825
|
+
self,
|
826
|
+
builtin_types=(type(None),),
|
827
|
+
order="deterministic",
|
828
|
+
)
|
829
|
+
|
830
|
+
for field_name, value in result.items():
|
831
|
+
if hasattr(value, "to_dict"):
|
832
|
+
result[field_name] = value.to_dict(include_none=include_none)
|
833
|
+
|
834
|
+
if include_none:
|
835
|
+
return result # type: ignore[no-any-return]
|
836
|
+
|
837
|
+
return {k: v for k, v in result.items() if v is not None}
|
408
838
|
|
409
|
-
return asdict(EasyOCRConfig())
|
410
|
-
case _: # paddleocr or any other backend
|
411
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
412
839
|
|
413
|
-
|
840
|
+
@dataclass(frozen=True)
|
841
|
+
class HTMLToMarkdownConfig:
|
842
|
+
stream_processing: bool = False
|
843
|
+
"""Enable streaming mode for processing large HTML documents."""
|
844
|
+
chunk_size: int = 1024
|
845
|
+
"""Size of chunks when stream_processing is enabled."""
|
846
|
+
chunk_callback: Callable[[str], None] | None = None
|
847
|
+
"""Callback function invoked for each chunk during stream processing."""
|
848
|
+
progress_callback: Callable[[int, int], None] | None = None
|
849
|
+
"""Callback function for progress updates (current, total)."""
|
850
|
+
parser: str | None = "lxml"
|
851
|
+
"""BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
|
852
|
+
autolinks: bool = True
|
853
|
+
"""Convert URLs to clickable links automatically."""
|
854
|
+
bullets: str = "*+-"
|
855
|
+
"""Characters to use for unordered list bullets."""
|
856
|
+
code_language: str = ""
|
857
|
+
"""Default language for code blocks."""
|
858
|
+
code_language_callback: Callable[[Any], str] | None = None
|
859
|
+
"""Callback to determine code language dynamically."""
|
860
|
+
convert: str | Iterable[str] | None = None
|
861
|
+
"""HTML tags to convert. If None, all supported tags are converted."""
|
862
|
+
convert_as_inline: bool = False
|
863
|
+
"""Convert block elements as inline elements."""
|
864
|
+
custom_converters: Mapping[Any, Any] | None = None
|
865
|
+
"""Custom converters for specific HTML elements."""
|
866
|
+
default_title: bool = False
|
867
|
+
"""Use a default title if none is found."""
|
868
|
+
escape_asterisks: bool = True
|
869
|
+
"""Escape asterisks in text to prevent unintended emphasis."""
|
870
|
+
escape_misc: bool = True
|
871
|
+
"""Escape miscellaneous characters that have special meaning in Markdown."""
|
872
|
+
escape_underscores: bool = True
|
873
|
+
"""Escape underscores in text to prevent unintended emphasis."""
|
874
|
+
extract_metadata: bool = True
|
875
|
+
"""Extract metadata from HTML head section."""
|
876
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
|
877
|
+
"""Style for markdown headings."""
|
878
|
+
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
879
|
+
"""Style for highlighting text."""
|
880
|
+
keep_inline_images_in: Iterable[str] | None = None
|
881
|
+
"""HTML tags where inline images should be preserved."""
|
882
|
+
newline_style: Literal["spaces", "backslash"] = "spaces"
|
883
|
+
"""Style for line breaks in markdown."""
|
884
|
+
strip: str | Iterable[str] | None = None
|
885
|
+
"""HTML tags to strip completely from output."""
|
886
|
+
strip_newlines: bool = False
|
887
|
+
"""Strip newlines from the output."""
|
888
|
+
strong_em_symbol: Literal["*", "_"] = "*"
|
889
|
+
"""Symbol to use for strong/emphasis formatting."""
|
890
|
+
sub_symbol: str = ""
|
891
|
+
"""Symbol to use for subscript text."""
|
892
|
+
sup_symbol: str = ""
|
893
|
+
"""Symbol to use for superscript text."""
|
894
|
+
wrap: bool = False
|
895
|
+
"""Enable text wrapping."""
|
896
|
+
wrap_width: int = 80
|
897
|
+
"""Width for text wrapping when wrap is True."""
|
898
|
+
preprocess_html: bool = True
|
899
|
+
"""Enable HTML preprocessing to clean up the input."""
|
900
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
|
901
|
+
"""Preprocessing level for cleaning HTML."""
|
902
|
+
remove_navigation: bool = True
|
903
|
+
"""Remove navigation elements from HTML."""
|
904
|
+
remove_forms: bool = True
|
905
|
+
"""Remove form elements from HTML."""
|
906
|
+
|
907
|
+
def to_dict(self) -> dict[str, Any]:
|
908
|
+
return {key: value for key, value in self.__dict__.items() if value is not None}
|