kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -2,17 +2,15 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import platform
|
4
4
|
import warnings
|
5
|
-
from dataclasses import dataclass
|
6
5
|
from importlib.util import find_spec
|
7
|
-
from
|
8
|
-
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
6
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
9
7
|
|
10
8
|
from PIL import Image
|
11
9
|
|
12
10
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
13
11
|
from kreuzberg._ocr._base import OCRBackend
|
14
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
15
|
-
from kreuzberg._utils._device import DeviceInfo,
|
12
|
+
from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
|
13
|
+
from kreuzberg._utils._device import DeviceInfo, validate_device_request
|
16
14
|
from kreuzberg._utils._string import normalize_spaces
|
17
15
|
from kreuzberg._utils._sync import run_sync
|
18
16
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -20,91 +18,23 @@ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationErr
|
|
20
18
|
if TYPE_CHECKING:
|
21
19
|
from pathlib import Path
|
22
20
|
|
23
|
-
|
24
21
|
try: # pragma: no cover
|
25
22
|
from typing import Unpack # type: ignore[attr-defined]
|
26
23
|
except ImportError: # pragma: no cover
|
27
24
|
from typing_extensions import Unpack
|
28
25
|
|
26
|
+
try:
|
27
|
+
import numpy as np
|
28
|
+
from paddleocr import PaddleOCR
|
29
29
|
|
30
|
-
|
30
|
+
HAS_PADDLEOCR = True
|
31
|
+
except ImportError:
|
32
|
+
HAS_PADDLEOCR = False
|
33
|
+
np = None # type: ignore[assignment]
|
34
|
+
PaddleOCR = None
|
31
35
|
|
32
36
|
|
33
|
-
|
34
|
-
class PaddleOCRConfig:
|
35
|
-
"""Configuration options for PaddleOCR.
|
36
|
-
|
37
|
-
This TypedDict provides type hints and documentation for all PaddleOCR parameters.
|
38
|
-
"""
|
39
|
-
|
40
|
-
cls_image_shape: str = "3,48,192"
|
41
|
-
"""Image shape for classification algorithm in format 'channels,height,width'."""
|
42
|
-
det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
|
43
|
-
"""Detection algorithm."""
|
44
|
-
det_db_box_thresh: float = 0.5
|
45
|
-
"""Score threshold for detected boxes. Boxes below this value are discarded."""
|
46
|
-
det_db_thresh: float = 0.3
|
47
|
-
"""Binarization threshold for DB output map."""
|
48
|
-
det_db_unclip_ratio: float = 2.0
|
49
|
-
"""Expansion ratio for detected text boxes."""
|
50
|
-
det_east_cover_thresh: float = 0.1
|
51
|
-
"""Score threshold for EAST output boxes."""
|
52
|
-
det_east_nms_thresh: float = 0.2
|
53
|
-
"""NMS threshold for EAST model output boxes."""
|
54
|
-
det_east_score_thresh: float = 0.8
|
55
|
-
"""Binarization threshold for EAST output map."""
|
56
|
-
det_max_side_len: int = 960
|
57
|
-
"""Maximum size of image long side. Images exceeding this will be proportionally resized."""
|
58
|
-
det_model_dir: str | None = None
|
59
|
-
"""Directory for detection model. If None, uses default model location."""
|
60
|
-
drop_score: float = 0.5
|
61
|
-
"""Filter recognition results by confidence score. Results below this are discarded."""
|
62
|
-
enable_mkldnn: bool = False
|
63
|
-
"""Whether to enable MKL-DNN acceleration (Intel CPU only)."""
|
64
|
-
gpu_mem: int = 8000
|
65
|
-
"""GPU memory size (in MB) to use for initialization."""
|
66
|
-
language: str = "en"
|
67
|
-
"""Language to use for OCR."""
|
68
|
-
max_text_length: int = 25
|
69
|
-
"""Maximum text length that the recognition algorithm can recognize."""
|
70
|
-
rec: bool = True
|
71
|
-
"""Enable text recognition when using the ocr() function."""
|
72
|
-
rec_algorithm: Literal[
|
73
|
-
"CRNN",
|
74
|
-
"SRN",
|
75
|
-
"NRTR",
|
76
|
-
"SAR",
|
77
|
-
"SEED",
|
78
|
-
"SVTR",
|
79
|
-
"SVTR_LCNet",
|
80
|
-
"ViTSTR",
|
81
|
-
"ABINet",
|
82
|
-
"VisionLAN",
|
83
|
-
"SPIN",
|
84
|
-
"RobustScanner",
|
85
|
-
"RFL",
|
86
|
-
] = "CRNN"
|
87
|
-
"""Recognition algorithm."""
|
88
|
-
rec_image_shape: str = "3,32,320"
|
89
|
-
"""Image shape for recognition algorithm in format 'channels,height,width'."""
|
90
|
-
rec_model_dir: str | None = None
|
91
|
-
"""Directory for recognition model. If None, uses default model location."""
|
92
|
-
table: bool = True
|
93
|
-
"""Whether to enable table recognition."""
|
94
|
-
use_angle_cls: bool = True
|
95
|
-
"""Whether to use text orientation classification model."""
|
96
|
-
use_gpu: bool = False
|
97
|
-
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
98
|
-
device: DeviceType = "auto"
|
99
|
-
"""Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
|
100
|
-
gpu_memory_limit: float | None = None
|
101
|
-
"""Maximum GPU memory to use in GB. None for no limit."""
|
102
|
-
fallback_to_cpu: bool = True
|
103
|
-
"""Whether to fallback to CPU if requested device is unavailable."""
|
104
|
-
use_space_char: bool = True
|
105
|
-
"""Whether to recognize spaces."""
|
106
|
-
use_zero_copy_run: bool = False
|
107
|
-
"""Whether to enable zero_copy_run for inference optimization."""
|
37
|
+
PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
|
108
38
|
|
109
39
|
|
110
40
|
class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
@@ -123,8 +53,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
123
53
|
Raises:
|
124
54
|
OCRError: If OCR processing fails.
|
125
55
|
"""
|
126
|
-
import numpy as np # noqa: PLC0415
|
127
|
-
|
128
56
|
await self._init_paddle_ocr(**kwargs)
|
129
57
|
|
130
58
|
if image.mode != "RGB":
|
@@ -258,12 +186,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
258
186
|
if cls._paddle_ocr is not None:
|
259
187
|
return
|
260
188
|
|
261
|
-
|
262
|
-
from paddleocr import PaddleOCR # noqa: PLC0415
|
263
|
-
except ImportError as e: # pragma: no cover
|
189
|
+
if not HAS_PADDLEOCR or PaddleOCR is None:
|
264
190
|
raise MissingDependencyError.create_for_package(
|
265
191
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
266
|
-
)
|
192
|
+
)
|
267
193
|
|
268
194
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
269
195
|
|
@@ -379,8 +305,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
379
305
|
Raises:
|
380
306
|
OCRError: If OCR processing fails.
|
381
307
|
"""
|
382
|
-
import numpy as np # noqa: PLC0415
|
383
|
-
|
384
308
|
self._init_paddle_ocr_sync(**kwargs)
|
385
309
|
|
386
310
|
if image.mode != "RGB":
|
@@ -427,12 +351,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
427
351
|
if cls._paddle_ocr is not None:
|
428
352
|
return
|
429
353
|
|
430
|
-
|
431
|
-
from paddleocr import PaddleOCR # noqa: PLC0415
|
432
|
-
except ImportError as e: # pragma: no cover
|
354
|
+
if not HAS_PADDLEOCR or PaddleOCR is None:
|
433
355
|
raise MissingDependencyError.create_for_package(
|
434
356
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
435
|
-
)
|
357
|
+
)
|
436
358
|
|
437
359
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
438
360
|
|
@@ -0,0 +1,260 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import csv
|
4
|
+
from io import StringIO
|
5
|
+
from typing import TYPE_CHECKING
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from kreuzberg.exceptions import ParsingError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from kreuzberg._types import TSVWord
|
13
|
+
|
14
|
+
|
15
|
+
def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
|
16
|
+
"""Parse TSV output into structured word data.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
tsv_data: Raw TSV output from Tesseract.
|
20
|
+
min_confidence: Minimum confidence score to include a word.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
List of word dictionaries with position and text data.
|
24
|
+
|
25
|
+
Raises:
|
26
|
+
ParsingError: If TSV data cannot be parsed.
|
27
|
+
"""
|
28
|
+
try:
|
29
|
+
reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
|
30
|
+
words: list[TSVWord] = []
|
31
|
+
|
32
|
+
for row in reader:
|
33
|
+
if row.get("level") == "5" and row.get("text", "").strip():
|
34
|
+
try:
|
35
|
+
conf = float(row["conf"])
|
36
|
+
if conf < min_confidence:
|
37
|
+
continue
|
38
|
+
|
39
|
+
words.append(
|
40
|
+
{
|
41
|
+
"level": int(row["level"]),
|
42
|
+
"page_num": int(row["page_num"]),
|
43
|
+
"block_num": int(row["block_num"]),
|
44
|
+
"par_num": int(row["par_num"]),
|
45
|
+
"line_num": int(row["line_num"]),
|
46
|
+
"word_num": int(row["word_num"]),
|
47
|
+
"left": int(row["left"]),
|
48
|
+
"top": int(row["top"]),
|
49
|
+
"width": int(row["width"]),
|
50
|
+
"height": int(row["height"]),
|
51
|
+
"conf": conf,
|
52
|
+
"text": row["text"],
|
53
|
+
}
|
54
|
+
)
|
55
|
+
except (ValueError, KeyError):
|
56
|
+
continue
|
57
|
+
|
58
|
+
return words
|
59
|
+
|
60
|
+
except Exception as e:
|
61
|
+
raise ParsingError("Failed to parse TSV data", context={"error": str(e)}) from e
|
62
|
+
|
63
|
+
|
64
|
+
def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
|
65
|
+
"""Detect columns using X position clustering.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
words: List of word dictionaries from TSV.
|
69
|
+
column_threshold: Pixel threshold for column clustering.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
Sorted list of column X positions.
|
73
|
+
"""
|
74
|
+
if not words:
|
75
|
+
return []
|
76
|
+
|
77
|
+
x_positions = sorted({w["left"] for w in words})
|
78
|
+
|
79
|
+
if len(x_positions) == 1:
|
80
|
+
return x_positions
|
81
|
+
|
82
|
+
columns = []
|
83
|
+
current_group = [x_positions[0]]
|
84
|
+
|
85
|
+
for x in x_positions[1:]:
|
86
|
+
if x - current_group[-1] <= column_threshold:
|
87
|
+
current_group.append(x)
|
88
|
+
else:
|
89
|
+
columns.append(int(np.median(current_group)))
|
90
|
+
current_group = [x]
|
91
|
+
|
92
|
+
columns.append(int(np.median(current_group)))
|
93
|
+
return columns
|
94
|
+
|
95
|
+
|
96
|
+
def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
|
97
|
+
"""Detect rows using Y position clustering.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
words: List of word dictionaries from TSV.
|
101
|
+
row_threshold_ratio: Row threshold as ratio of mean text height.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
Sorted list of row Y positions.
|
105
|
+
"""
|
106
|
+
if not words:
|
107
|
+
return []
|
108
|
+
|
109
|
+
y_centers = sorted(w["top"] + w["height"] / 2 for w in words)
|
110
|
+
|
111
|
+
if len(y_centers) == 1:
|
112
|
+
return [int(y_centers[0])]
|
113
|
+
|
114
|
+
mean_height = np.mean([w["height"] for w in words])
|
115
|
+
threshold = mean_height * row_threshold_ratio
|
116
|
+
|
117
|
+
rows = []
|
118
|
+
current_group = [y_centers[0]]
|
119
|
+
|
120
|
+
for y in y_centers[1:]:
|
121
|
+
if y - np.mean(current_group) <= threshold:
|
122
|
+
current_group.append(y)
|
123
|
+
else:
|
124
|
+
rows.append(int(np.median(current_group)))
|
125
|
+
current_group = [y]
|
126
|
+
|
127
|
+
rows.append(int(np.median(current_group)))
|
128
|
+
return rows
|
129
|
+
|
130
|
+
|
131
|
+
def _find_closest_index(value: float, positions: list[int]) -> int:
|
132
|
+
"""Find index of closest position.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
value: The value to match.
|
136
|
+
positions: List of positions to search.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
Index of the closest position.
|
140
|
+
"""
|
141
|
+
if not positions:
|
142
|
+
return 0
|
143
|
+
|
144
|
+
distances = [abs(value - pos) for pos in positions]
|
145
|
+
return distances.index(min(distances))
|
146
|
+
|
147
|
+
|
148
|
+
def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
|
149
|
+
"""Remove completely empty rows and columns.
|
150
|
+
|
151
|
+
Args:
|
152
|
+
table: 2D table array.
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
Cleaned table with empty rows/columns removed.
|
156
|
+
"""
|
157
|
+
if not table:
|
158
|
+
return table
|
159
|
+
|
160
|
+
table = [row for row in table if any(cell.strip() for cell in row)]
|
161
|
+
|
162
|
+
if not table:
|
163
|
+
return []
|
164
|
+
|
165
|
+
non_empty_cols = [
|
166
|
+
col_idx for col_idx in range(len(table[0])) if any(row[col_idx].strip() for row in table if col_idx < len(row))
|
167
|
+
]
|
168
|
+
|
169
|
+
if not non_empty_cols:
|
170
|
+
return []
|
171
|
+
|
172
|
+
return [[row[col_idx] if col_idx < len(row) else "" for col_idx in non_empty_cols] for row in table]
|
173
|
+
|
174
|
+
|
175
|
+
def reconstruct_table(
|
176
|
+
words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
|
177
|
+
) -> list[list[str]]:
|
178
|
+
"""Reconstruct table from words and detected structure.
|
179
|
+
|
180
|
+
Args:
|
181
|
+
words: List of word dictionaries from TSV.
|
182
|
+
column_threshold: Pixel threshold for column clustering.
|
183
|
+
row_threshold_ratio: Row threshold as ratio of mean text height.
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
2D list representing the table structure.
|
187
|
+
"""
|
188
|
+
if not words:
|
189
|
+
return []
|
190
|
+
|
191
|
+
col_positions = detect_columns(words, column_threshold=column_threshold)
|
192
|
+
row_positions = detect_rows(words, row_threshold_ratio=row_threshold_ratio)
|
193
|
+
|
194
|
+
if not col_positions or not row_positions:
|
195
|
+
return []
|
196
|
+
|
197
|
+
table: list[list[str]] = [[""] * len(col_positions) for _ in range(len(row_positions))]
|
198
|
+
|
199
|
+
for word in words:
|
200
|
+
col_idx = _find_closest_index(word["left"], col_positions)
|
201
|
+
|
202
|
+
y_center = word["top"] + word["height"] / 2
|
203
|
+
row_idx = _find_closest_index(y_center, row_positions)
|
204
|
+
|
205
|
+
if table[row_idx][col_idx]:
|
206
|
+
table[row_idx][col_idx] += " " + word["text"]
|
207
|
+
else:
|
208
|
+
table[row_idx][col_idx] = word["text"]
|
209
|
+
|
210
|
+
return _remove_empty_rows_cols(table)
|
211
|
+
|
212
|
+
|
213
|
+
def to_markdown(table: list[list[str]]) -> str:
|
214
|
+
"""Convert table to markdown format.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
table: 2D list representing the table.
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
Markdown-formatted table string.
|
221
|
+
"""
|
222
|
+
if not table or not table[0]:
|
223
|
+
return ""
|
224
|
+
|
225
|
+
lines = []
|
226
|
+
|
227
|
+
lines.append("| " + " | ".join(str(cell) for cell in table[0]) + " |")
|
228
|
+
|
229
|
+
lines.append("| " + " | ".join(["---"] * len(table[0])) + " |")
|
230
|
+
|
231
|
+
for row in table[1:]:
|
232
|
+
padded_row = list(row) + [""] * (len(table[0]) - len(row))
|
233
|
+
lines.append("| " + " | ".join(str(cell) for cell in padded_row[: len(table[0])]) + " |")
|
234
|
+
|
235
|
+
return "\n".join(lines)
|
236
|
+
|
237
|
+
|
238
|
+
def extract_table_from_tsv(
|
239
|
+
tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
|
240
|
+
) -> str:
|
241
|
+
"""Extract table from TSV data and convert to markdown.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
tsv_data: Raw TSV output from Tesseract.
|
245
|
+
column_threshold: Pixel threshold for column clustering.
|
246
|
+
row_threshold_ratio: Row threshold as ratio of mean text height.
|
247
|
+
min_confidence: Minimum confidence score to include a word.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
Markdown-formatted table string, or empty string if no table detected.
|
251
|
+
"""
|
252
|
+
words = extract_words(tsv_data, min_confidence=min_confidence)
|
253
|
+
if not words:
|
254
|
+
return ""
|
255
|
+
|
256
|
+
table = reconstruct_table(words, column_threshold=column_threshold, row_threshold_ratio=row_threshold_ratio)
|
257
|
+
if not table:
|
258
|
+
return ""
|
259
|
+
|
260
|
+
return to_markdown(table)
|