kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,17 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  import platform
4
4
  import warnings
5
- from dataclasses import dataclass
6
5
  from importlib.util import find_spec
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Final
9
7
 
10
8
  from PIL import Image
11
9
 
12
10
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
13
11
  from kreuzberg._ocr._base import OCRBackend
14
- from kreuzberg._types import ExtractionResult, Metadata
15
- from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
12
+ from kreuzberg._types import ExtractionResult, Metadata, PaddleOCRConfig
13
+ from kreuzberg._utils._device import DeviceInfo, validate_device_request
16
14
  from kreuzberg._utils._string import normalize_spaces
17
15
  from kreuzberg._utils._sync import run_sync
18
16
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -20,91 +18,23 @@ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationErr
20
18
  if TYPE_CHECKING:
21
19
  from pathlib import Path
22
20
 
23
-
24
21
  try: # pragma: no cover
25
22
  from typing import Unpack # type: ignore[attr-defined]
26
23
  except ImportError: # pragma: no cover
27
24
  from typing_extensions import Unpack
28
25
 
26
+ try:
27
+ import numpy as np
28
+ from paddleocr import PaddleOCR
29
29
 
30
- PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
30
+ HAS_PADDLEOCR = True
31
+ except ImportError:
32
+ HAS_PADDLEOCR = False
33
+ np = None # type: ignore[assignment]
34
+ PaddleOCR = None
31
35
 
32
36
 
33
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
34
- class PaddleOCRConfig:
35
- """Configuration options for PaddleOCR.
36
-
37
- This TypedDict provides type hints and documentation for all PaddleOCR parameters.
38
- """
39
-
40
- cls_image_shape: str = "3,48,192"
41
- """Image shape for classification algorithm in format 'channels,height,width'."""
42
- det_algorithm: Literal["DB", "EAST", "SAST", "PSE", "FCE", "PAN", "CT", "DB++", "Layout"] = "DB"
43
- """Detection algorithm."""
44
- det_db_box_thresh: float = 0.5
45
- """Score threshold for detected boxes. Boxes below this value are discarded."""
46
- det_db_thresh: float = 0.3
47
- """Binarization threshold for DB output map."""
48
- det_db_unclip_ratio: float = 2.0
49
- """Expansion ratio for detected text boxes."""
50
- det_east_cover_thresh: float = 0.1
51
- """Score threshold for EAST output boxes."""
52
- det_east_nms_thresh: float = 0.2
53
- """NMS threshold for EAST model output boxes."""
54
- det_east_score_thresh: float = 0.8
55
- """Binarization threshold for EAST output map."""
56
- det_max_side_len: int = 960
57
- """Maximum size of image long side. Images exceeding this will be proportionally resized."""
58
- det_model_dir: str | None = None
59
- """Directory for detection model. If None, uses default model location."""
60
- drop_score: float = 0.5
61
- """Filter recognition results by confidence score. Results below this are discarded."""
62
- enable_mkldnn: bool = False
63
- """Whether to enable MKL-DNN acceleration (Intel CPU only)."""
64
- gpu_mem: int = 8000
65
- """GPU memory size (in MB) to use for initialization."""
66
- language: str = "en"
67
- """Language to use for OCR."""
68
- max_text_length: int = 25
69
- """Maximum text length that the recognition algorithm can recognize."""
70
- rec: bool = True
71
- """Enable text recognition when using the ocr() function."""
72
- rec_algorithm: Literal[
73
- "CRNN",
74
- "SRN",
75
- "NRTR",
76
- "SAR",
77
- "SEED",
78
- "SVTR",
79
- "SVTR_LCNet",
80
- "ViTSTR",
81
- "ABINet",
82
- "VisionLAN",
83
- "SPIN",
84
- "RobustScanner",
85
- "RFL",
86
- ] = "CRNN"
87
- """Recognition algorithm."""
88
- rec_image_shape: str = "3,32,320"
89
- """Image shape for recognition algorithm in format 'channels,height,width'."""
90
- rec_model_dir: str | None = None
91
- """Directory for recognition model. If None, uses default model location."""
92
- table: bool = True
93
- """Whether to enable table recognition."""
94
- use_angle_cls: bool = True
95
- """Whether to use text orientation classification model."""
96
- use_gpu: bool = False
97
- """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
98
- device: DeviceType = "auto"
99
- """Device to use for inference. Options: 'cpu', 'cuda', 'auto'. Note: MPS not supported by PaddlePaddle."""
100
- gpu_memory_limit: float | None = None
101
- """Maximum GPU memory to use in GB. None for no limit."""
102
- fallback_to_cpu: bool = True
103
- """Whether to fallback to CPU if requested device is unavailable."""
104
- use_space_char: bool = True
105
- """Whether to recognize spaces."""
106
- use_zero_copy_run: bool = False
107
- """Whether to enable zero_copy_run for inference optimization."""
37
+ PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
108
38
 
109
39
 
110
40
  class PaddleBackend(OCRBackend[PaddleOCRConfig]):
@@ -123,8 +53,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
123
53
  Raises:
124
54
  OCRError: If OCR processing fails.
125
55
  """
126
- import numpy as np # noqa: PLC0415
127
-
128
56
  await self._init_paddle_ocr(**kwargs)
129
57
 
130
58
  if image.mode != "RGB":
@@ -258,12 +186,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
258
186
  if cls._paddle_ocr is not None:
259
187
  return
260
188
 
261
- try:
262
- from paddleocr import PaddleOCR # noqa: PLC0415
263
- except ImportError as e: # pragma: no cover
189
+ if not HAS_PADDLEOCR or PaddleOCR is None:
264
190
  raise MissingDependencyError.create_for_package(
265
191
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
266
- ) from e
192
+ )
267
193
 
268
194
  language = cls._validate_language_code(kwargs.pop("language", "en"))
269
195
 
@@ -379,8 +305,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
379
305
  Raises:
380
306
  OCRError: If OCR processing fails.
381
307
  """
382
- import numpy as np # noqa: PLC0415
383
-
384
308
  self._init_paddle_ocr_sync(**kwargs)
385
309
 
386
310
  if image.mode != "RGB":
@@ -427,12 +351,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
427
351
  if cls._paddle_ocr is not None:
428
352
  return
429
353
 
430
- try:
431
- from paddleocr import PaddleOCR # noqa: PLC0415
432
- except ImportError as e: # pragma: no cover
354
+ if not HAS_PADDLEOCR or PaddleOCR is None:
433
355
  raise MissingDependencyError.create_for_package(
434
356
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
435
- ) from e
357
+ )
436
358
 
437
359
  language = cls._validate_language_code(kwargs.pop("language", "en"))
438
360
 
@@ -0,0 +1,260 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ from io import StringIO
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ from kreuzberg.exceptions import ParsingError
10
+
11
+ if TYPE_CHECKING:
12
+ from kreuzberg._types import TSVWord
13
+
14
+
15
+ def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
16
+ """Parse TSV output into structured word data.
17
+
18
+ Args:
19
+ tsv_data: Raw TSV output from Tesseract.
20
+ min_confidence: Minimum confidence score to include a word.
21
+
22
+ Returns:
23
+ List of word dictionaries with position and text data.
24
+
25
+ Raises:
26
+ ParsingError: If TSV data cannot be parsed.
27
+ """
28
+ try:
29
+ reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
30
+ words: list[TSVWord] = []
31
+
32
+ for row in reader:
33
+ if row.get("level") == "5" and row.get("text", "").strip():
34
+ try:
35
+ conf = float(row["conf"])
36
+ if conf < min_confidence:
37
+ continue
38
+
39
+ words.append(
40
+ {
41
+ "level": int(row["level"]),
42
+ "page_num": int(row["page_num"]),
43
+ "block_num": int(row["block_num"]),
44
+ "par_num": int(row["par_num"]),
45
+ "line_num": int(row["line_num"]),
46
+ "word_num": int(row["word_num"]),
47
+ "left": int(row["left"]),
48
+ "top": int(row["top"]),
49
+ "width": int(row["width"]),
50
+ "height": int(row["height"]),
51
+ "conf": conf,
52
+ "text": row["text"],
53
+ }
54
+ )
55
+ except (ValueError, KeyError):
56
+ continue
57
+
58
+ return words
59
+
60
+ except Exception as e:
61
+ raise ParsingError("Failed to parse TSV data", context={"error": str(e)}) from e
62
+
63
+
64
+ def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
65
+ """Detect columns using X position clustering.
66
+
67
+ Args:
68
+ words: List of word dictionaries from TSV.
69
+ column_threshold: Pixel threshold for column clustering.
70
+
71
+ Returns:
72
+ Sorted list of column X positions.
73
+ """
74
+ if not words:
75
+ return []
76
+
77
+ x_positions = sorted({w["left"] for w in words})
78
+
79
+ if len(x_positions) == 1:
80
+ return x_positions
81
+
82
+ columns = []
83
+ current_group = [x_positions[0]]
84
+
85
+ for x in x_positions[1:]:
86
+ if x - current_group[-1] <= column_threshold:
87
+ current_group.append(x)
88
+ else:
89
+ columns.append(int(np.median(current_group)))
90
+ current_group = [x]
91
+
92
+ columns.append(int(np.median(current_group)))
93
+ return columns
94
+
95
+
96
+ def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
97
+ """Detect rows using Y position clustering.
98
+
99
+ Args:
100
+ words: List of word dictionaries from TSV.
101
+ row_threshold_ratio: Row threshold as ratio of mean text height.
102
+
103
+ Returns:
104
+ Sorted list of row Y positions.
105
+ """
106
+ if not words:
107
+ return []
108
+
109
+ y_centers = sorted(w["top"] + w["height"] / 2 for w in words)
110
+
111
+ if len(y_centers) == 1:
112
+ return [int(y_centers[0])]
113
+
114
+ mean_height = np.mean([w["height"] for w in words])
115
+ threshold = mean_height * row_threshold_ratio
116
+
117
+ rows = []
118
+ current_group = [y_centers[0]]
119
+
120
+ for y in y_centers[1:]:
121
+ if y - np.mean(current_group) <= threshold:
122
+ current_group.append(y)
123
+ else:
124
+ rows.append(int(np.median(current_group)))
125
+ current_group = [y]
126
+
127
+ rows.append(int(np.median(current_group)))
128
+ return rows
129
+
130
+
131
+ def _find_closest_index(value: float, positions: list[int]) -> int:
132
+ """Find index of closest position.
133
+
134
+ Args:
135
+ value: The value to match.
136
+ positions: List of positions to search.
137
+
138
+ Returns:
139
+ Index of the closest position.
140
+ """
141
+ if not positions:
142
+ return 0
143
+
144
+ distances = [abs(value - pos) for pos in positions]
145
+ return distances.index(min(distances))
146
+
147
+
148
+ def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
149
+ """Remove completely empty rows and columns.
150
+
151
+ Args:
152
+ table: 2D table array.
153
+
154
+ Returns:
155
+ Cleaned table with empty rows/columns removed.
156
+ """
157
+ if not table:
158
+ return table
159
+
160
+ table = [row for row in table if any(cell.strip() for cell in row)]
161
+
162
+ if not table:
163
+ return []
164
+
165
+ non_empty_cols = [
166
+ col_idx for col_idx in range(len(table[0])) if any(row[col_idx].strip() for row in table if col_idx < len(row))
167
+ ]
168
+
169
+ if not non_empty_cols:
170
+ return []
171
+
172
+ return [[row[col_idx] if col_idx < len(row) else "" for col_idx in non_empty_cols] for row in table]
173
+
174
+
175
+ def reconstruct_table(
176
+ words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
177
+ ) -> list[list[str]]:
178
+ """Reconstruct table from words and detected structure.
179
+
180
+ Args:
181
+ words: List of word dictionaries from TSV.
182
+ column_threshold: Pixel threshold for column clustering.
183
+ row_threshold_ratio: Row threshold as ratio of mean text height.
184
+
185
+ Returns:
186
+ 2D list representing the table structure.
187
+ """
188
+ if not words:
189
+ return []
190
+
191
+ col_positions = detect_columns(words, column_threshold=column_threshold)
192
+ row_positions = detect_rows(words, row_threshold_ratio=row_threshold_ratio)
193
+
194
+ if not col_positions or not row_positions:
195
+ return []
196
+
197
+ table: list[list[str]] = [[""] * len(col_positions) for _ in range(len(row_positions))]
198
+
199
+ for word in words:
200
+ col_idx = _find_closest_index(word["left"], col_positions)
201
+
202
+ y_center = word["top"] + word["height"] / 2
203
+ row_idx = _find_closest_index(y_center, row_positions)
204
+
205
+ if table[row_idx][col_idx]:
206
+ table[row_idx][col_idx] += " " + word["text"]
207
+ else:
208
+ table[row_idx][col_idx] = word["text"]
209
+
210
+ return _remove_empty_rows_cols(table)
211
+
212
+
213
+ def to_markdown(table: list[list[str]]) -> str:
214
+ """Convert table to markdown format.
215
+
216
+ Args:
217
+ table: 2D list representing the table.
218
+
219
+ Returns:
220
+ Markdown-formatted table string.
221
+ """
222
+ if not table or not table[0]:
223
+ return ""
224
+
225
+ lines = []
226
+
227
+ lines.append("| " + " | ".join(str(cell) for cell in table[0]) + " |")
228
+
229
+ lines.append("| " + " | ".join(["---"] * len(table[0])) + " |")
230
+
231
+ for row in table[1:]:
232
+ padded_row = list(row) + [""] * (len(table[0]) - len(row))
233
+ lines.append("| " + " | ".join(str(cell) for cell in padded_row[: len(table[0])]) + " |")
234
+
235
+ return "\n".join(lines)
236
+
237
+
238
+ def extract_table_from_tsv(
239
+ tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
240
+ ) -> str:
241
+ """Extract table from TSV data and convert to markdown.
242
+
243
+ Args:
244
+ tsv_data: Raw TSV output from Tesseract.
245
+ column_threshold: Pixel threshold for column clustering.
246
+ row_threshold_ratio: Row threshold as ratio of mean text height.
247
+ min_confidence: Minimum confidence score to include a word.
248
+
249
+ Returns:
250
+ Markdown-formatted table string, or empty string if no table detected.
251
+ """
252
+ words = extract_words(tsv_data, min_confidence=min_confidence)
253
+ if not words:
254
+ return ""
255
+
256
+ table = reconstruct_table(words, column_threshold=column_threshold, row_threshold_ratio=row_threshold_ratio)
257
+ if not table:
258
+ return ""
259
+
260
+ return to_markdown(table)