kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ from io import StringIO
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ from kreuzberg.exceptions import ParsingError
10
+
11
+ if TYPE_CHECKING:
12
+ from kreuzberg._types import TSVWord
13
+
14
+
15
+ def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
16
+ try:
17
+ reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
18
+ words: list[TSVWord] = []
19
+
20
+ for row in reader:
21
+ if row.get("level") == "5" and row.get("text", "").strip():
22
+ try:
23
+ conf = float(row["conf"])
24
+ if conf < min_confidence:
25
+ continue
26
+
27
+ words.append(
28
+ {
29
+ "level": int(row["level"]),
30
+ "page_num": int(row["page_num"]),
31
+ "block_num": int(row["block_num"]),
32
+ "par_num": int(row["par_num"]),
33
+ "line_num": int(row["line_num"]),
34
+ "word_num": int(row["word_num"]),
35
+ "left": int(row["left"]),
36
+ "top": int(row["top"]),
37
+ "width": int(row["width"]),
38
+ "height": int(row["height"]),
39
+ "conf": conf,
40
+ "text": row["text"],
41
+ }
42
+ )
43
+ except (ValueError, KeyError):
44
+ continue
45
+
46
+ return words
47
+
48
+ except Exception as e:
49
+ raise ParsingError("Failed to parse TSV data", context={"error": str(e)}) from e
50
+
51
+
52
+ def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
53
+ if not words:
54
+ return []
55
+
56
+ x_positions = sorted({w["left"] for w in words})
57
+
58
+ if len(x_positions) == 1:
59
+ return x_positions
60
+
61
+ columns = []
62
+ current_group = [x_positions[0]]
63
+
64
+ for x in x_positions[1:]:
65
+ if x - current_group[-1] <= column_threshold:
66
+ current_group.append(x)
67
+ else:
68
+ columns.append(int(np.median(current_group)))
69
+ current_group = [x]
70
+
71
+ columns.append(int(np.median(current_group)))
72
+ return columns
73
+
74
+
75
+ def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
76
+ if not words:
77
+ return []
78
+
79
+ y_centers = sorted(w["top"] + w["height"] / 2 for w in words)
80
+
81
+ if len(y_centers) == 1:
82
+ return [int(y_centers[0])]
83
+
84
+ mean_height = np.mean([w["height"] for w in words])
85
+ threshold = mean_height * row_threshold_ratio
86
+
87
+ rows = []
88
+ current_group = [y_centers[0]]
89
+
90
+ for y in y_centers[1:]:
91
+ if y - np.mean(current_group) <= threshold:
92
+ current_group.append(y)
93
+ else:
94
+ rows.append(int(np.median(current_group)))
95
+ current_group = [y]
96
+
97
+ rows.append(int(np.median(current_group)))
98
+ return rows
99
+
100
+
101
+ def _find_closest_index(value: float, positions: list[int]) -> int:
102
+ if not positions:
103
+ return 0
104
+
105
+ distances = [abs(value - pos) for pos in positions]
106
+ return distances.index(min(distances))
107
+
108
+
109
+ def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
110
+ if not table:
111
+ return table
112
+
113
+ table = [row for row in table if any(cell.strip() for cell in row)]
114
+
115
+ if not table:
116
+ return []
117
+
118
+ non_empty_cols = [
119
+ col_idx for col_idx in range(len(table[0])) if any(row[col_idx].strip() for row in table if col_idx < len(row))
120
+ ]
121
+
122
+ if not non_empty_cols:
123
+ return []
124
+
125
+ return [[row[col_idx] if col_idx < len(row) else "" for col_idx in non_empty_cols] for row in table]
126
+
127
+
128
+ def reconstruct_table(
129
+ words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
130
+ ) -> list[list[str]]:
131
+ if not words:
132
+ return []
133
+
134
+ col_positions = detect_columns(words, column_threshold=column_threshold)
135
+ row_positions = detect_rows(words, row_threshold_ratio=row_threshold_ratio)
136
+
137
+ if not col_positions or not row_positions:
138
+ return []
139
+
140
+ table: list[list[str]] = [[""] * len(col_positions) for _ in range(len(row_positions))]
141
+
142
+ for word in words:
143
+ col_idx = _find_closest_index(word["left"], col_positions)
144
+
145
+ y_center = word["top"] + word["height"] / 2
146
+ row_idx = _find_closest_index(y_center, row_positions)
147
+
148
+ if table[row_idx][col_idx]:
149
+ table[row_idx][col_idx] += " " + word["text"]
150
+ else:
151
+ table[row_idx][col_idx] = word["text"]
152
+
153
+ return _remove_empty_rows_cols(table)
154
+
155
+
156
+ def to_markdown(table: list[list[str]]) -> str:
157
+ if not table or not table[0]:
158
+ return ""
159
+
160
+ lines = []
161
+
162
+ lines.append("| " + " | ".join(str(cell) for cell in table[0]) + " |")
163
+
164
+ lines.append("| " + " | ".join(["---"] * len(table[0])) + " |")
165
+
166
+ for row in table[1:]:
167
+ padded_row = list(row) + [""] * (len(table[0]) - len(row))
168
+ lines.append("| " + " | ".join(str(cell) for cell in padded_row[: len(table[0])]) + " |")
169
+
170
+ return "\n".join(lines)
171
+
172
+
173
+ def extract_table_from_tsv(
174
+ tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
175
+ ) -> str:
176
+ words = extract_words(tsv_data, min_confidence=min_confidence)
177
+ if not words:
178
+ return ""
179
+
180
+ table = reconstruct_table(words, column_threshold=column_threshold, row_threshold_ratio=row_threshold_ratio)
181
+ if not table:
182
+ return ""
183
+
184
+ return to_markdown(table)