kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import csv
|
4
|
+
from io import StringIO
|
5
|
+
from typing import TYPE_CHECKING
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from kreuzberg.exceptions import ParsingError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from kreuzberg._types import TSVWord
|
13
|
+
|
14
|
+
|
15
|
+
def extract_words(tsv_data: str, *, min_confidence: float = 30.0) -> list[TSVWord]:
|
16
|
+
try:
|
17
|
+
reader = csv.DictReader(StringIO(tsv_data), delimiter="\t")
|
18
|
+
words: list[TSVWord] = []
|
19
|
+
|
20
|
+
for row in reader:
|
21
|
+
if row.get("level") == "5" and row.get("text", "").strip():
|
22
|
+
try:
|
23
|
+
conf = float(row["conf"])
|
24
|
+
if conf < min_confidence:
|
25
|
+
continue
|
26
|
+
|
27
|
+
words.append(
|
28
|
+
{
|
29
|
+
"level": int(row["level"]),
|
30
|
+
"page_num": int(row["page_num"]),
|
31
|
+
"block_num": int(row["block_num"]),
|
32
|
+
"par_num": int(row["par_num"]),
|
33
|
+
"line_num": int(row["line_num"]),
|
34
|
+
"word_num": int(row["word_num"]),
|
35
|
+
"left": int(row["left"]),
|
36
|
+
"top": int(row["top"]),
|
37
|
+
"width": int(row["width"]),
|
38
|
+
"height": int(row["height"]),
|
39
|
+
"conf": conf,
|
40
|
+
"text": row["text"],
|
41
|
+
}
|
42
|
+
)
|
43
|
+
except (ValueError, KeyError):
|
44
|
+
continue
|
45
|
+
|
46
|
+
return words
|
47
|
+
|
48
|
+
except Exception as e:
|
49
|
+
raise ParsingError("Failed to parse TSV data", context={"error": str(e)}) from e
|
50
|
+
|
51
|
+
|
52
|
+
def detect_columns(words: list[TSVWord], *, column_threshold: int = 20) -> list[int]:
|
53
|
+
if not words:
|
54
|
+
return []
|
55
|
+
|
56
|
+
x_positions = sorted({w["left"] for w in words})
|
57
|
+
|
58
|
+
if len(x_positions) == 1:
|
59
|
+
return x_positions
|
60
|
+
|
61
|
+
columns = []
|
62
|
+
current_group = [x_positions[0]]
|
63
|
+
|
64
|
+
for x in x_positions[1:]:
|
65
|
+
if x - current_group[-1] <= column_threshold:
|
66
|
+
current_group.append(x)
|
67
|
+
else:
|
68
|
+
columns.append(int(np.median(current_group)))
|
69
|
+
current_group = [x]
|
70
|
+
|
71
|
+
columns.append(int(np.median(current_group)))
|
72
|
+
return columns
|
73
|
+
|
74
|
+
|
75
|
+
def detect_rows(words: list[TSVWord], *, row_threshold_ratio: float = 0.5) -> list[int]:
|
76
|
+
if not words:
|
77
|
+
return []
|
78
|
+
|
79
|
+
y_centers = sorted(w["top"] + w["height"] / 2 for w in words)
|
80
|
+
|
81
|
+
if len(y_centers) == 1:
|
82
|
+
return [int(y_centers[0])]
|
83
|
+
|
84
|
+
mean_height = np.mean([w["height"] for w in words])
|
85
|
+
threshold = mean_height * row_threshold_ratio
|
86
|
+
|
87
|
+
rows = []
|
88
|
+
current_group = [y_centers[0]]
|
89
|
+
|
90
|
+
for y in y_centers[1:]:
|
91
|
+
if y - np.mean(current_group) <= threshold:
|
92
|
+
current_group.append(y)
|
93
|
+
else:
|
94
|
+
rows.append(int(np.median(current_group)))
|
95
|
+
current_group = [y]
|
96
|
+
|
97
|
+
rows.append(int(np.median(current_group)))
|
98
|
+
return rows
|
99
|
+
|
100
|
+
|
101
|
+
def _find_closest_index(value: float, positions: list[int]) -> int:
|
102
|
+
if not positions:
|
103
|
+
return 0
|
104
|
+
|
105
|
+
distances = [abs(value - pos) for pos in positions]
|
106
|
+
return distances.index(min(distances))
|
107
|
+
|
108
|
+
|
109
|
+
def _remove_empty_rows_cols(table: list[list[str]]) -> list[list[str]]:
|
110
|
+
if not table:
|
111
|
+
return table
|
112
|
+
|
113
|
+
table = [row for row in table if any(cell.strip() for cell in row)]
|
114
|
+
|
115
|
+
if not table:
|
116
|
+
return []
|
117
|
+
|
118
|
+
non_empty_cols = [
|
119
|
+
col_idx for col_idx in range(len(table[0])) if any(row[col_idx].strip() for row in table if col_idx < len(row))
|
120
|
+
]
|
121
|
+
|
122
|
+
if not non_empty_cols:
|
123
|
+
return []
|
124
|
+
|
125
|
+
return [[row[col_idx] if col_idx < len(row) else "" for col_idx in non_empty_cols] for row in table]
|
126
|
+
|
127
|
+
|
128
|
+
def reconstruct_table(
|
129
|
+
words: list[TSVWord], *, column_threshold: int = 20, row_threshold_ratio: float = 0.5
|
130
|
+
) -> list[list[str]]:
|
131
|
+
if not words:
|
132
|
+
return []
|
133
|
+
|
134
|
+
col_positions = detect_columns(words, column_threshold=column_threshold)
|
135
|
+
row_positions = detect_rows(words, row_threshold_ratio=row_threshold_ratio)
|
136
|
+
|
137
|
+
if not col_positions or not row_positions:
|
138
|
+
return []
|
139
|
+
|
140
|
+
table: list[list[str]] = [[""] * len(col_positions) for _ in range(len(row_positions))]
|
141
|
+
|
142
|
+
for word in words:
|
143
|
+
col_idx = _find_closest_index(word["left"], col_positions)
|
144
|
+
|
145
|
+
y_center = word["top"] + word["height"] / 2
|
146
|
+
row_idx = _find_closest_index(y_center, row_positions)
|
147
|
+
|
148
|
+
if table[row_idx][col_idx]:
|
149
|
+
table[row_idx][col_idx] += " " + word["text"]
|
150
|
+
else:
|
151
|
+
table[row_idx][col_idx] = word["text"]
|
152
|
+
|
153
|
+
return _remove_empty_rows_cols(table)
|
154
|
+
|
155
|
+
|
156
|
+
def to_markdown(table: list[list[str]]) -> str:
|
157
|
+
if not table or not table[0]:
|
158
|
+
return ""
|
159
|
+
|
160
|
+
lines = []
|
161
|
+
|
162
|
+
lines.append("| " + " | ".join(str(cell) for cell in table[0]) + " |")
|
163
|
+
|
164
|
+
lines.append("| " + " | ".join(["---"] * len(table[0])) + " |")
|
165
|
+
|
166
|
+
for row in table[1:]:
|
167
|
+
padded_row = list(row) + [""] * (len(table[0]) - len(row))
|
168
|
+
lines.append("| " + " | ".join(str(cell) for cell in padded_row[: len(table[0])]) + " |")
|
169
|
+
|
170
|
+
return "\n".join(lines)
|
171
|
+
|
172
|
+
|
173
|
+
def extract_table_from_tsv(
|
174
|
+
tsv_data: str, *, column_threshold: int = 20, row_threshold_ratio: float = 0.5, min_confidence: float = 30.0
|
175
|
+
) -> str:
|
176
|
+
words = extract_words(tsv_data, min_confidence=min_confidence)
|
177
|
+
if not words:
|
178
|
+
return ""
|
179
|
+
|
180
|
+
table = reconstruct_table(words, column_threshold=column_threshold, row_threshold_ratio=row_threshold_ratio)
|
181
|
+
if not table:
|
182
|
+
return ""
|
183
|
+
|
184
|
+
return to_markdown(table)
|