xfmr-zem 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfmr_zem/cli.py +32 -3
- xfmr_zem/client.py +59 -8
- xfmr_zem/server.py +21 -4
- xfmr_zem/servers/data_juicer/server.py +1 -1
- xfmr_zem/servers/instruction_gen/server.py +1 -1
- xfmr_zem/servers/io/server.py +1 -1
- xfmr_zem/servers/llm/parameters.yml +10 -0
- xfmr_zem/servers/nemo_curator/server.py +1 -1
- xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
- xfmr_zem/servers/ocr/engines.py +242 -0
- xfmr_zem/servers/ocr/install_models.py +63 -0
- xfmr_zem/servers/ocr/parameters.yml +4 -0
- xfmr_zem/servers/ocr/server.py +44 -0
- xfmr_zem/servers/profiler/parameters.yml +4 -0
- xfmr_zem/servers/sinks/parameters.yml +6 -0
- xfmr_zem/servers/unstructured/parameters.yml +6 -0
- xfmr_zem/servers/unstructured/server.py +62 -0
- xfmr_zem/zenml_wrapper.py +20 -7
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/METADATA +19 -1
- xfmr_zem-0.2.5.dist-info/RECORD +58 -0
- xfmr_zem-0.2.2.dist-info/RECORD +0 -23
- /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/WHEEL +0 -0
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/entry_points.txt +0 -0
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
from collections import Counter
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
from huggingface_hub import snapshot_download
|
|
24
|
+
|
|
25
|
+
from .recognizer import Recognizer
|
|
26
|
+
|
|
27
|
+
def get_project_base_directory():
|
|
28
|
+
return Path(__file__).resolve().parent
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TableStructureRecognizer(Recognizer):
|
|
32
|
+
labels = [
|
|
33
|
+
"table",
|
|
34
|
+
"table column",
|
|
35
|
+
"table row",
|
|
36
|
+
"table column header",
|
|
37
|
+
"table projected row header",
|
|
38
|
+
"table spanning cell",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
try:
|
|
43
|
+
super().__init__(self.labels, "tsr", os.path.join(
|
|
44
|
+
get_project_base_directory(),
|
|
45
|
+
"onnx"))
|
|
46
|
+
except Exception:
|
|
47
|
+
super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc",
|
|
48
|
+
local_dir=os.path.join(get_project_base_directory(), "onnx"),
|
|
49
|
+
local_dir_use_symlinks=False))
|
|
50
|
+
|
|
51
|
+
def __call__(self, images, thr=0.2):
|
|
52
|
+
tbls = super().__call__(images, thr)
|
|
53
|
+
res = []
|
|
54
|
+
# align left&right for rows, align top&bottom for columns
|
|
55
|
+
for tbl in tbls:
|
|
56
|
+
lts = [{"label": b["type"],
|
|
57
|
+
"score": b["score"],
|
|
58
|
+
"x0": b["bbox"][0], "x1": b["bbox"][2],
|
|
59
|
+
"top": b["bbox"][1], "bottom": b["bbox"][-1]
|
|
60
|
+
} for b in tbl]
|
|
61
|
+
if not lts:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
left = [b["x0"] for b in lts if b["label"].find(
|
|
65
|
+
"row") > 0 or b["label"].find("header") > 0]
|
|
66
|
+
right = [b["x1"] for b in lts if b["label"].find(
|
|
67
|
+
"row") > 0 or b["label"].find("header") > 0]
|
|
68
|
+
if not left:
|
|
69
|
+
continue
|
|
70
|
+
left = np.mean(left) if len(left) > 4 else np.min(left)
|
|
71
|
+
right = np.mean(right) if len(right) > 4 else np.max(right)
|
|
72
|
+
for b in lts:
|
|
73
|
+
if b["label"].find("row") > 0 or b["label"].find("header") > 0:
|
|
74
|
+
if b["x0"] > left:
|
|
75
|
+
b["x0"] = left
|
|
76
|
+
if b["x1"] < right:
|
|
77
|
+
b["x1"] = right
|
|
78
|
+
|
|
79
|
+
top = [b["top"] for b in lts if b["label"] == "table column"]
|
|
80
|
+
bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
|
|
81
|
+
if not top:
|
|
82
|
+
res.append(lts)
|
|
83
|
+
continue
|
|
84
|
+
top = np.median(top) if len(top) > 4 else np.min(top)
|
|
85
|
+
bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
|
|
86
|
+
for b in lts:
|
|
87
|
+
if b["label"] == "table column":
|
|
88
|
+
if b["top"] > top:
|
|
89
|
+
b["top"] = top
|
|
90
|
+
if b["bottom"] < bottom:
|
|
91
|
+
b["bottom"] = bottom
|
|
92
|
+
|
|
93
|
+
res.append(lts)
|
|
94
|
+
return res
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def is_caption(bx):
|
|
98
|
+
patt = [
|
|
99
|
+
r"[图表]+[ 0-9::]{2,}"
|
|
100
|
+
]
|
|
101
|
+
if any([re.match(p, bx["text"].strip()) for p in patt]) \
|
|
102
|
+
or bx["layout_type"].find("caption") >= 0:
|
|
103
|
+
return True
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def blockType(b):
|
|
108
|
+
patt = [
|
|
109
|
+
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
|
110
|
+
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
|
111
|
+
(r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
|
|
112
|
+
("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
|
|
113
|
+
(r"^第*[一二三四1-4]季度$", "Dt"),
|
|
114
|
+
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
|
115
|
+
(r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
|
|
116
|
+
("^[0-9.,+%/ -]+$", "Nu"),
|
|
117
|
+
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
|
118
|
+
(r"^[A-Z]*[a-z' -]+$", "En"),
|
|
119
|
+
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
|
120
|
+
(r"^.{1}$", "Sg")
|
|
121
|
+
]
|
|
122
|
+
for p, n in patt:
|
|
123
|
+
if re.search(p, b["text"].strip()):
|
|
124
|
+
return n
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def construct_table(boxes, is_english=False, markdown=True, **kwargs):
|
|
128
|
+
cap = ""
|
|
129
|
+
i = 0
|
|
130
|
+
while i < len(boxes):
|
|
131
|
+
if TableStructureRecognizer.is_caption(boxes[i]):
|
|
132
|
+
if is_english:
|
|
133
|
+
cap + " "
|
|
134
|
+
cap += boxes[i]["text"]
|
|
135
|
+
boxes.pop(i)
|
|
136
|
+
i -= 1
|
|
137
|
+
i += 1
|
|
138
|
+
|
|
139
|
+
if not boxes:
|
|
140
|
+
return []
|
|
141
|
+
for b in boxes:
|
|
142
|
+
b["btype"] = TableStructureRecognizer.blockType(b)
|
|
143
|
+
max_type = Counter([b["btype"] for b in boxes]).items()
|
|
144
|
+
max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
|
|
145
|
+
logging.debug("MAXTYPE: " + str(max_type))
|
|
146
|
+
|
|
147
|
+
rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
|
|
148
|
+
rowh = np.min(rowh) if rowh else 0
|
|
149
|
+
boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
|
|
150
|
+
#for b in boxes:print(b)
|
|
151
|
+
boxes[0]["rn"] = 0
|
|
152
|
+
rows = [[boxes[0]]]
|
|
153
|
+
btm = boxes[0]["bottom"]
|
|
154
|
+
for b in boxes[1:]:
|
|
155
|
+
b["rn"] = len(rows) - 1
|
|
156
|
+
lst_r = rows[-1]
|
|
157
|
+
if lst_r[-1].get("R", "") != b.get("R", "") \
|
|
158
|
+
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
|
|
159
|
+
): # new row
|
|
160
|
+
btm = b["bottom"]
|
|
161
|
+
b["rn"] += 1
|
|
162
|
+
rows.append([b])
|
|
163
|
+
continue
|
|
164
|
+
btm = (btm + b["bottom"]) / 2.
|
|
165
|
+
rows[-1].append(b)
|
|
166
|
+
|
|
167
|
+
colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
|
|
168
|
+
colwm = np.min(colwm) if colwm else 0
|
|
169
|
+
crosspage = len(set([b["page_number"] for b in boxes])) > 1
|
|
170
|
+
if crosspage:
|
|
171
|
+
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
|
|
172
|
+
else:
|
|
173
|
+
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
|
|
174
|
+
boxes[0]["cn"] = 0
|
|
175
|
+
cols = [[boxes[0]]]
|
|
176
|
+
right = boxes[0]["x1"]
|
|
177
|
+
for b in boxes[1:]:
|
|
178
|
+
b["cn"] = len(cols) - 1
|
|
179
|
+
lst_c = cols[-1]
|
|
180
|
+
if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
|
|
181
|
+
"page_number"]) \
|
|
182
|
+
or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
|
|
183
|
+
right = b["x1"]
|
|
184
|
+
b["cn"] += 1
|
|
185
|
+
cols.append([b])
|
|
186
|
+
continue
|
|
187
|
+
right = (right + b["x1"]) / 2.
|
|
188
|
+
cols[-1].append(b)
|
|
189
|
+
|
|
190
|
+
tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
|
|
191
|
+
for b in boxes:
|
|
192
|
+
tbl[b["rn"]][b["cn"]].append(b)
|
|
193
|
+
|
|
194
|
+
if len(rows) >= 4:
|
|
195
|
+
# remove single in column
|
|
196
|
+
j = 0
|
|
197
|
+
while j < len(tbl[0]):
|
|
198
|
+
e, ii = 0, 0
|
|
199
|
+
for i in range(len(tbl)):
|
|
200
|
+
if tbl[i][j]:
|
|
201
|
+
e += 1
|
|
202
|
+
ii = i
|
|
203
|
+
if e > 1:
|
|
204
|
+
break
|
|
205
|
+
if e > 1:
|
|
206
|
+
j += 1
|
|
207
|
+
continue
|
|
208
|
+
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
|
|
209
|
+
[j - 1][0].get("text")) or j == 0
|
|
210
|
+
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
|
|
211
|
+
[j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
|
|
212
|
+
if f and ff:
|
|
213
|
+
j += 1
|
|
214
|
+
continue
|
|
215
|
+
bx = tbl[ii][j][0]
|
|
216
|
+
logging.debug("Relocate column single: " + bx["text"])
|
|
217
|
+
# j column only has one value
|
|
218
|
+
left, right = 100000, 100000
|
|
219
|
+
if j > 0 and not f:
|
|
220
|
+
for i in range(len(tbl)):
|
|
221
|
+
if tbl[i][j - 1]:
|
|
222
|
+
left = min(left, np.min(
|
|
223
|
+
[bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
|
|
224
|
+
if j + 1 < len(tbl[0]) and not ff:
|
|
225
|
+
for i in range(len(tbl)):
|
|
226
|
+
if tbl[i][j + 1]:
|
|
227
|
+
right = min(right, np.min(
|
|
228
|
+
[a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
|
|
229
|
+
assert left < 100000 or right < 100000
|
|
230
|
+
if left < right:
|
|
231
|
+
for jj in range(j, len(tbl[0])):
|
|
232
|
+
for i in range(len(tbl)):
|
|
233
|
+
for a in tbl[i][jj]:
|
|
234
|
+
a["cn"] -= 1
|
|
235
|
+
if tbl[ii][j - 1]:
|
|
236
|
+
tbl[ii][j - 1].extend(tbl[ii][j])
|
|
237
|
+
else:
|
|
238
|
+
tbl[ii][j - 1] = tbl[ii][j]
|
|
239
|
+
for i in range(len(tbl)):
|
|
240
|
+
tbl[i].pop(j)
|
|
241
|
+
|
|
242
|
+
else:
|
|
243
|
+
for jj in range(j + 1, len(tbl[0])):
|
|
244
|
+
for i in range(len(tbl)):
|
|
245
|
+
for a in tbl[i][jj]:
|
|
246
|
+
a["cn"] -= 1
|
|
247
|
+
if tbl[ii][j + 1]:
|
|
248
|
+
tbl[ii][j + 1].extend(tbl[ii][j])
|
|
249
|
+
else:
|
|
250
|
+
tbl[ii][j + 1] = tbl[ii][j]
|
|
251
|
+
for i in range(len(tbl)):
|
|
252
|
+
tbl[i].pop(j)
|
|
253
|
+
cols.pop(j)
|
|
254
|
+
assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
|
|
255
|
+
len(cols), len(tbl[0]))
|
|
256
|
+
|
|
257
|
+
if len(cols) >= 4:
|
|
258
|
+
# remove single in row
|
|
259
|
+
i = 0
|
|
260
|
+
while i < len(tbl):
|
|
261
|
+
e, jj = 0, 0
|
|
262
|
+
for j in range(len(tbl[i])):
|
|
263
|
+
if tbl[i][j]:
|
|
264
|
+
e += 1
|
|
265
|
+
jj = j
|
|
266
|
+
if e > 1:
|
|
267
|
+
break
|
|
268
|
+
if e > 1:
|
|
269
|
+
i += 1
|
|
270
|
+
continue
|
|
271
|
+
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
|
|
272
|
+
[jj][0].get("text")) or i == 0
|
|
273
|
+
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
|
|
274
|
+
[jj][0].get("text")) or i + 1 >= len(tbl)
|
|
275
|
+
if f and ff:
|
|
276
|
+
i += 1
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
bx = tbl[i][jj][0]
|
|
280
|
+
logging.debug("Relocate row single: " + bx["text"])
|
|
281
|
+
# i row only has one value
|
|
282
|
+
up, down = 100000, 100000
|
|
283
|
+
if i > 0 and not f:
|
|
284
|
+
for j in range(len(tbl[i - 1])):
|
|
285
|
+
if tbl[i - 1][j]:
|
|
286
|
+
up = min(up, np.min(
|
|
287
|
+
[bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
|
|
288
|
+
if i + 1 < len(tbl) and not ff:
|
|
289
|
+
for j in range(len(tbl[i + 1])):
|
|
290
|
+
if tbl[i + 1][j]:
|
|
291
|
+
down = min(down, np.min(
|
|
292
|
+
[a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
|
|
293
|
+
assert up < 100000 or down < 100000
|
|
294
|
+
if up < down:
|
|
295
|
+
for ii in range(i, len(tbl)):
|
|
296
|
+
for j in range(len(tbl[ii])):
|
|
297
|
+
for a in tbl[ii][j]:
|
|
298
|
+
a["rn"] -= 1
|
|
299
|
+
if tbl[i - 1][jj]:
|
|
300
|
+
tbl[i - 1][jj].extend(tbl[i][jj])
|
|
301
|
+
else:
|
|
302
|
+
tbl[i - 1][jj] = tbl[i][jj]
|
|
303
|
+
tbl.pop(i)
|
|
304
|
+
|
|
305
|
+
else:
|
|
306
|
+
for ii in range(i + 1, len(tbl)):
|
|
307
|
+
for j in range(len(tbl[ii])):
|
|
308
|
+
for a in tbl[ii][j]:
|
|
309
|
+
a["rn"] -= 1
|
|
310
|
+
if tbl[i + 1][jj]:
|
|
311
|
+
tbl[i + 1][jj].extend(tbl[i][jj])
|
|
312
|
+
else:
|
|
313
|
+
tbl[i + 1][jj] = tbl[i][jj]
|
|
314
|
+
tbl.pop(i)
|
|
315
|
+
rows.pop(i)
|
|
316
|
+
|
|
317
|
+
# which rows are headers
|
|
318
|
+
hdset = set([])
|
|
319
|
+
for i in range(len(tbl)):
|
|
320
|
+
cnt, h = 0, 0
|
|
321
|
+
for j, arr in enumerate(tbl[i]):
|
|
322
|
+
if not arr:
|
|
323
|
+
continue
|
|
324
|
+
cnt += 1
|
|
325
|
+
if max_type == "Nu" and arr[0]["btype"] == "Nu":
|
|
326
|
+
continue
|
|
327
|
+
if any([a.get("H") for a in arr]) \
|
|
328
|
+
or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
|
|
329
|
+
h += 1
|
|
330
|
+
if h / cnt > 0.5:
|
|
331
|
+
hdset.add(i)
|
|
332
|
+
|
|
333
|
+
if markdown:
|
|
334
|
+
return TableStructureRecognizer.__markdown_table(cap, hdset,
|
|
335
|
+
TableStructureRecognizer.__cal_spans(boxes, rows,
|
|
336
|
+
cols, tbl, False)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return TableStructureRecognizer.__desc_table(cap, hdset,
|
|
340
|
+
TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
|
|
341
|
+
False),
|
|
342
|
+
is_english)
|
|
343
|
+
|
|
344
|
+
@staticmethod
|
|
345
|
+
def __markdown_table(cap, hdset, tbl):
|
|
346
|
+
# Construct Markdown table
|
|
347
|
+
md = ""
|
|
348
|
+
if cap:
|
|
349
|
+
md += f"**{cap}**\n\n"
|
|
350
|
+
header_row = None
|
|
351
|
+
for i, row in enumerate(tbl):
|
|
352
|
+
if header_row is None and (i in hdset or i == 0):
|
|
353
|
+
header_row = i
|
|
354
|
+
break
|
|
355
|
+
if header_row is None:
|
|
356
|
+
header_row = 0
|
|
357
|
+
# Compose header
|
|
358
|
+
headers = []
|
|
359
|
+
for cell in tbl[header_row]:
|
|
360
|
+
if not cell:
|
|
361
|
+
headers.append("")
|
|
362
|
+
else:
|
|
363
|
+
txt = " ".join([c["text"] for c in cell])
|
|
364
|
+
headers.append(txt)
|
|
365
|
+
md += "| " + " | ".join(headers) + " |\n"
|
|
366
|
+
md += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
|
367
|
+
# Compose rows
|
|
368
|
+
for i, row in enumerate(tbl):
|
|
369
|
+
if i == header_row:
|
|
370
|
+
continue
|
|
371
|
+
row_cells = []
|
|
372
|
+
for cell in row:
|
|
373
|
+
if not cell:
|
|
374
|
+
row_cells.append("")
|
|
375
|
+
else:
|
|
376
|
+
txt = " ".join([c["text"] for c in cell])
|
|
377
|
+
row_cells.append(txt)
|
|
378
|
+
md += "| " + " | ".join(row_cells) + " |\n"
|
|
379
|
+
return md
|
|
380
|
+
|
|
381
|
+
@staticmethod
|
|
382
|
+
def __desc_table(cap, hdr_rowno, tbl, is_english):
|
|
383
|
+
# get text of every colomn in header row to become header text
|
|
384
|
+
clmno = len(tbl[0])
|
|
385
|
+
rowno = len(tbl)
|
|
386
|
+
headers = {}
|
|
387
|
+
hdrset = set()
|
|
388
|
+
lst_hdr = []
|
|
389
|
+
de = "的" if not is_english else " for "
|
|
390
|
+
for r in sorted(list(hdr_rowno)):
|
|
391
|
+
headers[r] = ["" for _ in range(clmno)]
|
|
392
|
+
for i in range(clmno):
|
|
393
|
+
if not tbl[r][i]:
|
|
394
|
+
continue
|
|
395
|
+
txt = " ".join([a["text"].strip() for a in tbl[r][i]])
|
|
396
|
+
headers[r][i] = txt
|
|
397
|
+
hdrset.add(txt)
|
|
398
|
+
if all([not t for t in headers[r]]):
|
|
399
|
+
del headers[r]
|
|
400
|
+
hdr_rowno.remove(r)
|
|
401
|
+
continue
|
|
402
|
+
for j in range(clmno):
|
|
403
|
+
if headers[r][j]:
|
|
404
|
+
continue
|
|
405
|
+
if j >= len(lst_hdr):
|
|
406
|
+
break
|
|
407
|
+
headers[r][j] = lst_hdr[j]
|
|
408
|
+
lst_hdr = headers[r]
|
|
409
|
+
for i in range(rowno):
|
|
410
|
+
if i not in hdr_rowno:
|
|
411
|
+
continue
|
|
412
|
+
for j in range(i + 1, rowno):
|
|
413
|
+
if j not in hdr_rowno:
|
|
414
|
+
break
|
|
415
|
+
for k in range(clmno):
|
|
416
|
+
if not headers[j - 1][k]:
|
|
417
|
+
continue
|
|
418
|
+
if headers[j][k].find(headers[j - 1][k]) >= 0:
|
|
419
|
+
continue
|
|
420
|
+
if len(headers[j][k]) > len(headers[j - 1][k]):
|
|
421
|
+
headers[j][k] += (de if headers[j][k]
|
|
422
|
+
else "") + headers[j - 1][k]
|
|
423
|
+
else:
|
|
424
|
+
headers[j][k] = headers[j - 1][k] \
|
|
425
|
+
+ (de if headers[j - 1][k] else "") \
|
|
426
|
+
+ headers[j][k]
|
|
427
|
+
|
|
428
|
+
logging.debug(
|
|
429
|
+
f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
|
|
430
|
+
row_txt = []
|
|
431
|
+
for i in range(rowno):
|
|
432
|
+
if i in hdr_rowno:
|
|
433
|
+
continue
|
|
434
|
+
rtxt = []
|
|
435
|
+
|
|
436
|
+
def append(delimer):
|
|
437
|
+
nonlocal rtxt, row_txt
|
|
438
|
+
rtxt = delimer.join(rtxt)
|
|
439
|
+
if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
|
|
440
|
+
row_txt[-1] += "\n" + rtxt
|
|
441
|
+
else:
|
|
442
|
+
row_txt.append(rtxt)
|
|
443
|
+
|
|
444
|
+
r = 0
|
|
445
|
+
if len(headers.items()):
|
|
446
|
+
_arr = [(i - r, r) for r, _ in headers.items() if r < i]
|
|
447
|
+
if _arr:
|
|
448
|
+
_, r = min(_arr, key=lambda x: x[0])
|
|
449
|
+
|
|
450
|
+
if r not in headers and clmno <= 2:
|
|
451
|
+
for j in range(clmno):
|
|
452
|
+
if not tbl[i][j]:
|
|
453
|
+
continue
|
|
454
|
+
txt = "".join([a["text"].strip() for a in tbl[i][j]])
|
|
455
|
+
if txt:
|
|
456
|
+
rtxt.append(txt)
|
|
457
|
+
if rtxt:
|
|
458
|
+
append(":")
|
|
459
|
+
continue
|
|
460
|
+
|
|
461
|
+
for j in range(clmno):
|
|
462
|
+
if not tbl[i][j]:
|
|
463
|
+
continue
|
|
464
|
+
txt = "".join([a["text"].strip() for a in tbl[i][j]])
|
|
465
|
+
if not txt:
|
|
466
|
+
continue
|
|
467
|
+
ctt = headers[r][j] if r in headers else ""
|
|
468
|
+
if ctt:
|
|
469
|
+
ctt += ":"
|
|
470
|
+
ctt += txt
|
|
471
|
+
if ctt:
|
|
472
|
+
rtxt.append(ctt)
|
|
473
|
+
|
|
474
|
+
if rtxt:
|
|
475
|
+
row_txt.append("; ".join(rtxt))
|
|
476
|
+
|
|
477
|
+
if cap:
|
|
478
|
+
if is_english:
|
|
479
|
+
from_ = " in "
|
|
480
|
+
else:
|
|
481
|
+
from_ = "来自"
|
|
482
|
+
row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
|
|
483
|
+
return row_txt
|
|
484
|
+
|
|
485
|
+
@staticmethod
|
|
486
|
+
def __cal_spans(boxes, rows, cols, tbl, markdown=True):
|
|
487
|
+
# caculate span
|
|
488
|
+
clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
|
|
489
|
+
for cln in cols]
|
|
490
|
+
crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
|
|
491
|
+
for cln in cols]
|
|
492
|
+
rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
|
|
493
|
+
for row in rows]
|
|
494
|
+
rbtm = [np.mean([c.get("R_btm", c["bottom"])
|
|
495
|
+
for c in row]) for row in rows]
|
|
496
|
+
for b in boxes:
|
|
497
|
+
if "SP" not in b:
|
|
498
|
+
continue
|
|
499
|
+
b["colspan"] = [b["cn"]]
|
|
500
|
+
b["rowspan"] = [b["rn"]]
|
|
501
|
+
# col span
|
|
502
|
+
for j in range(0, len(clft)):
|
|
503
|
+
if j == b["cn"]:
|
|
504
|
+
continue
|
|
505
|
+
if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
|
|
506
|
+
continue
|
|
507
|
+
if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
|
|
508
|
+
continue
|
|
509
|
+
b["colspan"].append(j)
|
|
510
|
+
# row span
|
|
511
|
+
for j in range(0, len(rtop)):
|
|
512
|
+
if j == b["rn"]:
|
|
513
|
+
continue
|
|
514
|
+
if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
|
|
515
|
+
continue
|
|
516
|
+
if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
|
|
517
|
+
continue
|
|
518
|
+
b["rowspan"].append(j)
|
|
519
|
+
|
|
520
|
+
def join(arr):
|
|
521
|
+
if not arr:
|
|
522
|
+
return ""
|
|
523
|
+
return "".join([t["text"] for t in arr])
|
|
524
|
+
|
|
525
|
+
# rm the spaning cells
|
|
526
|
+
for i in range(len(tbl)):
|
|
527
|
+
for j, arr in enumerate(tbl[i]):
|
|
528
|
+
if not arr:
|
|
529
|
+
continue
|
|
530
|
+
if all(["rowspan" not in a and "colspan" not in a for a in arr]):
|
|
531
|
+
continue
|
|
532
|
+
rowspan, colspan = [], []
|
|
533
|
+
for a in arr:
|
|
534
|
+
if isinstance(a.get("rowspan", 0), list):
|
|
535
|
+
rowspan.extend(a["rowspan"])
|
|
536
|
+
if isinstance(a.get("colspan", 0), list):
|
|
537
|
+
colspan.extend(a["colspan"])
|
|
538
|
+
rowspan, colspan = set(rowspan), set(colspan)
|
|
539
|
+
if len(rowspan) < 2 and len(colspan) < 2:
|
|
540
|
+
for a in arr:
|
|
541
|
+
if "rowspan" in a:
|
|
542
|
+
del a["rowspan"]
|
|
543
|
+
if "colspan" in a:
|
|
544
|
+
del a["colspan"]
|
|
545
|
+
continue
|
|
546
|
+
rowspan, colspan = sorted(rowspan), sorted(colspan)
|
|
547
|
+
rowspan = list(range(rowspan[0], rowspan[-1] + 1))
|
|
548
|
+
colspan = list(range(colspan[0], colspan[-1] + 1))
|
|
549
|
+
assert i in rowspan, rowspan
|
|
550
|
+
assert j in colspan, colspan
|
|
551
|
+
arr = []
|
|
552
|
+
for r in rowspan:
|
|
553
|
+
for c in colspan:
|
|
554
|
+
arr_txt = join(arr)
|
|
555
|
+
if tbl[r][c] and join(tbl[r][c]) != arr_txt:
|
|
556
|
+
arr.extend(tbl[r][c])
|
|
557
|
+
tbl[r][c] = None if markdown else arr
|
|
558
|
+
for a in arr:
|
|
559
|
+
if len(rowspan) > 1:
|
|
560
|
+
a["rowspan"] = len(rowspan)
|
|
561
|
+
elif "rowspan" in a:
|
|
562
|
+
del a["rowspan"]
|
|
563
|
+
if len(colspan) > 1:
|
|
564
|
+
a["colspan"] = len(colspan)
|
|
565
|
+
elif "colspan" in a:
|
|
566
|
+
del a["colspan"]
|
|
567
|
+
tbl[rowspan[0]][colspan[0]] = arr
|
|
568
|
+
|
|
569
|
+
return tbl
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import base64
|
|
17
|
+
import datetime
|
|
18
|
+
import io
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import pickle
|
|
22
|
+
import time
|
|
23
|
+
import uuid
|
|
24
|
+
import logging
|
|
25
|
+
import copy
|
|
26
|
+
from enum import Enum, IntEnum
|
|
27
|
+
|
|
28
|
+
from . import file_utils
|
|
29
|
+
|
|
30
|
+
def read_config(conf_name=None):
|
|
31
|
+
# Simplified: return empty config as we don't use RagFlow conf files
|
|
32
|
+
return {}
|
|
33
|
+
|
|
34
|
+
CONFIGS = read_config()
|
|
35
|
+
|
|
36
|
+
def get_base_config(key, default=None):
|
|
37
|
+
if key is None:
|
|
38
|
+
return None
|
|
39
|
+
return CONFIGS.get(key, default)
|
|
40
|
+
|
|
41
|
+
class BaseType:
|
|
42
|
+
def to_dict(self):
|
|
43
|
+
return dict([(k.lstrip("_"), v) for k, v in self.__dict__.items()])
|
|
44
|
+
|
|
45
|
+
class CustomJSONEncoder(json.JSONEncoder):
|
|
46
|
+
def default(self, obj):
|
|
47
|
+
if isinstance(obj, datetime.datetime):
|
|
48
|
+
return obj.strftime('%Y-%m-%d %H:%M:%S')
|
|
49
|
+
elif isinstance(obj, datetime.date):
|
|
50
|
+
return obj.strftime('%Y-%m-%d')
|
|
51
|
+
elif isinstance(obj, datetime.timedelta):
|
|
52
|
+
return str(obj)
|
|
53
|
+
elif issubclass(type(obj), Enum) or issubclass(type(obj), IntEnum):
|
|
54
|
+
return obj.value
|
|
55
|
+
elif isinstance(obj, set):
|
|
56
|
+
return list(obj)
|
|
57
|
+
elif issubclass(type(obj), BaseType):
|
|
58
|
+
return obj.to_dict()
|
|
59
|
+
else:
|
|
60
|
+
return json.JSONEncoder.default(self, obj)
|
|
61
|
+
|
|
62
|
+
def json_dumps(src, byte=False, indent=None):
|
|
63
|
+
dest = json.dumps(src, indent=indent, cls=CustomJSONEncoder)
|
|
64
|
+
if byte:
|
|
65
|
+
dest = dest.encode(encoding="utf-8")
|
|
66
|
+
return dest
|
|
67
|
+
|
|
68
|
+
def json_loads(src):
|
|
69
|
+
if isinstance(src, bytes):
|
|
70
|
+
src = src.decode(encoding="utf-8")
|
|
71
|
+
return json.loads(src)
|
|
72
|
+
|
|
73
|
+
def current_timestamp():
|
|
74
|
+
return int(time.time() * 1000)
|
|
75
|
+
|
|
76
|
+
def get_uuid():
|
|
77
|
+
return uuid.uuid1().hex
|
|
78
|
+
|
|
79
|
+
def datetime_format(date_time: datetime.datetime) -> datetime.datetime:
|
|
80
|
+
return datetime.datetime(date_time.year, date_time.month, date_time.day,
|
|
81
|
+
date_time.hour, date_time.minute, date_time.second)
|