deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
from collections import Counter
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
from ..common.model_store import resolve_vision_model_dir
|
|
23
|
+
from ..depend import rag_tokenizer
|
|
24
|
+
|
|
25
|
+
from .recognizer import Recognizer
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TableStructureRecognizer(Recognizer):
|
|
29
|
+
labels = [
|
|
30
|
+
"table",
|
|
31
|
+
"table column",
|
|
32
|
+
"table row",
|
|
33
|
+
"table column header",
|
|
34
|
+
"table projected row header",
|
|
35
|
+
"table spanning cell",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model_dir: str | None = None,
|
|
41
|
+
model_home: str | None = None,
|
|
42
|
+
model_provider: str | None = None,
|
|
43
|
+
offline: bool | None = None,
|
|
44
|
+
):
|
|
45
|
+
if not model_dir:
|
|
46
|
+
model_dir = resolve_vision_model_dir(
|
|
47
|
+
model_home=model_home,
|
|
48
|
+
provider=model_provider,
|
|
49
|
+
offline=offline,
|
|
50
|
+
)
|
|
51
|
+
self.model_dir = model_dir
|
|
52
|
+
super().__init__(self.labels, "tsr", model_dir)
|
|
53
|
+
|
|
54
|
+
def __call__(self, images, thr=0.2):
|
|
55
|
+
table_structure_recognizer_type = os.getenv("TABLE_STRUCTURE_RECOGNIZER_TYPE", "onnx").lower()
|
|
56
|
+
if table_structure_recognizer_type not in ["onnx", "ascend"]:
|
|
57
|
+
raise RuntimeError("Unsupported table structure recognizer type.")
|
|
58
|
+
|
|
59
|
+
if table_structure_recognizer_type == "onnx":
|
|
60
|
+
logging.debug("Using Onnx table structure recognizer")
|
|
61
|
+
tbls = super().__call__(images, thr)
|
|
62
|
+
else: # ascend
|
|
63
|
+
logging.debug("Using Ascend table structure recognizer")
|
|
64
|
+
tbls = self._run_ascend_tsr(images, thr)
|
|
65
|
+
|
|
66
|
+
res = []
|
|
67
|
+
# align left&right for rows, align top&bottom for columns
|
|
68
|
+
for tbl in tbls:
|
|
69
|
+
lts = [
|
|
70
|
+
{
|
|
71
|
+
"label": b["type"],
|
|
72
|
+
"score": b["score"],
|
|
73
|
+
"x0": b["bbox"][0],
|
|
74
|
+
"x1": b["bbox"][2],
|
|
75
|
+
"top": b["bbox"][1],
|
|
76
|
+
"bottom": b["bbox"][-1],
|
|
77
|
+
}
|
|
78
|
+
for b in tbl
|
|
79
|
+
]
|
|
80
|
+
if not lts:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
left = [b["x0"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0]
|
|
84
|
+
right = [b["x1"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0]
|
|
85
|
+
if not left:
|
|
86
|
+
continue
|
|
87
|
+
left = np.mean(left) if len(left) > 4 else np.min(left)
|
|
88
|
+
right = np.mean(right) if len(right) > 4 else np.max(right)
|
|
89
|
+
for b in lts:
|
|
90
|
+
if b["label"].find("row") > 0 or b["label"].find("header") > 0:
|
|
91
|
+
if b["x0"] > left:
|
|
92
|
+
b["x0"] = left
|
|
93
|
+
if b["x1"] < right:
|
|
94
|
+
b["x1"] = right
|
|
95
|
+
|
|
96
|
+
top = [b["top"] for b in lts if b["label"] == "table column"]
|
|
97
|
+
bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
|
|
98
|
+
if not top:
|
|
99
|
+
res.append(lts)
|
|
100
|
+
continue
|
|
101
|
+
top = np.median(top) if len(top) > 4 else np.min(top)
|
|
102
|
+
bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
|
|
103
|
+
for b in lts:
|
|
104
|
+
if b["label"] == "table column":
|
|
105
|
+
if b["top"] > top:
|
|
106
|
+
b["top"] = top
|
|
107
|
+
if b["bottom"] < bottom:
|
|
108
|
+
b["bottom"] = bottom
|
|
109
|
+
|
|
110
|
+
res.append(lts)
|
|
111
|
+
return res
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def is_caption(bx):
|
|
115
|
+
patt = [r"[图表]+[ 0-9::]{2,}"]
|
|
116
|
+
if any([re.match(p, bx["text"].strip()) for p in patt]) or bx.get("layout_type", "").find("caption") >= 0:
|
|
117
|
+
return True
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def blockType(b):
|
|
122
|
+
patt = [
|
|
123
|
+
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
|
124
|
+
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
|
125
|
+
(r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
|
|
126
|
+
("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
|
|
127
|
+
(r"^第*[一二三四1-4]季度$", "Dt"),
|
|
128
|
+
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
|
129
|
+
(r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
|
|
130
|
+
("^[0-9.,+%/ -]+$", "Nu"),
|
|
131
|
+
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
|
132
|
+
(r"^[A-Z]*[a-z' -]+$", "En"),
|
|
133
|
+
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
|
134
|
+
(r"^.{1}$", "Sg"),
|
|
135
|
+
]
|
|
136
|
+
for p, n in patt:
|
|
137
|
+
if re.search(p, b["text"].strip()):
|
|
138
|
+
return n
|
|
139
|
+
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
|
|
140
|
+
if len(tks) > 3:
|
|
141
|
+
if len(tks) < 12:
|
|
142
|
+
return "Tx"
|
|
143
|
+
else:
|
|
144
|
+
return "Lx"
|
|
145
|
+
|
|
146
|
+
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
|
147
|
+
return "Nr"
|
|
148
|
+
|
|
149
|
+
return "Ot"
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def construct_table(boxes, is_english=False, html=True, **kwargs):
|
|
153
|
+
cap = ""
|
|
154
|
+
i = 0
|
|
155
|
+
while i < len(boxes):
|
|
156
|
+
if TableStructureRecognizer.is_caption(boxes[i]):
|
|
157
|
+
if is_english:
|
|
158
|
+
cap += " "
|
|
159
|
+
cap += boxes[i]["text"]
|
|
160
|
+
boxes.pop(i)
|
|
161
|
+
i -= 1
|
|
162
|
+
i += 1
|
|
163
|
+
|
|
164
|
+
if not boxes:
|
|
165
|
+
return []
|
|
166
|
+
for b in boxes:
|
|
167
|
+
b["btype"] = TableStructureRecognizer.blockType(b)
|
|
168
|
+
max_type = Counter([b["btype"] for b in boxes]).items()
|
|
169
|
+
max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
|
|
170
|
+
logging.debug("MAXTYPE: " + max_type)
|
|
171
|
+
|
|
172
|
+
rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
|
|
173
|
+
rowh = np.min(rowh) if rowh else 0
|
|
174
|
+
boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
|
|
175
|
+
# for b in boxes:print(b)
|
|
176
|
+
boxes[0]["rn"] = 0
|
|
177
|
+
rows = [[boxes[0]]]
|
|
178
|
+
btm = boxes[0]["bottom"]
|
|
179
|
+
for b in boxes[1:]:
|
|
180
|
+
b["rn"] = len(rows) - 1
|
|
181
|
+
lst_r = rows[-1]
|
|
182
|
+
if lst_r[-1].get("R", "") != b.get("R", "") or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")): # new row
|
|
183
|
+
btm = b["bottom"]
|
|
184
|
+
b["rn"] += 1
|
|
185
|
+
rows.append([b])
|
|
186
|
+
continue
|
|
187
|
+
btm = (btm + b["bottom"]) / 2.0
|
|
188
|
+
rows[-1].append(b)
|
|
189
|
+
|
|
190
|
+
colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
|
|
191
|
+
colwm = np.min(colwm) if colwm else 0
|
|
192
|
+
crosspage = len(set([b["page_number"] for b in boxes])) > 1
|
|
193
|
+
if crosspage:
|
|
194
|
+
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
|
|
195
|
+
else:
|
|
196
|
+
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
|
|
197
|
+
boxes[0]["cn"] = 0
|
|
198
|
+
cols = [[boxes[0]]]
|
|
199
|
+
right = boxes[0]["x1"]
|
|
200
|
+
for b in boxes[1:]:
|
|
201
|
+
b["cn"] = len(cols) - 1
|
|
202
|
+
lst_c = cols[-1]
|
|
203
|
+
if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1]["page_number"]) or (
|
|
204
|
+
b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")
|
|
205
|
+
): # new col
|
|
206
|
+
right = b["x1"]
|
|
207
|
+
b["cn"] += 1
|
|
208
|
+
cols.append([b])
|
|
209
|
+
continue
|
|
210
|
+
right = (right + b["x1"]) / 2.0
|
|
211
|
+
cols[-1].append(b)
|
|
212
|
+
|
|
213
|
+
tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
|
|
214
|
+
for b in boxes:
|
|
215
|
+
tbl[b["rn"]][b["cn"]].append(b)
|
|
216
|
+
|
|
217
|
+
if len(rows) >= 4:
|
|
218
|
+
# remove single in column
|
|
219
|
+
j = 0
|
|
220
|
+
while j < len(tbl[0]):
|
|
221
|
+
e, ii = 0, 0
|
|
222
|
+
for i in range(len(tbl)):
|
|
223
|
+
if tbl[i][j]:
|
|
224
|
+
e += 1
|
|
225
|
+
ii = i
|
|
226
|
+
if e > 1:
|
|
227
|
+
break
|
|
228
|
+
if e > 1:
|
|
229
|
+
j += 1
|
|
230
|
+
continue
|
|
231
|
+
f = (j > 0 and tbl[ii][j - 1] and tbl[ii][j - 1][0].get("text")) or j == 0
|
|
232
|
+
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii][j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
|
|
233
|
+
if f and ff:
|
|
234
|
+
j += 1
|
|
235
|
+
continue
|
|
236
|
+
bx = tbl[ii][j][0]
|
|
237
|
+
logging.debug("Relocate column single: " + bx["text"])
|
|
238
|
+
# j column only has one value
|
|
239
|
+
left, right = 100000, 100000
|
|
240
|
+
if j > 0 and not f:
|
|
241
|
+
for i in range(len(tbl)):
|
|
242
|
+
if tbl[i][j - 1]:
|
|
243
|
+
left = min(left, np.min([bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
|
|
244
|
+
if j + 1 < len(tbl[0]) and not ff:
|
|
245
|
+
for i in range(len(tbl)):
|
|
246
|
+
if tbl[i][j + 1]:
|
|
247
|
+
right = min(right, np.min([a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
|
|
248
|
+
assert left < 100000 or right < 100000
|
|
249
|
+
if left < right:
|
|
250
|
+
for jj in range(j, len(tbl[0])):
|
|
251
|
+
for i in range(len(tbl)):
|
|
252
|
+
for a in tbl[i][jj]:
|
|
253
|
+
a["cn"] -= 1
|
|
254
|
+
if tbl[ii][j - 1]:
|
|
255
|
+
tbl[ii][j - 1].extend(tbl[ii][j])
|
|
256
|
+
else:
|
|
257
|
+
tbl[ii][j - 1] = tbl[ii][j]
|
|
258
|
+
for i in range(len(tbl)):
|
|
259
|
+
tbl[i].pop(j)
|
|
260
|
+
|
|
261
|
+
else:
|
|
262
|
+
for jj in range(j + 1, len(tbl[0])):
|
|
263
|
+
for i in range(len(tbl)):
|
|
264
|
+
for a in tbl[i][jj]:
|
|
265
|
+
a["cn"] -= 1
|
|
266
|
+
if tbl[ii][j + 1]:
|
|
267
|
+
tbl[ii][j + 1].extend(tbl[ii][j])
|
|
268
|
+
else:
|
|
269
|
+
tbl[ii][j + 1] = tbl[ii][j]
|
|
270
|
+
for i in range(len(tbl)):
|
|
271
|
+
tbl[i].pop(j)
|
|
272
|
+
cols.pop(j)
|
|
273
|
+
assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (len(cols), len(tbl[0]))
|
|
274
|
+
|
|
275
|
+
if len(cols) >= 4:
|
|
276
|
+
# remove single in row
|
|
277
|
+
i = 0
|
|
278
|
+
while i < len(tbl):
|
|
279
|
+
e, jj = 0, 0
|
|
280
|
+
for j in range(len(tbl[i])):
|
|
281
|
+
if tbl[i][j]:
|
|
282
|
+
e += 1
|
|
283
|
+
jj = j
|
|
284
|
+
if e > 1:
|
|
285
|
+
break
|
|
286
|
+
if e > 1:
|
|
287
|
+
i += 1
|
|
288
|
+
continue
|
|
289
|
+
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1][jj][0].get("text")) or i == 0
|
|
290
|
+
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1][jj][0].get("text")) or i + 1 >= len(tbl)
|
|
291
|
+
if f and ff:
|
|
292
|
+
i += 1
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
bx = tbl[i][jj][0]
|
|
296
|
+
logging.debug("Relocate row single: " + bx["text"])
|
|
297
|
+
# i row only has one value
|
|
298
|
+
up, down = 100000, 100000
|
|
299
|
+
if i > 0 and not f:
|
|
300
|
+
for j in range(len(tbl[i - 1])):
|
|
301
|
+
if tbl[i - 1][j]:
|
|
302
|
+
up = min(up, np.min([bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
|
|
303
|
+
if i + 1 < len(tbl) and not ff:
|
|
304
|
+
for j in range(len(tbl[i + 1])):
|
|
305
|
+
if tbl[i + 1][j]:
|
|
306
|
+
down = min(down, np.min([a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
|
|
307
|
+
assert up < 100000 or down < 100000
|
|
308
|
+
if up < down:
|
|
309
|
+
for ii in range(i, len(tbl)):
|
|
310
|
+
for j in range(len(tbl[ii])):
|
|
311
|
+
for a in tbl[ii][j]:
|
|
312
|
+
a["rn"] -= 1
|
|
313
|
+
if tbl[i - 1][jj]:
|
|
314
|
+
tbl[i - 1][jj].extend(tbl[i][jj])
|
|
315
|
+
else:
|
|
316
|
+
tbl[i - 1][jj] = tbl[i][jj]
|
|
317
|
+
tbl.pop(i)
|
|
318
|
+
|
|
319
|
+
else:
|
|
320
|
+
for ii in range(i + 1, len(tbl)):
|
|
321
|
+
for j in range(len(tbl[ii])):
|
|
322
|
+
for a in tbl[ii][j]:
|
|
323
|
+
a["rn"] -= 1
|
|
324
|
+
if tbl[i + 1][jj]:
|
|
325
|
+
tbl[i + 1][jj].extend(tbl[i][jj])
|
|
326
|
+
else:
|
|
327
|
+
tbl[i + 1][jj] = tbl[i][jj]
|
|
328
|
+
tbl.pop(i)
|
|
329
|
+
rows.pop(i)
|
|
330
|
+
|
|
331
|
+
# which rows are headers
|
|
332
|
+
hdset = set([])
|
|
333
|
+
for i in range(len(tbl)):
|
|
334
|
+
cnt, h = 0, 0
|
|
335
|
+
for j, arr in enumerate(tbl[i]):
|
|
336
|
+
if not arr:
|
|
337
|
+
continue
|
|
338
|
+
cnt += 1
|
|
339
|
+
if max_type == "Nu" and arr[0]["btype"] == "Nu":
|
|
340
|
+
continue
|
|
341
|
+
if any([a.get("H") for a in arr]) or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
|
|
342
|
+
h += 1
|
|
343
|
+
if h / cnt > 0.5:
|
|
344
|
+
hdset.add(i)
|
|
345
|
+
|
|
346
|
+
if html:
|
|
347
|
+
return TableStructureRecognizer.__html_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, True))
|
|
348
|
+
|
|
349
|
+
return TableStructureRecognizer.__desc_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, False), is_english)
|
|
350
|
+
|
|
351
|
+
@staticmethod
|
|
352
|
+
def __html_table(cap, hdset, tbl):
|
|
353
|
+
# constrcut HTML
|
|
354
|
+
html = "<table>"
|
|
355
|
+
if cap:
|
|
356
|
+
html += f"<caption>{cap}</caption>"
|
|
357
|
+
for i in range(len(tbl)):
|
|
358
|
+
row = "<tr>"
|
|
359
|
+
txts = []
|
|
360
|
+
for j, arr in enumerate(tbl[i]):
|
|
361
|
+
if arr is None:
|
|
362
|
+
continue
|
|
363
|
+
if not arr:
|
|
364
|
+
row += "<td></td>" if i not in hdset else "<th></th>"
|
|
365
|
+
continue
|
|
366
|
+
txt = ""
|
|
367
|
+
if arr:
|
|
368
|
+
h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
|
|
369
|
+
txt = " ".join([c["text"] for c in Recognizer.sort_Y_firstly(arr, h)])
|
|
370
|
+
txts.append(txt)
|
|
371
|
+
sp = ""
|
|
372
|
+
if arr[0].get("colspan"):
|
|
373
|
+
sp = "colspan={}".format(arr[0]["colspan"])
|
|
374
|
+
if arr[0].get("rowspan"):
|
|
375
|
+
sp += " rowspan={}".format(arr[0]["rowspan"])
|
|
376
|
+
if i in hdset:
|
|
377
|
+
row += f"<th {sp} >" + txt + "</th>"
|
|
378
|
+
else:
|
|
379
|
+
row += f"<td {sp} >" + txt + "</td>"
|
|
380
|
+
|
|
381
|
+
if i in hdset:
|
|
382
|
+
if all([t in hdset for t in txts]):
|
|
383
|
+
continue
|
|
384
|
+
for t in txts:
|
|
385
|
+
hdset.add(t)
|
|
386
|
+
|
|
387
|
+
if row != "<tr>":
|
|
388
|
+
row += "</tr>"
|
|
389
|
+
else:
|
|
390
|
+
row = ""
|
|
391
|
+
html += "\n" + row
|
|
392
|
+
html += "\n</table>"
|
|
393
|
+
return html
|
|
394
|
+
|
|
395
|
+
@staticmethod
|
|
396
|
+
def __desc_table(cap, hdr_rowno, tbl, is_english):
|
|
397
|
+
# get text of every colomn in header row to become header text
|
|
398
|
+
clmno = len(tbl[0])
|
|
399
|
+
rowno = len(tbl)
|
|
400
|
+
headers = {}
|
|
401
|
+
hdrset = set()
|
|
402
|
+
lst_hdr = []
|
|
403
|
+
de = "的" if not is_english else " for "
|
|
404
|
+
for r in sorted(list(hdr_rowno)):
|
|
405
|
+
headers[r] = ["" for _ in range(clmno)]
|
|
406
|
+
for i in range(clmno):
|
|
407
|
+
if not tbl[r][i]:
|
|
408
|
+
continue
|
|
409
|
+
txt = " ".join([a["text"].strip() for a in tbl[r][i]])
|
|
410
|
+
headers[r][i] = txt
|
|
411
|
+
hdrset.add(txt)
|
|
412
|
+
if all([not t for t in headers[r]]):
|
|
413
|
+
del headers[r]
|
|
414
|
+
hdr_rowno.remove(r)
|
|
415
|
+
continue
|
|
416
|
+
for j in range(clmno):
|
|
417
|
+
if headers[r][j]:
|
|
418
|
+
continue
|
|
419
|
+
if j >= len(lst_hdr):
|
|
420
|
+
break
|
|
421
|
+
headers[r][j] = lst_hdr[j]
|
|
422
|
+
lst_hdr = headers[r]
|
|
423
|
+
for i in range(rowno):
|
|
424
|
+
if i not in hdr_rowno:
|
|
425
|
+
continue
|
|
426
|
+
for j in range(i + 1, rowno):
|
|
427
|
+
if j not in hdr_rowno:
|
|
428
|
+
break
|
|
429
|
+
for k in range(clmno):
|
|
430
|
+
if not headers[j - 1][k]:
|
|
431
|
+
continue
|
|
432
|
+
if headers[j][k].find(headers[j - 1][k]) >= 0:
|
|
433
|
+
continue
|
|
434
|
+
if len(headers[j][k]) > len(headers[j - 1][k]):
|
|
435
|
+
headers[j][k] += (de if headers[j][k] else "") + headers[j - 1][k]
|
|
436
|
+
else:
|
|
437
|
+
headers[j][k] = headers[j - 1][k] + (de if headers[j - 1][k] else "") + headers[j][k]
|
|
438
|
+
|
|
439
|
+
logging.debug(f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
|
|
440
|
+
row_txt = []
|
|
441
|
+
for i in range(rowno):
|
|
442
|
+
if i in hdr_rowno:
|
|
443
|
+
continue
|
|
444
|
+
rtxt = []
|
|
445
|
+
|
|
446
|
+
def append(delimer):
|
|
447
|
+
nonlocal rtxt, row_txt
|
|
448
|
+
rtxt = delimer.join(rtxt)
|
|
449
|
+
if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
|
|
450
|
+
row_txt[-1] += "\n" + rtxt
|
|
451
|
+
else:
|
|
452
|
+
row_txt.append(rtxt)
|
|
453
|
+
|
|
454
|
+
r = 0
|
|
455
|
+
if len(headers.items()):
|
|
456
|
+
_arr = [(i - r, r) for r, _ in headers.items() if r < i]
|
|
457
|
+
if _arr:
|
|
458
|
+
_, r = min(_arr, key=lambda x: x[0])
|
|
459
|
+
|
|
460
|
+
if r not in headers and clmno <= 2:
|
|
461
|
+
for j in range(clmno):
|
|
462
|
+
if not tbl[i][j]:
|
|
463
|
+
continue
|
|
464
|
+
txt = "".join([a["text"].strip() for a in tbl[i][j]])
|
|
465
|
+
if txt:
|
|
466
|
+
rtxt.append(txt)
|
|
467
|
+
if rtxt:
|
|
468
|
+
append(":")
|
|
469
|
+
continue
|
|
470
|
+
|
|
471
|
+
for j in range(clmno):
|
|
472
|
+
if not tbl[i][j]:
|
|
473
|
+
continue
|
|
474
|
+
txt = "".join([a["text"].strip() for a in tbl[i][j]])
|
|
475
|
+
if not txt:
|
|
476
|
+
continue
|
|
477
|
+
ctt = headers[r][j] if r in headers else ""
|
|
478
|
+
if ctt:
|
|
479
|
+
ctt += ":"
|
|
480
|
+
ctt += txt
|
|
481
|
+
if ctt:
|
|
482
|
+
rtxt.append(ctt)
|
|
483
|
+
|
|
484
|
+
if rtxt:
|
|
485
|
+
row_txt.append("; ".join(rtxt))
|
|
486
|
+
|
|
487
|
+
if cap:
|
|
488
|
+
if is_english:
|
|
489
|
+
from_ = " in "
|
|
490
|
+
else:
|
|
491
|
+
from_ = "来自"
|
|
492
|
+
row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
|
|
493
|
+
return row_txt
|
|
494
|
+
|
|
495
|
+
@staticmethod
|
|
496
|
+
def __cal_spans(boxes, rows, cols, tbl, html=True):
|
|
497
|
+
# caculate span
|
|
498
|
+
clft = [np.mean([c.get("C_left", c["x0"]) for c in cln]) for cln in cols]
|
|
499
|
+
crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln]) for cln in cols]
|
|
500
|
+
rtop = [np.mean([c.get("R_top", c["top"]) for c in row]) for row in rows]
|
|
501
|
+
rbtm = [np.mean([c.get("R_btm", c["bottom"]) for c in row]) for row in rows]
|
|
502
|
+
for b in boxes:
|
|
503
|
+
if "SP" not in b:
|
|
504
|
+
continue
|
|
505
|
+
b["colspan"] = [b["cn"]]
|
|
506
|
+
b["rowspan"] = [b["rn"]]
|
|
507
|
+
# col span
|
|
508
|
+
for j in range(0, len(clft)):
|
|
509
|
+
if j == b["cn"]:
|
|
510
|
+
continue
|
|
511
|
+
if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
|
|
512
|
+
continue
|
|
513
|
+
if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
|
|
514
|
+
continue
|
|
515
|
+
b["colspan"].append(j)
|
|
516
|
+
# row span
|
|
517
|
+
for j in range(0, len(rtop)):
|
|
518
|
+
if j == b["rn"]:
|
|
519
|
+
continue
|
|
520
|
+
if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
|
|
521
|
+
continue
|
|
522
|
+
if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
|
|
523
|
+
continue
|
|
524
|
+
b["rowspan"].append(j)
|
|
525
|
+
|
|
526
|
+
def join(arr):
|
|
527
|
+
if not arr:
|
|
528
|
+
return ""
|
|
529
|
+
return "".join([t["text"] for t in arr])
|
|
530
|
+
|
|
531
|
+
# rm the spaning cells
|
|
532
|
+
for i in range(len(tbl)):
|
|
533
|
+
for j, arr in enumerate(tbl[i]):
|
|
534
|
+
if not arr:
|
|
535
|
+
continue
|
|
536
|
+
if all(["rowspan" not in a and "colspan" not in a for a in arr]):
|
|
537
|
+
continue
|
|
538
|
+
rowspan, colspan = [], []
|
|
539
|
+
for a in arr:
|
|
540
|
+
if isinstance(a.get("rowspan", 0), list):
|
|
541
|
+
rowspan.extend(a["rowspan"])
|
|
542
|
+
if isinstance(a.get("colspan", 0), list):
|
|
543
|
+
colspan.extend(a["colspan"])
|
|
544
|
+
rowspan, colspan = set(rowspan), set(colspan)
|
|
545
|
+
if len(rowspan) < 2 and len(colspan) < 2:
|
|
546
|
+
for a in arr:
|
|
547
|
+
if "rowspan" in a:
|
|
548
|
+
del a["rowspan"]
|
|
549
|
+
if "colspan" in a:
|
|
550
|
+
del a["colspan"]
|
|
551
|
+
continue
|
|
552
|
+
rowspan, colspan = sorted(rowspan), sorted(colspan)
|
|
553
|
+
rowspan = list(range(rowspan[0], rowspan[-1] + 1))
|
|
554
|
+
colspan = list(range(colspan[0], colspan[-1] + 1))
|
|
555
|
+
assert i in rowspan, rowspan
|
|
556
|
+
assert j in colspan, colspan
|
|
557
|
+
arr = []
|
|
558
|
+
for r in rowspan:
|
|
559
|
+
for c in colspan:
|
|
560
|
+
arr_txt = join(arr)
|
|
561
|
+
if tbl[r][c] and join(tbl[r][c]) != arr_txt:
|
|
562
|
+
arr.extend(tbl[r][c])
|
|
563
|
+
tbl[r][c] = None if html else arr
|
|
564
|
+
for a in arr:
|
|
565
|
+
if len(rowspan) > 1:
|
|
566
|
+
a["rowspan"] = len(rowspan)
|
|
567
|
+
elif "rowspan" in a:
|
|
568
|
+
del a["rowspan"]
|
|
569
|
+
if len(colspan) > 1:
|
|
570
|
+
a["colspan"] = len(colspan)
|
|
571
|
+
elif "colspan" in a:
|
|
572
|
+
del a["colspan"]
|
|
573
|
+
tbl[rowspan[0]][colspan[0]] = arr
|
|
574
|
+
|
|
575
|
+
return tbl
|
|
576
|
+
|
|
577
|
+
def _run_ascend_tsr(self, image_list, thr=0.2, batch_size=16):
|
|
578
|
+
import math
|
|
579
|
+
|
|
580
|
+
from ais_bench.infer.interface import InferSession
|
|
581
|
+
|
|
582
|
+
model_root = os.getenv("DEEPDOC_ASCEND_MODEL_DIR") or self.model_dir
|
|
583
|
+
if not model_root:
|
|
584
|
+
raise FileNotFoundError(
|
|
585
|
+
"Ascend table structure recognizer requires DEEPDOC_ASCEND_MODEL_DIR or an explicit model_dir."
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
model_file_path = os.path.join(model_root, "tsr.om")
|
|
589
|
+
|
|
590
|
+
if not os.path.exists(model_file_path):
|
|
591
|
+
raise ValueError(f"Model file not found: {model_file_path}")
|
|
592
|
+
|
|
593
|
+
device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
|
|
594
|
+
session = InferSession(device_id=device_id, model_path=model_file_path)
|
|
595
|
+
|
|
596
|
+
images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list]
|
|
597
|
+
results = []
|
|
598
|
+
|
|
599
|
+
conf_thr = max(thr, 0.08)
|
|
600
|
+
|
|
601
|
+
batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
|
|
602
|
+
for bi in range(batch_loop_cnt):
|
|
603
|
+
s = bi * batch_size
|
|
604
|
+
e = min((bi + 1) * batch_size, len(images))
|
|
605
|
+
batch_images = images[s:e]
|
|
606
|
+
|
|
607
|
+
inputs_list = self.preprocess(batch_images)
|
|
608
|
+
for ins in inputs_list:
|
|
609
|
+
feeds = []
|
|
610
|
+
if "image" in ins:
|
|
611
|
+
feeds.append(ins["image"])
|
|
612
|
+
else:
|
|
613
|
+
feeds.append(ins[self.input_names[0]])
|
|
614
|
+
output_list = session.infer(feeds=feeds, mode="static")
|
|
615
|
+
bb = self.postprocess(output_list, ins, conf_thr)
|
|
616
|
+
results.append(bb)
|
|
617
|
+
return results
|