deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,617 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import logging
17
+ import os
18
+ import re
19
+ from collections import Counter
20
+
21
+ import numpy as np
22
+ from ..common.model_store import resolve_vision_model_dir
23
+ from ..depend import rag_tokenizer
24
+
25
+ from .recognizer import Recognizer
26
+
27
+
28
+ class TableStructureRecognizer(Recognizer):
29
+ labels = [
30
+ "table",
31
+ "table column",
32
+ "table row",
33
+ "table column header",
34
+ "table projected row header",
35
+ "table spanning cell",
36
+ ]
37
+
38
+ def __init__(
39
+ self,
40
+ model_dir: str | None = None,
41
+ model_home: str | None = None,
42
+ model_provider: str | None = None,
43
+ offline: bool | None = None,
44
+ ):
45
+ if not model_dir:
46
+ model_dir = resolve_vision_model_dir(
47
+ model_home=model_home,
48
+ provider=model_provider,
49
+ offline=offline,
50
+ )
51
+ self.model_dir = model_dir
52
+ super().__init__(self.labels, "tsr", model_dir)
53
+
54
+ def __call__(self, images, thr=0.2):
55
+ table_structure_recognizer_type = os.getenv("TABLE_STRUCTURE_RECOGNIZER_TYPE", "onnx").lower()
56
+ if table_structure_recognizer_type not in ["onnx", "ascend"]:
57
+ raise RuntimeError("Unsupported table structure recognizer type.")
58
+
59
+ if table_structure_recognizer_type == "onnx":
60
+ logging.debug("Using Onnx table structure recognizer")
61
+ tbls = super().__call__(images, thr)
62
+ else: # ascend
63
+ logging.debug("Using Ascend table structure recognizer")
64
+ tbls = self._run_ascend_tsr(images, thr)
65
+
66
+ res = []
67
+ # align left&right for rows, align top&bottom for columns
68
+ for tbl in tbls:
69
+ lts = [
70
+ {
71
+ "label": b["type"],
72
+ "score": b["score"],
73
+ "x0": b["bbox"][0],
74
+ "x1": b["bbox"][2],
75
+ "top": b["bbox"][1],
76
+ "bottom": b["bbox"][-1],
77
+ }
78
+ for b in tbl
79
+ ]
80
+ if not lts:
81
+ continue
82
+
83
+ left = [b["x0"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0]
84
+ right = [b["x1"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0]
85
+ if not left:
86
+ continue
87
+ left = np.mean(left) if len(left) > 4 else np.min(left)
88
+ right = np.mean(right) if len(right) > 4 else np.max(right)
89
+ for b in lts:
90
+ if b["label"].find("row") > 0 or b["label"].find("header") > 0:
91
+ if b["x0"] > left:
92
+ b["x0"] = left
93
+ if b["x1"] < right:
94
+ b["x1"] = right
95
+
96
+ top = [b["top"] for b in lts if b["label"] == "table column"]
97
+ bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
98
+ if not top:
99
+ res.append(lts)
100
+ continue
101
+ top = np.median(top) if len(top) > 4 else np.min(top)
102
+ bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
103
+ for b in lts:
104
+ if b["label"] == "table column":
105
+ if b["top"] > top:
106
+ b["top"] = top
107
+ if b["bottom"] < bottom:
108
+ b["bottom"] = bottom
109
+
110
+ res.append(lts)
111
+ return res
112
+
113
+ @staticmethod
114
+ def is_caption(bx):
115
+ patt = [r"[图表]+[ 0-9::]{2,}"]
116
+ if any([re.match(p, bx["text"].strip()) for p in patt]) or bx.get("layout_type", "").find("caption") >= 0:
117
+ return True
118
+ return False
119
+
120
+ @staticmethod
121
+ def blockType(b):
122
+ patt = [
123
+ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
124
+ (r"^(20|19)[0-9]{2}年$", "Dt"),
125
+ (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
126
+ ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
127
+ (r"^第*[一二三四1-4]季度$", "Dt"),
128
+ (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
129
+ (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
130
+ ("^[0-9.,+%/ -]+$", "Nu"),
131
+ (r"^[0-9A-Z/\._~-]+$", "Ca"),
132
+ (r"^[A-Z]*[a-z' -]+$", "En"),
133
+ (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
134
+ (r"^.{1}$", "Sg"),
135
+ ]
136
+ for p, n in patt:
137
+ if re.search(p, b["text"].strip()):
138
+ return n
139
+ tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
140
+ if len(tks) > 3:
141
+ if len(tks) < 12:
142
+ return "Tx"
143
+ else:
144
+ return "Lx"
145
+
146
+ if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
147
+ return "Nr"
148
+
149
+ return "Ot"
150
+
151
+ @staticmethod
152
+ def construct_table(boxes, is_english=False, html=True, **kwargs):
153
+ cap = ""
154
+ i = 0
155
+ while i < len(boxes):
156
+ if TableStructureRecognizer.is_caption(boxes[i]):
157
+ if is_english:
158
+ cap += " "
159
+ cap += boxes[i]["text"]
160
+ boxes.pop(i)
161
+ i -= 1
162
+ i += 1
163
+
164
+ if not boxes:
165
+ return []
166
+ for b in boxes:
167
+ b["btype"] = TableStructureRecognizer.blockType(b)
168
+ max_type = Counter([b["btype"] for b in boxes]).items()
169
+ max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
170
+ logging.debug("MAXTYPE: " + max_type)
171
+
172
+ rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
173
+ rowh = np.min(rowh) if rowh else 0
174
+ boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
175
+ # for b in boxes:print(b)
176
+ boxes[0]["rn"] = 0
177
+ rows = [[boxes[0]]]
178
+ btm = boxes[0]["bottom"]
179
+ for b in boxes[1:]:
180
+ b["rn"] = len(rows) - 1
181
+ lst_r = rows[-1]
182
+ if lst_r[-1].get("R", "") != b.get("R", "") or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")): # new row
183
+ btm = b["bottom"]
184
+ b["rn"] += 1
185
+ rows.append([b])
186
+ continue
187
+ btm = (btm + b["bottom"]) / 2.0
188
+ rows[-1].append(b)
189
+
190
+ colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
191
+ colwm = np.min(colwm) if colwm else 0
192
+ crosspage = len(set([b["page_number"] for b in boxes])) > 1
193
+ if crosspage:
194
+ boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
195
+ else:
196
+ boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
197
+ boxes[0]["cn"] = 0
198
+ cols = [[boxes[0]]]
199
+ right = boxes[0]["x1"]
200
+ for b in boxes[1:]:
201
+ b["cn"] = len(cols) - 1
202
+ lst_c = cols[-1]
203
+ if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1]["page_number"]) or (
204
+ b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")
205
+ ): # new col
206
+ right = b["x1"]
207
+ b["cn"] += 1
208
+ cols.append([b])
209
+ continue
210
+ right = (right + b["x1"]) / 2.0
211
+ cols[-1].append(b)
212
+
213
+ tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
214
+ for b in boxes:
215
+ tbl[b["rn"]][b["cn"]].append(b)
216
+
217
+ if len(rows) >= 4:
218
+ # remove single in column
219
+ j = 0
220
+ while j < len(tbl[0]):
221
+ e, ii = 0, 0
222
+ for i in range(len(tbl)):
223
+ if tbl[i][j]:
224
+ e += 1
225
+ ii = i
226
+ if e > 1:
227
+ break
228
+ if e > 1:
229
+ j += 1
230
+ continue
231
+ f = (j > 0 and tbl[ii][j - 1] and tbl[ii][j - 1][0].get("text")) or j == 0
232
+ ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii][j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
233
+ if f and ff:
234
+ j += 1
235
+ continue
236
+ bx = tbl[ii][j][0]
237
+ logging.debug("Relocate column single: " + bx["text"])
238
+ # j column only has one value
239
+ left, right = 100000, 100000
240
+ if j > 0 and not f:
241
+ for i in range(len(tbl)):
242
+ if tbl[i][j - 1]:
243
+ left = min(left, np.min([bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
244
+ if j + 1 < len(tbl[0]) and not ff:
245
+ for i in range(len(tbl)):
246
+ if tbl[i][j + 1]:
247
+ right = min(right, np.min([a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
248
+ assert left < 100000 or right < 100000
249
+ if left < right:
250
+ for jj in range(j, len(tbl[0])):
251
+ for i in range(len(tbl)):
252
+ for a in tbl[i][jj]:
253
+ a["cn"] -= 1
254
+ if tbl[ii][j - 1]:
255
+ tbl[ii][j - 1].extend(tbl[ii][j])
256
+ else:
257
+ tbl[ii][j - 1] = tbl[ii][j]
258
+ for i in range(len(tbl)):
259
+ tbl[i].pop(j)
260
+
261
+ else:
262
+ for jj in range(j + 1, len(tbl[0])):
263
+ for i in range(len(tbl)):
264
+ for a in tbl[i][jj]:
265
+ a["cn"] -= 1
266
+ if tbl[ii][j + 1]:
267
+ tbl[ii][j + 1].extend(tbl[ii][j])
268
+ else:
269
+ tbl[ii][j + 1] = tbl[ii][j]
270
+ for i in range(len(tbl)):
271
+ tbl[i].pop(j)
272
+ cols.pop(j)
273
+ assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (len(cols), len(tbl[0]))
274
+
275
+ if len(cols) >= 4:
276
+ # remove single in row
277
+ i = 0
278
+ while i < len(tbl):
279
+ e, jj = 0, 0
280
+ for j in range(len(tbl[i])):
281
+ if tbl[i][j]:
282
+ e += 1
283
+ jj = j
284
+ if e > 1:
285
+ break
286
+ if e > 1:
287
+ i += 1
288
+ continue
289
+ f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1][jj][0].get("text")) or i == 0
290
+ ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1][jj][0].get("text")) or i + 1 >= len(tbl)
291
+ if f and ff:
292
+ i += 1
293
+ continue
294
+
295
+ bx = tbl[i][jj][0]
296
+ logging.debug("Relocate row single: " + bx["text"])
297
+ # i row only has one value
298
+ up, down = 100000, 100000
299
+ if i > 0 and not f:
300
+ for j in range(len(tbl[i - 1])):
301
+ if tbl[i - 1][j]:
302
+ up = min(up, np.min([bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
303
+ if i + 1 < len(tbl) and not ff:
304
+ for j in range(len(tbl[i + 1])):
305
+ if tbl[i + 1][j]:
306
+ down = min(down, np.min([a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
307
+ assert up < 100000 or down < 100000
308
+ if up < down:
309
+ for ii in range(i, len(tbl)):
310
+ for j in range(len(tbl[ii])):
311
+ for a in tbl[ii][j]:
312
+ a["rn"] -= 1
313
+ if tbl[i - 1][jj]:
314
+ tbl[i - 1][jj].extend(tbl[i][jj])
315
+ else:
316
+ tbl[i - 1][jj] = tbl[i][jj]
317
+ tbl.pop(i)
318
+
319
+ else:
320
+ for ii in range(i + 1, len(tbl)):
321
+ for j in range(len(tbl[ii])):
322
+ for a in tbl[ii][j]:
323
+ a["rn"] -= 1
324
+ if tbl[i + 1][jj]:
325
+ tbl[i + 1][jj].extend(tbl[i][jj])
326
+ else:
327
+ tbl[i + 1][jj] = tbl[i][jj]
328
+ tbl.pop(i)
329
+ rows.pop(i)
330
+
331
+ # which rows are headers
332
+ hdset = set([])
333
+ for i in range(len(tbl)):
334
+ cnt, h = 0, 0
335
+ for j, arr in enumerate(tbl[i]):
336
+ if not arr:
337
+ continue
338
+ cnt += 1
339
+ if max_type == "Nu" and arr[0]["btype"] == "Nu":
340
+ continue
341
+ if any([a.get("H") for a in arr]) or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
342
+ h += 1
343
+ if h / cnt > 0.5:
344
+ hdset.add(i)
345
+
346
+ if html:
347
+ return TableStructureRecognizer.__html_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, True))
348
+
349
+ return TableStructureRecognizer.__desc_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, False), is_english)
350
+
351
+ @staticmethod
352
+ def __html_table(cap, hdset, tbl):
353
+ # constrcut HTML
354
+ html = "<table>"
355
+ if cap:
356
+ html += f"<caption>{cap}</caption>"
357
+ for i in range(len(tbl)):
358
+ row = "<tr>"
359
+ txts = []
360
+ for j, arr in enumerate(tbl[i]):
361
+ if arr is None:
362
+ continue
363
+ if not arr:
364
+ row += "<td></td>" if i not in hdset else "<th></th>"
365
+ continue
366
+ txt = ""
367
+ if arr:
368
+ h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
369
+ txt = " ".join([c["text"] for c in Recognizer.sort_Y_firstly(arr, h)])
370
+ txts.append(txt)
371
+ sp = ""
372
+ if arr[0].get("colspan"):
373
+ sp = "colspan={}".format(arr[0]["colspan"])
374
+ if arr[0].get("rowspan"):
375
+ sp += " rowspan={}".format(arr[0]["rowspan"])
376
+ if i in hdset:
377
+ row += f"<th {sp} >" + txt + "</th>"
378
+ else:
379
+ row += f"<td {sp} >" + txt + "</td>"
380
+
381
+ if i in hdset:
382
+ if all([t in hdset for t in txts]):
383
+ continue
384
+ for t in txts:
385
+ hdset.add(t)
386
+
387
+ if row != "<tr>":
388
+ row += "</tr>"
389
+ else:
390
+ row = ""
391
+ html += "\n" + row
392
+ html += "\n</table>"
393
+ return html
394
+
395
+ @staticmethod
396
+ def __desc_table(cap, hdr_rowno, tbl, is_english):
397
+ # get text of every colomn in header row to become header text
398
+ clmno = len(tbl[0])
399
+ rowno = len(tbl)
400
+ headers = {}
401
+ hdrset = set()
402
+ lst_hdr = []
403
+ de = "的" if not is_english else " for "
404
+ for r in sorted(list(hdr_rowno)):
405
+ headers[r] = ["" for _ in range(clmno)]
406
+ for i in range(clmno):
407
+ if not tbl[r][i]:
408
+ continue
409
+ txt = " ".join([a["text"].strip() for a in tbl[r][i]])
410
+ headers[r][i] = txt
411
+ hdrset.add(txt)
412
+ if all([not t for t in headers[r]]):
413
+ del headers[r]
414
+ hdr_rowno.remove(r)
415
+ continue
416
+ for j in range(clmno):
417
+ if headers[r][j]:
418
+ continue
419
+ if j >= len(lst_hdr):
420
+ break
421
+ headers[r][j] = lst_hdr[j]
422
+ lst_hdr = headers[r]
423
+ for i in range(rowno):
424
+ if i not in hdr_rowno:
425
+ continue
426
+ for j in range(i + 1, rowno):
427
+ if j not in hdr_rowno:
428
+ break
429
+ for k in range(clmno):
430
+ if not headers[j - 1][k]:
431
+ continue
432
+ if headers[j][k].find(headers[j - 1][k]) >= 0:
433
+ continue
434
+ if len(headers[j][k]) > len(headers[j - 1][k]):
435
+ headers[j][k] += (de if headers[j][k] else "") + headers[j - 1][k]
436
+ else:
437
+ headers[j][k] = headers[j - 1][k] + (de if headers[j - 1][k] else "") + headers[j][k]
438
+
439
+ logging.debug(f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
440
+ row_txt = []
441
+ for i in range(rowno):
442
+ if i in hdr_rowno:
443
+ continue
444
+ rtxt = []
445
+
446
+ def append(delimer):
447
+ nonlocal rtxt, row_txt
448
+ rtxt = delimer.join(rtxt)
449
+ if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
450
+ row_txt[-1] += "\n" + rtxt
451
+ else:
452
+ row_txt.append(rtxt)
453
+
454
+ r = 0
455
+ if len(headers.items()):
456
+ _arr = [(i - r, r) for r, _ in headers.items() if r < i]
457
+ if _arr:
458
+ _, r = min(_arr, key=lambda x: x[0])
459
+
460
+ if r not in headers and clmno <= 2:
461
+ for j in range(clmno):
462
+ if not tbl[i][j]:
463
+ continue
464
+ txt = "".join([a["text"].strip() for a in tbl[i][j]])
465
+ if txt:
466
+ rtxt.append(txt)
467
+ if rtxt:
468
+ append(":")
469
+ continue
470
+
471
+ for j in range(clmno):
472
+ if not tbl[i][j]:
473
+ continue
474
+ txt = "".join([a["text"].strip() for a in tbl[i][j]])
475
+ if not txt:
476
+ continue
477
+ ctt = headers[r][j] if r in headers else ""
478
+ if ctt:
479
+ ctt += ":"
480
+ ctt += txt
481
+ if ctt:
482
+ rtxt.append(ctt)
483
+
484
+ if rtxt:
485
+ row_txt.append("; ".join(rtxt))
486
+
487
+ if cap:
488
+ if is_english:
489
+ from_ = " in "
490
+ else:
491
+ from_ = "来自"
492
+ row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
493
+ return row_txt
494
+
495
+ @staticmethod
496
+ def __cal_spans(boxes, rows, cols, tbl, html=True):
497
+ # caculate span
498
+ clft = [np.mean([c.get("C_left", c["x0"]) for c in cln]) for cln in cols]
499
+ crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln]) for cln in cols]
500
+ rtop = [np.mean([c.get("R_top", c["top"]) for c in row]) for row in rows]
501
+ rbtm = [np.mean([c.get("R_btm", c["bottom"]) for c in row]) for row in rows]
502
+ for b in boxes:
503
+ if "SP" not in b:
504
+ continue
505
+ b["colspan"] = [b["cn"]]
506
+ b["rowspan"] = [b["rn"]]
507
+ # col span
508
+ for j in range(0, len(clft)):
509
+ if j == b["cn"]:
510
+ continue
511
+ if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
512
+ continue
513
+ if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
514
+ continue
515
+ b["colspan"].append(j)
516
+ # row span
517
+ for j in range(0, len(rtop)):
518
+ if j == b["rn"]:
519
+ continue
520
+ if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
521
+ continue
522
+ if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
523
+ continue
524
+ b["rowspan"].append(j)
525
+
526
+ def join(arr):
527
+ if not arr:
528
+ return ""
529
+ return "".join([t["text"] for t in arr])
530
+
531
+ # rm the spaning cells
532
+ for i in range(len(tbl)):
533
+ for j, arr in enumerate(tbl[i]):
534
+ if not arr:
535
+ continue
536
+ if all(["rowspan" not in a and "colspan" not in a for a in arr]):
537
+ continue
538
+ rowspan, colspan = [], []
539
+ for a in arr:
540
+ if isinstance(a.get("rowspan", 0), list):
541
+ rowspan.extend(a["rowspan"])
542
+ if isinstance(a.get("colspan", 0), list):
543
+ colspan.extend(a["colspan"])
544
+ rowspan, colspan = set(rowspan), set(colspan)
545
+ if len(rowspan) < 2 and len(colspan) < 2:
546
+ for a in arr:
547
+ if "rowspan" in a:
548
+ del a["rowspan"]
549
+ if "colspan" in a:
550
+ del a["colspan"]
551
+ continue
552
+ rowspan, colspan = sorted(rowspan), sorted(colspan)
553
+ rowspan = list(range(rowspan[0], rowspan[-1] + 1))
554
+ colspan = list(range(colspan[0], colspan[-1] + 1))
555
+ assert i in rowspan, rowspan
556
+ assert j in colspan, colspan
557
+ arr = []
558
+ for r in rowspan:
559
+ for c in colspan:
560
+ arr_txt = join(arr)
561
+ if tbl[r][c] and join(tbl[r][c]) != arr_txt:
562
+ arr.extend(tbl[r][c])
563
+ tbl[r][c] = None if html else arr
564
+ for a in arr:
565
+ if len(rowspan) > 1:
566
+ a["rowspan"] = len(rowspan)
567
+ elif "rowspan" in a:
568
+ del a["rowspan"]
569
+ if len(colspan) > 1:
570
+ a["colspan"] = len(colspan)
571
+ elif "colspan" in a:
572
+ del a["colspan"]
573
+ tbl[rowspan[0]][colspan[0]] = arr
574
+
575
+ return tbl
576
+
577
+ def _run_ascend_tsr(self, image_list, thr=0.2, batch_size=16):
578
+ import math
579
+
580
+ from ais_bench.infer.interface import InferSession
581
+
582
+ model_root = os.getenv("DEEPDOC_ASCEND_MODEL_DIR") or self.model_dir
583
+ if not model_root:
584
+ raise FileNotFoundError(
585
+ "Ascend table structure recognizer requires DEEPDOC_ASCEND_MODEL_DIR or an explicit model_dir."
586
+ )
587
+
588
+ model_file_path = os.path.join(model_root, "tsr.om")
589
+
590
+ if not os.path.exists(model_file_path):
591
+ raise ValueError(f"Model file not found: {model_file_path}")
592
+
593
+ device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
594
+ session = InferSession(device_id=device_id, model_path=model_file_path)
595
+
596
+ images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list]
597
+ results = []
598
+
599
+ conf_thr = max(thr, 0.08)
600
+
601
+ batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
602
+ for bi in range(batch_loop_cnt):
603
+ s = bi * batch_size
604
+ e = min((bi + 1) * batch_size, len(images))
605
+ batch_images = images[s:e]
606
+
607
+ inputs_list = self.preprocess(batch_images)
608
+ for ins in inputs_list:
609
+ feeds = []
610
+ if "image" in ins:
611
+ feeds.append(ins["image"])
612
+ else:
613
+ feeds.append(ins[self.input_names[0]])
614
+ output_list = session.infer(feeds=feeds, mode="static")
615
+ bb = self.postprocess(output_list, ins, conf_thr)
616
+ results.append(bb)
617
+ return results