xfmr-zem 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. xfmr_zem/cli.py +32 -3
  2. xfmr_zem/client.py +59 -8
  3. xfmr_zem/server.py +21 -4
  4. xfmr_zem/servers/data_juicer/server.py +1 -1
  5. xfmr_zem/servers/instruction_gen/server.py +1 -1
  6. xfmr_zem/servers/io/server.py +1 -1
  7. xfmr_zem/servers/llm/parameters.yml +10 -0
  8. xfmr_zem/servers/nemo_curator/server.py +1 -1
  9. xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
  10. xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
  11. xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
  12. xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
  13. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
  14. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
  15. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
  16. xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
  17. xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
  18. xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
  19. xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
  20. xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
  21. xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
  22. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
  23. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
  24. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
  25. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
  26. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
  27. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
  28. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
  29. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
  30. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
  31. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
  32. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
  33. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
  34. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
  35. xfmr_zem/servers/ocr/engines.py +242 -0
  36. xfmr_zem/servers/ocr/install_models.py +63 -0
  37. xfmr_zem/servers/ocr/parameters.yml +4 -0
  38. xfmr_zem/servers/ocr/server.py +44 -0
  39. xfmr_zem/servers/profiler/parameters.yml +4 -0
  40. xfmr_zem/servers/sinks/parameters.yml +6 -0
  41. xfmr_zem/servers/unstructured/parameters.yml +6 -0
  42. xfmr_zem/servers/unstructured/server.py +62 -0
  43. xfmr_zem/zenml_wrapper.py +20 -7
  44. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/METADATA +19 -1
  45. xfmr_zem-0.2.5.dist-info/RECORD +58 -0
  46. xfmr_zem-0.2.2.dist-info/RECORD +0 -23
  47. /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
  48. /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
  49. /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
  50. /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
  51. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/WHEEL +0 -0
  52. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/entry_points.txt +0 -0
  53. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,569 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import logging
17
+ import os
18
+ import re
19
+ from collections import Counter
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ from huggingface_hub import snapshot_download
24
+
25
+ from .recognizer import Recognizer
26
+
27
+ def get_project_base_directory():
28
+ return Path(__file__).resolve().parent
29
+
30
+
31
+ class TableStructureRecognizer(Recognizer):
32
+ labels = [
33
+ "table",
34
+ "table column",
35
+ "table row",
36
+ "table column header",
37
+ "table projected row header",
38
+ "table spanning cell",
39
+ ]
40
+
41
+ def __init__(self):
42
+ try:
43
+ super().__init__(self.labels, "tsr", os.path.join(
44
+ get_project_base_directory(),
45
+ "onnx"))
46
+ except Exception:
47
+ super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc",
48
+ local_dir=os.path.join(get_project_base_directory(), "onnx"),
49
+ local_dir_use_symlinks=False))
50
+
51
+ def __call__(self, images, thr=0.2):
52
+ tbls = super().__call__(images, thr)
53
+ res = []
54
+ # align left&right for rows, align top&bottom for columns
55
+ for tbl in tbls:
56
+ lts = [{"label": b["type"],
57
+ "score": b["score"],
58
+ "x0": b["bbox"][0], "x1": b["bbox"][2],
59
+ "top": b["bbox"][1], "bottom": b["bbox"][-1]
60
+ } for b in tbl]
61
+ if not lts:
62
+ continue
63
+
64
+ left = [b["x0"] for b in lts if b["label"].find(
65
+ "row") > 0 or b["label"].find("header") > 0]
66
+ right = [b["x1"] for b in lts if b["label"].find(
67
+ "row") > 0 or b["label"].find("header") > 0]
68
+ if not left:
69
+ continue
70
+ left = np.mean(left) if len(left) > 4 else np.min(left)
71
+ right = np.mean(right) if len(right) > 4 else np.max(right)
72
+ for b in lts:
73
+ if b["label"].find("row") > 0 or b["label"].find("header") > 0:
74
+ if b["x0"] > left:
75
+ b["x0"] = left
76
+ if b["x1"] < right:
77
+ b["x1"] = right
78
+
79
+ top = [b["top"] for b in lts if b["label"] == "table column"]
80
+ bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
81
+ if not top:
82
+ res.append(lts)
83
+ continue
84
+ top = np.median(top) if len(top) > 4 else np.min(top)
85
+ bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
86
+ for b in lts:
87
+ if b["label"] == "table column":
88
+ if b["top"] > top:
89
+ b["top"] = top
90
+ if b["bottom"] < bottom:
91
+ b["bottom"] = bottom
92
+
93
+ res.append(lts)
94
+ return res
95
+
96
+ @staticmethod
97
+ def is_caption(bx):
98
+ patt = [
99
+ r"[图表]+[ 0-9::]{2,}"
100
+ ]
101
+ if any([re.match(p, bx["text"].strip()) for p in patt]) \
102
+ or bx["layout_type"].find("caption") >= 0:
103
+ return True
104
+ return False
105
+
106
+ @staticmethod
107
+ def blockType(b):
108
+ patt = [
109
+ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
110
+ (r"^(20|19)[0-9]{2}年$", "Dt"),
111
+ (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
112
+ ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
113
+ (r"^第*[一二三四1-4]季度$", "Dt"),
114
+ (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
115
+ (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
116
+ ("^[0-9.,+%/ -]+$", "Nu"),
117
+ (r"^[0-9A-Z/\._~-]+$", "Ca"),
118
+ (r"^[A-Z]*[a-z' -]+$", "En"),
119
+ (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
120
+ (r"^.{1}$", "Sg")
121
+ ]
122
+ for p, n in patt:
123
+ if re.search(p, b["text"].strip()):
124
+ return n
125
+
126
+ @staticmethod
127
+ def construct_table(boxes, is_english=False, markdown=True, **kwargs):
128
+ cap = ""
129
+ i = 0
130
+ while i < len(boxes):
131
+ if TableStructureRecognizer.is_caption(boxes[i]):
132
+ if is_english:
133
+ cap + " "
134
+ cap += boxes[i]["text"]
135
+ boxes.pop(i)
136
+ i -= 1
137
+ i += 1
138
+
139
+ if not boxes:
140
+ return []
141
+ for b in boxes:
142
+ b["btype"] = TableStructureRecognizer.blockType(b)
143
+ max_type = Counter([b["btype"] for b in boxes]).items()
144
+ max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
145
+ logging.debug("MAXTYPE: " + str(max_type))
146
+
147
+ rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
148
+ rowh = np.min(rowh) if rowh else 0
149
+ boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
150
+ #for b in boxes:print(b)
151
+ boxes[0]["rn"] = 0
152
+ rows = [[boxes[0]]]
153
+ btm = boxes[0]["bottom"]
154
+ for b in boxes[1:]:
155
+ b["rn"] = len(rows) - 1
156
+ lst_r = rows[-1]
157
+ if lst_r[-1].get("R", "") != b.get("R", "") \
158
+ or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
159
+ ): # new row
160
+ btm = b["bottom"]
161
+ b["rn"] += 1
162
+ rows.append([b])
163
+ continue
164
+ btm = (btm + b["bottom"]) / 2.
165
+ rows[-1].append(b)
166
+
167
+ colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
168
+ colwm = np.min(colwm) if colwm else 0
169
+ crosspage = len(set([b["page_number"] for b in boxes])) > 1
170
+ if crosspage:
171
+ boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
172
+ else:
173
+ boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
174
+ boxes[0]["cn"] = 0
175
+ cols = [[boxes[0]]]
176
+ right = boxes[0]["x1"]
177
+ for b in boxes[1:]:
178
+ b["cn"] = len(cols) - 1
179
+ lst_c = cols[-1]
180
+ if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
181
+ "page_number"]) \
182
+ or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
183
+ right = b["x1"]
184
+ b["cn"] += 1
185
+ cols.append([b])
186
+ continue
187
+ right = (right + b["x1"]) / 2.
188
+ cols[-1].append(b)
189
+
190
+ tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
191
+ for b in boxes:
192
+ tbl[b["rn"]][b["cn"]].append(b)
193
+
194
+ if len(rows) >= 4:
195
+ # remove single in column
196
+ j = 0
197
+ while j < len(tbl[0]):
198
+ e, ii = 0, 0
199
+ for i in range(len(tbl)):
200
+ if tbl[i][j]:
201
+ e += 1
202
+ ii = i
203
+ if e > 1:
204
+ break
205
+ if e > 1:
206
+ j += 1
207
+ continue
208
+ f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
209
+ [j - 1][0].get("text")) or j == 0
210
+ ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
211
+ [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
212
+ if f and ff:
213
+ j += 1
214
+ continue
215
+ bx = tbl[ii][j][0]
216
+ logging.debug("Relocate column single: " + bx["text"])
217
+ # j column only has one value
218
+ left, right = 100000, 100000
219
+ if j > 0 and not f:
220
+ for i in range(len(tbl)):
221
+ if tbl[i][j - 1]:
222
+ left = min(left, np.min(
223
+ [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
224
+ if j + 1 < len(tbl[0]) and not ff:
225
+ for i in range(len(tbl)):
226
+ if tbl[i][j + 1]:
227
+ right = min(right, np.min(
228
+ [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
229
+ assert left < 100000 or right < 100000
230
+ if left < right:
231
+ for jj in range(j, len(tbl[0])):
232
+ for i in range(len(tbl)):
233
+ for a in tbl[i][jj]:
234
+ a["cn"] -= 1
235
+ if tbl[ii][j - 1]:
236
+ tbl[ii][j - 1].extend(tbl[ii][j])
237
+ else:
238
+ tbl[ii][j - 1] = tbl[ii][j]
239
+ for i in range(len(tbl)):
240
+ tbl[i].pop(j)
241
+
242
+ else:
243
+ for jj in range(j + 1, len(tbl[0])):
244
+ for i in range(len(tbl)):
245
+ for a in tbl[i][jj]:
246
+ a["cn"] -= 1
247
+ if tbl[ii][j + 1]:
248
+ tbl[ii][j + 1].extend(tbl[ii][j])
249
+ else:
250
+ tbl[ii][j + 1] = tbl[ii][j]
251
+ for i in range(len(tbl)):
252
+ tbl[i].pop(j)
253
+ cols.pop(j)
254
+ assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
255
+ len(cols), len(tbl[0]))
256
+
257
+ if len(cols) >= 4:
258
+ # remove single in row
259
+ i = 0
260
+ while i < len(tbl):
261
+ e, jj = 0, 0
262
+ for j in range(len(tbl[i])):
263
+ if tbl[i][j]:
264
+ e += 1
265
+ jj = j
266
+ if e > 1:
267
+ break
268
+ if e > 1:
269
+ i += 1
270
+ continue
271
+ f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
272
+ [jj][0].get("text")) or i == 0
273
+ ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
274
+ [jj][0].get("text")) or i + 1 >= len(tbl)
275
+ if f and ff:
276
+ i += 1
277
+ continue
278
+
279
+ bx = tbl[i][jj][0]
280
+ logging.debug("Relocate row single: " + bx["text"])
281
+ # i row only has one value
282
+ up, down = 100000, 100000
283
+ if i > 0 and not f:
284
+ for j in range(len(tbl[i - 1])):
285
+ if tbl[i - 1][j]:
286
+ up = min(up, np.min(
287
+ [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
288
+ if i + 1 < len(tbl) and not ff:
289
+ for j in range(len(tbl[i + 1])):
290
+ if tbl[i + 1][j]:
291
+ down = min(down, np.min(
292
+ [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
293
+ assert up < 100000 or down < 100000
294
+ if up < down:
295
+ for ii in range(i, len(tbl)):
296
+ for j in range(len(tbl[ii])):
297
+ for a in tbl[ii][j]:
298
+ a["rn"] -= 1
299
+ if tbl[i - 1][jj]:
300
+ tbl[i - 1][jj].extend(tbl[i][jj])
301
+ else:
302
+ tbl[i - 1][jj] = tbl[i][jj]
303
+ tbl.pop(i)
304
+
305
+ else:
306
+ for ii in range(i + 1, len(tbl)):
307
+ for j in range(len(tbl[ii])):
308
+ for a in tbl[ii][j]:
309
+ a["rn"] -= 1
310
+ if tbl[i + 1][jj]:
311
+ tbl[i + 1][jj].extend(tbl[i][jj])
312
+ else:
313
+ tbl[i + 1][jj] = tbl[i][jj]
314
+ tbl.pop(i)
315
+ rows.pop(i)
316
+
317
+ # which rows are headers
318
+ hdset = set([])
319
+ for i in range(len(tbl)):
320
+ cnt, h = 0, 0
321
+ for j, arr in enumerate(tbl[i]):
322
+ if not arr:
323
+ continue
324
+ cnt += 1
325
+ if max_type == "Nu" and arr[0]["btype"] == "Nu":
326
+ continue
327
+ if any([a.get("H") for a in arr]) \
328
+ or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
329
+ h += 1
330
+ if h / cnt > 0.5:
331
+ hdset.add(i)
332
+
333
+ if markdown:
334
+ return TableStructureRecognizer.__markdown_table(cap, hdset,
335
+ TableStructureRecognizer.__cal_spans(boxes, rows,
336
+ cols, tbl, False)
337
+ )
338
+
339
+ return TableStructureRecognizer.__desc_table(cap, hdset,
340
+ TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
341
+ False),
342
+ is_english)
343
+
344
+ @staticmethod
345
+ def __markdown_table(cap, hdset, tbl):
346
+ # Construct Markdown table
347
+ md = ""
348
+ if cap:
349
+ md += f"**{cap}**\n\n"
350
+ header_row = None
351
+ for i, row in enumerate(tbl):
352
+ if header_row is None and (i in hdset or i == 0):
353
+ header_row = i
354
+ break
355
+ if header_row is None:
356
+ header_row = 0
357
+ # Compose header
358
+ headers = []
359
+ for cell in tbl[header_row]:
360
+ if not cell:
361
+ headers.append("")
362
+ else:
363
+ txt = " ".join([c["text"] for c in cell])
364
+ headers.append(txt)
365
+ md += "| " + " | ".join(headers) + " |\n"
366
+ md += "| " + " | ".join(["---"] * len(headers)) + " |\n"
367
+ # Compose rows
368
+ for i, row in enumerate(tbl):
369
+ if i == header_row:
370
+ continue
371
+ row_cells = []
372
+ for cell in row:
373
+ if not cell:
374
+ row_cells.append("")
375
+ else:
376
+ txt = " ".join([c["text"] for c in cell])
377
+ row_cells.append(txt)
378
+ md += "| " + " | ".join(row_cells) + " |\n"
379
+ return md
380
+
381
+ @staticmethod
382
+ def __desc_table(cap, hdr_rowno, tbl, is_english):
383
+ # get text of every colomn in header row to become header text
384
+ clmno = len(tbl[0])
385
+ rowno = len(tbl)
386
+ headers = {}
387
+ hdrset = set()
388
+ lst_hdr = []
389
+ de = "的" if not is_english else " for "
390
+ for r in sorted(list(hdr_rowno)):
391
+ headers[r] = ["" for _ in range(clmno)]
392
+ for i in range(clmno):
393
+ if not tbl[r][i]:
394
+ continue
395
+ txt = " ".join([a["text"].strip() for a in tbl[r][i]])
396
+ headers[r][i] = txt
397
+ hdrset.add(txt)
398
+ if all([not t for t in headers[r]]):
399
+ del headers[r]
400
+ hdr_rowno.remove(r)
401
+ continue
402
+ for j in range(clmno):
403
+ if headers[r][j]:
404
+ continue
405
+ if j >= len(lst_hdr):
406
+ break
407
+ headers[r][j] = lst_hdr[j]
408
+ lst_hdr = headers[r]
409
+ for i in range(rowno):
410
+ if i not in hdr_rowno:
411
+ continue
412
+ for j in range(i + 1, rowno):
413
+ if j not in hdr_rowno:
414
+ break
415
+ for k in range(clmno):
416
+ if not headers[j - 1][k]:
417
+ continue
418
+ if headers[j][k].find(headers[j - 1][k]) >= 0:
419
+ continue
420
+ if len(headers[j][k]) > len(headers[j - 1][k]):
421
+ headers[j][k] += (de if headers[j][k]
422
+ else "") + headers[j - 1][k]
423
+ else:
424
+ headers[j][k] = headers[j - 1][k] \
425
+ + (de if headers[j - 1][k] else "") \
426
+ + headers[j][k]
427
+
428
+ logging.debug(
429
+ f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
430
+ row_txt = []
431
+ for i in range(rowno):
432
+ if i in hdr_rowno:
433
+ continue
434
+ rtxt = []
435
+
436
+ def append(delimer):
437
+ nonlocal rtxt, row_txt
438
+ rtxt = delimer.join(rtxt)
439
+ if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
440
+ row_txt[-1] += "\n" + rtxt
441
+ else:
442
+ row_txt.append(rtxt)
443
+
444
+ r = 0
445
+ if len(headers.items()):
446
+ _arr = [(i - r, r) for r, _ in headers.items() if r < i]
447
+ if _arr:
448
+ _, r = min(_arr, key=lambda x: x[0])
449
+
450
+ if r not in headers and clmno <= 2:
451
+ for j in range(clmno):
452
+ if not tbl[i][j]:
453
+ continue
454
+ txt = "".join([a["text"].strip() for a in tbl[i][j]])
455
+ if txt:
456
+ rtxt.append(txt)
457
+ if rtxt:
458
+ append(":")
459
+ continue
460
+
461
+ for j in range(clmno):
462
+ if not tbl[i][j]:
463
+ continue
464
+ txt = "".join([a["text"].strip() for a in tbl[i][j]])
465
+ if not txt:
466
+ continue
467
+ ctt = headers[r][j] if r in headers else ""
468
+ if ctt:
469
+ ctt += ":"
470
+ ctt += txt
471
+ if ctt:
472
+ rtxt.append(ctt)
473
+
474
+ if rtxt:
475
+ row_txt.append("; ".join(rtxt))
476
+
477
+ if cap:
478
+ if is_english:
479
+ from_ = " in "
480
+ else:
481
+ from_ = "来自"
482
+ row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
483
+ return row_txt
484
+
485
+ @staticmethod
486
+ def __cal_spans(boxes, rows, cols, tbl, markdown=True):
487
+ # caculate span
488
+ clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
489
+ for cln in cols]
490
+ crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
491
+ for cln in cols]
492
+ rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
493
+ for row in rows]
494
+ rbtm = [np.mean([c.get("R_btm", c["bottom"])
495
+ for c in row]) for row in rows]
496
+ for b in boxes:
497
+ if "SP" not in b:
498
+ continue
499
+ b["colspan"] = [b["cn"]]
500
+ b["rowspan"] = [b["rn"]]
501
+ # col span
502
+ for j in range(0, len(clft)):
503
+ if j == b["cn"]:
504
+ continue
505
+ if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
506
+ continue
507
+ if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
508
+ continue
509
+ b["colspan"].append(j)
510
+ # row span
511
+ for j in range(0, len(rtop)):
512
+ if j == b["rn"]:
513
+ continue
514
+ if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
515
+ continue
516
+ if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
517
+ continue
518
+ b["rowspan"].append(j)
519
+
520
+ def join(arr):
521
+ if not arr:
522
+ return ""
523
+ return "".join([t["text"] for t in arr])
524
+
525
+ # rm the spaning cells
526
+ for i in range(len(tbl)):
527
+ for j, arr in enumerate(tbl[i]):
528
+ if not arr:
529
+ continue
530
+ if all(["rowspan" not in a and "colspan" not in a for a in arr]):
531
+ continue
532
+ rowspan, colspan = [], []
533
+ for a in arr:
534
+ if isinstance(a.get("rowspan", 0), list):
535
+ rowspan.extend(a["rowspan"])
536
+ if isinstance(a.get("colspan", 0), list):
537
+ colspan.extend(a["colspan"])
538
+ rowspan, colspan = set(rowspan), set(colspan)
539
+ if len(rowspan) < 2 and len(colspan) < 2:
540
+ for a in arr:
541
+ if "rowspan" in a:
542
+ del a["rowspan"]
543
+ if "colspan" in a:
544
+ del a["colspan"]
545
+ continue
546
+ rowspan, colspan = sorted(rowspan), sorted(colspan)
547
+ rowspan = list(range(rowspan[0], rowspan[-1] + 1))
548
+ colspan = list(range(colspan[0], colspan[-1] + 1))
549
+ assert i in rowspan, rowspan
550
+ assert j in colspan, colspan
551
+ arr = []
552
+ for r in rowspan:
553
+ for c in colspan:
554
+ arr_txt = join(arr)
555
+ if tbl[r][c] and join(tbl[r][c]) != arr_txt:
556
+ arr.extend(tbl[r][c])
557
+ tbl[r][c] = None if markdown else arr
558
+ for a in arr:
559
+ if len(rowspan) > 1:
560
+ a["rowspan"] = len(rowspan)
561
+ elif "rowspan" in a:
562
+ del a["rowspan"]
563
+ if len(colspan) > 1:
564
+ a["colspan"] = len(colspan)
565
+ elif "colspan" in a:
566
+ del a["colspan"]
567
+ tbl[rowspan[0]][colspan[0]] = arr
568
+
569
+ return tbl
@@ -0,0 +1,81 @@
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import base64
17
+ import datetime
18
+ import io
19
+ import json
20
+ import os
21
+ import pickle
22
+ import time
23
+ import uuid
24
+ import logging
25
+ import copy
26
+ from enum import Enum, IntEnum
27
+
28
+ from . import file_utils
29
+
30
+ def read_config(conf_name=None):
31
+ # Simplified: return empty config as we don't use RagFlow conf files
32
+ return {}
33
+
34
+ CONFIGS = read_config()
35
+
36
+ def get_base_config(key, default=None):
37
+ if key is None:
38
+ return None
39
+ return CONFIGS.get(key, default)
40
+
41
+ class BaseType:
42
+ def to_dict(self):
43
+ return dict([(k.lstrip("_"), v) for k, v in self.__dict__.items()])
44
+
45
+ class CustomJSONEncoder(json.JSONEncoder):
46
+ def default(self, obj):
47
+ if isinstance(obj, datetime.datetime):
48
+ return obj.strftime('%Y-%m-%d %H:%M:%S')
49
+ elif isinstance(obj, datetime.date):
50
+ return obj.strftime('%Y-%m-%d')
51
+ elif isinstance(obj, datetime.timedelta):
52
+ return str(obj)
53
+ elif issubclass(type(obj), Enum) or issubclass(type(obj), IntEnum):
54
+ return obj.value
55
+ elif isinstance(obj, set):
56
+ return list(obj)
57
+ elif issubclass(type(obj), BaseType):
58
+ return obj.to_dict()
59
+ else:
60
+ return json.JSONEncoder.default(self, obj)
61
+
62
+ def json_dumps(src, byte=False, indent=None):
63
+ dest = json.dumps(src, indent=indent, cls=CustomJSONEncoder)
64
+ if byte:
65
+ dest = dest.encode(encoding="utf-8")
66
+ return dest
67
+
68
+ def json_loads(src):
69
+ if isinstance(src, bytes):
70
+ src = src.decode(encoding="utf-8")
71
+ return json.loads(src)
72
+
73
+ def current_timestamp():
74
+ return int(time.time() * 1000)
75
+
76
+ def get_uuid():
77
+ return uuid.uuid1().hex
78
+
79
+ def datetime_format(date_time: datetime.datetime) -> datetime.datetime:
80
+ return datetime.datetime(date_time.year, date_time.month, date_time.day,
81
+ date_time.hour, date_time.minute, date_time.second)