magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. magic_pdf/filter/pdf_meta_scan.py +3 -17
  2. magic_pdf/libs/commons.py +0 -161
  3. magic_pdf/libs/draw_bbox.py +2 -3
  4. magic_pdf/libs/markdown_utils.py +0 -21
  5. magic_pdf/libs/pdf_image_tools.py +2 -1
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  8. magic_pdf/model/magic_model.py +0 -30
  9. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  11. magic_pdf/para/para_split_v3.py +7 -2
  12. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  13. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  14. magic_pdf/pre_proc/cut_image.py +0 -37
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  16. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  17. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  18. magic_pdf/rw/S3ReaderWriter.py +1 -1
  19. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  20. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
  21. magic_pdf/dict2md/mkcontent.py +0 -438
  22. magic_pdf/layout/__init__.py +0 -0
  23. magic_pdf/layout/bbox_sort.py +0 -681
  24. magic_pdf/layout/layout_det_utils.py +0 -182
  25. magic_pdf/layout/layout_sort.py +0 -921
  26. magic_pdf/layout/layout_spiler_recog.py +0 -101
  27. magic_pdf/layout/mcol_sort.py +0 -336
  28. magic_pdf/libs/calc_span_stats.py +0 -239
  29. magic_pdf/libs/detect_language_from_model.py +0 -21
  30. magic_pdf/libs/nlp_utils.py +0 -203
  31. magic_pdf/libs/textbase.py +0 -33
  32. magic_pdf/libs/vis_utils.py +0 -308
  33. magic_pdf/para/block_continuation_processor.py +0 -562
  34. magic_pdf/para/block_termination_processor.py +0 -480
  35. magic_pdf/para/commons.py +0 -222
  36. magic_pdf/para/denoise.py +0 -246
  37. magic_pdf/para/draw.py +0 -121
  38. magic_pdf/para/exceptions.py +0 -198
  39. magic_pdf/para/layout_match_processor.py +0 -40
  40. magic_pdf/para/para_split.py +0 -807
  41. magic_pdf/para/para_split_v2.py +0 -959
  42. magic_pdf/para/raw_processor.py +0 -207
  43. magic_pdf/para/stats.py +0 -268
  44. magic_pdf/para/title_processor.py +0 -1014
  45. magic_pdf/pdf_parse_union_core.py +0 -345
  46. magic_pdf/post_proc/__init__.py +0 -0
  47. magic_pdf/post_proc/detect_para.py +0 -3472
  48. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  49. magic_pdf/post_proc/remove_footnote.py +0 -153
  50. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  51. magic_pdf/pre_proc/detect_equation.py +0 -134
  52. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  53. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  54. magic_pdf/pre_proc/detect_footnote.py +0 -170
  55. magic_pdf/pre_proc/detect_header.py +0 -64
  56. magic_pdf/pre_proc/detect_images.py +0 -647
  57. magic_pdf/pre_proc/detect_page_number.py +0 -64
  58. magic_pdf/pre_proc/detect_tables.py +0 -62
  59. magic_pdf/pre_proc/equations_replace.py +0 -550
  60. magic_pdf/pre_proc/fix_image.py +0 -244
  61. magic_pdf/pre_proc/fix_table.py +0 -270
  62. magic_pdf/pre_proc/main_text_font.py +0 -23
  63. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  64. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  65. magic_pdf/pre_proc/post_layout_split.py +0 -0
  66. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  67. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  68. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  69. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  70. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  71. magic_pdf/pre_proc/statistics.py +0 -12
  72. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  73. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
  74. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  75. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,647 +0,0 @@
1
- import collections # 统计库
2
- import re
3
- from magic_pdf.libs.commons import fitz # pyMuPDF库
4
-
5
-
6
- #--------------------------------------- Tool Functions --------------------------------------#
7
- # 正则化,输入文本,输出只保留a-z,A-Z,0-9
8
- def remove_special_chars(s: str) -> str:
9
- pattern = r"[^a-zA-Z0-9]"
10
- res = re.sub(pattern, "", s)
11
- return res
12
-
13
- def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
14
- # 判断rect1和rect2是否一模一样
15
- return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
16
-
17
- def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
18
- # 判断rect1包含了rect2
19
- return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
20
-
21
- def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
22
- # 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
23
- return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
24
-
25
- def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
26
- # 计算两个rect,重叠面积各占2个rect面积的比例
27
- if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
28
- return 0, 0
29
- square_1 = (R1 - L1) * (D1 - U1)
30
- square_2 = (R2 - L2) * (D2 - U2)
31
- if square_1 == 0 or square_2 == 0:
32
- return 0, 0
33
- square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
34
- return square_overlap / square_1, square_overlap / square_2
35
-
36
- def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
37
- # 计算两个line,重叠区间各占2个line长度的比例
38
- if max(L1, L2) > min(R1, R2):
39
- return 0, 0
40
- if L1 == R1 or L2 == R2:
41
- return 0, 0
42
- overlap_line = min(R1, R2) - max(L1, L2)
43
- return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
44
-
45
-
46
- # 判断rect其实是一条line
47
- def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
48
- width = R - L
49
- height = D - U
50
- if width <= 3 or height <= 3:
51
- return True
52
- if width / height >= 30 or height / width >= 30:
53
- return True
54
-
55
-
56
-
57
- def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
58
- """
59
- :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
60
- :param page :fitz读取的当前页的内容
61
- :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
62
- :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
63
- """
64
- #### 通过fitz获取page信息
65
- ## 超越边界
66
- DPI = 72 # use this resolution
67
- pix = page.get_pixmap(dpi=DPI)
68
- pageL = 0
69
- pageR = int(pix.w)
70
- pageU = 0
71
- pageD = int(pix.h)
72
-
73
- #----------------- 保存每一个文本块的LURD ------------------#
74
- textLine_blocks = []
75
- blocks = page.get_text(
76
- "dict",
77
- flags=fitz.TEXTFLAGS_TEXT,
78
- #clip=clip,
79
- )["blocks"]
80
- for i in range(len(blocks)):
81
- bbox = blocks[i]['bbox']
82
- # print(bbox)
83
- for tt in blocks[i]['lines']:
84
- # 当前line
85
- cur_line_bbox = None # 当前line,最右侧的section的bbox
86
- for xf in tt['spans']:
87
- L, U, R, D = xf['bbox']
88
- L, R = min(L, R), max(L, R)
89
- U, D = min(U, D), max(U, D)
90
- textLine_blocks.append((L, U, R, D))
91
- textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
92
-
93
-
94
- #---------------------------------------------- 保存img --------------------------------------------------#
95
- raw_imgs = page.get_images() # 获取所有的图片
96
- imgs = []
97
- img_names = [] # 保存图片的名字,方便在md中插入引用
98
- img_bboxs = [] # 保存图片的location信息。
99
- img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
100
- img_ID = 0
101
-
102
- ## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
103
- for i in range(len(raw_imgs)):
104
- # 如果图片在junklist中则跳过
105
- if raw_imgs[i][0] in junk_img_bojids:
106
- continue
107
- else:
108
- try:
109
- tt = page.get_image_rects(raw_imgs[i][0], transform = True)
110
-
111
- rec = tt[0][0]
112
- L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
113
-
114
- L, R = min(L, R), max(L, R)
115
- U, D = min(U, D), max(U, D)
116
- if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
117
- continue
118
- if pageL == L and R == pageR:
119
- continue
120
- if pageU == U and D == pageD:
121
- continue
122
- # pix1 = page.get_Pixmap(clip=(L,U,R,D))
123
- new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
124
- # pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
125
- img_names.append(new_img_name)
126
- img_bboxs.append((L, U, R, D))
127
- img_visited.append(False)
128
- imgs.append(raw_imgs[i])
129
- except:
130
- continue
131
-
132
- #-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
133
- imgs_ok = [True for _ in range(len(imgs))]
134
- for i in range(len(imgs)):
135
- L1, U1, R1, D1 = img_bboxs[i]
136
- for j in range(i + 1, len(imgs)):
137
- L2, U2, R2, D2 = img_bboxs[j]
138
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
139
- s1 = abs(R1 - L1) * abs(D1 - U1)
140
- s2 = abs(R2 - L2) * abs(D2 - U2)
141
- if ratio_1 > 0 and ratio_2 > 0:
142
- if ratio_1 == 1 and ratio_2 > 0.8:
143
- imgs_ok[i] = False
144
- elif ratio_1 > 0.8 and ratio_2 == 1:
145
- imgs_ok[j] = False
146
- elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
147
- imgs_ok[i] = False
148
- imgs_ok[j] = False
149
- elif s1 / s2 > 5 and ratio_2 > 0.5:
150
- imgs_ok[j] = False
151
- elif s2 / s1 > 5 and ratio_1 > 0.5:
152
- imgs_ok[i] = False
153
-
154
- imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
155
- img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
156
- img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
157
- img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
158
- #*******************************************************************************#
159
-
160
- #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
161
- #
162
- svgs = page.get_drawings()
163
- #------------ preprocess, check一些大框,看是否是合理的 ----------#
164
- ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
165
- svg_rect_visited = set()
166
- available_svgIdx = []
167
- for i in range(len(svgs)):
168
- L, U, R, D = svgs[i]['rect'].irect
169
- L, R = min(L, R), max(L, R)
170
- U, D = min(U, D), max(U, D)
171
- tt = (L, U, R, D)
172
- if tt not in svg_rect_visited:
173
- svg_rect_visited.add(tt)
174
- available_svgIdx.append(i)
175
-
176
- svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
177
- svg_childs = [[] for _ in range(len(svgs))]
178
- svg_parents = [[] for _ in range(len(svgs))]
179
- svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
180
- svg_visited = [False for _ in range(len(svgs))]
181
- svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
182
-
183
-
184
- for i in range(len(svgs)):
185
- L, U, R, D = svgs[i]['rect'].irect
186
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
187
- if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
188
- if ratio_2 >= 0.7:
189
- svg_exceedPage[i] += 4
190
- else:
191
- if L <= pageL:
192
- svg_exceedPage[i] += 1
193
- if pageR <= R:
194
- svg_exceedPage[i] += 1
195
- if U <= pageU:
196
- svg_exceedPage[i] += 1
197
- if pageD <= D:
198
- svg_exceedPage[i] += 1
199
-
200
- #### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
201
- if len([x for x in svg_exceedPage if x >= 1]) >= 2:
202
- svgs = []
203
- svg_childs = []
204
- svg_parents = []
205
- svg_overlaps = []
206
- svg_visited = []
207
- svg_exceedPage = []
208
-
209
- #---------------------------- build graph ----------------------------#
210
- for i, p in enumerate(svgs):
211
- L1, U1, R1, D1 = svgs[i]["rect"].irect
212
- for j in range(len(svgs)):
213
- if i == j:
214
- continue
215
- L2, U2, R2, D2 = svgs[j]["rect"].irect
216
- ## 包含
217
- if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
218
- svg_childs[i].append(j)
219
- svg_parents[j].append(i)
220
- else:
221
- ## 交叉
222
- if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
223
- svg_overlaps[i].append(j)
224
-
225
- #---------------- 确定最终的svg。连通块儿的外围 -------------------#
226
- eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
227
- svg_ID = 0
228
- svg_final_names = []
229
- svg_final_bboxs = []
230
- svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
231
-
232
- svg_idxs = [i for i in range(len(svgs))]
233
- svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
234
-
235
- for i in svg_idxs:
236
- if svg_visited[i] == True:
237
- continue
238
- svg_visited[i] = True
239
- L, U, R, D = svgs[i]['rect'].irect
240
- width = R - L
241
- height = D - U
242
- if check_rect_isLine(L, U, R, D) == True:
243
- svg_visited[i] = False
244
- continue
245
- # if i == 4:
246
- # print(i, L, U, R, D)
247
- # print(svg_parents[i])
248
-
249
- cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
250
- if len(svg_parents[i]) == 0:
251
- ## 是个普通框的情形
252
- cur_block_element_cnt += len(svg_childs[i])
253
- if svg_exceedPage[i] == 0:
254
- ## 误差。可能已经包含在某个框里面了
255
- neglect_flag = False
256
- for pL, pU, pR, pD in svg_final_bboxs:
257
- if pL <= L <= R <= pR and pU <= U <= D <= pD:
258
- neglect_flag = True
259
- break
260
- if neglect_flag == True:
261
- continue
262
-
263
- ## 搜索连通域, bfs+记忆化
264
- q = collections.deque()
265
- for j in svg_overlaps[i]:
266
- q.append(j)
267
- while q:
268
- j = q.popleft()
269
- svg_visited[j] = True
270
- L2, U2, R2, D2 = svgs[j]['rect'].irect
271
- # width2 = R2 - L2
272
- # height2 = D2 - U2
273
- # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
274
- # continue
275
- L = min(L, L2)
276
- R = max(R, R2)
277
- U = min(U, U2)
278
- D = max(D, D2)
279
- cur_block_element_cnt += 1
280
- cur_block_element_cnt += len(svg_childs[j])
281
- for k in svg_overlaps[j]:
282
- if svg_visited[k] == False and svg_exceedPage[k] == 0:
283
- svg_visited[k] = True
284
- q.append(k)
285
- elif svg_exceedPage[i] <= 2:
286
- ## 误差。可能已经包含在某个svg_final_bbox框里面了
287
- neglect_flag = False
288
- for sL, sU, sR, sD in svg_final_bboxs:
289
- if sL <= L <= R <= sR and sU <= U <= D <= sD:
290
- neglect_flag = True
291
- break
292
- if neglect_flag == True:
293
- continue
294
-
295
- L, U, R, D = pageR, pageD, pageL, pageU
296
- ## 所有孩子元素的最大边界
297
- for j in svg_childs[i]:
298
- if svg_visited[j] == True:
299
- continue
300
- if svg_exceedPage[j] >= 1:
301
- continue
302
- svg_visited[j] = True #### 这个位置考虑一下
303
- L2, U2, R2, D2 = svgs[j]['rect'].irect
304
- L = min(L, L2)
305
- R = max(R, R2)
306
- U = min(U, U2)
307
- D = max(D, D2)
308
- cur_block_element_cnt += 1
309
-
310
- # 如果是条line,就不用保存了
311
- if check_rect_isLine(L, U, R, D) == True:
312
- continue
313
- # 如果当前的svg,连2个elements都没有,就不用保存了
314
- if cur_block_element_cnt < 3:
315
- continue
316
-
317
- ## 当前svg,框住了多少文本框。如果框多了,可能就是错了
318
- contain_textLineBlock_cnt = 0
319
- for L2, U2, R2, D2 in textLine_blocks:
320
- if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
321
- contain_textLineBlock_cnt += 1
322
- if contain_textLineBlock_cnt >= 10:
323
- continue
324
-
325
- # L -= eps_ERROR * 2
326
- # U -= eps_ERROR
327
- # R += eps_ERROR * 2
328
- # D += eps_ERROR
329
- # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
330
- # cur_svg = page.get_pixmap(clip=(L,U,R,D))
331
- new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
332
- # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
333
- svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
334
- svg_final_bboxs.append((L, U, R, D))
335
- svg_final_visited.append(False)
336
- svg_ID += 1
337
-
338
- ## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
339
- svg_idxs = [i for i in range(len(svg_final_bboxs))]
340
- svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
341
- svg_final_names_2 = []
342
- svg_final_bboxs_2 = []
343
- svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
344
- svg_ID_2 = 0
345
- for i in range(len(svg_final_bboxs)):
346
- L1, U1, R1, D1 = svg_final_bboxs[i]
347
- for j in range(i + 1, len(svg_final_bboxs)):
348
- L2, U2, R2, D2 = svg_final_bboxs[j]
349
- # 如果 rect1包含了rect2
350
- if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
351
- svg_final_visited[j] = True
352
- continue
353
- # 水平并列
354
- ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
355
- if ratio_1 >= 0.7 and ratio_2 >= 0.7:
356
- if abs(L2 - R1) >= 20:
357
- continue
358
- LL = min(L1, L2)
359
- UU = min(U1, U2)
360
- RR = max(R1, R2)
361
- DD = max(D1, D2)
362
- svg_final_bboxs[i] = (LL, UU, RR, DD)
363
- svg_final_visited[j] = True
364
- continue
365
- # 竖直并列
366
- ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
367
- if ratio_1 >= 0.7 and ratio_2 >= 0.7:
368
- if abs(U2 - D1) >= 20:
369
- continue
370
- LL = min(L1, L2)
371
- UU = min(U1, U2)
372
- RR = max(R1, R2)
373
- DD = max(D1, D2)
374
- svg_final_bboxs[i] = (LL, UU, RR, DD)
375
- svg_final_visited[j] = True
376
-
377
- for i in range(len(svg_final_bboxs)):
378
- if svg_final_visited[i] == False:
379
- L, U, R, D = svg_final_bboxs[i]
380
- svg_final_bboxs_2.append((L, U, R, D))
381
-
382
- L -= eps_ERROR * 2
383
- U -= eps_ERROR
384
- R += eps_ERROR * 2
385
- D += eps_ERROR
386
- # cur_svg = page.get_pixmap(clip=(L,U,R,D))
387
- new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
388
- # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
389
- svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
390
- svg_final_bboxs_2.append((L, U, R, D))
391
- svg_final_visited_2.append(False)
392
- svg_ID_2 += 1
393
-
394
- ## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
395
- # 有收尾才comprehensive
396
- # xxxx
397
- # xxxx
398
- # xxxx
399
- # xxxx
400
-
401
-
402
- #--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
403
- figure_bbox_from_DocXChain = []
404
-
405
- figure_from_DocXChain_visited = [] # 记忆化
406
- figure_bbox_from_DocXChain_overlappedRatio = []
407
-
408
- figure_only_from_DocXChain_bboxs = [] # 存储
409
- figure_only_from_DocXChain_names = []
410
- figure_only_from_DocXChain_visited = []
411
- figure_only_ID = 0
412
-
413
- xf_json = json_from_DocXchain_obj
414
- width_from_json = xf_json['page_info']['width']
415
- height_from_json = xf_json['page_info']['height']
416
- LR_scaleRatio = width_from_json / (pageR - pageL)
417
- UD_scaleRatio = height_from_json / (pageD - pageU)
418
-
419
- for xf in xf_json['layout_dets']:
420
- # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
421
- L = xf['poly'][0] / LR_scaleRatio
422
- U = xf['poly'][1] / UD_scaleRatio
423
- R = xf['poly'][2] / LR_scaleRatio
424
- D = xf['poly'][5] / UD_scaleRatio
425
- # L += pageL # 有的页面,artBox偏移了。不在(0,0)
426
- # R += pageL
427
- # U += pageU
428
- # D += pageU
429
- L, R = min(L, R), max(L, R)
430
- U, D = min(U, D), max(U, D)
431
- # figure
432
- if xf["category_id"] == 1 and xf['score'] >= 0.3:
433
- figure_bbox_from_DocXChain.append((L, U, R, D))
434
- figure_from_DocXChain_visited.append(False)
435
- figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
436
-
437
- #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
438
-
439
- ## 比对imgs
440
- for i, b1 in enumerate(figure_bbox_from_DocXChain):
441
- # print('--------- DocXChain的图片', b1)
442
- L1, U1, R1, D1 = b1
443
- for b2 in img_bboxs:
444
- # print('-------- igms得到的图', b2)
445
- L2, U2, R2, D2 = b2
446
- s1 = abs(R1 - L1) * abs(D1 - U1)
447
- s2 = abs(R2 - L2) * abs(D2 - U2)
448
- # 相同
449
- if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
450
- figure_from_DocXChain_visited[i] = True
451
- # 包含
452
- elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
453
- if s2 / s1 > 0.8:
454
- figure_from_DocXChain_visited[i] = True
455
- elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
456
- if s1 / s2 > 0.8:
457
- figure_from_DocXChain_visited[i] = True
458
- else:
459
- # 重叠了相当一部分
460
- # print('进入第3部分')
461
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
462
- if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
463
- figure_from_DocXChain_visited[i] = True
464
- else:
465
- figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
466
- # print('图片的重叠率是{}'.format(ratio_1))
467
-
468
-
469
- ## 比对svgs
470
- svg_final_bboxs_2_badIdxs = []
471
- for i, b1 in enumerate(figure_bbox_from_DocXChain):
472
- L1, U1, R1, D1 = b1
473
- for j, b2 in enumerate(svg_final_bboxs_2):
474
- L2, U2, R2, D2 = b2
475
- s1 = abs(R1 - L1) * abs(D1 - U1)
476
- s2 = abs(R2 - L2) * abs(D2 - U2)
477
- # 相同
478
- if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
479
- figure_from_DocXChain_visited[i] = True
480
- # 包含
481
- elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
482
- figure_from_DocXChain_visited[i] = True
483
- elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
484
- if s1 / s2 > 0.7:
485
- figure_from_DocXChain_visited[i] = True
486
- else:
487
- svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
488
- else:
489
- # 重叠了相当一部分
490
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
491
- if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
492
- figure_from_DocXChain_visited[i] = True
493
- else:
494
- figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
495
-
496
- # 丢掉错误的svg
497
- svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
498
-
499
- for i in range(len(figure_from_DocXChain_visited)):
500
- if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
501
- figure_from_DocXChain_visited[i] = True
502
-
503
- # DocXChain识别出来的figure,但是没被保存的。
504
- for i in range(len(figure_from_DocXChain_visited)):
505
- if figure_from_DocXChain_visited[i] == False:
506
- figure_from_DocXChain_visited[i] = True
507
- cur_bbox = figure_bbox_from_DocXChain[i]
508
- # cur_figure = page.get_pixmap(clip=cur_bbox)
509
- new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
510
- # cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
511
- figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
512
- figure_only_from_DocXChain_bboxs.append(cur_bbox)
513
- figure_only_from_DocXChain_visited.append(False)
514
- figure_only_ID += 1
515
-
516
- img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
517
- svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
518
- figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
519
- curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
520
-
521
- #--------------------------- 最后统一去重 -----------------------------------#
522
- curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
523
-
524
- #### 先考虑包含关系的小块
525
- final_duplicate = set()
526
- for i in range(len(curPage_all_fig_bboxs)):
527
- L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
528
- for j in range(len(curPage_all_fig_bboxs)):
529
- if i == j:
530
- continue
531
- L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
532
- s1 = abs(R1 - L1) * abs(D1 - U1)
533
- s2 = abs(R2 - L2) * abs(D2 - U2)
534
- if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
535
- final_duplicate.add((L1, U1, R1, D1))
536
- else:
537
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
538
- if ratio_1 >= 0.8 and ratio_2 <= 0.6:
539
- final_duplicate.add((L1, U1, R1, D1))
540
-
541
- curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
542
-
543
- #### 再考虑重叠关系的块
544
- final_duplicate = set()
545
- final_synthetic_bboxs = []
546
- for i in range(len(curPage_all_fig_bboxs)):
547
- L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
548
- for j in range(len(curPage_all_fig_bboxs)):
549
- if i == j:
550
- continue
551
- L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
552
- s1 = abs(R1 - L1) * abs(D1 - U1)
553
- s2 = abs(R2 - L2) * abs(D2 - U2)
554
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
555
- union_ok = False
556
- if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
557
- union_ok = True
558
- if (ratio_1 > 0.2 and s2 / s1 > 5):
559
- union_ok = True
560
- if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
561
- union_ok = True
562
- if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
563
- union_ok = True
564
- if union_ok == True:
565
- final_duplicate.add((L1, U1, R1, D1))
566
- final_duplicate.add((L2, U2, R2, D2))
567
- L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
568
- final_synthetic_bboxs.append((L3, U3, R3, D3))
569
-
570
- # print('---------- curPage_all_fig_bboxs ---------')
571
- # print(curPage_all_fig_bboxs)
572
- curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
573
- final_synthetic_bboxs = list(set(final_synthetic_bboxs))
574
-
575
-
576
- ## 再再考虑重叠关系。极端情况下会迭代式地2进1
577
- new_images = []
578
- droped_img_idx = []
579
- image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
580
- for i in range(0, len(image_bboxes)):
581
- for j in range(i+1, len(image_bboxes)):
582
- if j not in droped_img_idx:
583
- L2, U2, R2, D2 = image_bboxes[j]
584
- s1 = abs(R1 - L1) * abs(D1 - U1)
585
- s2 = abs(R2 - L2) * abs(D2 - U2)
586
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
587
- union_ok = False
588
- if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
589
- union_ok = True
590
- if (ratio_1 > 0.2 and s2 / s1 > 5):
591
- union_ok = True
592
- if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
593
- union_ok = True
594
- if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
595
- union_ok = True
596
- if union_ok == True:
597
- # 合并
598
- image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
599
- droped_img_idx.append(j)
600
-
601
- for i in range(0, len(image_bboxes)):
602
- if i not in droped_img_idx:
603
- new_images.append(image_bboxes[i])
604
-
605
-
606
- # find_union_FLAG = True
607
- # while find_union_FLAG == True:
608
- # find_union_FLAG = False
609
- # final_duplicate = set()
610
- # tmp = []
611
- # for i in range(len(final_synthetic_bboxs)):
612
- # L1, U1, R1, D1 = final_synthetic_bboxs[i]
613
- # for j in range(len(final_synthetic_bboxs)):
614
- # if i == j:
615
- # continue
616
- # L2, U2, R2, D2 = final_synthetic_bboxs[j]
617
- # s1 = abs(R1 - L1) * abs(D1 - U1)
618
- # s2 = abs(R2 - L2) * abs(D2 - U2)
619
- # ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
620
- # union_ok = False
621
- # if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
622
- # union_ok = True
623
- # if (ratio_1 > 0.2 and s2 / s1 > 5):
624
- # union_ok = True
625
- # if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
626
- # union_ok = True
627
- # if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
628
- # union_ok = True
629
- # if union_ok == True:
630
- # find_union_FLAG = True
631
- # final_duplicate.add((L1, U1, R1, D1))
632
- # final_duplicate.add((L2, U2, R2, D2))
633
- # L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
634
- # tmp.append((L3, U3, R3, D3))
635
- # if find_union_FLAG == True:
636
- # tmp = list(set(tmp))
637
- # final_synthetic_bboxs = tmp[:]
638
-
639
-
640
- # curPage_all_fig_bboxs += final_synthetic_bboxs
641
- # print('--------- final synthetic')
642
- # print(final_synthetic_bboxs)
643
- #**************************************************************************#
644
- images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
645
- images = images1 + new_images
646
- return images
647
-