magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,647 +0,0 @@
|
|
1
|
-
import collections # 统计库
|
2
|
-
import re
|
3
|
-
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
4
|
-
|
5
|
-
|
6
|
-
#--------------------------------------- Tool Functions --------------------------------------#
|
7
|
-
# 正则化,输入文本,输出只保留a-z,A-Z,0-9
|
8
|
-
def remove_special_chars(s: str) -> str:
|
9
|
-
pattern = r"[^a-zA-Z0-9]"
|
10
|
-
res = re.sub(pattern, "", s)
|
11
|
-
return res
|
12
|
-
|
13
|
-
def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
|
14
|
-
# 判断rect1和rect2是否一模一样
|
15
|
-
return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
|
16
|
-
|
17
|
-
def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
|
18
|
-
# 判断rect1包含了rect2
|
19
|
-
return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
|
20
|
-
|
21
|
-
def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
|
22
|
-
# 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
|
23
|
-
return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
|
24
|
-
|
25
|
-
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
|
26
|
-
# 计算两个rect,重叠面积各占2个rect面积的比例
|
27
|
-
if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
|
28
|
-
return 0, 0
|
29
|
-
square_1 = (R1 - L1) * (D1 - U1)
|
30
|
-
square_2 = (R2 - L2) * (D2 - U2)
|
31
|
-
if square_1 == 0 or square_2 == 0:
|
32
|
-
return 0, 0
|
33
|
-
square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
|
34
|
-
return square_overlap / square_1, square_overlap / square_2
|
35
|
-
|
36
|
-
def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
|
37
|
-
# 计算两个line,重叠区间各占2个line长度的比例
|
38
|
-
if max(L1, L2) > min(R1, R2):
|
39
|
-
return 0, 0
|
40
|
-
if L1 == R1 or L2 == R2:
|
41
|
-
return 0, 0
|
42
|
-
overlap_line = min(R1, R2) - max(L1, L2)
|
43
|
-
return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
|
44
|
-
|
45
|
-
|
46
|
-
# 判断rect其实是一条line
|
47
|
-
def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
|
48
|
-
width = R - L
|
49
|
-
height = D - U
|
50
|
-
if width <= 3 or height <= 3:
|
51
|
-
return True
|
52
|
-
if width / height >= 30 or height / width >= 30:
|
53
|
-
return True
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
|
58
|
-
"""
|
59
|
-
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
|
60
|
-
:param page :fitz读取的当前页的内容
|
61
|
-
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
62
|
-
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
63
|
-
"""
|
64
|
-
#### 通过fitz获取page信息
|
65
|
-
## 超越边界
|
66
|
-
DPI = 72 # use this resolution
|
67
|
-
pix = page.get_pixmap(dpi=DPI)
|
68
|
-
pageL = 0
|
69
|
-
pageR = int(pix.w)
|
70
|
-
pageU = 0
|
71
|
-
pageD = int(pix.h)
|
72
|
-
|
73
|
-
#----------------- 保存每一个文本块的LURD ------------------#
|
74
|
-
textLine_blocks = []
|
75
|
-
blocks = page.get_text(
|
76
|
-
"dict",
|
77
|
-
flags=fitz.TEXTFLAGS_TEXT,
|
78
|
-
#clip=clip,
|
79
|
-
)["blocks"]
|
80
|
-
for i in range(len(blocks)):
|
81
|
-
bbox = blocks[i]['bbox']
|
82
|
-
# print(bbox)
|
83
|
-
for tt in blocks[i]['lines']:
|
84
|
-
# 当前line
|
85
|
-
cur_line_bbox = None # 当前line,最右侧的section的bbox
|
86
|
-
for xf in tt['spans']:
|
87
|
-
L, U, R, D = xf['bbox']
|
88
|
-
L, R = min(L, R), max(L, R)
|
89
|
-
U, D = min(U, D), max(U, D)
|
90
|
-
textLine_blocks.append((L, U, R, D))
|
91
|
-
textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
|
92
|
-
|
93
|
-
|
94
|
-
#---------------------------------------------- 保存img --------------------------------------------------#
|
95
|
-
raw_imgs = page.get_images() # 获取所有的图片
|
96
|
-
imgs = []
|
97
|
-
img_names = [] # 保存图片的名字,方便在md中插入引用
|
98
|
-
img_bboxs = [] # 保存图片的location信息。
|
99
|
-
img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
|
100
|
-
img_ID = 0
|
101
|
-
|
102
|
-
## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
|
103
|
-
for i in range(len(raw_imgs)):
|
104
|
-
# 如果图片在junklist中则跳过
|
105
|
-
if raw_imgs[i][0] in junk_img_bojids:
|
106
|
-
continue
|
107
|
-
else:
|
108
|
-
try:
|
109
|
-
tt = page.get_image_rects(raw_imgs[i][0], transform = True)
|
110
|
-
|
111
|
-
rec = tt[0][0]
|
112
|
-
L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
|
113
|
-
|
114
|
-
L, R = min(L, R), max(L, R)
|
115
|
-
U, D = min(U, D), max(U, D)
|
116
|
-
if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
|
117
|
-
continue
|
118
|
-
if pageL == L and R == pageR:
|
119
|
-
continue
|
120
|
-
if pageU == U and D == pageD:
|
121
|
-
continue
|
122
|
-
# pix1 = page.get_Pixmap(clip=(L,U,R,D))
|
123
|
-
new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
|
124
|
-
# pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
|
125
|
-
img_names.append(new_img_name)
|
126
|
-
img_bboxs.append((L, U, R, D))
|
127
|
-
img_visited.append(False)
|
128
|
-
imgs.append(raw_imgs[i])
|
129
|
-
except:
|
130
|
-
continue
|
131
|
-
|
132
|
-
#-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
|
133
|
-
imgs_ok = [True for _ in range(len(imgs))]
|
134
|
-
for i in range(len(imgs)):
|
135
|
-
L1, U1, R1, D1 = img_bboxs[i]
|
136
|
-
for j in range(i + 1, len(imgs)):
|
137
|
-
L2, U2, R2, D2 = img_bboxs[j]
|
138
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
139
|
-
s1 = abs(R1 - L1) * abs(D1 - U1)
|
140
|
-
s2 = abs(R2 - L2) * abs(D2 - U2)
|
141
|
-
if ratio_1 > 0 and ratio_2 > 0:
|
142
|
-
if ratio_1 == 1 and ratio_2 > 0.8:
|
143
|
-
imgs_ok[i] = False
|
144
|
-
elif ratio_1 > 0.8 and ratio_2 == 1:
|
145
|
-
imgs_ok[j] = False
|
146
|
-
elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
|
147
|
-
imgs_ok[i] = False
|
148
|
-
imgs_ok[j] = False
|
149
|
-
elif s1 / s2 > 5 and ratio_2 > 0.5:
|
150
|
-
imgs_ok[j] = False
|
151
|
-
elif s2 / s1 > 5 and ratio_1 > 0.5:
|
152
|
-
imgs_ok[i] = False
|
153
|
-
|
154
|
-
imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
|
155
|
-
img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
|
156
|
-
img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
|
157
|
-
img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
|
158
|
-
#*******************************************************************************#
|
159
|
-
|
160
|
-
#---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
|
161
|
-
#
|
162
|
-
svgs = page.get_drawings()
|
163
|
-
#------------ preprocess, check一些大框,看是否是合理的 ----------#
|
164
|
-
## 去重。有时候会遇到rect1和rect2是完全一样的情形。
|
165
|
-
svg_rect_visited = set()
|
166
|
-
available_svgIdx = []
|
167
|
-
for i in range(len(svgs)):
|
168
|
-
L, U, R, D = svgs[i]['rect'].irect
|
169
|
-
L, R = min(L, R), max(L, R)
|
170
|
-
U, D = min(U, D), max(U, D)
|
171
|
-
tt = (L, U, R, D)
|
172
|
-
if tt not in svg_rect_visited:
|
173
|
-
svg_rect_visited.add(tt)
|
174
|
-
available_svgIdx.append(i)
|
175
|
-
|
176
|
-
svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
|
177
|
-
svg_childs = [[] for _ in range(len(svgs))]
|
178
|
-
svg_parents = [[] for _ in range(len(svgs))]
|
179
|
-
svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
|
180
|
-
svg_visited = [False for _ in range(len(svgs))]
|
181
|
-
svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
|
182
|
-
|
183
|
-
|
184
|
-
for i in range(len(svgs)):
|
185
|
-
L, U, R, D = svgs[i]['rect'].irect
|
186
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
|
187
|
-
if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
|
188
|
-
if ratio_2 >= 0.7:
|
189
|
-
svg_exceedPage[i] += 4
|
190
|
-
else:
|
191
|
-
if L <= pageL:
|
192
|
-
svg_exceedPage[i] += 1
|
193
|
-
if pageR <= R:
|
194
|
-
svg_exceedPage[i] += 1
|
195
|
-
if U <= pageU:
|
196
|
-
svg_exceedPage[i] += 1
|
197
|
-
if pageD <= D:
|
198
|
-
svg_exceedPage[i] += 1
|
199
|
-
|
200
|
-
#### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
|
201
|
-
if len([x for x in svg_exceedPage if x >= 1]) >= 2:
|
202
|
-
svgs = []
|
203
|
-
svg_childs = []
|
204
|
-
svg_parents = []
|
205
|
-
svg_overlaps = []
|
206
|
-
svg_visited = []
|
207
|
-
svg_exceedPage = []
|
208
|
-
|
209
|
-
#---------------------------- build graph ----------------------------#
|
210
|
-
for i, p in enumerate(svgs):
|
211
|
-
L1, U1, R1, D1 = svgs[i]["rect"].irect
|
212
|
-
for j in range(len(svgs)):
|
213
|
-
if i == j:
|
214
|
-
continue
|
215
|
-
L2, U2, R2, D2 = svgs[j]["rect"].irect
|
216
|
-
## 包含
|
217
|
-
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
218
|
-
svg_childs[i].append(j)
|
219
|
-
svg_parents[j].append(i)
|
220
|
-
else:
|
221
|
-
## 交叉
|
222
|
-
if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
223
|
-
svg_overlaps[i].append(j)
|
224
|
-
|
225
|
-
#---------------- 确定最终的svg。连通块儿的外围 -------------------#
|
226
|
-
eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
|
227
|
-
svg_ID = 0
|
228
|
-
svg_final_names = []
|
229
|
-
svg_final_bboxs = []
|
230
|
-
svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
|
231
|
-
|
232
|
-
svg_idxs = [i for i in range(len(svgs))]
|
233
|
-
svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
|
234
|
-
|
235
|
-
for i in svg_idxs:
|
236
|
-
if svg_visited[i] == True:
|
237
|
-
continue
|
238
|
-
svg_visited[i] = True
|
239
|
-
L, U, R, D = svgs[i]['rect'].irect
|
240
|
-
width = R - L
|
241
|
-
height = D - U
|
242
|
-
if check_rect_isLine(L, U, R, D) == True:
|
243
|
-
svg_visited[i] = False
|
244
|
-
continue
|
245
|
-
# if i == 4:
|
246
|
-
# print(i, L, U, R, D)
|
247
|
-
# print(svg_parents[i])
|
248
|
-
|
249
|
-
cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
|
250
|
-
if len(svg_parents[i]) == 0:
|
251
|
-
## 是个普通框的情形
|
252
|
-
cur_block_element_cnt += len(svg_childs[i])
|
253
|
-
if svg_exceedPage[i] == 0:
|
254
|
-
## 误差。可能已经包含在某个框里面了
|
255
|
-
neglect_flag = False
|
256
|
-
for pL, pU, pR, pD in svg_final_bboxs:
|
257
|
-
if pL <= L <= R <= pR and pU <= U <= D <= pD:
|
258
|
-
neglect_flag = True
|
259
|
-
break
|
260
|
-
if neglect_flag == True:
|
261
|
-
continue
|
262
|
-
|
263
|
-
## 搜索连通域, bfs+记忆化
|
264
|
-
q = collections.deque()
|
265
|
-
for j in svg_overlaps[i]:
|
266
|
-
q.append(j)
|
267
|
-
while q:
|
268
|
-
j = q.popleft()
|
269
|
-
svg_visited[j] = True
|
270
|
-
L2, U2, R2, D2 = svgs[j]['rect'].irect
|
271
|
-
# width2 = R2 - L2
|
272
|
-
# height2 = D2 - U2
|
273
|
-
# if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
|
274
|
-
# continue
|
275
|
-
L = min(L, L2)
|
276
|
-
R = max(R, R2)
|
277
|
-
U = min(U, U2)
|
278
|
-
D = max(D, D2)
|
279
|
-
cur_block_element_cnt += 1
|
280
|
-
cur_block_element_cnt += len(svg_childs[j])
|
281
|
-
for k in svg_overlaps[j]:
|
282
|
-
if svg_visited[k] == False and svg_exceedPage[k] == 0:
|
283
|
-
svg_visited[k] = True
|
284
|
-
q.append(k)
|
285
|
-
elif svg_exceedPage[i] <= 2:
|
286
|
-
## 误差。可能已经包含在某个svg_final_bbox框里面了
|
287
|
-
neglect_flag = False
|
288
|
-
for sL, sU, sR, sD in svg_final_bboxs:
|
289
|
-
if sL <= L <= R <= sR and sU <= U <= D <= sD:
|
290
|
-
neglect_flag = True
|
291
|
-
break
|
292
|
-
if neglect_flag == True:
|
293
|
-
continue
|
294
|
-
|
295
|
-
L, U, R, D = pageR, pageD, pageL, pageU
|
296
|
-
## 所有孩子元素的最大边界
|
297
|
-
for j in svg_childs[i]:
|
298
|
-
if svg_visited[j] == True:
|
299
|
-
continue
|
300
|
-
if svg_exceedPage[j] >= 1:
|
301
|
-
continue
|
302
|
-
svg_visited[j] = True #### 这个位置考虑一下
|
303
|
-
L2, U2, R2, D2 = svgs[j]['rect'].irect
|
304
|
-
L = min(L, L2)
|
305
|
-
R = max(R, R2)
|
306
|
-
U = min(U, U2)
|
307
|
-
D = max(D, D2)
|
308
|
-
cur_block_element_cnt += 1
|
309
|
-
|
310
|
-
# 如果是条line,就不用保存了
|
311
|
-
if check_rect_isLine(L, U, R, D) == True:
|
312
|
-
continue
|
313
|
-
# 如果当前的svg,连2个elements都没有,就不用保存了
|
314
|
-
if cur_block_element_cnt < 3:
|
315
|
-
continue
|
316
|
-
|
317
|
-
## 当前svg,框住了多少文本框。如果框多了,可能就是错了
|
318
|
-
contain_textLineBlock_cnt = 0
|
319
|
-
for L2, U2, R2, D2 in textLine_blocks:
|
320
|
-
if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
|
321
|
-
contain_textLineBlock_cnt += 1
|
322
|
-
if contain_textLineBlock_cnt >= 10:
|
323
|
-
continue
|
324
|
-
|
325
|
-
# L -= eps_ERROR * 2
|
326
|
-
# U -= eps_ERROR
|
327
|
-
# R += eps_ERROR * 2
|
328
|
-
# D += eps_ERROR
|
329
|
-
# # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
|
330
|
-
# cur_svg = page.get_pixmap(clip=(L,U,R,D))
|
331
|
-
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
|
332
|
-
# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
|
333
|
-
svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
|
334
|
-
svg_final_bboxs.append((L, U, R, D))
|
335
|
-
svg_final_visited.append(False)
|
336
|
-
svg_ID += 1
|
337
|
-
|
338
|
-
## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
|
339
|
-
svg_idxs = [i for i in range(len(svg_final_bboxs))]
|
340
|
-
svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
|
341
|
-
svg_final_names_2 = []
|
342
|
-
svg_final_bboxs_2 = []
|
343
|
-
svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
|
344
|
-
svg_ID_2 = 0
|
345
|
-
for i in range(len(svg_final_bboxs)):
|
346
|
-
L1, U1, R1, D1 = svg_final_bboxs[i]
|
347
|
-
for j in range(i + 1, len(svg_final_bboxs)):
|
348
|
-
L2, U2, R2, D2 = svg_final_bboxs[j]
|
349
|
-
# 如果 rect1包含了rect2
|
350
|
-
if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
351
|
-
svg_final_visited[j] = True
|
352
|
-
continue
|
353
|
-
# 水平并列
|
354
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
|
355
|
-
if ratio_1 >= 0.7 and ratio_2 >= 0.7:
|
356
|
-
if abs(L2 - R1) >= 20:
|
357
|
-
continue
|
358
|
-
LL = min(L1, L2)
|
359
|
-
UU = min(U1, U2)
|
360
|
-
RR = max(R1, R2)
|
361
|
-
DD = max(D1, D2)
|
362
|
-
svg_final_bboxs[i] = (LL, UU, RR, DD)
|
363
|
-
svg_final_visited[j] = True
|
364
|
-
continue
|
365
|
-
# 竖直并列
|
366
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
|
367
|
-
if ratio_1 >= 0.7 and ratio_2 >= 0.7:
|
368
|
-
if abs(U2 - D1) >= 20:
|
369
|
-
continue
|
370
|
-
LL = min(L1, L2)
|
371
|
-
UU = min(U1, U2)
|
372
|
-
RR = max(R1, R2)
|
373
|
-
DD = max(D1, D2)
|
374
|
-
svg_final_bboxs[i] = (LL, UU, RR, DD)
|
375
|
-
svg_final_visited[j] = True
|
376
|
-
|
377
|
-
for i in range(len(svg_final_bboxs)):
|
378
|
-
if svg_final_visited[i] == False:
|
379
|
-
L, U, R, D = svg_final_bboxs[i]
|
380
|
-
svg_final_bboxs_2.append((L, U, R, D))
|
381
|
-
|
382
|
-
L -= eps_ERROR * 2
|
383
|
-
U -= eps_ERROR
|
384
|
-
R += eps_ERROR * 2
|
385
|
-
D += eps_ERROR
|
386
|
-
# cur_svg = page.get_pixmap(clip=(L,U,R,D))
|
387
|
-
new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
|
388
|
-
# cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
|
389
|
-
svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
|
390
|
-
svg_final_bboxs_2.append((L, U, R, D))
|
391
|
-
svg_final_visited_2.append(False)
|
392
|
-
svg_ID_2 += 1
|
393
|
-
|
394
|
-
## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
|
395
|
-
# 有收尾才comprehensive
|
396
|
-
# xxxx
|
397
|
-
# xxxx
|
398
|
-
# xxxx
|
399
|
-
# xxxx
|
400
|
-
|
401
|
-
|
402
|
-
#--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
|
403
|
-
figure_bbox_from_DocXChain = []
|
404
|
-
|
405
|
-
figure_from_DocXChain_visited = [] # 记忆化
|
406
|
-
figure_bbox_from_DocXChain_overlappedRatio = []
|
407
|
-
|
408
|
-
figure_only_from_DocXChain_bboxs = [] # 存储
|
409
|
-
figure_only_from_DocXChain_names = []
|
410
|
-
figure_only_from_DocXChain_visited = []
|
411
|
-
figure_only_ID = 0
|
412
|
-
|
413
|
-
xf_json = json_from_DocXchain_obj
|
414
|
-
width_from_json = xf_json['page_info']['width']
|
415
|
-
height_from_json = xf_json['page_info']['height']
|
416
|
-
LR_scaleRatio = width_from_json / (pageR - pageL)
|
417
|
-
UD_scaleRatio = height_from_json / (pageD - pageU)
|
418
|
-
|
419
|
-
for xf in xf_json['layout_dets']:
|
420
|
-
# {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
|
421
|
-
L = xf['poly'][0] / LR_scaleRatio
|
422
|
-
U = xf['poly'][1] / UD_scaleRatio
|
423
|
-
R = xf['poly'][2] / LR_scaleRatio
|
424
|
-
D = xf['poly'][5] / UD_scaleRatio
|
425
|
-
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
426
|
-
# R += pageL
|
427
|
-
# U += pageU
|
428
|
-
# D += pageU
|
429
|
-
L, R = min(L, R), max(L, R)
|
430
|
-
U, D = min(U, D), max(U, D)
|
431
|
-
# figure
|
432
|
-
if xf["category_id"] == 1 and xf['score'] >= 0.3:
|
433
|
-
figure_bbox_from_DocXChain.append((L, U, R, D))
|
434
|
-
figure_from_DocXChain_visited.append(False)
|
435
|
-
figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
|
436
|
-
|
437
|
-
#---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
|
438
|
-
|
439
|
-
## 比对imgs
|
440
|
-
for i, b1 in enumerate(figure_bbox_from_DocXChain):
|
441
|
-
# print('--------- DocXChain的图片', b1)
|
442
|
-
L1, U1, R1, D1 = b1
|
443
|
-
for b2 in img_bboxs:
|
444
|
-
# print('-------- igms得到的图', b2)
|
445
|
-
L2, U2, R2, D2 = b2
|
446
|
-
s1 = abs(R1 - L1) * abs(D1 - U1)
|
447
|
-
s2 = abs(R2 - L2) * abs(D2 - U2)
|
448
|
-
# 相同
|
449
|
-
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
450
|
-
figure_from_DocXChain_visited[i] = True
|
451
|
-
# 包含
|
452
|
-
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
453
|
-
if s2 / s1 > 0.8:
|
454
|
-
figure_from_DocXChain_visited[i] = True
|
455
|
-
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
|
456
|
-
if s1 / s2 > 0.8:
|
457
|
-
figure_from_DocXChain_visited[i] = True
|
458
|
-
else:
|
459
|
-
# 重叠了相当一部分
|
460
|
-
# print('进入第3部分')
|
461
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
462
|
-
if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
|
463
|
-
figure_from_DocXChain_visited[i] = True
|
464
|
-
else:
|
465
|
-
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
|
466
|
-
# print('图片的重叠率是{}'.format(ratio_1))
|
467
|
-
|
468
|
-
|
469
|
-
## 比对svgs
|
470
|
-
svg_final_bboxs_2_badIdxs = []
|
471
|
-
for i, b1 in enumerate(figure_bbox_from_DocXChain):
|
472
|
-
L1, U1, R1, D1 = b1
|
473
|
-
for j, b2 in enumerate(svg_final_bboxs_2):
|
474
|
-
L2, U2, R2, D2 = b2
|
475
|
-
s1 = abs(R1 - L1) * abs(D1 - U1)
|
476
|
-
s2 = abs(R2 - L2) * abs(D2 - U2)
|
477
|
-
# 相同
|
478
|
-
if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
479
|
-
figure_from_DocXChain_visited[i] = True
|
480
|
-
# 包含
|
481
|
-
elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
|
482
|
-
figure_from_DocXChain_visited[i] = True
|
483
|
-
elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
|
484
|
-
if s1 / s2 > 0.7:
|
485
|
-
figure_from_DocXChain_visited[i] = True
|
486
|
-
else:
|
487
|
-
svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
|
488
|
-
else:
|
489
|
-
# 重叠了相当一部分
|
490
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
491
|
-
if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
|
492
|
-
figure_from_DocXChain_visited[i] = True
|
493
|
-
else:
|
494
|
-
figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
|
495
|
-
|
496
|
-
# 丢掉错误的svg
|
497
|
-
svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
|
498
|
-
|
499
|
-
for i in range(len(figure_from_DocXChain_visited)):
|
500
|
-
if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
|
501
|
-
figure_from_DocXChain_visited[i] = True
|
502
|
-
|
503
|
-
# DocXChain识别出来的figure,但是没被保存的。
|
504
|
-
for i in range(len(figure_from_DocXChain_visited)):
|
505
|
-
if figure_from_DocXChain_visited[i] == False:
|
506
|
-
figure_from_DocXChain_visited[i] = True
|
507
|
-
cur_bbox = figure_bbox_from_DocXChain[i]
|
508
|
-
# cur_figure = page.get_pixmap(clip=cur_bbox)
|
509
|
-
new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
|
510
|
-
# cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
|
511
|
-
figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
|
512
|
-
figure_only_from_DocXChain_bboxs.append(cur_bbox)
|
513
|
-
figure_only_from_DocXChain_visited.append(False)
|
514
|
-
figure_only_ID += 1
|
515
|
-
|
516
|
-
img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
|
517
|
-
svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
|
518
|
-
figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
|
519
|
-
curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
|
520
|
-
|
521
|
-
#--------------------------- 最后统一去重 -----------------------------------#
|
522
|
-
curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
|
523
|
-
|
524
|
-
#### 先考虑包含关系的小块
|
525
|
-
final_duplicate = set()
|
526
|
-
for i in range(len(curPage_all_fig_bboxs)):
|
527
|
-
L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
|
528
|
-
for j in range(len(curPage_all_fig_bboxs)):
|
529
|
-
if i == j:
|
530
|
-
continue
|
531
|
-
L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
|
532
|
-
s1 = abs(R1 - L1) * abs(D1 - U1)
|
533
|
-
s2 = abs(R2 - L2) * abs(D2 - U2)
|
534
|
-
if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
|
535
|
-
final_duplicate.add((L1, U1, R1, D1))
|
536
|
-
else:
|
537
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
538
|
-
if ratio_1 >= 0.8 and ratio_2 <= 0.6:
|
539
|
-
final_duplicate.add((L1, U1, R1, D1))
|
540
|
-
|
541
|
-
curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
|
542
|
-
|
543
|
-
#### 再考虑重叠关系的块
|
544
|
-
final_duplicate = set()
|
545
|
-
final_synthetic_bboxs = []
|
546
|
-
for i in range(len(curPage_all_fig_bboxs)):
|
547
|
-
L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
|
548
|
-
for j in range(len(curPage_all_fig_bboxs)):
|
549
|
-
if i == j:
|
550
|
-
continue
|
551
|
-
L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
|
552
|
-
s1 = abs(R1 - L1) * abs(D1 - U1)
|
553
|
-
s2 = abs(R2 - L2) * abs(D2 - U2)
|
554
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
555
|
-
union_ok = False
|
556
|
-
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
|
557
|
-
union_ok = True
|
558
|
-
if (ratio_1 > 0.2 and s2 / s1 > 5):
|
559
|
-
union_ok = True
|
560
|
-
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
|
561
|
-
union_ok = True
|
562
|
-
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
|
563
|
-
union_ok = True
|
564
|
-
if union_ok == True:
|
565
|
-
final_duplicate.add((L1, U1, R1, D1))
|
566
|
-
final_duplicate.add((L2, U2, R2, D2))
|
567
|
-
L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
|
568
|
-
final_synthetic_bboxs.append((L3, U3, R3, D3))
|
569
|
-
|
570
|
-
# print('---------- curPage_all_fig_bboxs ---------')
|
571
|
-
# print(curPage_all_fig_bboxs)
|
572
|
-
curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
|
573
|
-
final_synthetic_bboxs = list(set(final_synthetic_bboxs))
|
574
|
-
|
575
|
-
|
576
|
-
## 再再考虑重叠关系。极端情况下会迭代式地2进1
|
577
|
-
new_images = []
|
578
|
-
droped_img_idx = []
|
579
|
-
image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
|
580
|
-
for i in range(0, len(image_bboxes)):
|
581
|
-
for j in range(i+1, len(image_bboxes)):
|
582
|
-
if j not in droped_img_idx:
|
583
|
-
L2, U2, R2, D2 = image_bboxes[j]
|
584
|
-
s1 = abs(R1 - L1) * abs(D1 - U1)
|
585
|
-
s2 = abs(R2 - L2) * abs(D2 - U2)
|
586
|
-
ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
587
|
-
union_ok = False
|
588
|
-
if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
|
589
|
-
union_ok = True
|
590
|
-
if (ratio_1 > 0.2 and s2 / s1 > 5):
|
591
|
-
union_ok = True
|
592
|
-
if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
|
593
|
-
union_ok = True
|
594
|
-
if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
|
595
|
-
union_ok = True
|
596
|
-
if union_ok == True:
|
597
|
-
# 合并
|
598
|
-
image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
|
599
|
-
droped_img_idx.append(j)
|
600
|
-
|
601
|
-
for i in range(0, len(image_bboxes)):
|
602
|
-
if i not in droped_img_idx:
|
603
|
-
new_images.append(image_bboxes[i])
|
604
|
-
|
605
|
-
|
606
|
-
# find_union_FLAG = True
|
607
|
-
# while find_union_FLAG == True:
|
608
|
-
# find_union_FLAG = False
|
609
|
-
# final_duplicate = set()
|
610
|
-
# tmp = []
|
611
|
-
# for i in range(len(final_synthetic_bboxs)):
|
612
|
-
# L1, U1, R1, D1 = final_synthetic_bboxs[i]
|
613
|
-
# for j in range(len(final_synthetic_bboxs)):
|
614
|
-
# if i == j:
|
615
|
-
# continue
|
616
|
-
# L2, U2, R2, D2 = final_synthetic_bboxs[j]
|
617
|
-
# s1 = abs(R1 - L1) * abs(D1 - U1)
|
618
|
-
# s2 = abs(R2 - L2) * abs(D2 - U2)
|
619
|
-
# ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
|
620
|
-
# union_ok = False
|
621
|
-
# if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
|
622
|
-
# union_ok = True
|
623
|
-
# if (ratio_1 > 0.2 and s2 / s1 > 5):
|
624
|
-
# union_ok = True
|
625
|
-
# if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
|
626
|
-
# union_ok = True
|
627
|
-
# if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
|
628
|
-
# union_ok = True
|
629
|
-
# if union_ok == True:
|
630
|
-
# find_union_FLAG = True
|
631
|
-
# final_duplicate.add((L1, U1, R1, D1))
|
632
|
-
# final_duplicate.add((L2, U2, R2, D2))
|
633
|
-
# L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
|
634
|
-
# tmp.append((L3, U3, R3, D3))
|
635
|
-
# if find_union_FLAG == True:
|
636
|
-
# tmp = list(set(tmp))
|
637
|
-
# final_synthetic_bboxs = tmp[:]
|
638
|
-
|
639
|
-
|
640
|
-
# curPage_all_fig_bboxs += final_synthetic_bboxs
|
641
|
-
# print('--------- final synthetic')
|
642
|
-
# print(final_synthetic_bboxs)
|
643
|
-
#**************************************************************************#
|
644
|
-
images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
|
645
|
-
images = images1 + new_images
|
646
|
-
return images
|
647
|
-
|