magic-pdf 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +130 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/boxbase.py +169 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +227 -161
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +105 -15
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/para/para_split_v2.py +26 -27
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +68 -26
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +34 -29
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
magic_pdf/model/magic_model.py
CHANGED
@@ -1,50 +1,38 @@
|
|
1
1
|
import json
|
2
|
-
import math
|
3
2
|
|
4
|
-
from magic_pdf.libs.
|
5
|
-
|
6
|
-
|
7
|
-
from magic_pdf.libs.commons import join_path
|
3
|
+
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
4
|
+
bbox_relative_pos, calculate_iou,
|
5
|
+
calculate_overlap_area_in_bbox1_area_ratio)
|
6
|
+
from magic_pdf.libs.commons import fitz, join_path
|
8
7
|
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
9
|
-
from magic_pdf.libs.ocr_content_type import ContentType
|
10
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
11
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
12
8
|
from magic_pdf.libs.local_math import float_gt
|
13
|
-
from magic_pdf.libs.boxbase import (
|
14
|
-
_is_in,
|
15
|
-
bbox_relative_pos,
|
16
|
-
bbox_distance,
|
17
|
-
_is_part_overlap,
|
18
|
-
calculate_overlap_area_in_bbox1_area_ratio,
|
19
|
-
calculate_iou,
|
20
|
-
)
|
21
9
|
from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
|
10
|
+
from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
|
11
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
12
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
22
13
|
|
23
14
|
CAPATION_OVERLAP_AREA_RATIO = 0.6
|
24
15
|
|
25
16
|
|
26
17
|
class MagicModel:
|
27
|
-
"""
|
28
|
-
每个函数没有得到元素的时候返回空list
|
29
|
-
|
30
|
-
"""
|
18
|
+
"""每个函数没有得到元素的时候返回空list."""
|
31
19
|
|
32
20
|
def __fix_axis(self):
|
33
21
|
for model_page_info in self.__model_list:
|
34
22
|
need_remove_list = []
|
35
|
-
page_no = model_page_info[
|
23
|
+
page_no = model_page_info['page_info']['page_no']
|
36
24
|
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
|
37
25
|
model_page_info, self.__docs[page_no]
|
38
26
|
)
|
39
|
-
layout_dets = model_page_info[
|
27
|
+
layout_dets = model_page_info['layout_dets']
|
40
28
|
for layout_det in layout_dets:
|
41
29
|
|
42
|
-
if layout_det.get(
|
30
|
+
if layout_det.get('bbox') is not None:
|
43
31
|
# 兼容直接输出bbox的模型数据,如paddle
|
44
|
-
x0, y0, x1, y1 = layout_det[
|
32
|
+
x0, y0, x1, y1 = layout_det['bbox']
|
45
33
|
else:
|
46
34
|
# 兼容直接输出poly的模型数据,如xxx
|
47
|
-
x0, y0, _, _, x1, y1, _, _ = layout_det[
|
35
|
+
x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
|
48
36
|
|
49
37
|
bbox = [
|
50
38
|
int(x0 / horizontal_scale_ratio),
|
@@ -52,7 +40,7 @@ class MagicModel:
|
|
52
40
|
int(x1 / horizontal_scale_ratio),
|
53
41
|
int(y1 / vertical_scale_ratio),
|
54
42
|
]
|
55
|
-
layout_det[
|
43
|
+
layout_det['bbox'] = bbox
|
56
44
|
# 删除高度或者宽度小于等于0的spans
|
57
45
|
if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
|
58
46
|
need_remove_list.append(layout_det)
|
@@ -62,9 +50,9 @@ class MagicModel:
|
|
62
50
|
def __fix_by_remove_low_confidence(self):
|
63
51
|
for model_page_info in self.__model_list:
|
64
52
|
need_remove_list = []
|
65
|
-
layout_dets = model_page_info[
|
53
|
+
layout_dets = model_page_info['layout_dets']
|
66
54
|
for layout_det in layout_dets:
|
67
|
-
if layout_det[
|
55
|
+
if layout_det['score'] <= 0.05:
|
68
56
|
need_remove_list.append(layout_det)
|
69
57
|
else:
|
70
58
|
continue
|
@@ -74,12 +62,12 @@ class MagicModel:
|
|
74
62
|
def __fix_by_remove_high_iou_and_low_confidence(self):
|
75
63
|
for model_page_info in self.__model_list:
|
76
64
|
need_remove_list = []
|
77
|
-
layout_dets = model_page_info[
|
65
|
+
layout_dets = model_page_info['layout_dets']
|
78
66
|
for layout_det1 in layout_dets:
|
79
67
|
for layout_det2 in layout_dets:
|
80
68
|
if layout_det1 == layout_det2:
|
81
69
|
continue
|
82
|
-
if layout_det1[
|
70
|
+
if layout_det1['category_id'] in [
|
83
71
|
0,
|
84
72
|
1,
|
85
73
|
2,
|
@@ -90,12 +78,12 @@ class MagicModel:
|
|
90
78
|
7,
|
91
79
|
8,
|
92
80
|
9,
|
93
|
-
] and layout_det2[
|
81
|
+
] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
|
94
82
|
if (
|
95
|
-
calculate_iou(layout_det1[
|
83
|
+
calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
|
96
84
|
> 0.9
|
97
85
|
):
|
98
|
-
if layout_det1[
|
86
|
+
if layout_det1['score'] < layout_det2['score']:
|
99
87
|
layout_det_need_remove = layout_det1
|
100
88
|
else:
|
101
89
|
layout_det_need_remove = layout_det2
|
@@ -118,6 +106,67 @@ class MagicModel:
|
|
118
106
|
self.__fix_by_remove_low_confidence()
|
119
107
|
"""删除高iou(>0.9)数据中置信度较低的那个"""
|
120
108
|
self.__fix_by_remove_high_iou_and_low_confidence()
|
109
|
+
self.__fix_footnote()
|
110
|
+
|
111
|
+
def __fix_footnote(self):
|
112
|
+
# 3: figure, 5: table, 7: footnote
|
113
|
+
for model_page_info in self.__model_list:
|
114
|
+
footnotes = []
|
115
|
+
figures = []
|
116
|
+
tables = []
|
117
|
+
|
118
|
+
for obj in model_page_info['layout_dets']:
|
119
|
+
if obj['category_id'] == 7:
|
120
|
+
footnotes.append(obj)
|
121
|
+
elif obj['category_id'] == 3:
|
122
|
+
figures.append(obj)
|
123
|
+
elif obj['category_id'] == 5:
|
124
|
+
tables.append(obj)
|
125
|
+
if len(footnotes) * len(figures) == 0:
|
126
|
+
continue
|
127
|
+
dis_figure_footnote = {}
|
128
|
+
dis_table_footnote = {}
|
129
|
+
|
130
|
+
for i in range(len(footnotes)):
|
131
|
+
for j in range(len(figures)):
|
132
|
+
pos_flag_count = sum(
|
133
|
+
list(
|
134
|
+
map(
|
135
|
+
lambda x: 1 if x else 0,
|
136
|
+
bbox_relative_pos(
|
137
|
+
footnotes[i]['bbox'], figures[j]['bbox']
|
138
|
+
),
|
139
|
+
)
|
140
|
+
)
|
141
|
+
)
|
142
|
+
if pos_flag_count > 1:
|
143
|
+
continue
|
144
|
+
dis_figure_footnote[i] = min(
|
145
|
+
bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
|
146
|
+
dis_figure_footnote.get(i, float('inf')),
|
147
|
+
)
|
148
|
+
for i in range(len(footnotes)):
|
149
|
+
for j in range(len(tables)):
|
150
|
+
pos_flag_count = sum(
|
151
|
+
list(
|
152
|
+
map(
|
153
|
+
lambda x: 1 if x else 0,
|
154
|
+
bbox_relative_pos(
|
155
|
+
footnotes[i]['bbox'], tables[j]['bbox']
|
156
|
+
),
|
157
|
+
)
|
158
|
+
)
|
159
|
+
)
|
160
|
+
if pos_flag_count > 1:
|
161
|
+
continue
|
162
|
+
|
163
|
+
dis_table_footnote[i] = min(
|
164
|
+
bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
|
165
|
+
dis_table_footnote.get(i, float('inf')),
|
166
|
+
)
|
167
|
+
for i in range(len(footnotes)):
|
168
|
+
if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
|
169
|
+
footnotes[i]['category_id'] = CategoryId.ImageFootnote
|
121
170
|
|
122
171
|
def __reduct_overlap(self, bboxes):
|
123
172
|
N = len(bboxes)
|
@@ -126,76 +175,77 @@ class MagicModel:
|
|
126
175
|
for j in range(N):
|
127
176
|
if i == j:
|
128
177
|
continue
|
129
|
-
if _is_in(bboxes[i][
|
178
|
+
if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
|
130
179
|
keep[i] = False
|
131
|
-
|
132
180
|
return [bboxes[i] for i in range(N) if keep[i]]
|
133
181
|
|
134
182
|
def __tie_up_category_by_distance(
|
135
183
|
self, page_no, subject_category_id, object_category_id
|
136
184
|
):
|
137
|
-
"""
|
138
|
-
|
139
|
-
"""
|
185
|
+
"""假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object
|
186
|
+
只能属于一个 subject."""
|
140
187
|
ret = []
|
141
188
|
MAX_DIS_OF_POINT = 10**9 + 7
|
189
|
+
"""
|
190
|
+
subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。
|
191
|
+
筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
192
|
+
再求出筛选出的 subjects 和 object 的最短距离
|
193
|
+
"""
|
142
194
|
|
143
|
-
# subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
144
|
-
# 再求出筛选出的 subjects 和 object 的最短距离!
|
145
195
|
def may_find_other_nearest_bbox(subject_idx, object_idx):
|
146
|
-
ret = float(
|
196
|
+
ret = float('inf')
|
147
197
|
|
148
198
|
x0 = min(
|
149
|
-
all_bboxes[subject_idx][
|
199
|
+
all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
|
150
200
|
)
|
151
201
|
y0 = min(
|
152
|
-
all_bboxes[subject_idx][
|
202
|
+
all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
|
153
203
|
)
|
154
204
|
x1 = max(
|
155
|
-
all_bboxes[subject_idx][
|
205
|
+
all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
|
156
206
|
)
|
157
207
|
y1 = max(
|
158
|
-
all_bboxes[subject_idx][
|
208
|
+
all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
|
159
209
|
)
|
160
210
|
|
161
211
|
object_area = abs(
|
162
|
-
all_bboxes[object_idx][
|
212
|
+
all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
|
163
213
|
) * abs(
|
164
|
-
all_bboxes[object_idx][
|
214
|
+
all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
|
165
215
|
)
|
166
216
|
|
167
217
|
for i in range(len(all_bboxes)):
|
168
218
|
if (
|
169
219
|
i == subject_idx
|
170
|
-
or all_bboxes[i][
|
220
|
+
or all_bboxes[i]['category_id'] != subject_category_id
|
171
221
|
):
|
172
222
|
continue
|
173
|
-
if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i][
|
174
|
-
all_bboxes[i][
|
223
|
+
if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
|
224
|
+
all_bboxes[i]['bbox'], [x0, y0, x1, y1]
|
175
225
|
):
|
176
226
|
|
177
227
|
i_area = abs(
|
178
|
-
all_bboxes[i][
|
179
|
-
) * abs(all_bboxes[i][
|
228
|
+
all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
|
229
|
+
) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
|
180
230
|
if i_area >= object_area:
|
181
|
-
ret = min(float(
|
231
|
+
ret = min(float('inf'), dis[i][object_idx])
|
182
232
|
|
183
233
|
return ret
|
184
234
|
|
185
235
|
def expand_bbbox(idxes):
|
186
|
-
x0s = [all_bboxes[idx][
|
187
|
-
y0s = [all_bboxes[idx][
|
188
|
-
x1s = [all_bboxes[idx][
|
189
|
-
y1s = [all_bboxes[idx][
|
236
|
+
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
|
237
|
+
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
|
238
|
+
x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
|
239
|
+
y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
|
190
240
|
return min(x0s), min(y0s), max(x1s), max(y1s)
|
191
241
|
|
192
242
|
subjects = self.__reduct_overlap(
|
193
243
|
list(
|
194
244
|
map(
|
195
|
-
lambda x: {
|
245
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
196
246
|
filter(
|
197
|
-
lambda x: x[
|
198
|
-
self.__model_list[page_no][
|
247
|
+
lambda x: x['category_id'] == subject_category_id,
|
248
|
+
self.__model_list[page_no]['layout_dets'],
|
199
249
|
),
|
200
250
|
)
|
201
251
|
)
|
@@ -204,10 +254,10 @@ class MagicModel:
|
|
204
254
|
objects = self.__reduct_overlap(
|
205
255
|
list(
|
206
256
|
map(
|
207
|
-
lambda x: {
|
257
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
208
258
|
filter(
|
209
|
-
lambda x: x[
|
210
|
-
self.__model_list[page_no][
|
259
|
+
lambda x: x['category_id'] == object_category_id,
|
260
|
+
self.__model_list[page_no]['layout_dets'],
|
211
261
|
),
|
212
262
|
)
|
213
263
|
)
|
@@ -215,7 +265,7 @@ class MagicModel:
|
|
215
265
|
subject_object_relation_map = {}
|
216
266
|
|
217
267
|
subjects.sort(
|
218
|
-
key=lambda x: x[
|
268
|
+
key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
|
219
269
|
) # get the distance !
|
220
270
|
|
221
271
|
all_bboxes = []
|
@@ -223,18 +273,18 @@ class MagicModel:
|
|
223
273
|
for v in subjects:
|
224
274
|
all_bboxes.append(
|
225
275
|
{
|
226
|
-
|
227
|
-
|
228
|
-
|
276
|
+
'category_id': subject_category_id,
|
277
|
+
'bbox': v['bbox'],
|
278
|
+
'score': v['score'],
|
229
279
|
}
|
230
280
|
)
|
231
281
|
|
232
282
|
for v in objects:
|
233
283
|
all_bboxes.append(
|
234
284
|
{
|
235
|
-
|
236
|
-
|
237
|
-
|
285
|
+
'category_id': object_category_id,
|
286
|
+
'bbox': v['bbox'],
|
287
|
+
'score': v['score'],
|
238
288
|
}
|
239
289
|
)
|
240
290
|
|
@@ -244,18 +294,18 @@ class MagicModel:
|
|
244
294
|
for i in range(N):
|
245
295
|
for j in range(i):
|
246
296
|
if (
|
247
|
-
all_bboxes[i][
|
248
|
-
and all_bboxes[j][
|
297
|
+
all_bboxes[i]['category_id'] == subject_category_id
|
298
|
+
and all_bboxes[j]['category_id'] == subject_category_id
|
249
299
|
):
|
250
300
|
continue
|
251
301
|
|
252
|
-
dis[i][j] = bbox_distance(all_bboxes[i][
|
302
|
+
dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
|
253
303
|
dis[j][i] = dis[i][j]
|
254
304
|
|
255
305
|
used = set()
|
256
306
|
for i in range(N):
|
257
307
|
# 求第 i 个 subject 所关联的 object
|
258
|
-
if all_bboxes[i][
|
308
|
+
if all_bboxes[i]['category_id'] != subject_category_id:
|
259
309
|
continue
|
260
310
|
seen = set()
|
261
311
|
candidates = []
|
@@ -267,7 +317,7 @@ class MagicModel:
|
|
267
317
|
map(
|
268
318
|
lambda x: 1 if x else 0,
|
269
319
|
bbox_relative_pos(
|
270
|
-
all_bboxes[i][
|
320
|
+
all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
|
271
321
|
),
|
272
322
|
)
|
273
323
|
)
|
@@ -275,25 +325,28 @@ class MagicModel:
|
|
275
325
|
if pos_flag_count > 1:
|
276
326
|
continue
|
277
327
|
if (
|
278
|
-
all_bboxes[j][
|
328
|
+
all_bboxes[j]['category_id'] != object_category_id
|
279
329
|
or j in used
|
280
330
|
or dis[i][j] == MAX_DIS_OF_POINT
|
281
331
|
):
|
282
332
|
continue
|
283
333
|
left, right, _, _ = bbox_relative_pos(
|
284
|
-
all_bboxes[i][
|
334
|
+
all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
|
285
335
|
) # 由 pos_flag_count 相关逻辑保证本段逻辑准确性
|
286
336
|
if left or right:
|
287
|
-
one_way_dis = all_bboxes[i][
|
337
|
+
one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
|
288
338
|
else:
|
289
|
-
one_way_dis = all_bboxes[i][
|
339
|
+
one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
|
290
340
|
if dis[i][j] > one_way_dis:
|
291
341
|
continue
|
292
342
|
arr.append((dis[i][j], j))
|
293
343
|
|
294
344
|
arr.sort(key=lambda x: x[0])
|
295
345
|
if len(arr) > 0:
|
296
|
-
|
346
|
+
"""
|
347
|
+
bug: 离该subject 最近的 object 可能跨越了其它的 subject。
|
348
|
+
比如 [this subect] [some sbuject] [the nearest object of subject]
|
349
|
+
"""
|
297
350
|
if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
|
298
351
|
|
299
352
|
candidates.append(arr[0][1])
|
@@ -308,7 +361,7 @@ class MagicModel:
|
|
308
361
|
map(
|
309
362
|
lambda x: 1 if x else 0,
|
310
363
|
bbox_relative_pos(
|
311
|
-
all_bboxes[j][
|
364
|
+
all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
|
312
365
|
),
|
313
366
|
)
|
314
367
|
)
|
@@ -318,7 +371,7 @@ class MagicModel:
|
|
318
371
|
continue
|
319
372
|
|
320
373
|
if (
|
321
|
-
all_bboxes[k][
|
374
|
+
all_bboxes[k]['category_id'] != object_category_id
|
322
375
|
or k in used
|
323
376
|
or k in seen
|
324
377
|
or dis[j][k] == MAX_DIS_OF_POINT
|
@@ -327,17 +380,19 @@ class MagicModel:
|
|
327
380
|
continue
|
328
381
|
|
329
382
|
is_nearest = True
|
330
|
-
for
|
331
|
-
if
|
383
|
+
for ni in range(i + 1, N):
|
384
|
+
if ni in (j, k) or ni in used or ni in seen:
|
332
385
|
continue
|
333
386
|
|
334
|
-
if not float_gt(dis[
|
387
|
+
if not float_gt(dis[ni][k], dis[j][k]):
|
335
388
|
is_nearest = False
|
336
389
|
break
|
337
390
|
|
338
391
|
if is_nearest:
|
339
392
|
nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
|
340
|
-
n_dis = bbox_distance(
|
393
|
+
n_dis = bbox_distance(
|
394
|
+
all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
|
395
|
+
)
|
341
396
|
if float_gt(dis[i][j], n_dis):
|
342
397
|
continue
|
343
398
|
tmp.append(k)
|
@@ -350,7 +405,7 @@ class MagicModel:
|
|
350
405
|
# 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
|
351
406
|
# 先扩一下 bbox,
|
352
407
|
ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
|
353
|
-
ix0, iy0, ix1, iy1 = all_bboxes[i][
|
408
|
+
ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
|
354
409
|
|
355
410
|
# 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
|
356
411
|
caption_poses = [
|
@@ -366,17 +421,17 @@ class MagicModel:
|
|
366
421
|
for idx in seen:
|
367
422
|
if (
|
368
423
|
calculate_overlap_area_in_bbox1_area_ratio(
|
369
|
-
all_bboxes[idx][
|
424
|
+
all_bboxes[idx]['bbox'], bbox
|
370
425
|
)
|
371
426
|
> CAPATION_OVERLAP_AREA_RATIO
|
372
427
|
):
|
373
428
|
embed_arr.append(idx)
|
374
429
|
|
375
430
|
if len(embed_arr) > 0:
|
376
|
-
embed_x0 = min([all_bboxes[idx][
|
377
|
-
embed_y0 = min([all_bboxes[idx][
|
378
|
-
embed_x1 = max([all_bboxes[idx][
|
379
|
-
embed_y1 = max([all_bboxes[idx][
|
431
|
+
embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
|
432
|
+
embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
|
433
|
+
embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
|
434
|
+
embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
|
380
435
|
caption_areas.append(
|
381
436
|
int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
|
382
437
|
)
|
@@ -391,7 +446,7 @@ class MagicModel:
|
|
391
446
|
for j in seen:
|
392
447
|
if (
|
393
448
|
calculate_overlap_area_in_bbox1_area_ratio(
|
394
|
-
all_bboxes[j][
|
449
|
+
all_bboxes[j]['bbox'], caption_bbox
|
395
450
|
)
|
396
451
|
> CAPATION_OVERLAP_AREA_RATIO
|
397
452
|
):
|
@@ -400,30 +455,30 @@ class MagicModel:
|
|
400
455
|
|
401
456
|
for i in sorted(subject_object_relation_map.keys()):
|
402
457
|
result = {
|
403
|
-
|
404
|
-
|
405
|
-
|
458
|
+
'subject_body': all_bboxes[i]['bbox'],
|
459
|
+
'all': all_bboxes[i]['bbox'],
|
460
|
+
'score': all_bboxes[i]['score'],
|
406
461
|
}
|
407
462
|
|
408
463
|
if len(subject_object_relation_map[i]) > 0:
|
409
464
|
x0 = min(
|
410
|
-
[all_bboxes[j][
|
465
|
+
[all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
|
411
466
|
)
|
412
467
|
y0 = min(
|
413
|
-
[all_bboxes[j][
|
468
|
+
[all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
|
414
469
|
)
|
415
470
|
x1 = max(
|
416
|
-
[all_bboxes[j][
|
471
|
+
[all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
|
417
472
|
)
|
418
473
|
y1 = max(
|
419
|
-
[all_bboxes[j][
|
474
|
+
[all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
|
420
475
|
)
|
421
|
-
result[
|
422
|
-
result[
|
423
|
-
min(x0, all_bboxes[i][
|
424
|
-
min(y0, all_bboxes[i][
|
425
|
-
max(x1, all_bboxes[i][
|
426
|
-
max(y1, all_bboxes[i][
|
476
|
+
result['object_body'] = [x0, y0, x1, y1]
|
477
|
+
result['all'] = [
|
478
|
+
min(x0, all_bboxes[i]['bbox'][0]),
|
479
|
+
min(y0, all_bboxes[i]['bbox'][1]),
|
480
|
+
max(x1, all_bboxes[i]['bbox'][2]),
|
481
|
+
max(y1, all_bboxes[i]['bbox'][3]),
|
427
482
|
]
|
428
483
|
ret.append(result)
|
429
484
|
|
@@ -432,7 +487,7 @@ class MagicModel:
|
|
432
487
|
for i in subject_object_relation_map.keys():
|
433
488
|
for j in subject_object_relation_map[i]:
|
434
489
|
total_subject_object_dis += bbox_distance(
|
435
|
-
all_bboxes[i][
|
490
|
+
all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
|
436
491
|
)
|
437
492
|
|
438
493
|
# 计算未匹配的 subject 和 object 的距离(非精确版)
|
@@ -444,12 +499,12 @@ class MagicModel:
|
|
444
499
|
]
|
445
500
|
)
|
446
501
|
for i in range(N):
|
447
|
-
if all_bboxes[i][
|
502
|
+
if all_bboxes[i]['category_id'] != object_category_id or i in used:
|
448
503
|
continue
|
449
504
|
candidates = []
|
450
505
|
for j in range(N):
|
451
506
|
if (
|
452
|
-
all_bboxes[j][
|
507
|
+
all_bboxes[j]['category_id'] != subject_category_id
|
453
508
|
or j in with_caption_subject
|
454
509
|
):
|
455
510
|
continue
|
@@ -461,18 +516,28 @@ class MagicModel:
|
|
461
516
|
return ret, total_subject_object_dis
|
462
517
|
|
463
518
|
def get_imgs(self, page_no: int):
|
464
|
-
|
465
|
-
|
519
|
+
with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
|
520
|
+
with_footnotes, _ = self.__tie_up_category_by_distance(
|
521
|
+
page_no, 3, CategoryId.ImageFootnote
|
466
522
|
)
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
523
|
+
ret = []
|
524
|
+
N, M = len(with_captions), len(with_footnotes)
|
525
|
+
assert N == M
|
526
|
+
for i in range(N):
|
527
|
+
record = {
|
528
|
+
'score': with_captions[i]['score'],
|
529
|
+
'img_caption_bbox': with_captions[i].get('object_body', None),
|
530
|
+
'img_body_bbox': with_captions[i]['subject_body'],
|
531
|
+
'img_footnote_bbox': with_footnotes[i].get('object_body', None),
|
473
532
|
}
|
474
|
-
|
475
|
-
|
533
|
+
|
534
|
+
x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
|
535
|
+
y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
|
536
|
+
x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
|
537
|
+
y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
|
538
|
+
record['bbox'] = [x0, y0, x1, y1]
|
539
|
+
ret.append(record)
|
540
|
+
return ret
|
476
541
|
|
477
542
|
def get_tables(
|
478
543
|
self, page_no: int
|
@@ -484,26 +549,26 @@ class MagicModel:
|
|
484
549
|
assert N == M
|
485
550
|
for i in range(N):
|
486
551
|
record = {
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
552
|
+
'score': with_captions[i]['score'],
|
553
|
+
'table_caption_bbox': with_captions[i].get('object_body', None),
|
554
|
+
'table_body_bbox': with_captions[i]['subject_body'],
|
555
|
+
'table_footnote_bbox': with_footnotes[i].get('object_body', None),
|
491
556
|
}
|
492
557
|
|
493
|
-
x0 = min(with_captions[i][
|
494
|
-
y0 = min(with_captions[i][
|
495
|
-
x1 = max(with_captions[i][
|
496
|
-
y1 = max(with_captions[i][
|
497
|
-
record[
|
558
|
+
x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
|
559
|
+
y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
|
560
|
+
x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
|
561
|
+
y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
|
562
|
+
record['bbox'] = [x0, y0, x1, y1]
|
498
563
|
ret.append(record)
|
499
564
|
return ret
|
500
565
|
|
501
566
|
def get_equations(self, page_no: int) -> list: # 有坐标,也有字
|
502
567
|
inline_equations = self.__get_blocks_by_type(
|
503
|
-
ModelBlockTypeEnum.EMBEDDING.value, page_no, [
|
568
|
+
ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
|
504
569
|
)
|
505
570
|
interline_equations = self.__get_blocks_by_type(
|
506
|
-
ModelBlockTypeEnum.ISOLATED.value, page_no, [
|
571
|
+
ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
|
507
572
|
)
|
508
573
|
interline_equations_blocks = self.__get_blocks_by_type(
|
509
574
|
ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
|
@@ -525,17 +590,18 @@ class MagicModel:
|
|
525
590
|
def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标
|
526
591
|
text_spans = []
|
527
592
|
model_page_info = self.__model_list[page_no]
|
528
|
-
layout_dets = model_page_info[
|
593
|
+
layout_dets = model_page_info['layout_dets']
|
529
594
|
for layout_det in layout_dets:
|
530
|
-
if layout_det[
|
595
|
+
if layout_det['category_id'] == '15':
|
531
596
|
span = {
|
532
|
-
|
533
|
-
|
597
|
+
'bbox': layout_det['bbox'],
|
598
|
+
'content': layout_det['text'],
|
534
599
|
}
|
535
600
|
text_spans.append(span)
|
536
601
|
return text_spans
|
537
602
|
|
538
603
|
def get_all_spans(self, page_no: int) -> list:
|
604
|
+
|
539
605
|
def remove_duplicate_spans(spans):
|
540
606
|
new_spans = []
|
541
607
|
for span in spans:
|
@@ -545,7 +611,7 @@ class MagicModel:
|
|
545
611
|
|
546
612
|
all_spans = []
|
547
613
|
model_page_info = self.__model_list[page_no]
|
548
|
-
layout_dets = model_page_info[
|
614
|
+
layout_dets = model_page_info['layout_dets']
|
549
615
|
allow_category_id_list = [3, 5, 13, 14, 15]
|
550
616
|
"""当成span拼接的"""
|
551
617
|
# 3: 'image', # 图片
|
@@ -554,11 +620,11 @@ class MagicModel:
|
|
554
620
|
# 14: 'interline_equation', # 行间公式
|
555
621
|
# 15: 'text', # ocr识别文本
|
556
622
|
for layout_det in layout_dets:
|
557
|
-
category_id = layout_det[
|
623
|
+
category_id = layout_det['category_id']
|
558
624
|
if category_id in allow_category_id_list:
|
559
|
-
span = {
|
625
|
+
span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
|
560
626
|
if category_id == 3:
|
561
|
-
span[
|
627
|
+
span['type'] = ContentType.Image
|
562
628
|
elif category_id == 5:
|
563
629
|
# 获取table模型结果
|
564
630
|
latex = layout_det.get("latex", None)
|
@@ -569,14 +635,14 @@ class MagicModel:
|
|
569
635
|
span["html"] = html
|
570
636
|
span["type"] = ContentType.Table
|
571
637
|
elif category_id == 13:
|
572
|
-
span[
|
573
|
-
span[
|
638
|
+
span['content'] = layout_det['latex']
|
639
|
+
span['type'] = ContentType.InlineEquation
|
574
640
|
elif category_id == 14:
|
575
|
-
span[
|
576
|
-
span[
|
641
|
+
span['content'] = layout_det['latex']
|
642
|
+
span['type'] = ContentType.InterlineEquation
|
577
643
|
elif category_id == 15:
|
578
|
-
span[
|
579
|
-
span[
|
644
|
+
span['content'] = layout_det['text']
|
645
|
+
span['type'] = ContentType.Text
|
580
646
|
all_spans.append(span)
|
581
647
|
return remove_duplicate_spans(all_spans)
|
582
648
|
|
@@ -593,19 +659,19 @@ class MagicModel:
|
|
593
659
|
) -> list:
|
594
660
|
blocks = []
|
595
661
|
for page_dict in self.__model_list:
|
596
|
-
layout_dets = page_dict.get(
|
597
|
-
page_info = page_dict.get(
|
598
|
-
page_number = page_info.get(
|
662
|
+
layout_dets = page_dict.get('layout_dets', [])
|
663
|
+
page_info = page_dict.get('page_info', {})
|
664
|
+
page_number = page_info.get('page_no', -1)
|
599
665
|
if page_no != page_number:
|
600
666
|
continue
|
601
667
|
for item in layout_dets:
|
602
|
-
category_id = item.get(
|
603
|
-
bbox = item.get(
|
668
|
+
category_id = item.get('category_id', -1)
|
669
|
+
bbox = item.get('bbox', None)
|
604
670
|
|
605
671
|
if category_id == type:
|
606
672
|
block = {
|
607
|
-
|
608
|
-
|
673
|
+
'bbox': bbox,
|
674
|
+
'score': item.get('score'),
|
609
675
|
}
|
610
676
|
for col in extra_col:
|
611
677
|
block[col] = item.get(col, None)
|
@@ -616,28 +682,28 @@ class MagicModel:
|
|
616
682
|
return self.__model_list[page_no]
|
617
683
|
|
618
684
|
|
619
|
-
if __name__ ==
|
620
|
-
drw = DiskReaderWriter(r
|
685
|
+
if __name__ == '__main__':
|
686
|
+
drw = DiskReaderWriter(r'D:/project/20231108code-clean')
|
621
687
|
if 0:
|
622
|
-
pdf_file_path = r
|
623
|
-
model_file_path = r
|
688
|
+
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
689
|
+
model_file_path = r'linshixuqiu\19983-00_new.json'
|
624
690
|
pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
|
625
691
|
model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
|
626
692
|
model_list = json.loads(model_json_txt)
|
627
|
-
write_path = r
|
628
|
-
img_bucket_path =
|
693
|
+
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
694
|
+
img_bucket_path = 'imgs'
|
629
695
|
img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
|
630
|
-
pdf_docs = fitz.open(
|
696
|
+
pdf_docs = fitz.open('pdf', pdf_bytes)
|
631
697
|
magic_model = MagicModel(model_list, pdf_docs)
|
632
698
|
|
633
699
|
if 1:
|
634
700
|
model_list = json.loads(
|
635
|
-
drw.read(
|
701
|
+
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
|
636
702
|
)
|
637
703
|
pdf_bytes = drw.read(
|
638
|
-
|
704
|
+
'/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
|
639
705
|
)
|
640
|
-
pdf_docs = fitz.open(
|
706
|
+
pdf_docs = fitz.open('pdf', pdf_bytes)
|
641
707
|
magic_model = MagicModel(model_list, pdf_docs)
|
642
708
|
for i in range(7):
|
643
709
|
print(magic_model.get_imgs(i))
|