magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +130 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/boxbase.py +188 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +283 -166
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +105 -15
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/para/para_split_v2.py +26 -27
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
magic_pdf/model/magic_model.py
CHANGED
@@ -1,50 +1,40 @@
|
|
1
1
|
import json
|
2
|
-
import math
|
3
2
|
|
4
|
-
from magic_pdf.libs.
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
4
|
+
bbox_relative_pos, box_area, calculate_iou,
|
5
|
+
calculate_overlap_area_in_bbox1_area_ratio,
|
6
|
+
get_overlap_area)
|
7
|
+
from magic_pdf.libs.commons import fitz, join_path
|
8
8
|
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
9
|
-
from magic_pdf.libs.ocr_content_type import ContentType
|
10
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
11
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
12
9
|
from magic_pdf.libs.local_math import float_gt
|
13
|
-
from magic_pdf.libs.boxbase import (
|
14
|
-
_is_in,
|
15
|
-
bbox_relative_pos,
|
16
|
-
bbox_distance,
|
17
|
-
_is_part_overlap,
|
18
|
-
calculate_overlap_area_in_bbox1_area_ratio,
|
19
|
-
calculate_iou,
|
20
|
-
)
|
21
10
|
from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
|
11
|
+
from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
|
12
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
22
14
|
|
23
15
|
CAPATION_OVERLAP_AREA_RATIO = 0.6
|
16
|
+
MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
|
24
17
|
|
25
18
|
|
26
19
|
class MagicModel:
|
27
|
-
"""
|
28
|
-
每个函数没有得到元素的时候返回空list
|
29
|
-
|
30
|
-
"""
|
20
|
+
"""每个函数没有得到元素的时候返回空list."""
|
31
21
|
|
32
22
|
def __fix_axis(self):
|
33
23
|
for model_page_info in self.__model_list:
|
34
24
|
need_remove_list = []
|
35
|
-
page_no = model_page_info[
|
25
|
+
page_no = model_page_info['page_info']['page_no']
|
36
26
|
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
|
37
27
|
model_page_info, self.__docs[page_no]
|
38
28
|
)
|
39
|
-
layout_dets = model_page_info[
|
29
|
+
layout_dets = model_page_info['layout_dets']
|
40
30
|
for layout_det in layout_dets:
|
41
31
|
|
42
|
-
if layout_det.get(
|
32
|
+
if layout_det.get('bbox') is not None:
|
43
33
|
# 兼容直接输出bbox的模型数据,如paddle
|
44
|
-
x0, y0, x1, y1 = layout_det[
|
34
|
+
x0, y0, x1, y1 = layout_det['bbox']
|
45
35
|
else:
|
46
36
|
# 兼容直接输出poly的模型数据,如xxx
|
47
|
-
x0, y0, _, _, x1, y1, _, _ = layout_det[
|
37
|
+
x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
|
48
38
|
|
49
39
|
bbox = [
|
50
40
|
int(x0 / horizontal_scale_ratio),
|
@@ -52,7 +42,7 @@ class MagicModel:
|
|
52
42
|
int(x1 / horizontal_scale_ratio),
|
53
43
|
int(y1 / vertical_scale_ratio),
|
54
44
|
]
|
55
|
-
layout_det[
|
45
|
+
layout_det['bbox'] = bbox
|
56
46
|
# 删除高度或者宽度小于等于0的spans
|
57
47
|
if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
|
58
48
|
need_remove_list.append(layout_det)
|
@@ -62,9 +52,9 @@ class MagicModel:
|
|
62
52
|
def __fix_by_remove_low_confidence(self):
|
63
53
|
for model_page_info in self.__model_list:
|
64
54
|
need_remove_list = []
|
65
|
-
layout_dets = model_page_info[
|
55
|
+
layout_dets = model_page_info['layout_dets']
|
66
56
|
for layout_det in layout_dets:
|
67
|
-
if layout_det[
|
57
|
+
if layout_det['score'] <= 0.05:
|
68
58
|
need_remove_list.append(layout_det)
|
69
59
|
else:
|
70
60
|
continue
|
@@ -74,12 +64,12 @@ class MagicModel:
|
|
74
64
|
def __fix_by_remove_high_iou_and_low_confidence(self):
|
75
65
|
for model_page_info in self.__model_list:
|
76
66
|
need_remove_list = []
|
77
|
-
layout_dets = model_page_info[
|
67
|
+
layout_dets = model_page_info['layout_dets']
|
78
68
|
for layout_det1 in layout_dets:
|
79
69
|
for layout_det2 in layout_dets:
|
80
70
|
if layout_det1 == layout_det2:
|
81
71
|
continue
|
82
|
-
if layout_det1[
|
72
|
+
if layout_det1['category_id'] in [
|
83
73
|
0,
|
84
74
|
1,
|
85
75
|
2,
|
@@ -90,12 +80,12 @@ class MagicModel:
|
|
90
80
|
7,
|
91
81
|
8,
|
92
82
|
9,
|
93
|
-
] and layout_det2[
|
83
|
+
] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
|
94
84
|
if (
|
95
|
-
calculate_iou(layout_det1[
|
85
|
+
calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
|
96
86
|
> 0.9
|
97
87
|
):
|
98
|
-
if layout_det1[
|
88
|
+
if layout_det1['score'] < layout_det2['score']:
|
99
89
|
layout_det_need_remove = layout_det1
|
100
90
|
else:
|
101
91
|
layout_det_need_remove = layout_det2
|
@@ -118,6 +108,69 @@ class MagicModel:
|
|
118
108
|
self.__fix_by_remove_low_confidence()
|
119
109
|
"""删除高iou(>0.9)数据中置信度较低的那个"""
|
120
110
|
self.__fix_by_remove_high_iou_and_low_confidence()
|
111
|
+
self.__fix_footnote()
|
112
|
+
|
113
|
+
def __fix_footnote(self):
|
114
|
+
# 3: figure, 5: table, 7: footnote
|
115
|
+
for model_page_info in self.__model_list:
|
116
|
+
footnotes = []
|
117
|
+
figures = []
|
118
|
+
tables = []
|
119
|
+
|
120
|
+
for obj in model_page_info['layout_dets']:
|
121
|
+
if obj['category_id'] == 7:
|
122
|
+
footnotes.append(obj)
|
123
|
+
elif obj['category_id'] == 3:
|
124
|
+
figures.append(obj)
|
125
|
+
elif obj['category_id'] == 5:
|
126
|
+
tables.append(obj)
|
127
|
+
if len(footnotes) * len(figures) == 0:
|
128
|
+
continue
|
129
|
+
dis_figure_footnote = {}
|
130
|
+
dis_table_footnote = {}
|
131
|
+
|
132
|
+
for i in range(len(footnotes)):
|
133
|
+
for j in range(len(figures)):
|
134
|
+
pos_flag_count = sum(
|
135
|
+
list(
|
136
|
+
map(
|
137
|
+
lambda x: 1 if x else 0,
|
138
|
+
bbox_relative_pos(
|
139
|
+
footnotes[i]['bbox'], figures[j]['bbox']
|
140
|
+
),
|
141
|
+
)
|
142
|
+
)
|
143
|
+
)
|
144
|
+
if pos_flag_count > 1:
|
145
|
+
continue
|
146
|
+
dis_figure_footnote[i] = min(
|
147
|
+
bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
|
148
|
+
dis_figure_footnote.get(i, float('inf')),
|
149
|
+
)
|
150
|
+
for i in range(len(footnotes)):
|
151
|
+
for j in range(len(tables)):
|
152
|
+
pos_flag_count = sum(
|
153
|
+
list(
|
154
|
+
map(
|
155
|
+
lambda x: 1 if x else 0,
|
156
|
+
bbox_relative_pos(
|
157
|
+
footnotes[i]['bbox'], tables[j]['bbox']
|
158
|
+
),
|
159
|
+
)
|
160
|
+
)
|
161
|
+
)
|
162
|
+
if pos_flag_count > 1:
|
163
|
+
continue
|
164
|
+
|
165
|
+
dis_table_footnote[i] = min(
|
166
|
+
bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
|
167
|
+
dis_table_footnote.get(i, float('inf')),
|
168
|
+
)
|
169
|
+
for i in range(len(footnotes)):
|
170
|
+
if i not in dis_figure_footnote:
|
171
|
+
continue
|
172
|
+
if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
|
173
|
+
footnotes[i]['category_id'] = CategoryId.ImageFootnote
|
121
174
|
|
122
175
|
def __reduct_overlap(self, bboxes):
|
123
176
|
N = len(bboxes)
|
@@ -126,76 +179,115 @@ class MagicModel:
|
|
126
179
|
for j in range(N):
|
127
180
|
if i == j:
|
128
181
|
continue
|
129
|
-
if _is_in(bboxes[i][
|
182
|
+
if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
|
130
183
|
keep[i] = False
|
131
|
-
|
132
184
|
return [bboxes[i] for i in range(N) if keep[i]]
|
133
185
|
|
134
186
|
def __tie_up_category_by_distance(
|
135
187
|
self, page_no, subject_category_id, object_category_id
|
136
188
|
):
|
137
|
-
"""
|
138
|
-
|
139
|
-
"""
|
189
|
+
"""假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object
|
190
|
+
只能属于一个 subject."""
|
140
191
|
ret = []
|
141
192
|
MAX_DIS_OF_POINT = 10**9 + 7
|
193
|
+
"""
|
194
|
+
subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。
|
195
|
+
筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
196
|
+
再求出筛选出的 subjects 和 object 的最短距离
|
197
|
+
"""
|
198
|
+
def search_overlap_between_boxes(
|
199
|
+
subject_idx, object_idx
|
200
|
+
):
|
201
|
+
idxes = [subject_idx, object_idx]
|
202
|
+
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
|
203
|
+
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
|
204
|
+
x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
|
205
|
+
y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
|
206
|
+
|
207
|
+
merged_bbox = [
|
208
|
+
min(x0s),
|
209
|
+
min(y0s),
|
210
|
+
max(x1s),
|
211
|
+
max(y1s),
|
212
|
+
]
|
213
|
+
ratio = 0
|
214
|
+
|
215
|
+
other_objects = list(
|
216
|
+
map(
|
217
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
218
|
+
filter(
|
219
|
+
lambda x: x['category_id']
|
220
|
+
not in (object_category_id, subject_category_id),
|
221
|
+
self.__model_list[page_no]['layout_dets'],
|
222
|
+
),
|
223
|
+
)
|
224
|
+
)
|
225
|
+
for other_object in other_objects:
|
226
|
+
ratio = max(
|
227
|
+
ratio,
|
228
|
+
get_overlap_area(
|
229
|
+
merged_bbox, other_object['bbox']
|
230
|
+
) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
|
231
|
+
)
|
232
|
+
if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
|
233
|
+
break
|
234
|
+
|
235
|
+
return ratio
|
142
236
|
|
143
|
-
# subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
144
|
-
# 再求出筛选出的 subjects 和 object 的最短距离!
|
145
237
|
def may_find_other_nearest_bbox(subject_idx, object_idx):
|
146
|
-
ret = float(
|
238
|
+
ret = float('inf')
|
147
239
|
|
148
240
|
x0 = min(
|
149
|
-
all_bboxes[subject_idx][
|
241
|
+
all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
|
150
242
|
)
|
151
243
|
y0 = min(
|
152
|
-
all_bboxes[subject_idx][
|
244
|
+
all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
|
153
245
|
)
|
154
246
|
x1 = max(
|
155
|
-
all_bboxes[subject_idx][
|
247
|
+
all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
|
156
248
|
)
|
157
249
|
y1 = max(
|
158
|
-
all_bboxes[subject_idx][
|
250
|
+
all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
|
159
251
|
)
|
160
252
|
|
161
253
|
object_area = abs(
|
162
|
-
all_bboxes[object_idx][
|
254
|
+
all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
|
163
255
|
) * abs(
|
164
|
-
all_bboxes[object_idx][
|
256
|
+
all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
|
165
257
|
)
|
166
258
|
|
167
259
|
for i in range(len(all_bboxes)):
|
168
260
|
if (
|
169
261
|
i == subject_idx
|
170
|
-
or all_bboxes[i][
|
262
|
+
or all_bboxes[i]['category_id'] != subject_category_id
|
171
263
|
):
|
172
264
|
continue
|
173
|
-
if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i][
|
174
|
-
all_bboxes[i][
|
265
|
+
if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
|
266
|
+
all_bboxes[i]['bbox'], [x0, y0, x1, y1]
|
175
267
|
):
|
176
268
|
|
177
269
|
i_area = abs(
|
178
|
-
all_bboxes[i][
|
179
|
-
) * abs(all_bboxes[i][
|
270
|
+
all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
|
271
|
+
) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
|
180
272
|
if i_area >= object_area:
|
181
|
-
ret = min(float(
|
273
|
+
ret = min(float('inf'), dis[i][object_idx])
|
182
274
|
|
183
275
|
return ret
|
184
276
|
|
185
277
|
def expand_bbbox(idxes):
|
186
|
-
x0s = [all_bboxes[idx][
|
187
|
-
y0s = [all_bboxes[idx][
|
188
|
-
x1s = [all_bboxes[idx][
|
189
|
-
y1s = [all_bboxes[idx][
|
278
|
+
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
|
279
|
+
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
|
280
|
+
x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
|
281
|
+
y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
|
190
282
|
return min(x0s), min(y0s), max(x1s), max(y1s)
|
191
283
|
|
192
284
|
subjects = self.__reduct_overlap(
|
193
285
|
list(
|
194
286
|
map(
|
195
|
-
lambda x: {
|
287
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
196
288
|
filter(
|
197
|
-
lambda x: x[
|
198
|
-
self.__model_list[page_no][
|
289
|
+
lambda x: x['category_id'] == subject_category_id,
|
290
|
+
self.__model_list[page_no]['layout_dets'],
|
199
291
|
),
|
200
292
|
)
|
201
293
|
)
|
@@ -204,10 +296,10 @@ class MagicModel:
|
|
204
296
|
objects = self.__reduct_overlap(
|
205
297
|
list(
|
206
298
|
map(
|
207
|
-
lambda x: {
|
299
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
208
300
|
filter(
|
209
|
-
lambda x: x[
|
210
|
-
self.__model_list[page_no][
|
301
|
+
lambda x: x['category_id'] == object_category_id,
|
302
|
+
self.__model_list[page_no]['layout_dets'],
|
211
303
|
),
|
212
304
|
)
|
213
305
|
)
|
@@ -215,7 +307,7 @@ class MagicModel:
|
|
215
307
|
subject_object_relation_map = {}
|
216
308
|
|
217
309
|
subjects.sort(
|
218
|
-
key=lambda x: x[
|
310
|
+
key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
|
219
311
|
) # get the distance !
|
220
312
|
|
221
313
|
all_bboxes = []
|
@@ -223,18 +315,18 @@ class MagicModel:
|
|
223
315
|
for v in subjects:
|
224
316
|
all_bboxes.append(
|
225
317
|
{
|
226
|
-
|
227
|
-
|
228
|
-
|
318
|
+
'category_id': subject_category_id,
|
319
|
+
'bbox': v['bbox'],
|
320
|
+
'score': v['score'],
|
229
321
|
}
|
230
322
|
)
|
231
323
|
|
232
324
|
for v in objects:
|
233
325
|
all_bboxes.append(
|
234
326
|
{
|
235
|
-
|
236
|
-
|
237
|
-
|
327
|
+
'category_id': object_category_id,
|
328
|
+
'bbox': v['bbox'],
|
329
|
+
'score': v['score'],
|
238
330
|
}
|
239
331
|
)
|
240
332
|
|
@@ -244,18 +336,27 @@ class MagicModel:
|
|
244
336
|
for i in range(N):
|
245
337
|
for j in range(i):
|
246
338
|
if (
|
247
|
-
all_bboxes[i][
|
248
|
-
and all_bboxes[j][
|
339
|
+
all_bboxes[i]['category_id'] == subject_category_id
|
340
|
+
and all_bboxes[j]['category_id'] == subject_category_id
|
249
341
|
):
|
250
342
|
continue
|
251
343
|
|
252
|
-
|
344
|
+
subject_idx, object_idx = i, j
|
345
|
+
if all_bboxes[j]['category_id'] == subject_category_id:
|
346
|
+
subject_idx, object_idx = j, i
|
347
|
+
|
348
|
+
if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
|
349
|
+
dis[i][j] = float('inf')
|
350
|
+
dis[j][i] = dis[i][j]
|
351
|
+
continue
|
352
|
+
|
353
|
+
dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
|
253
354
|
dis[j][i] = dis[i][j]
|
254
355
|
|
255
356
|
used = set()
|
256
357
|
for i in range(N):
|
257
358
|
# 求第 i 个 subject 所关联的 object
|
258
|
-
if all_bboxes[i][
|
359
|
+
if all_bboxes[i]['category_id'] != subject_category_id:
|
259
360
|
continue
|
260
361
|
seen = set()
|
261
362
|
candidates = []
|
@@ -267,7 +368,7 @@ class MagicModel:
|
|
267
368
|
map(
|
268
369
|
lambda x: 1 if x else 0,
|
269
370
|
bbox_relative_pos(
|
270
|
-
all_bboxes[i][
|
371
|
+
all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
|
271
372
|
),
|
272
373
|
)
|
273
374
|
)
|
@@ -275,25 +376,28 @@ class MagicModel:
|
|
275
376
|
if pos_flag_count > 1:
|
276
377
|
continue
|
277
378
|
if (
|
278
|
-
all_bboxes[j][
|
379
|
+
all_bboxes[j]['category_id'] != object_category_id
|
279
380
|
or j in used
|
280
381
|
or dis[i][j] == MAX_DIS_OF_POINT
|
281
382
|
):
|
282
383
|
continue
|
283
384
|
left, right, _, _ = bbox_relative_pos(
|
284
|
-
all_bboxes[i][
|
385
|
+
all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
|
285
386
|
) # 由 pos_flag_count 相关逻辑保证本段逻辑准确性
|
286
387
|
if left or right:
|
287
|
-
one_way_dis = all_bboxes[i][
|
388
|
+
one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
|
288
389
|
else:
|
289
|
-
one_way_dis = all_bboxes[i][
|
390
|
+
one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
|
290
391
|
if dis[i][j] > one_way_dis:
|
291
392
|
continue
|
292
393
|
arr.append((dis[i][j], j))
|
293
394
|
|
294
395
|
arr.sort(key=lambda x: x[0])
|
295
396
|
if len(arr) > 0:
|
296
|
-
|
397
|
+
"""
|
398
|
+
bug: 离该subject 最近的 object 可能跨越了其它的 subject。
|
399
|
+
比如 [this subect] [some sbuject] [the nearest object of subject]
|
400
|
+
"""
|
297
401
|
if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
|
298
402
|
|
299
403
|
candidates.append(arr[0][1])
|
@@ -308,7 +412,7 @@ class MagicModel:
|
|
308
412
|
map(
|
309
413
|
lambda x: 1 if x else 0,
|
310
414
|
bbox_relative_pos(
|
311
|
-
all_bboxes[j][
|
415
|
+
all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
|
312
416
|
),
|
313
417
|
)
|
314
418
|
)
|
@@ -318,7 +422,7 @@ class MagicModel:
|
|
318
422
|
continue
|
319
423
|
|
320
424
|
if (
|
321
|
-
all_bboxes[k][
|
425
|
+
all_bboxes[k]['category_id'] != object_category_id
|
322
426
|
or k in used
|
323
427
|
or k in seen
|
324
428
|
or dis[j][k] == MAX_DIS_OF_POINT
|
@@ -327,17 +431,19 @@ class MagicModel:
|
|
327
431
|
continue
|
328
432
|
|
329
433
|
is_nearest = True
|
330
|
-
for
|
331
|
-
if
|
434
|
+
for ni in range(i + 1, N):
|
435
|
+
if ni in (j, k) or ni in used or ni in seen:
|
332
436
|
continue
|
333
437
|
|
334
|
-
if not float_gt(dis[
|
438
|
+
if not float_gt(dis[ni][k], dis[j][k]):
|
335
439
|
is_nearest = False
|
336
440
|
break
|
337
441
|
|
338
442
|
if is_nearest:
|
339
443
|
nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
|
340
|
-
n_dis = bbox_distance(
|
444
|
+
n_dis = bbox_distance(
|
445
|
+
all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
|
446
|
+
)
|
341
447
|
if float_gt(dis[i][j], n_dis):
|
342
448
|
continue
|
343
449
|
tmp.append(k)
|
@@ -350,7 +456,7 @@ class MagicModel:
|
|
350
456
|
# 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
|
351
457
|
# 先扩一下 bbox,
|
352
458
|
ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
|
353
|
-
ix0, iy0, ix1, iy1 = all_bboxes[i][
|
459
|
+
ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
|
354
460
|
|
355
461
|
# 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
|
356
462
|
caption_poses = [
|
@@ -366,17 +472,17 @@ class MagicModel:
|
|
366
472
|
for idx in seen:
|
367
473
|
if (
|
368
474
|
calculate_overlap_area_in_bbox1_area_ratio(
|
369
|
-
all_bboxes[idx][
|
475
|
+
all_bboxes[idx]['bbox'], bbox
|
370
476
|
)
|
371
477
|
> CAPATION_OVERLAP_AREA_RATIO
|
372
478
|
):
|
373
479
|
embed_arr.append(idx)
|
374
480
|
|
375
481
|
if len(embed_arr) > 0:
|
376
|
-
embed_x0 = min([all_bboxes[idx][
|
377
|
-
embed_y0 = min([all_bboxes[idx][
|
378
|
-
embed_x1 = max([all_bboxes[idx][
|
379
|
-
embed_y1 = max([all_bboxes[idx][
|
482
|
+
embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
|
483
|
+
embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
|
484
|
+
embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
|
485
|
+
embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
|
380
486
|
caption_areas.append(
|
381
487
|
int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
|
382
488
|
)
|
@@ -391,7 +497,7 @@ class MagicModel:
|
|
391
497
|
for j in seen:
|
392
498
|
if (
|
393
499
|
calculate_overlap_area_in_bbox1_area_ratio(
|
394
|
-
all_bboxes[j][
|
500
|
+
all_bboxes[j]['bbox'], caption_bbox
|
395
501
|
)
|
396
502
|
> CAPATION_OVERLAP_AREA_RATIO
|
397
503
|
):
|
@@ -400,30 +506,30 @@ class MagicModel:
|
|
400
506
|
|
401
507
|
for i in sorted(subject_object_relation_map.keys()):
|
402
508
|
result = {
|
403
|
-
|
404
|
-
|
405
|
-
|
509
|
+
'subject_body': all_bboxes[i]['bbox'],
|
510
|
+
'all': all_bboxes[i]['bbox'],
|
511
|
+
'score': all_bboxes[i]['score'],
|
406
512
|
}
|
407
513
|
|
408
514
|
if len(subject_object_relation_map[i]) > 0:
|
409
515
|
x0 = min(
|
410
|
-
[all_bboxes[j][
|
516
|
+
[all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
|
411
517
|
)
|
412
518
|
y0 = min(
|
413
|
-
[all_bboxes[j][
|
519
|
+
[all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
|
414
520
|
)
|
415
521
|
x1 = max(
|
416
|
-
[all_bboxes[j][
|
522
|
+
[all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
|
417
523
|
)
|
418
524
|
y1 = max(
|
419
|
-
[all_bboxes[j][
|
525
|
+
[all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
|
420
526
|
)
|
421
|
-
result[
|
422
|
-
result[
|
423
|
-
min(x0, all_bboxes[i][
|
424
|
-
min(y0, all_bboxes[i][
|
425
|
-
max(x1, all_bboxes[i][
|
426
|
-
max(y1, all_bboxes[i][
|
527
|
+
result['object_body'] = [x0, y0, x1, y1]
|
528
|
+
result['all'] = [
|
529
|
+
min(x0, all_bboxes[i]['bbox'][0]),
|
530
|
+
min(y0, all_bboxes[i]['bbox'][1]),
|
531
|
+
max(x1, all_bboxes[i]['bbox'][2]),
|
532
|
+
max(y1, all_bboxes[i]['bbox'][3]),
|
427
533
|
]
|
428
534
|
ret.append(result)
|
429
535
|
|
@@ -432,7 +538,7 @@ class MagicModel:
|
|
432
538
|
for i in subject_object_relation_map.keys():
|
433
539
|
for j in subject_object_relation_map[i]:
|
434
540
|
total_subject_object_dis += bbox_distance(
|
435
|
-
all_bboxes[i][
|
541
|
+
all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
|
436
542
|
)
|
437
543
|
|
438
544
|
# 计算未匹配的 subject 和 object 的距离(非精确版)
|
@@ -444,12 +550,12 @@ class MagicModel:
|
|
444
550
|
]
|
445
551
|
)
|
446
552
|
for i in range(N):
|
447
|
-
if all_bboxes[i][
|
553
|
+
if all_bboxes[i]['category_id'] != object_category_id or i in used:
|
448
554
|
continue
|
449
555
|
candidates = []
|
450
556
|
for j in range(N):
|
451
557
|
if (
|
452
|
-
all_bboxes[j][
|
558
|
+
all_bboxes[j]['category_id'] != subject_category_id
|
453
559
|
or j in with_caption_subject
|
454
560
|
):
|
455
561
|
continue
|
@@ -461,18 +567,28 @@ class MagicModel:
|
|
461
567
|
return ret, total_subject_object_dis
|
462
568
|
|
463
569
|
def get_imgs(self, page_no: int):
|
464
|
-
|
465
|
-
|
570
|
+
with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
|
571
|
+
with_footnotes, _ = self.__tie_up_category_by_distance(
|
572
|
+
page_no, 3, CategoryId.ImageFootnote
|
466
573
|
)
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
574
|
+
ret = []
|
575
|
+
N, M = len(with_captions), len(with_footnotes)
|
576
|
+
assert N == M
|
577
|
+
for i in range(N):
|
578
|
+
record = {
|
579
|
+
'score': with_captions[i]['score'],
|
580
|
+
'img_caption_bbox': with_captions[i].get('object_body', None),
|
581
|
+
'img_body_bbox': with_captions[i]['subject_body'],
|
582
|
+
'img_footnote_bbox': with_footnotes[i].get('object_body', None),
|
473
583
|
}
|
474
|
-
|
475
|
-
|
584
|
+
|
585
|
+
x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
|
586
|
+
y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
|
587
|
+
x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
|
588
|
+
y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
|
589
|
+
record['bbox'] = [x0, y0, x1, y1]
|
590
|
+
ret.append(record)
|
591
|
+
return ret
|
476
592
|
|
477
593
|
def get_tables(
|
478
594
|
self, page_no: int
|
@@ -484,26 +600,26 @@ class MagicModel:
|
|
484
600
|
assert N == M
|
485
601
|
for i in range(N):
|
486
602
|
record = {
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
603
|
+
'score': with_captions[i]['score'],
|
604
|
+
'table_caption_bbox': with_captions[i].get('object_body', None),
|
605
|
+
'table_body_bbox': with_captions[i]['subject_body'],
|
606
|
+
'table_footnote_bbox': with_footnotes[i].get('object_body', None),
|
491
607
|
}
|
492
608
|
|
493
|
-
x0 = min(with_captions[i][
|
494
|
-
y0 = min(with_captions[i][
|
495
|
-
x1 = max(with_captions[i][
|
496
|
-
y1 = max(with_captions[i][
|
497
|
-
record[
|
609
|
+
x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
|
610
|
+
y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
|
611
|
+
x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
|
612
|
+
y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
|
613
|
+
record['bbox'] = [x0, y0, x1, y1]
|
498
614
|
ret.append(record)
|
499
615
|
return ret
|
500
616
|
|
501
617
|
def get_equations(self, page_no: int) -> list: # 有坐标,也有字
|
502
618
|
inline_equations = self.__get_blocks_by_type(
|
503
|
-
ModelBlockTypeEnum.EMBEDDING.value, page_no, [
|
619
|
+
ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
|
504
620
|
)
|
505
621
|
interline_equations = self.__get_blocks_by_type(
|
506
|
-
ModelBlockTypeEnum.ISOLATED.value, page_no, [
|
622
|
+
ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
|
507
623
|
)
|
508
624
|
interline_equations_blocks = self.__get_blocks_by_type(
|
509
625
|
ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
|
@@ -525,17 +641,18 @@ class MagicModel:
|
|
525
641
|
def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标
|
526
642
|
text_spans = []
|
527
643
|
model_page_info = self.__model_list[page_no]
|
528
|
-
layout_dets = model_page_info[
|
644
|
+
layout_dets = model_page_info['layout_dets']
|
529
645
|
for layout_det in layout_dets:
|
530
|
-
if layout_det[
|
646
|
+
if layout_det['category_id'] == '15':
|
531
647
|
span = {
|
532
|
-
|
533
|
-
|
648
|
+
'bbox': layout_det['bbox'],
|
649
|
+
'content': layout_det['text'],
|
534
650
|
}
|
535
651
|
text_spans.append(span)
|
536
652
|
return text_spans
|
537
653
|
|
538
654
|
def get_all_spans(self, page_no: int) -> list:
|
655
|
+
|
539
656
|
def remove_duplicate_spans(spans):
|
540
657
|
new_spans = []
|
541
658
|
for span in spans:
|
@@ -545,7 +662,7 @@ class MagicModel:
|
|
545
662
|
|
546
663
|
all_spans = []
|
547
664
|
model_page_info = self.__model_list[page_no]
|
548
|
-
layout_dets = model_page_info[
|
665
|
+
layout_dets = model_page_info['layout_dets']
|
549
666
|
allow_category_id_list = [3, 5, 13, 14, 15]
|
550
667
|
"""当成span拼接的"""
|
551
668
|
# 3: 'image', # 图片
|
@@ -554,29 +671,29 @@ class MagicModel:
|
|
554
671
|
# 14: 'interline_equation', # 行间公式
|
555
672
|
# 15: 'text', # ocr识别文本
|
556
673
|
for layout_det in layout_dets:
|
557
|
-
category_id = layout_det[
|
674
|
+
category_id = layout_det['category_id']
|
558
675
|
if category_id in allow_category_id_list:
|
559
|
-
span = {
|
676
|
+
span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
|
560
677
|
if category_id == 3:
|
561
|
-
span[
|
678
|
+
span['type'] = ContentType.Image
|
562
679
|
elif category_id == 5:
|
563
680
|
# 获取table模型结果
|
564
|
-
latex = layout_det.get(
|
565
|
-
html = layout_det.get(
|
681
|
+
latex = layout_det.get('latex', None)
|
682
|
+
html = layout_det.get('html', None)
|
566
683
|
if latex:
|
567
|
-
span[
|
684
|
+
span['latex'] = latex
|
568
685
|
elif html:
|
569
|
-
span[
|
570
|
-
span[
|
686
|
+
span['html'] = html
|
687
|
+
span['type'] = ContentType.Table
|
571
688
|
elif category_id == 13:
|
572
|
-
span[
|
573
|
-
span[
|
689
|
+
span['content'] = layout_det['latex']
|
690
|
+
span['type'] = ContentType.InlineEquation
|
574
691
|
elif category_id == 14:
|
575
|
-
span[
|
576
|
-
span[
|
692
|
+
span['content'] = layout_det['latex']
|
693
|
+
span['type'] = ContentType.InterlineEquation
|
577
694
|
elif category_id == 15:
|
578
|
-
span[
|
579
|
-
span[
|
695
|
+
span['content'] = layout_det['text']
|
696
|
+
span['type'] = ContentType.Text
|
580
697
|
all_spans.append(span)
|
581
698
|
return remove_duplicate_spans(all_spans)
|
582
699
|
|
@@ -593,19 +710,19 @@ class MagicModel:
|
|
593
710
|
) -> list:
|
594
711
|
blocks = []
|
595
712
|
for page_dict in self.__model_list:
|
596
|
-
layout_dets = page_dict.get(
|
597
|
-
page_info = page_dict.get(
|
598
|
-
page_number = page_info.get(
|
713
|
+
layout_dets = page_dict.get('layout_dets', [])
|
714
|
+
page_info = page_dict.get('page_info', {})
|
715
|
+
page_number = page_info.get('page_no', -1)
|
599
716
|
if page_no != page_number:
|
600
717
|
continue
|
601
718
|
for item in layout_dets:
|
602
|
-
category_id = item.get(
|
603
|
-
bbox = item.get(
|
719
|
+
category_id = item.get('category_id', -1)
|
720
|
+
bbox = item.get('bbox', None)
|
604
721
|
|
605
722
|
if category_id == type:
|
606
723
|
block = {
|
607
|
-
|
608
|
-
|
724
|
+
'bbox': bbox,
|
725
|
+
'score': item.get('score'),
|
609
726
|
}
|
610
727
|
for col in extra_col:
|
611
728
|
block[col] = item.get(col, None)
|
@@ -616,28 +733,28 @@ class MagicModel:
|
|
616
733
|
return self.__model_list[page_no]
|
617
734
|
|
618
735
|
|
619
|
-
if __name__ ==
|
620
|
-
drw = DiskReaderWriter(r
|
736
|
+
if __name__ == '__main__':
|
737
|
+
drw = DiskReaderWriter(r'D:/project/20231108code-clean')
|
621
738
|
if 0:
|
622
|
-
pdf_file_path = r
|
623
|
-
model_file_path = r
|
739
|
+
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
740
|
+
model_file_path = r'linshixuqiu\19983-00_new.json'
|
624
741
|
pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
|
625
742
|
model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
|
626
743
|
model_list = json.loads(model_json_txt)
|
627
|
-
write_path = r
|
628
|
-
img_bucket_path =
|
744
|
+
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
745
|
+
img_bucket_path = 'imgs'
|
629
746
|
img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
|
630
|
-
pdf_docs = fitz.open(
|
747
|
+
pdf_docs = fitz.open('pdf', pdf_bytes)
|
631
748
|
magic_model = MagicModel(model_list, pdf_docs)
|
632
749
|
|
633
750
|
if 1:
|
634
751
|
model_list = json.loads(
|
635
|
-
drw.read(
|
752
|
+
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
|
636
753
|
)
|
637
754
|
pdf_bytes = drw.read(
|
638
|
-
|
755
|
+
'/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
|
639
756
|
)
|
640
|
-
pdf_docs = fitz.open(
|
757
|
+
pdf_docs = fitz.open('pdf', pdf_bytes)
|
641
758
|
magic_model = MagicModel(model_list, pdf_docs)
|
642
759
|
for i in range(7):
|
643
760
|
print(magic_model.get_imgs(i))
|