magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +188 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +283 -166
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,50 +1,40 @@
1
1
  import json
2
- import math
3
2
 
4
- from magic_pdf.libs.commons import fitz
5
- from loguru import logger
6
-
7
- from magic_pdf.libs.commons import join_path
3
+ from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
4
+ bbox_relative_pos, box_area, calculate_iou,
5
+ calculate_overlap_area_in_bbox1_area_ratio,
6
+ get_overlap_area)
7
+ from magic_pdf.libs.commons import fitz, join_path
8
8
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
9
- from magic_pdf.libs.ocr_content_type import ContentType
10
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
11
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
12
9
  from magic_pdf.libs.local_math import float_gt
13
- from magic_pdf.libs.boxbase import (
14
- _is_in,
15
- bbox_relative_pos,
16
- bbox_distance,
17
- _is_part_overlap,
18
- calculate_overlap_area_in_bbox1_area_ratio,
19
- calculate_iou,
20
- )
21
10
  from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
11
+ from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
12
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
22
14
 
23
15
  CAPATION_OVERLAP_AREA_RATIO = 0.6
16
+ MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
24
17
 
25
18
 
26
19
  class MagicModel:
27
- """
28
- 每个函数没有得到元素的时候返回空list
29
-
30
- """
20
+ """每个函数没有得到元素的时候返回空list."""
31
21
 
32
22
  def __fix_axis(self):
33
23
  for model_page_info in self.__model_list:
34
24
  need_remove_list = []
35
- page_no = model_page_info["page_info"]["page_no"]
25
+ page_no = model_page_info['page_info']['page_no']
36
26
  horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
37
27
  model_page_info, self.__docs[page_no]
38
28
  )
39
- layout_dets = model_page_info["layout_dets"]
29
+ layout_dets = model_page_info['layout_dets']
40
30
  for layout_det in layout_dets:
41
31
 
42
- if layout_det.get("bbox") is not None:
32
+ if layout_det.get('bbox') is not None:
43
33
  # 兼容直接输出bbox的模型数据,如paddle
44
- x0, y0, x1, y1 = layout_det["bbox"]
34
+ x0, y0, x1, y1 = layout_det['bbox']
45
35
  else:
46
36
  # 兼容直接输出poly的模型数据,如xxx
47
- x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
37
+ x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
48
38
 
49
39
  bbox = [
50
40
  int(x0 / horizontal_scale_ratio),
@@ -52,7 +42,7 @@ class MagicModel:
52
42
  int(x1 / horizontal_scale_ratio),
53
43
  int(y1 / vertical_scale_ratio),
54
44
  ]
55
- layout_det["bbox"] = bbox
45
+ layout_det['bbox'] = bbox
56
46
  # 删除高度或者宽度小于等于0的spans
57
47
  if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
58
48
  need_remove_list.append(layout_det)
@@ -62,9 +52,9 @@ class MagicModel:
62
52
  def __fix_by_remove_low_confidence(self):
63
53
  for model_page_info in self.__model_list:
64
54
  need_remove_list = []
65
- layout_dets = model_page_info["layout_dets"]
55
+ layout_dets = model_page_info['layout_dets']
66
56
  for layout_det in layout_dets:
67
- if layout_det["score"] <= 0.05:
57
+ if layout_det['score'] <= 0.05:
68
58
  need_remove_list.append(layout_det)
69
59
  else:
70
60
  continue
@@ -74,12 +64,12 @@ class MagicModel:
74
64
  def __fix_by_remove_high_iou_and_low_confidence(self):
75
65
  for model_page_info in self.__model_list:
76
66
  need_remove_list = []
77
- layout_dets = model_page_info["layout_dets"]
67
+ layout_dets = model_page_info['layout_dets']
78
68
  for layout_det1 in layout_dets:
79
69
  for layout_det2 in layout_dets:
80
70
  if layout_det1 == layout_det2:
81
71
  continue
82
- if layout_det1["category_id"] in [
72
+ if layout_det1['category_id'] in [
83
73
  0,
84
74
  1,
85
75
  2,
@@ -90,12 +80,12 @@ class MagicModel:
90
80
  7,
91
81
  8,
92
82
  9,
93
- ] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
83
+ ] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
94
84
  if (
95
- calculate_iou(layout_det1["bbox"], layout_det2["bbox"])
85
+ calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
96
86
  > 0.9
97
87
  ):
98
- if layout_det1["score"] < layout_det2["score"]:
88
+ if layout_det1['score'] < layout_det2['score']:
99
89
  layout_det_need_remove = layout_det1
100
90
  else:
101
91
  layout_det_need_remove = layout_det2
@@ -118,6 +108,69 @@ class MagicModel:
118
108
  self.__fix_by_remove_low_confidence()
119
109
  """删除高iou(>0.9)数据中置信度较低的那个"""
120
110
  self.__fix_by_remove_high_iou_and_low_confidence()
111
+ self.__fix_footnote()
112
+
113
+ def __fix_footnote(self):
114
+ # 3: figure, 5: table, 7: footnote
115
+ for model_page_info in self.__model_list:
116
+ footnotes = []
117
+ figures = []
118
+ tables = []
119
+
120
+ for obj in model_page_info['layout_dets']:
121
+ if obj['category_id'] == 7:
122
+ footnotes.append(obj)
123
+ elif obj['category_id'] == 3:
124
+ figures.append(obj)
125
+ elif obj['category_id'] == 5:
126
+ tables.append(obj)
127
+ if len(footnotes) * len(figures) == 0:
128
+ continue
129
+ dis_figure_footnote = {}
130
+ dis_table_footnote = {}
131
+
132
+ for i in range(len(footnotes)):
133
+ for j in range(len(figures)):
134
+ pos_flag_count = sum(
135
+ list(
136
+ map(
137
+ lambda x: 1 if x else 0,
138
+ bbox_relative_pos(
139
+ footnotes[i]['bbox'], figures[j]['bbox']
140
+ ),
141
+ )
142
+ )
143
+ )
144
+ if pos_flag_count > 1:
145
+ continue
146
+ dis_figure_footnote[i] = min(
147
+ bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
148
+ dis_figure_footnote.get(i, float('inf')),
149
+ )
150
+ for i in range(len(footnotes)):
151
+ for j in range(len(tables)):
152
+ pos_flag_count = sum(
153
+ list(
154
+ map(
155
+ lambda x: 1 if x else 0,
156
+ bbox_relative_pos(
157
+ footnotes[i]['bbox'], tables[j]['bbox']
158
+ ),
159
+ )
160
+ )
161
+ )
162
+ if pos_flag_count > 1:
163
+ continue
164
+
165
+ dis_table_footnote[i] = min(
166
+ bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
167
+ dis_table_footnote.get(i, float('inf')),
168
+ )
169
+ for i in range(len(footnotes)):
170
+ if i not in dis_figure_footnote:
171
+ continue
172
+ if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
173
+ footnotes[i]['category_id'] = CategoryId.ImageFootnote
121
174
 
122
175
  def __reduct_overlap(self, bboxes):
123
176
  N = len(bboxes)
@@ -126,76 +179,115 @@ class MagicModel:
126
179
  for j in range(N):
127
180
  if i == j:
128
181
  continue
129
- if _is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]):
182
+ if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
130
183
  keep[i] = False
131
-
132
184
  return [bboxes[i] for i in range(N) if keep[i]]
133
185
 
134
186
  def __tie_up_category_by_distance(
135
187
  self, page_no, subject_category_id, object_category_id
136
188
  ):
137
- """
138
- 假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
139
- """
189
+ """假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object
190
+ 只能属于一个 subject."""
140
191
  ret = []
141
192
  MAX_DIS_OF_POINT = 10**9 + 7
193
+ """
194
+ subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。
195
+ 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
196
+ 再求出筛选出的 subjects 和 object 的最短距离
197
+ """
198
+ def search_overlap_between_boxes(
199
+ subject_idx, object_idx
200
+ ):
201
+ idxes = [subject_idx, object_idx]
202
+ x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
203
+ y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
204
+ x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
205
+ y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
206
+
207
+ merged_bbox = [
208
+ min(x0s),
209
+ min(y0s),
210
+ max(x1s),
211
+ max(y1s),
212
+ ]
213
+ ratio = 0
214
+
215
+ other_objects = list(
216
+ map(
217
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
218
+ filter(
219
+ lambda x: x['category_id']
220
+ not in (object_category_id, subject_category_id),
221
+ self.__model_list[page_no]['layout_dets'],
222
+ ),
223
+ )
224
+ )
225
+ for other_object in other_objects:
226
+ ratio = max(
227
+ ratio,
228
+ get_overlap_area(
229
+ merged_bbox, other_object['bbox']
230
+ ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
231
+ )
232
+ if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
233
+ break
234
+
235
+ return ratio
142
236
 
143
- # subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
144
- # 再求出筛选出的 subjects 和 object 的最短距离!
145
237
  def may_find_other_nearest_bbox(subject_idx, object_idx):
146
- ret = float("inf")
238
+ ret = float('inf')
147
239
 
148
240
  x0 = min(
149
- all_bboxes[subject_idx]["bbox"][0], all_bboxes[object_idx]["bbox"][0]
241
+ all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
150
242
  )
151
243
  y0 = min(
152
- all_bboxes[subject_idx]["bbox"][1], all_bboxes[object_idx]["bbox"][1]
244
+ all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
153
245
  )
154
246
  x1 = max(
155
- all_bboxes[subject_idx]["bbox"][2], all_bboxes[object_idx]["bbox"][2]
247
+ all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
156
248
  )
157
249
  y1 = max(
158
- all_bboxes[subject_idx]["bbox"][3], all_bboxes[object_idx]["bbox"][3]
250
+ all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
159
251
  )
160
252
 
161
253
  object_area = abs(
162
- all_bboxes[object_idx]["bbox"][2] - all_bboxes[object_idx]["bbox"][0]
254
+ all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
163
255
  ) * abs(
164
- all_bboxes[object_idx]["bbox"][3] - all_bboxes[object_idx]["bbox"][1]
256
+ all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
165
257
  )
166
258
 
167
259
  for i in range(len(all_bboxes)):
168
260
  if (
169
261
  i == subject_idx
170
- or all_bboxes[i]["category_id"] != subject_category_id
262
+ or all_bboxes[i]['category_id'] != subject_category_id
171
263
  ):
172
264
  continue
173
- if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]["bbox"]) or _is_in(
174
- all_bboxes[i]["bbox"], [x0, y0, x1, y1]
265
+ if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
266
+ all_bboxes[i]['bbox'], [x0, y0, x1, y1]
175
267
  ):
176
268
 
177
269
  i_area = abs(
178
- all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
179
- ) * abs(all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1])
270
+ all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
271
+ ) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
180
272
  if i_area >= object_area:
181
- ret = min(float("inf"), dis[i][object_idx])
273
+ ret = min(float('inf'), dis[i][object_idx])
182
274
 
183
275
  return ret
184
276
 
185
277
  def expand_bbbox(idxes):
186
- x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes]
187
- y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes]
188
- x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes]
189
- y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes]
278
+ x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
279
+ y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
280
+ x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
281
+ y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
190
282
  return min(x0s), min(y0s), max(x1s), max(y1s)
191
283
 
192
284
  subjects = self.__reduct_overlap(
193
285
  list(
194
286
  map(
195
- lambda x: {"bbox": x["bbox"], "score": x["score"]},
287
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
196
288
  filter(
197
- lambda x: x["category_id"] == subject_category_id,
198
- self.__model_list[page_no]["layout_dets"],
289
+ lambda x: x['category_id'] == subject_category_id,
290
+ self.__model_list[page_no]['layout_dets'],
199
291
  ),
200
292
  )
201
293
  )
@@ -204,10 +296,10 @@ class MagicModel:
204
296
  objects = self.__reduct_overlap(
205
297
  list(
206
298
  map(
207
- lambda x: {"bbox": x["bbox"], "score": x["score"]},
299
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
208
300
  filter(
209
- lambda x: x["category_id"] == object_category_id,
210
- self.__model_list[page_no]["layout_dets"],
301
+ lambda x: x['category_id'] == object_category_id,
302
+ self.__model_list[page_no]['layout_dets'],
211
303
  ),
212
304
  )
213
305
  )
@@ -215,7 +307,7 @@ class MagicModel:
215
307
  subject_object_relation_map = {}
216
308
 
217
309
  subjects.sort(
218
- key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2
310
+ key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
219
311
  ) # get the distance !
220
312
 
221
313
  all_bboxes = []
@@ -223,18 +315,18 @@ class MagicModel:
223
315
  for v in subjects:
224
316
  all_bboxes.append(
225
317
  {
226
- "category_id": subject_category_id,
227
- "bbox": v["bbox"],
228
- "score": v["score"],
318
+ 'category_id': subject_category_id,
319
+ 'bbox': v['bbox'],
320
+ 'score': v['score'],
229
321
  }
230
322
  )
231
323
 
232
324
  for v in objects:
233
325
  all_bboxes.append(
234
326
  {
235
- "category_id": object_category_id,
236
- "bbox": v["bbox"],
237
- "score": v["score"],
327
+ 'category_id': object_category_id,
328
+ 'bbox': v['bbox'],
329
+ 'score': v['score'],
238
330
  }
239
331
  )
240
332
 
@@ -244,18 +336,27 @@ class MagicModel:
244
336
  for i in range(N):
245
337
  for j in range(i):
246
338
  if (
247
- all_bboxes[i]["category_id"] == subject_category_id
248
- and all_bboxes[j]["category_id"] == subject_category_id
339
+ all_bboxes[i]['category_id'] == subject_category_id
340
+ and all_bboxes[j]['category_id'] == subject_category_id
249
341
  ):
250
342
  continue
251
343
 
252
- dis[i][j] = bbox_distance(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"])
344
+ subject_idx, object_idx = i, j
345
+ if all_bboxes[j]['category_id'] == subject_category_id:
346
+ subject_idx, object_idx = j, i
347
+
348
+ if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
349
+ dis[i][j] = float('inf')
350
+ dis[j][i] = dis[i][j]
351
+ continue
352
+
353
+ dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
253
354
  dis[j][i] = dis[i][j]
254
355
 
255
356
  used = set()
256
357
  for i in range(N):
257
358
  # 求第 i 个 subject 所关联的 object
258
- if all_bboxes[i]["category_id"] != subject_category_id:
359
+ if all_bboxes[i]['category_id'] != subject_category_id:
259
360
  continue
260
361
  seen = set()
261
362
  candidates = []
@@ -267,7 +368,7 @@ class MagicModel:
267
368
  map(
268
369
  lambda x: 1 if x else 0,
269
370
  bbox_relative_pos(
270
- all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
371
+ all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
271
372
  ),
272
373
  )
273
374
  )
@@ -275,25 +376,28 @@ class MagicModel:
275
376
  if pos_flag_count > 1:
276
377
  continue
277
378
  if (
278
- all_bboxes[j]["category_id"] != object_category_id
379
+ all_bboxes[j]['category_id'] != object_category_id
279
380
  or j in used
280
381
  or dis[i][j] == MAX_DIS_OF_POINT
281
382
  ):
282
383
  continue
283
384
  left, right, _, _ = bbox_relative_pos(
284
- all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
385
+ all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
285
386
  ) # 由 pos_flag_count 相关逻辑保证本段逻辑准确性
286
387
  if left or right:
287
- one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
388
+ one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
288
389
  else:
289
- one_way_dis = all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1]
390
+ one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
290
391
  if dis[i][j] > one_way_dis:
291
392
  continue
292
393
  arr.append((dis[i][j], j))
293
394
 
294
395
  arr.sort(key=lambda x: x[0])
295
396
  if len(arr) > 0:
296
- # bug: 离该subject 最近的 object 可能跨越了其它的 subject 。比如 [this subect] [some sbuject] [the nearest objec of subject]
397
+ """
398
+ bug: 离该subject 最近的 object 可能跨越了其它的 subject。
399
+ 比如 [this subect] [some sbuject] [the nearest object of subject]
400
+ """
297
401
  if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
298
402
 
299
403
  candidates.append(arr[0][1])
@@ -308,7 +412,7 @@ class MagicModel:
308
412
  map(
309
413
  lambda x: 1 if x else 0,
310
414
  bbox_relative_pos(
311
- all_bboxes[j]["bbox"], all_bboxes[k]["bbox"]
415
+ all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
312
416
  ),
313
417
  )
314
418
  )
@@ -318,7 +422,7 @@ class MagicModel:
318
422
  continue
319
423
 
320
424
  if (
321
- all_bboxes[k]["category_id"] != object_category_id
425
+ all_bboxes[k]['category_id'] != object_category_id
322
426
  or k in used
323
427
  or k in seen
324
428
  or dis[j][k] == MAX_DIS_OF_POINT
@@ -327,17 +431,19 @@ class MagicModel:
327
431
  continue
328
432
 
329
433
  is_nearest = True
330
- for l in range(i + 1, N):
331
- if l in (j, k) or l in used or l in seen:
434
+ for ni in range(i + 1, N):
435
+ if ni in (j, k) or ni in used or ni in seen:
332
436
  continue
333
437
 
334
- if not float_gt(dis[l][k], dis[j][k]):
438
+ if not float_gt(dis[ni][k], dis[j][k]):
335
439
  is_nearest = False
336
440
  break
337
441
 
338
442
  if is_nearest:
339
443
  nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
340
- n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1])
444
+ n_dis = bbox_distance(
445
+ all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
446
+ )
341
447
  if float_gt(dis[i][j], n_dis):
342
448
  continue
343
449
  tmp.append(k)
@@ -350,7 +456,7 @@ class MagicModel:
350
456
  # 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
351
457
  # 先扩一下 bbox,
352
458
  ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
353
- ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
459
+ ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
354
460
 
355
461
  # 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
356
462
  caption_poses = [
@@ -366,17 +472,17 @@ class MagicModel:
366
472
  for idx in seen:
367
473
  if (
368
474
  calculate_overlap_area_in_bbox1_area_ratio(
369
- all_bboxes[idx]["bbox"], bbox
475
+ all_bboxes[idx]['bbox'], bbox
370
476
  )
371
477
  > CAPATION_OVERLAP_AREA_RATIO
372
478
  ):
373
479
  embed_arr.append(idx)
374
480
 
375
481
  if len(embed_arr) > 0:
376
- embed_x0 = min([all_bboxes[idx]["bbox"][0] for idx in embed_arr])
377
- embed_y0 = min([all_bboxes[idx]["bbox"][1] for idx in embed_arr])
378
- embed_x1 = max([all_bboxes[idx]["bbox"][2] for idx in embed_arr])
379
- embed_y1 = max([all_bboxes[idx]["bbox"][3] for idx in embed_arr])
482
+ embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
483
+ embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
484
+ embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
485
+ embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
380
486
  caption_areas.append(
381
487
  int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
382
488
  )
@@ -391,7 +497,7 @@ class MagicModel:
391
497
  for j in seen:
392
498
  if (
393
499
  calculate_overlap_area_in_bbox1_area_ratio(
394
- all_bboxes[j]["bbox"], caption_bbox
500
+ all_bboxes[j]['bbox'], caption_bbox
395
501
  )
396
502
  > CAPATION_OVERLAP_AREA_RATIO
397
503
  ):
@@ -400,30 +506,30 @@ class MagicModel:
400
506
 
401
507
  for i in sorted(subject_object_relation_map.keys()):
402
508
  result = {
403
- "subject_body": all_bboxes[i]["bbox"],
404
- "all": all_bboxes[i]["bbox"],
405
- "score": all_bboxes[i]["score"],
509
+ 'subject_body': all_bboxes[i]['bbox'],
510
+ 'all': all_bboxes[i]['bbox'],
511
+ 'score': all_bboxes[i]['score'],
406
512
  }
407
513
 
408
514
  if len(subject_object_relation_map[i]) > 0:
409
515
  x0 = min(
410
- [all_bboxes[j]["bbox"][0] for j in subject_object_relation_map[i]]
516
+ [all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
411
517
  )
412
518
  y0 = min(
413
- [all_bboxes[j]["bbox"][1] for j in subject_object_relation_map[i]]
519
+ [all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
414
520
  )
415
521
  x1 = max(
416
- [all_bboxes[j]["bbox"][2] for j in subject_object_relation_map[i]]
522
+ [all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
417
523
  )
418
524
  y1 = max(
419
- [all_bboxes[j]["bbox"][3] for j in subject_object_relation_map[i]]
525
+ [all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
420
526
  )
421
- result["object_body"] = [x0, y0, x1, y1]
422
- result["all"] = [
423
- min(x0, all_bboxes[i]["bbox"][0]),
424
- min(y0, all_bboxes[i]["bbox"][1]),
425
- max(x1, all_bboxes[i]["bbox"][2]),
426
- max(y1, all_bboxes[i]["bbox"][3]),
527
+ result['object_body'] = [x0, y0, x1, y1]
528
+ result['all'] = [
529
+ min(x0, all_bboxes[i]['bbox'][0]),
530
+ min(y0, all_bboxes[i]['bbox'][1]),
531
+ max(x1, all_bboxes[i]['bbox'][2]),
532
+ max(y1, all_bboxes[i]['bbox'][3]),
427
533
  ]
428
534
  ret.append(result)
429
535
 
@@ -432,7 +538,7 @@ class MagicModel:
432
538
  for i in subject_object_relation_map.keys():
433
539
  for j in subject_object_relation_map[i]:
434
540
  total_subject_object_dis += bbox_distance(
435
- all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
541
+ all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
436
542
  )
437
543
 
438
544
  # 计算未匹配的 subject 和 object 的距离(非精确版)
@@ -444,12 +550,12 @@ class MagicModel:
444
550
  ]
445
551
  )
446
552
  for i in range(N):
447
- if all_bboxes[i]["category_id"] != object_category_id or i in used:
553
+ if all_bboxes[i]['category_id'] != object_category_id or i in used:
448
554
  continue
449
555
  candidates = []
450
556
  for j in range(N):
451
557
  if (
452
- all_bboxes[j]["category_id"] != subject_category_id
558
+ all_bboxes[j]['category_id'] != subject_category_id
453
559
  or j in with_caption_subject
454
560
  ):
455
561
  continue
@@ -461,18 +567,28 @@ class MagicModel:
461
567
  return ret, total_subject_object_dis
462
568
 
463
569
  def get_imgs(self, page_no: int):
464
- figure_captions, _ = self.__tie_up_category_by_distance(
465
- page_no, 3, 4
570
+ with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
571
+ with_footnotes, _ = self.__tie_up_category_by_distance(
572
+ page_no, 3, CategoryId.ImageFootnote
466
573
  )
467
- return [
468
- {
469
- "bbox": record["all"],
470
- "img_body_bbox": record["subject_body"],
471
- "img_caption_bbox": record.get("object_body", None),
472
- "score": record["score"],
574
+ ret = []
575
+ N, M = len(with_captions), len(with_footnotes)
576
+ assert N == M
577
+ for i in range(N):
578
+ record = {
579
+ 'score': with_captions[i]['score'],
580
+ 'img_caption_bbox': with_captions[i].get('object_body', None),
581
+ 'img_body_bbox': with_captions[i]['subject_body'],
582
+ 'img_footnote_bbox': with_footnotes[i].get('object_body', None),
473
583
  }
474
- for record in figure_captions
475
- ]
584
+
585
+ x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
586
+ y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
587
+ x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
588
+ y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
589
+ record['bbox'] = [x0, y0, x1, y1]
590
+ ret.append(record)
591
+ return ret
476
592
 
477
593
  def get_tables(
478
594
  self, page_no: int
@@ -484,26 +600,26 @@ class MagicModel:
484
600
  assert N == M
485
601
  for i in range(N):
486
602
  record = {
487
- "score": with_captions[i]["score"],
488
- "table_caption_bbox": with_captions[i].get("object_body", None),
489
- "table_body_bbox": with_captions[i]["subject_body"],
490
- "table_footnote_bbox": with_footnotes[i].get("object_body", None),
603
+ 'score': with_captions[i]['score'],
604
+ 'table_caption_bbox': with_captions[i].get('object_body', None),
605
+ 'table_body_bbox': with_captions[i]['subject_body'],
606
+ 'table_footnote_bbox': with_footnotes[i].get('object_body', None),
491
607
  }
492
608
 
493
- x0 = min(with_captions[i]["all"][0], with_footnotes[i]["all"][0])
494
- y0 = min(with_captions[i]["all"][1], with_footnotes[i]["all"][1])
495
- x1 = max(with_captions[i]["all"][2], with_footnotes[i]["all"][2])
496
- y1 = max(with_captions[i]["all"][3], with_footnotes[i]["all"][3])
497
- record["bbox"] = [x0, y0, x1, y1]
609
+ x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
610
+ y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
611
+ x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
612
+ y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
613
+ record['bbox'] = [x0, y0, x1, y1]
498
614
  ret.append(record)
499
615
  return ret
500
616
 
501
617
  def get_equations(self, page_no: int) -> list: # 有坐标,也有字
502
618
  inline_equations = self.__get_blocks_by_type(
503
- ModelBlockTypeEnum.EMBEDDING.value, page_no, ["latex"]
619
+ ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
504
620
  )
505
621
  interline_equations = self.__get_blocks_by_type(
506
- ModelBlockTypeEnum.ISOLATED.value, page_no, ["latex"]
622
+ ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
507
623
  )
508
624
  interline_equations_blocks = self.__get_blocks_by_type(
509
625
  ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
@@ -525,17 +641,18 @@ class MagicModel:
525
641
  def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标
526
642
  text_spans = []
527
643
  model_page_info = self.__model_list[page_no]
528
- layout_dets = model_page_info["layout_dets"]
644
+ layout_dets = model_page_info['layout_dets']
529
645
  for layout_det in layout_dets:
530
- if layout_det["category_id"] == "15":
646
+ if layout_det['category_id'] == '15':
531
647
  span = {
532
- "bbox": layout_det["bbox"],
533
- "content": layout_det["text"],
648
+ 'bbox': layout_det['bbox'],
649
+ 'content': layout_det['text'],
534
650
  }
535
651
  text_spans.append(span)
536
652
  return text_spans
537
653
 
538
654
  def get_all_spans(self, page_no: int) -> list:
655
+
539
656
  def remove_duplicate_spans(spans):
540
657
  new_spans = []
541
658
  for span in spans:
@@ -545,7 +662,7 @@ class MagicModel:
545
662
 
546
663
  all_spans = []
547
664
  model_page_info = self.__model_list[page_no]
548
- layout_dets = model_page_info["layout_dets"]
665
+ layout_dets = model_page_info['layout_dets']
549
666
  allow_category_id_list = [3, 5, 13, 14, 15]
550
667
  """当成span拼接的"""
551
668
  # 3: 'image', # 图片
@@ -554,29 +671,29 @@ class MagicModel:
554
671
  # 14: 'interline_equation', # 行间公式
555
672
  # 15: 'text', # ocr识别文本
556
673
  for layout_det in layout_dets:
557
- category_id = layout_det["category_id"]
674
+ category_id = layout_det['category_id']
558
675
  if category_id in allow_category_id_list:
559
- span = {"bbox": layout_det["bbox"], "score": layout_det["score"]}
676
+ span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
560
677
  if category_id == 3:
561
- span["type"] = ContentType.Image
678
+ span['type'] = ContentType.Image
562
679
  elif category_id == 5:
563
680
  # 获取table模型结果
564
- latex = layout_det.get("latex", None)
565
- html = layout_det.get("html", None)
681
+ latex = layout_det.get('latex', None)
682
+ html = layout_det.get('html', None)
566
683
  if latex:
567
- span["latex"] = latex
684
+ span['latex'] = latex
568
685
  elif html:
569
- span["html"] = html
570
- span["type"] = ContentType.Table
686
+ span['html'] = html
687
+ span['type'] = ContentType.Table
571
688
  elif category_id == 13:
572
- span["content"] = layout_det["latex"]
573
- span["type"] = ContentType.InlineEquation
689
+ span['content'] = layout_det['latex']
690
+ span['type'] = ContentType.InlineEquation
574
691
  elif category_id == 14:
575
- span["content"] = layout_det["latex"]
576
- span["type"] = ContentType.InterlineEquation
692
+ span['content'] = layout_det['latex']
693
+ span['type'] = ContentType.InterlineEquation
577
694
  elif category_id == 15:
578
- span["content"] = layout_det["text"]
579
- span["type"] = ContentType.Text
695
+ span['content'] = layout_det['text']
696
+ span['type'] = ContentType.Text
580
697
  all_spans.append(span)
581
698
  return remove_duplicate_spans(all_spans)
582
699
 
@@ -593,19 +710,19 @@ class MagicModel:
593
710
  ) -> list:
594
711
  blocks = []
595
712
  for page_dict in self.__model_list:
596
- layout_dets = page_dict.get("layout_dets", [])
597
- page_info = page_dict.get("page_info", {})
598
- page_number = page_info.get("page_no", -1)
713
+ layout_dets = page_dict.get('layout_dets', [])
714
+ page_info = page_dict.get('page_info', {})
715
+ page_number = page_info.get('page_no', -1)
599
716
  if page_no != page_number:
600
717
  continue
601
718
  for item in layout_dets:
602
- category_id = item.get("category_id", -1)
603
- bbox = item.get("bbox", None)
719
+ category_id = item.get('category_id', -1)
720
+ bbox = item.get('bbox', None)
604
721
 
605
722
  if category_id == type:
606
723
  block = {
607
- "bbox": bbox,
608
- "score": item.get("score"),
724
+ 'bbox': bbox,
725
+ 'score': item.get('score'),
609
726
  }
610
727
  for col in extra_col:
611
728
  block[col] = item.get(col, None)
@@ -616,28 +733,28 @@ class MagicModel:
616
733
  return self.__model_list[page_no]
617
734
 
618
735
 
619
- if __name__ == "__main__":
620
- drw = DiskReaderWriter(r"D:/project/20231108code-clean")
736
+ if __name__ == '__main__':
737
+ drw = DiskReaderWriter(r'D:/project/20231108code-clean')
621
738
  if 0:
622
- pdf_file_path = r"linshixuqiu\19983-00.pdf"
623
- model_file_path = r"linshixuqiu\19983-00_new.json"
739
+ pdf_file_path = r'linshixuqiu\19983-00.pdf'
740
+ model_file_path = r'linshixuqiu\19983-00_new.json'
624
741
  pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
625
742
  model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
626
743
  model_list = json.loads(model_json_txt)
627
- write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
628
- img_bucket_path = "imgs"
744
+ write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
745
+ img_bucket_path = 'imgs'
629
746
  img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
630
- pdf_docs = fitz.open("pdf", pdf_bytes)
747
+ pdf_docs = fitz.open('pdf', pdf_bytes)
631
748
  magic_model = MagicModel(model_list, pdf_docs)
632
749
 
633
750
  if 1:
634
751
  model_list = json.loads(
635
- drw.read("/opt/data/pdf/20240418/j.chroma.2009.03.042.json")
752
+ drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
636
753
  )
637
754
  pdf_bytes = drw.read(
638
- "/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf", AbsReaderWriter.MODE_BIN
755
+ '/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
639
756
  )
640
- pdf_docs = fitz.open("pdf", pdf_bytes)
757
+ pdf_docs = fitz.open('pdf', pdf_bytes)
641
758
  magic_model = MagicModel(model_list, pdf_docs)
642
759
  for i in range(7):
643
760
  print(magic_model.get_imgs(i))