magic-pdf 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +169 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +227 -161
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +68 -26
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,50 +1,38 @@
1
1
  import json
2
- import math
3
2
 
4
- from magic_pdf.libs.commons import fitz
5
- from loguru import logger
6
-
7
- from magic_pdf.libs.commons import join_path
3
+ from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
4
+ bbox_relative_pos, calculate_iou,
5
+ calculate_overlap_area_in_bbox1_area_ratio)
6
+ from magic_pdf.libs.commons import fitz, join_path
8
7
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
9
- from magic_pdf.libs.ocr_content_type import ContentType
10
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
11
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
12
8
  from magic_pdf.libs.local_math import float_gt
13
- from magic_pdf.libs.boxbase import (
14
- _is_in,
15
- bbox_relative_pos,
16
- bbox_distance,
17
- _is_part_overlap,
18
- calculate_overlap_area_in_bbox1_area_ratio,
19
- calculate_iou,
20
- )
21
9
  from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
10
+ from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
11
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
12
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
22
13
 
23
14
  CAPATION_OVERLAP_AREA_RATIO = 0.6
24
15
 
25
16
 
26
17
  class MagicModel:
27
- """
28
- 每个函数没有得到元素的时候返回空list
29
-
30
- """
18
+ """每个函数没有得到元素的时候返回空list."""
31
19
 
32
20
  def __fix_axis(self):
33
21
  for model_page_info in self.__model_list:
34
22
  need_remove_list = []
35
- page_no = model_page_info["page_info"]["page_no"]
23
+ page_no = model_page_info['page_info']['page_no']
36
24
  horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
37
25
  model_page_info, self.__docs[page_no]
38
26
  )
39
- layout_dets = model_page_info["layout_dets"]
27
+ layout_dets = model_page_info['layout_dets']
40
28
  for layout_det in layout_dets:
41
29
 
42
- if layout_det.get("bbox") is not None:
30
+ if layout_det.get('bbox') is not None:
43
31
  # 兼容直接输出bbox的模型数据,如paddle
44
- x0, y0, x1, y1 = layout_det["bbox"]
32
+ x0, y0, x1, y1 = layout_det['bbox']
45
33
  else:
46
34
  # 兼容直接输出poly的模型数据,如xxx
47
- x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
35
+ x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
48
36
 
49
37
  bbox = [
50
38
  int(x0 / horizontal_scale_ratio),
@@ -52,7 +40,7 @@ class MagicModel:
52
40
  int(x1 / horizontal_scale_ratio),
53
41
  int(y1 / vertical_scale_ratio),
54
42
  ]
55
- layout_det["bbox"] = bbox
43
+ layout_det['bbox'] = bbox
56
44
  # 删除高度或者宽度小于等于0的spans
57
45
  if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
58
46
  need_remove_list.append(layout_det)
@@ -62,9 +50,9 @@ class MagicModel:
62
50
  def __fix_by_remove_low_confidence(self):
63
51
  for model_page_info in self.__model_list:
64
52
  need_remove_list = []
65
- layout_dets = model_page_info["layout_dets"]
53
+ layout_dets = model_page_info['layout_dets']
66
54
  for layout_det in layout_dets:
67
- if layout_det["score"] <= 0.05:
55
+ if layout_det['score'] <= 0.05:
68
56
  need_remove_list.append(layout_det)
69
57
  else:
70
58
  continue
@@ -74,12 +62,12 @@ class MagicModel:
74
62
  def __fix_by_remove_high_iou_and_low_confidence(self):
75
63
  for model_page_info in self.__model_list:
76
64
  need_remove_list = []
77
- layout_dets = model_page_info["layout_dets"]
65
+ layout_dets = model_page_info['layout_dets']
78
66
  for layout_det1 in layout_dets:
79
67
  for layout_det2 in layout_dets:
80
68
  if layout_det1 == layout_det2:
81
69
  continue
82
- if layout_det1["category_id"] in [
70
+ if layout_det1['category_id'] in [
83
71
  0,
84
72
  1,
85
73
  2,
@@ -90,12 +78,12 @@ class MagicModel:
90
78
  7,
91
79
  8,
92
80
  9,
93
- ] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
81
+ ] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
94
82
  if (
95
- calculate_iou(layout_det1["bbox"], layout_det2["bbox"])
83
+ calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
96
84
  > 0.9
97
85
  ):
98
- if layout_det1["score"] < layout_det2["score"]:
86
+ if layout_det1['score'] < layout_det2['score']:
99
87
  layout_det_need_remove = layout_det1
100
88
  else:
101
89
  layout_det_need_remove = layout_det2
@@ -118,6 +106,67 @@ class MagicModel:
118
106
  self.__fix_by_remove_low_confidence()
119
107
  """删除高iou(>0.9)数据中置信度较低的那个"""
120
108
  self.__fix_by_remove_high_iou_and_low_confidence()
109
+ self.__fix_footnote()
110
+
111
+ def __fix_footnote(self):
112
+ # 3: figure, 5: table, 7: footnote
113
+ for model_page_info in self.__model_list:
114
+ footnotes = []
115
+ figures = []
116
+ tables = []
117
+
118
+ for obj in model_page_info['layout_dets']:
119
+ if obj['category_id'] == 7:
120
+ footnotes.append(obj)
121
+ elif obj['category_id'] == 3:
122
+ figures.append(obj)
123
+ elif obj['category_id'] == 5:
124
+ tables.append(obj)
125
+ if len(footnotes) * len(figures) == 0:
126
+ continue
127
+ dis_figure_footnote = {}
128
+ dis_table_footnote = {}
129
+
130
+ for i in range(len(footnotes)):
131
+ for j in range(len(figures)):
132
+ pos_flag_count = sum(
133
+ list(
134
+ map(
135
+ lambda x: 1 if x else 0,
136
+ bbox_relative_pos(
137
+ footnotes[i]['bbox'], figures[j]['bbox']
138
+ ),
139
+ )
140
+ )
141
+ )
142
+ if pos_flag_count > 1:
143
+ continue
144
+ dis_figure_footnote[i] = min(
145
+ bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
146
+ dis_figure_footnote.get(i, float('inf')),
147
+ )
148
+ for i in range(len(footnotes)):
149
+ for j in range(len(tables)):
150
+ pos_flag_count = sum(
151
+ list(
152
+ map(
153
+ lambda x: 1 if x else 0,
154
+ bbox_relative_pos(
155
+ footnotes[i]['bbox'], tables[j]['bbox']
156
+ ),
157
+ )
158
+ )
159
+ )
160
+ if pos_flag_count > 1:
161
+ continue
162
+
163
+ dis_table_footnote[i] = min(
164
+ bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
165
+ dis_table_footnote.get(i, float('inf')),
166
+ )
167
+ for i in range(len(footnotes)):
168
+ if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
169
+ footnotes[i]['category_id'] = CategoryId.ImageFootnote
121
170
 
122
171
  def __reduct_overlap(self, bboxes):
123
172
  N = len(bboxes)
@@ -126,76 +175,77 @@ class MagicModel:
126
175
  for j in range(N):
127
176
  if i == j:
128
177
  continue
129
- if _is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]):
178
+ if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
130
179
  keep[i] = False
131
-
132
180
  return [bboxes[i] for i in range(N) if keep[i]]
133
181
 
134
182
  def __tie_up_category_by_distance(
135
183
  self, page_no, subject_category_id, object_category_id
136
184
  ):
137
- """
138
- 假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject
139
- """
185
+ """假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object
186
+ 只能属于一个 subject."""
140
187
  ret = []
141
188
  MAX_DIS_OF_POINT = 10**9 + 7
189
+ """
190
+ subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。
191
+ 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
192
+ 再求出筛选出的 subjects 和 object 的最短距离
193
+ """
142
194
 
143
- # subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
144
- # 再求出筛选出的 subjects 和 object 的最短距离!
145
195
  def may_find_other_nearest_bbox(subject_idx, object_idx):
146
- ret = float("inf")
196
+ ret = float('inf')
147
197
 
148
198
  x0 = min(
149
- all_bboxes[subject_idx]["bbox"][0], all_bboxes[object_idx]["bbox"][0]
199
+ all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
150
200
  )
151
201
  y0 = min(
152
- all_bboxes[subject_idx]["bbox"][1], all_bboxes[object_idx]["bbox"][1]
202
+ all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
153
203
  )
154
204
  x1 = max(
155
- all_bboxes[subject_idx]["bbox"][2], all_bboxes[object_idx]["bbox"][2]
205
+ all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
156
206
  )
157
207
  y1 = max(
158
- all_bboxes[subject_idx]["bbox"][3], all_bboxes[object_idx]["bbox"][3]
208
+ all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
159
209
  )
160
210
 
161
211
  object_area = abs(
162
- all_bboxes[object_idx]["bbox"][2] - all_bboxes[object_idx]["bbox"][0]
212
+ all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
163
213
  ) * abs(
164
- all_bboxes[object_idx]["bbox"][3] - all_bboxes[object_idx]["bbox"][1]
214
+ all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
165
215
  )
166
216
 
167
217
  for i in range(len(all_bboxes)):
168
218
  if (
169
219
  i == subject_idx
170
- or all_bboxes[i]["category_id"] != subject_category_id
220
+ or all_bboxes[i]['category_id'] != subject_category_id
171
221
  ):
172
222
  continue
173
- if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]["bbox"]) or _is_in(
174
- all_bboxes[i]["bbox"], [x0, y0, x1, y1]
223
+ if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
224
+ all_bboxes[i]['bbox'], [x0, y0, x1, y1]
175
225
  ):
176
226
 
177
227
  i_area = abs(
178
- all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
179
- ) * abs(all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1])
228
+ all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
229
+ ) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
180
230
  if i_area >= object_area:
181
- ret = min(float("inf"), dis[i][object_idx])
231
+ ret = min(float('inf'), dis[i][object_idx])
182
232
 
183
233
  return ret
184
234
 
185
235
  def expand_bbbox(idxes):
186
- x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes]
187
- y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes]
188
- x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes]
189
- y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes]
236
+ x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
237
+ y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
238
+ x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
239
+ y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
190
240
  return min(x0s), min(y0s), max(x1s), max(y1s)
191
241
 
192
242
  subjects = self.__reduct_overlap(
193
243
  list(
194
244
  map(
195
- lambda x: {"bbox": x["bbox"], "score": x["score"]},
245
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
196
246
  filter(
197
- lambda x: x["category_id"] == subject_category_id,
198
- self.__model_list[page_no]["layout_dets"],
247
+ lambda x: x['category_id'] == subject_category_id,
248
+ self.__model_list[page_no]['layout_dets'],
199
249
  ),
200
250
  )
201
251
  )
@@ -204,10 +254,10 @@ class MagicModel:
204
254
  objects = self.__reduct_overlap(
205
255
  list(
206
256
  map(
207
- lambda x: {"bbox": x["bbox"], "score": x["score"]},
257
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
208
258
  filter(
209
- lambda x: x["category_id"] == object_category_id,
210
- self.__model_list[page_no]["layout_dets"],
259
+ lambda x: x['category_id'] == object_category_id,
260
+ self.__model_list[page_no]['layout_dets'],
211
261
  ),
212
262
  )
213
263
  )
@@ -215,7 +265,7 @@ class MagicModel:
215
265
  subject_object_relation_map = {}
216
266
 
217
267
  subjects.sort(
218
- key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2
268
+ key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
219
269
  ) # get the distance !
220
270
 
221
271
  all_bboxes = []
@@ -223,18 +273,18 @@ class MagicModel:
223
273
  for v in subjects:
224
274
  all_bboxes.append(
225
275
  {
226
- "category_id": subject_category_id,
227
- "bbox": v["bbox"],
228
- "score": v["score"],
276
+ 'category_id': subject_category_id,
277
+ 'bbox': v['bbox'],
278
+ 'score': v['score'],
229
279
  }
230
280
  )
231
281
 
232
282
  for v in objects:
233
283
  all_bboxes.append(
234
284
  {
235
- "category_id": object_category_id,
236
- "bbox": v["bbox"],
237
- "score": v["score"],
285
+ 'category_id': object_category_id,
286
+ 'bbox': v['bbox'],
287
+ 'score': v['score'],
238
288
  }
239
289
  )
240
290
 
@@ -244,18 +294,18 @@ class MagicModel:
244
294
  for i in range(N):
245
295
  for j in range(i):
246
296
  if (
247
- all_bboxes[i]["category_id"] == subject_category_id
248
- and all_bboxes[j]["category_id"] == subject_category_id
297
+ all_bboxes[i]['category_id'] == subject_category_id
298
+ and all_bboxes[j]['category_id'] == subject_category_id
249
299
  ):
250
300
  continue
251
301
 
252
- dis[i][j] = bbox_distance(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"])
302
+ dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
253
303
  dis[j][i] = dis[i][j]
254
304
 
255
305
  used = set()
256
306
  for i in range(N):
257
307
  # 求第 i 个 subject 所关联的 object
258
- if all_bboxes[i]["category_id"] != subject_category_id:
308
+ if all_bboxes[i]['category_id'] != subject_category_id:
259
309
  continue
260
310
  seen = set()
261
311
  candidates = []
@@ -267,7 +317,7 @@ class MagicModel:
267
317
  map(
268
318
  lambda x: 1 if x else 0,
269
319
  bbox_relative_pos(
270
- all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
320
+ all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
271
321
  ),
272
322
  )
273
323
  )
@@ -275,25 +325,28 @@ class MagicModel:
275
325
  if pos_flag_count > 1:
276
326
  continue
277
327
  if (
278
- all_bboxes[j]["category_id"] != object_category_id
328
+ all_bboxes[j]['category_id'] != object_category_id
279
329
  or j in used
280
330
  or dis[i][j] == MAX_DIS_OF_POINT
281
331
  ):
282
332
  continue
283
333
  left, right, _, _ = bbox_relative_pos(
284
- all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
334
+ all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
285
335
  ) # 由 pos_flag_count 相关逻辑保证本段逻辑准确性
286
336
  if left or right:
287
- one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
337
+ one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
288
338
  else:
289
- one_way_dis = all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1]
339
+ one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
290
340
  if dis[i][j] > one_way_dis:
291
341
  continue
292
342
  arr.append((dis[i][j], j))
293
343
 
294
344
  arr.sort(key=lambda x: x[0])
295
345
  if len(arr) > 0:
296
- # bug: 离该subject 最近的 object 可能跨越了其它的 subject 。比如 [this subect] [some sbuject] [the nearest objec of subject]
346
+ """
347
+ bug: 离该subject 最近的 object 可能跨越了其它的 subject。
348
+ 比如 [this subect] [some sbuject] [the nearest object of subject]
349
+ """
297
350
  if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
298
351
 
299
352
  candidates.append(arr[0][1])
@@ -308,7 +361,7 @@ class MagicModel:
308
361
  map(
309
362
  lambda x: 1 if x else 0,
310
363
  bbox_relative_pos(
311
- all_bboxes[j]["bbox"], all_bboxes[k]["bbox"]
364
+ all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
312
365
  ),
313
366
  )
314
367
  )
@@ -318,7 +371,7 @@ class MagicModel:
318
371
  continue
319
372
 
320
373
  if (
321
- all_bboxes[k]["category_id"] != object_category_id
374
+ all_bboxes[k]['category_id'] != object_category_id
322
375
  or k in used
323
376
  or k in seen
324
377
  or dis[j][k] == MAX_DIS_OF_POINT
@@ -327,17 +380,19 @@ class MagicModel:
327
380
  continue
328
381
 
329
382
  is_nearest = True
330
- for l in range(i + 1, N):
331
- if l in (j, k) or l in used or l in seen:
383
+ for ni in range(i + 1, N):
384
+ if ni in (j, k) or ni in used or ni in seen:
332
385
  continue
333
386
 
334
- if not float_gt(dis[l][k], dis[j][k]):
387
+ if not float_gt(dis[ni][k], dis[j][k]):
335
388
  is_nearest = False
336
389
  break
337
390
 
338
391
  if is_nearest:
339
392
  nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
340
- n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1])
393
+ n_dis = bbox_distance(
394
+ all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
395
+ )
341
396
  if float_gt(dis[i][j], n_dis):
342
397
  continue
343
398
  tmp.append(k)
@@ -350,7 +405,7 @@ class MagicModel:
350
405
  # 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
351
406
  # 先扩一下 bbox,
352
407
  ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
353
- ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
408
+ ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
354
409
 
355
410
  # 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
356
411
  caption_poses = [
@@ -366,17 +421,17 @@ class MagicModel:
366
421
  for idx in seen:
367
422
  if (
368
423
  calculate_overlap_area_in_bbox1_area_ratio(
369
- all_bboxes[idx]["bbox"], bbox
424
+ all_bboxes[idx]['bbox'], bbox
370
425
  )
371
426
  > CAPATION_OVERLAP_AREA_RATIO
372
427
  ):
373
428
  embed_arr.append(idx)
374
429
 
375
430
  if len(embed_arr) > 0:
376
- embed_x0 = min([all_bboxes[idx]["bbox"][0] for idx in embed_arr])
377
- embed_y0 = min([all_bboxes[idx]["bbox"][1] for idx in embed_arr])
378
- embed_x1 = max([all_bboxes[idx]["bbox"][2] for idx in embed_arr])
379
- embed_y1 = max([all_bboxes[idx]["bbox"][3] for idx in embed_arr])
431
+ embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
432
+ embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
433
+ embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
434
+ embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
380
435
  caption_areas.append(
381
436
  int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
382
437
  )
@@ -391,7 +446,7 @@ class MagicModel:
391
446
  for j in seen:
392
447
  if (
393
448
  calculate_overlap_area_in_bbox1_area_ratio(
394
- all_bboxes[j]["bbox"], caption_bbox
449
+ all_bboxes[j]['bbox'], caption_bbox
395
450
  )
396
451
  > CAPATION_OVERLAP_AREA_RATIO
397
452
  ):
@@ -400,30 +455,30 @@ class MagicModel:
400
455
 
401
456
  for i in sorted(subject_object_relation_map.keys()):
402
457
  result = {
403
- "subject_body": all_bboxes[i]["bbox"],
404
- "all": all_bboxes[i]["bbox"],
405
- "score": all_bboxes[i]["score"],
458
+ 'subject_body': all_bboxes[i]['bbox'],
459
+ 'all': all_bboxes[i]['bbox'],
460
+ 'score': all_bboxes[i]['score'],
406
461
  }
407
462
 
408
463
  if len(subject_object_relation_map[i]) > 0:
409
464
  x0 = min(
410
- [all_bboxes[j]["bbox"][0] for j in subject_object_relation_map[i]]
465
+ [all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
411
466
  )
412
467
  y0 = min(
413
- [all_bboxes[j]["bbox"][1] for j in subject_object_relation_map[i]]
468
+ [all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
414
469
  )
415
470
  x1 = max(
416
- [all_bboxes[j]["bbox"][2] for j in subject_object_relation_map[i]]
471
+ [all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
417
472
  )
418
473
  y1 = max(
419
- [all_bboxes[j]["bbox"][3] for j in subject_object_relation_map[i]]
474
+ [all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
420
475
  )
421
- result["object_body"] = [x0, y0, x1, y1]
422
- result["all"] = [
423
- min(x0, all_bboxes[i]["bbox"][0]),
424
- min(y0, all_bboxes[i]["bbox"][1]),
425
- max(x1, all_bboxes[i]["bbox"][2]),
426
- max(y1, all_bboxes[i]["bbox"][3]),
476
+ result['object_body'] = [x0, y0, x1, y1]
477
+ result['all'] = [
478
+ min(x0, all_bboxes[i]['bbox'][0]),
479
+ min(y0, all_bboxes[i]['bbox'][1]),
480
+ max(x1, all_bboxes[i]['bbox'][2]),
481
+ max(y1, all_bboxes[i]['bbox'][3]),
427
482
  ]
428
483
  ret.append(result)
429
484
 
@@ -432,7 +487,7 @@ class MagicModel:
432
487
  for i in subject_object_relation_map.keys():
433
488
  for j in subject_object_relation_map[i]:
434
489
  total_subject_object_dis += bbox_distance(
435
- all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
490
+ all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
436
491
  )
437
492
 
438
493
  # 计算未匹配的 subject 和 object 的距离(非精确版)
@@ -444,12 +499,12 @@ class MagicModel:
444
499
  ]
445
500
  )
446
501
  for i in range(N):
447
- if all_bboxes[i]["category_id"] != object_category_id or i in used:
502
+ if all_bboxes[i]['category_id'] != object_category_id or i in used:
448
503
  continue
449
504
  candidates = []
450
505
  for j in range(N):
451
506
  if (
452
- all_bboxes[j]["category_id"] != subject_category_id
507
+ all_bboxes[j]['category_id'] != subject_category_id
453
508
  or j in with_caption_subject
454
509
  ):
455
510
  continue
@@ -461,18 +516,28 @@ class MagicModel:
461
516
  return ret, total_subject_object_dis
462
517
 
463
518
  def get_imgs(self, page_no: int):
464
- figure_captions, _ = self.__tie_up_category_by_distance(
465
- page_no, 3, 4
519
+ with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
520
+ with_footnotes, _ = self.__tie_up_category_by_distance(
521
+ page_no, 3, CategoryId.ImageFootnote
466
522
  )
467
- return [
468
- {
469
- "bbox": record["all"],
470
- "img_body_bbox": record["subject_body"],
471
- "img_caption_bbox": record.get("object_body", None),
472
- "score": record["score"],
523
+ ret = []
524
+ N, M = len(with_captions), len(with_footnotes)
525
+ assert N == M
526
+ for i in range(N):
527
+ record = {
528
+ 'score': with_captions[i]['score'],
529
+ 'img_caption_bbox': with_captions[i].get('object_body', None),
530
+ 'img_body_bbox': with_captions[i]['subject_body'],
531
+ 'img_footnote_bbox': with_footnotes[i].get('object_body', None),
473
532
  }
474
- for record in figure_captions
475
- ]
533
+
534
+ x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
535
+ y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
536
+ x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
537
+ y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
538
+ record['bbox'] = [x0, y0, x1, y1]
539
+ ret.append(record)
540
+ return ret
476
541
 
477
542
  def get_tables(
478
543
  self, page_no: int
@@ -484,26 +549,26 @@ class MagicModel:
484
549
  assert N == M
485
550
  for i in range(N):
486
551
  record = {
487
- "score": with_captions[i]["score"],
488
- "table_caption_bbox": with_captions[i].get("object_body", None),
489
- "table_body_bbox": with_captions[i]["subject_body"],
490
- "table_footnote_bbox": with_footnotes[i].get("object_body", None),
552
+ 'score': with_captions[i]['score'],
553
+ 'table_caption_bbox': with_captions[i].get('object_body', None),
554
+ 'table_body_bbox': with_captions[i]['subject_body'],
555
+ 'table_footnote_bbox': with_footnotes[i].get('object_body', None),
491
556
  }
492
557
 
493
- x0 = min(with_captions[i]["all"][0], with_footnotes[i]["all"][0])
494
- y0 = min(with_captions[i]["all"][1], with_footnotes[i]["all"][1])
495
- x1 = max(with_captions[i]["all"][2], with_footnotes[i]["all"][2])
496
- y1 = max(with_captions[i]["all"][3], with_footnotes[i]["all"][3])
497
- record["bbox"] = [x0, y0, x1, y1]
558
+ x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
559
+ y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
560
+ x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
561
+ y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
562
+ record['bbox'] = [x0, y0, x1, y1]
498
563
  ret.append(record)
499
564
  return ret
500
565
 
501
566
  def get_equations(self, page_no: int) -> list: # 有坐标,也有字
502
567
  inline_equations = self.__get_blocks_by_type(
503
- ModelBlockTypeEnum.EMBEDDING.value, page_no, ["latex"]
568
+ ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
504
569
  )
505
570
  interline_equations = self.__get_blocks_by_type(
506
- ModelBlockTypeEnum.ISOLATED.value, page_no, ["latex"]
571
+ ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
507
572
  )
508
573
  interline_equations_blocks = self.__get_blocks_by_type(
509
574
  ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
@@ -525,17 +590,18 @@ class MagicModel:
525
590
  def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标
526
591
  text_spans = []
527
592
  model_page_info = self.__model_list[page_no]
528
- layout_dets = model_page_info["layout_dets"]
593
+ layout_dets = model_page_info['layout_dets']
529
594
  for layout_det in layout_dets:
530
- if layout_det["category_id"] == "15":
595
+ if layout_det['category_id'] == '15':
531
596
  span = {
532
- "bbox": layout_det["bbox"],
533
- "content": layout_det["text"],
597
+ 'bbox': layout_det['bbox'],
598
+ 'content': layout_det['text'],
534
599
  }
535
600
  text_spans.append(span)
536
601
  return text_spans
537
602
 
538
603
  def get_all_spans(self, page_no: int) -> list:
604
+
539
605
  def remove_duplicate_spans(spans):
540
606
  new_spans = []
541
607
  for span in spans:
@@ -545,7 +611,7 @@ class MagicModel:
545
611
 
546
612
  all_spans = []
547
613
  model_page_info = self.__model_list[page_no]
548
- layout_dets = model_page_info["layout_dets"]
614
+ layout_dets = model_page_info['layout_dets']
549
615
  allow_category_id_list = [3, 5, 13, 14, 15]
550
616
  """当成span拼接的"""
551
617
  # 3: 'image', # 图片
@@ -554,11 +620,11 @@ class MagicModel:
554
620
  # 14: 'interline_equation', # 行间公式
555
621
  # 15: 'text', # ocr识别文本
556
622
  for layout_det in layout_dets:
557
- category_id = layout_det["category_id"]
623
+ category_id = layout_det['category_id']
558
624
  if category_id in allow_category_id_list:
559
- span = {"bbox": layout_det["bbox"], "score": layout_det["score"]}
625
+ span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
560
626
  if category_id == 3:
561
- span["type"] = ContentType.Image
627
+ span['type'] = ContentType.Image
562
628
  elif category_id == 5:
563
629
  # 获取table模型结果
564
630
  latex = layout_det.get("latex", None)
@@ -569,14 +635,14 @@ class MagicModel:
569
635
  span["html"] = html
570
636
  span["type"] = ContentType.Table
571
637
  elif category_id == 13:
572
- span["content"] = layout_det["latex"]
573
- span["type"] = ContentType.InlineEquation
638
+ span['content'] = layout_det['latex']
639
+ span['type'] = ContentType.InlineEquation
574
640
  elif category_id == 14:
575
- span["content"] = layout_det["latex"]
576
- span["type"] = ContentType.InterlineEquation
641
+ span['content'] = layout_det['latex']
642
+ span['type'] = ContentType.InterlineEquation
577
643
  elif category_id == 15:
578
- span["content"] = layout_det["text"]
579
- span["type"] = ContentType.Text
644
+ span['content'] = layout_det['text']
645
+ span['type'] = ContentType.Text
580
646
  all_spans.append(span)
581
647
  return remove_duplicate_spans(all_spans)
582
648
 
@@ -593,19 +659,19 @@ class MagicModel:
593
659
  ) -> list:
594
660
  blocks = []
595
661
  for page_dict in self.__model_list:
596
- layout_dets = page_dict.get("layout_dets", [])
597
- page_info = page_dict.get("page_info", {})
598
- page_number = page_info.get("page_no", -1)
662
+ layout_dets = page_dict.get('layout_dets', [])
663
+ page_info = page_dict.get('page_info', {})
664
+ page_number = page_info.get('page_no', -1)
599
665
  if page_no != page_number:
600
666
  continue
601
667
  for item in layout_dets:
602
- category_id = item.get("category_id", -1)
603
- bbox = item.get("bbox", None)
668
+ category_id = item.get('category_id', -1)
669
+ bbox = item.get('bbox', None)
604
670
 
605
671
  if category_id == type:
606
672
  block = {
607
- "bbox": bbox,
608
- "score": item.get("score"),
673
+ 'bbox': bbox,
674
+ 'score': item.get('score'),
609
675
  }
610
676
  for col in extra_col:
611
677
  block[col] = item.get(col, None)
@@ -616,28 +682,28 @@ class MagicModel:
616
682
  return self.__model_list[page_no]
617
683
 
618
684
 
619
- if __name__ == "__main__":
620
- drw = DiskReaderWriter(r"D:/project/20231108code-clean")
685
+ if __name__ == '__main__':
686
+ drw = DiskReaderWriter(r'D:/project/20231108code-clean')
621
687
  if 0:
622
- pdf_file_path = r"linshixuqiu\19983-00.pdf"
623
- model_file_path = r"linshixuqiu\19983-00_new.json"
688
+ pdf_file_path = r'linshixuqiu\19983-00.pdf'
689
+ model_file_path = r'linshixuqiu\19983-00_new.json'
624
690
  pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
625
691
  model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
626
692
  model_list = json.loads(model_json_txt)
627
- write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
628
- img_bucket_path = "imgs"
693
+ write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
694
+ img_bucket_path = 'imgs'
629
695
  img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
630
- pdf_docs = fitz.open("pdf", pdf_bytes)
696
+ pdf_docs = fitz.open('pdf', pdf_bytes)
631
697
  magic_model = MagicModel(model_list, pdf_docs)
632
698
 
633
699
  if 1:
634
700
  model_list = json.loads(
635
- drw.read("/opt/data/pdf/20240418/j.chroma.2009.03.042.json")
701
+ drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
636
702
  )
637
703
  pdf_bytes = drw.read(
638
- "/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf", AbsReaderWriter.MODE_BIN
704
+ '/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
639
705
  )
640
- pdf_docs = fitz.open("pdf", pdf_bytes)
706
+ pdf_docs = fitz.open('pdf', pdf_bytes)
641
707
  magic_model = MagicModel(model_list, pdf_docs)
642
708
  for i in range(7):
643
709
  print(magic_model.get_imgs(i))