magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +5 -5
  12. magic_pdf/libs/draw_bbox.py +3 -2
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +122 -76
  18. magic_pdf/model/sub_modules/model_init.py +40 -35
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  21. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  22. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  23. magic_pdf/para/para_split.py +411 -248
  24. magic_pdf/para/para_split_v2.py +352 -182
  25. magic_pdf/para/para_split_v3.py +110 -53
  26. magic_pdf/pdf_parse_by_ocr.py +2 -0
  27. magic_pdf/pdf_parse_by_txt.py +2 -0
  28. magic_pdf/pdf_parse_union_core.py +174 -100
  29. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  30. magic_pdf/pipe/AbsPipe.py +28 -44
  31. magic_pdf/pipe/OCRPipe.py +5 -5
  32. magic_pdf/pipe/TXTPipe.py +5 -6
  33. magic_pdf/pipe/UNIPipe.py +24 -25
  34. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  35. magic_pdf/pre_proc/cut_image.py +9 -11
  36. magic_pdf/pre_proc/equations_replace.py +203 -212
  37. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  38. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  39. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  40. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  41. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  42. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  43. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  44. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  45. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  46. magic_pdf/spark/spark_api.py +15 -17
  47. magic_pdf/tools/cli.py +3 -4
  48. magic_pdf/tools/cli_dev.py +6 -9
  49. magic_pdf/tools/common.py +26 -36
  50. magic_pdf/user_api.py +29 -38
  51. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
  53. magic_pdf/libs/Constants.py +0 -55
  54. magic_pdf/libs/MakeContentConfig.py +0 -11
  55. magic_pdf/libs/drop_reason.py +0 -27
  56. magic_pdf/libs/drop_tag.py +0 -19
  57. magic_pdf/para/para_pipeline.py +0 -297
  58. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  59. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  60. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,60 +1,181 @@
1
- from loguru import logger
2
1
 
3
- from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
4
- calculate_iou, calculate_vertical_projection_overlap_ratio
5
- from magic_pdf.libs.drop_tag import DropTag
6
- from magic_pdf.libs.ocr_content_type import BlockType
7
- from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
8
-
9
-
10
- def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
11
- title_blocks, interline_equation_blocks, page_w, page_h):
2
+ from magic_pdf.config.ocr_content_type import BlockType
3
+ from magic_pdf.libs.boxbase import (
4
+ calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
5
+ calculate_vertical_projection_overlap_ratio,
6
+ get_minbox_if_overlap_by_ratio)
7
+ from magic_pdf.pre_proc.remove_bbox_overlap import \
8
+ remove_overlap_between_bbox_for_block
9
+
10
+
11
+ def ocr_prepare_bboxes_for_layout_split(
12
+ img_blocks,
13
+ table_blocks,
14
+ discarded_blocks,
15
+ text_blocks,
16
+ title_blocks,
17
+ interline_equation_blocks,
18
+ page_w,
19
+ page_h,
20
+ ):
12
21
  all_bboxes = []
13
22
  all_discarded_blocks = []
14
23
  for image in img_blocks:
15
24
  x0, y0, x1, y1 = image['bbox']
16
- all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
25
+ all_bboxes.append(
26
+ [
27
+ x0,
28
+ y0,
29
+ x1,
30
+ y1,
31
+ None,
32
+ None,
33
+ None,
34
+ BlockType.Image,
35
+ None,
36
+ None,
37
+ None,
38
+ None,
39
+ image['score'],
40
+ ]
41
+ )
17
42
 
18
43
  for table in table_blocks:
19
44
  x0, y0, x1, y1 = table['bbox']
20
- all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
45
+ all_bboxes.append(
46
+ [
47
+ x0,
48
+ y0,
49
+ x1,
50
+ y1,
51
+ None,
52
+ None,
53
+ None,
54
+ BlockType.Table,
55
+ None,
56
+ None,
57
+ None,
58
+ None,
59
+ table['score'],
60
+ ]
61
+ )
21
62
 
22
63
  for text in text_blocks:
23
64
  x0, y0, x1, y1 = text['bbox']
24
- all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
65
+ all_bboxes.append(
66
+ [
67
+ x0,
68
+ y0,
69
+ x1,
70
+ y1,
71
+ None,
72
+ None,
73
+ None,
74
+ BlockType.Text,
75
+ None,
76
+ None,
77
+ None,
78
+ None,
79
+ text['score'],
80
+ ]
81
+ )
25
82
 
26
83
  for title in title_blocks:
27
84
  x0, y0, x1, y1 = title['bbox']
28
- all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
85
+ all_bboxes.append(
86
+ [
87
+ x0,
88
+ y0,
89
+ x1,
90
+ y1,
91
+ None,
92
+ None,
93
+ None,
94
+ BlockType.Title,
95
+ None,
96
+ None,
97
+ None,
98
+ None,
99
+ title['score'],
100
+ ]
101
+ )
29
102
 
30
103
  for interline_equation in interline_equation_blocks:
31
104
  x0, y0, x1, y1 = interline_equation['bbox']
32
- all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
33
-
34
- '''block嵌套问题解决'''
35
- '''文本框与标题框重叠,优先信任文本框'''
105
+ all_bboxes.append(
106
+ [
107
+ x0,
108
+ y0,
109
+ x1,
110
+ y1,
111
+ None,
112
+ None,
113
+ None,
114
+ BlockType.InterlineEquation,
115
+ None,
116
+ None,
117
+ None,
118
+ None,
119
+ interline_equation['score'],
120
+ ]
121
+ )
122
+
123
+ """block嵌套问题解决"""
124
+ """文本框与标题框重叠,优先信任文本框"""
36
125
  all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
37
- '''任何框体与舍弃框重叠,优先信任舍弃框'''
126
+ """任何框体与舍弃框重叠,优先信任舍弃框"""
38
127
  all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
39
128
 
40
129
  # interline_equation 与title或text框冲突的情况,分两种情况处理
41
- '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
130
+ """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
42
131
  all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
43
- '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
132
+ """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
44
133
  # 通过后续大框套小框逻辑删除
45
134
 
46
- '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
135
+ """discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
47
136
  for discarded in discarded_blocks:
48
137
  x0, y0, x1, y1 = discarded['bbox']
49
- all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
138
+ all_discarded_blocks.append(
139
+ [
140
+ x0,
141
+ y0,
142
+ x1,
143
+ y1,
144
+ None,
145
+ None,
146
+ None,
147
+ BlockType.Discarded,
148
+ None,
149
+ None,
150
+ None,
151
+ None,
152
+ discarded['score'],
153
+ ]
154
+ )
50
155
  # 将footnote加入到all_bboxes中,用来计算layout
51
156
  if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
52
- all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
53
-
54
- '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
157
+ all_bboxes.append(
158
+ [
159
+ x0,
160
+ y0,
161
+ x1,
162
+ y1,
163
+ None,
164
+ None,
165
+ None,
166
+ BlockType.Footnote,
167
+ None,
168
+ None,
169
+ None,
170
+ None,
171
+ discarded['score'],
172
+ ]
173
+ )
174
+
175
+ """经过以上处理后,还存在大框套小框的情况,则删除小框"""
55
176
  all_bboxes = remove_overlaps_min_blocks(all_bboxes)
56
177
  all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
57
- '''将剩余的bbox做分离处理,防止后面分layout时出错'''
178
+ """将剩余的bbox做分离处理,防止后面分layout时出错"""
58
179
  all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
59
180
 
60
181
  return all_bboxes, all_discarded_blocks, drop_reasons
@@ -64,18 +185,64 @@ def add_bboxes(blocks, block_type, bboxes):
64
185
  for block in blocks:
65
186
  x0, y0, x1, y1 = block['bbox']
66
187
  if block_type in [
67
- BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
68
- BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
188
+ BlockType.ImageBody,
189
+ BlockType.ImageCaption,
190
+ BlockType.ImageFootnote,
191
+ BlockType.TableBody,
192
+ BlockType.TableCaption,
193
+ BlockType.TableFootnote,
69
194
  ]:
70
- bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
195
+ bboxes.append(
196
+ [
197
+ x0,
198
+ y0,
199
+ x1,
200
+ y1,
201
+ None,
202
+ None,
203
+ None,
204
+ block_type,
205
+ None,
206
+ None,
207
+ None,
208
+ None,
209
+ block['score'],
210
+ block['group_id'],
211
+ ]
212
+ )
71
213
  else:
72
- bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
214
+ bboxes.append(
215
+ [
216
+ x0,
217
+ y0,
218
+ x1,
219
+ y1,
220
+ None,
221
+ None,
222
+ None,
223
+ block_type,
224
+ None,
225
+ None,
226
+ None,
227
+ None,
228
+ block['score'],
229
+ ]
230
+ )
73
231
 
74
232
 
75
233
  def ocr_prepare_bboxes_for_layout_split_v2(
76
- img_body_blocks, img_caption_blocks, img_footnote_blocks,
77
- table_body_blocks, table_caption_blocks, table_footnote_blocks,
78
- discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
234
+ img_body_blocks,
235
+ img_caption_blocks,
236
+ img_footnote_blocks,
237
+ table_body_blocks,
238
+ table_caption_blocks,
239
+ table_footnote_blocks,
240
+ discarded_blocks,
241
+ text_blocks,
242
+ title_blocks,
243
+ interline_equation_blocks,
244
+ page_w,
245
+ page_h,
79
246
  ):
80
247
  all_bboxes = []
81
248
 
@@ -89,40 +256,40 @@ def ocr_prepare_bboxes_for_layout_split_v2(
89
256
  add_bboxes(title_blocks, BlockType.Title, all_bboxes)
90
257
  add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
91
258
 
92
- '''block嵌套问题解决'''
93
- '''文本框与标题框重叠,优先信任文本框'''
259
+ """block嵌套问题解决"""
260
+ """文本框与标题框重叠,优先信任文本框"""
94
261
  all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
95
- '''任何框体与舍弃框重叠,优先信任舍弃框'''
262
+ """任何框体与舍弃框重叠,优先信任舍弃框"""
96
263
  all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
97
264
 
98
265
  # interline_equation 与title或text框冲突的情况,分两种情况处理
99
- '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
266
+ """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
100
267
  all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
101
- '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
268
+ """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
102
269
  # 通过后续大框套小框逻辑删除
103
270
 
104
- '''discarded_blocks'''
271
+ """discarded_blocks"""
105
272
  all_discarded_blocks = []
106
273
  add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
107
274
 
108
- '''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
275
+ """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
109
276
  footnote_blocks = []
110
277
  for discarded in discarded_blocks:
111
278
  x0, y0, x1, y1 = discarded['bbox']
112
279
  if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
113
280
  footnote_blocks.append([x0, y0, x1, y1])
114
281
 
115
- '''移除在footnote下面的任何框'''
282
+ """移除在footnote下面的任何框"""
116
283
  need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
117
284
  if len(need_remove_blocks) > 0:
118
285
  for block in need_remove_blocks:
119
286
  all_bboxes.remove(block)
120
287
  all_discarded_blocks.append(block)
121
288
 
122
- '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
289
+ """经过以上处理后,还存在大框套小框的情况,则删除小框"""
123
290
  all_bboxes = remove_overlaps_min_blocks(all_bboxes)
124
291
  all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
125
- '''将剩余的bbox做分离处理,防止后面分layout时出错'''
292
+ """将剩余的bbox做分离处理,防止后面分layout时出错"""
126
293
  all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
127
294
 
128
295
  return all_bboxes, all_discarded_blocks
@@ -135,7 +302,13 @@ def find_blocks_under_footnote(all_bboxes, footnote_blocks):
135
302
  for footnote_bbox in footnote_blocks:
136
303
  footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
137
304
  # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
138
- if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
305
+ if (
306
+ block_y0 >= footnote_y1
307
+ and calculate_vertical_projection_overlap_ratio(
308
+ (block_x0, block_y0, block_x1, block_y1), footnote_bbox
309
+ )
310
+ >= 0.8
311
+ ):
139
312
  if block not in need_remove_blocks:
140
313
  need_remove_blocks.append(block)
141
314
  break
@@ -203,7 +376,12 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
203
376
  for block in all_bboxes:
204
377
  for discarded_block in discarded_blocks:
205
378
  block_bbox = block[:4]
206
- if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
379
+ if (
380
+ calculate_overlap_area_in_bbox1_area_ratio(
381
+ block_bbox, discarded_block['bbox']
382
+ )
383
+ > 0.6
384
+ ):
207
385
  if block not in need_remove:
208
386
  need_remove.append(block)
209
387
  break
@@ -223,10 +401,18 @@ def remove_overlaps_min_blocks(all_bboxes):
223
401
  if block1 != block2:
224
402
  block1_bbox = block1[:4]
225
403
  block2_bbox = block2[:4]
226
- overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
404
+ overlap_box = get_minbox_if_overlap_by_ratio(
405
+ block1_bbox, block2_bbox, 0.8
406
+ )
227
407
  if overlap_box is not None:
228
- block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
229
- if block_to_remove is not None and block_to_remove not in need_remove:
408
+ block_to_remove = next(
409
+ (block for block in all_bboxes if block[:4] == overlap_box),
410
+ None,
411
+ )
412
+ if (
413
+ block_to_remove is not None
414
+ and block_to_remove not in need_remove
415
+ ):
230
416
  large_block = block1 if block1 != block_to_remove else block2
231
417
  x1, y1, x2, y2 = large_block[:4]
232
418
  sx1, sy1, sx2, sy2 = block_to_remove[:4]
@@ -1,8 +1,8 @@
1
+ from magic_pdf.config.drop_tag import DropTag
2
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
1
3
  from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
2
4
  _is_in_or_part_overlap_with_area_ratio,
3
5
  calculate_overlap_area_in_bbox1_area_ratio)
4
- from magic_pdf.libs.drop_tag import DropTag
5
- from magic_pdf.libs.ocr_content_type import BlockType, ContentType
6
6
 
7
7
 
8
8
  # 将每一个line中的span从左到右排序
@@ -24,7 +24,7 @@ def line_sort_spans_by_left_to_right(lines):
24
24
  return line_objects
25
25
 
26
26
 
27
- def merge_spans_to_line(spans):
27
+ def merge_spans_to_line(spans, threshold=0.6):
28
28
  if len(spans) == 0:
29
29
  return []
30
30
  else:
@@ -49,7 +49,7 @@ def merge_spans_to_line(spans):
49
49
  continue
50
50
 
51
51
  # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
52
- if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
52
+ if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
53
53
  current_line.append(span)
54
54
  else:
55
55
  # 否则,开始新行
@@ -157,7 +157,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
157
157
  BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
158
158
  BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
159
159
  ]:
160
- block_dict["group_id"] = block[-1]
160
+ block_dict['group_id'] = block[-1]
161
161
  block_spans = []
162
162
  for span in spans:
163
163
  span_bbox = span['bbox']