magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +16 -22
  2. magic_pdf/filter/pdf_meta_scan.py +5 -19
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_check.py +52 -25
  7. magic_pdf/libs/pdf_image_tools.py +2 -1
  8. magic_pdf/libs/version.py +1 -1
  9. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  10. magic_pdf/model/magic_model.py +0 -30
  11. magic_pdf/model/pp_structure_v2.py +23 -3
  12. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
  13. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
  14. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
  15. magic_pdf/para/para_split_v3.py +21 -7
  16. magic_pdf/pdf_parse_union_core_v2.py +134 -146
  17. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  18. magic_pdf/pre_proc/cut_image.py +0 -37
  19. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  20. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  21. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  22. magic_pdf/rw/S3ReaderWriter.py +1 -1
  23. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
  24. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
  25. magic_pdf/dict2md/mkcontent.py +0 -438
  26. magic_pdf/layout/__init__.py +0 -0
  27. magic_pdf/layout/bbox_sort.py +0 -681
  28. magic_pdf/layout/layout_det_utils.py +0 -182
  29. magic_pdf/layout/layout_sort.py +0 -921
  30. magic_pdf/layout/layout_spiler_recog.py +0 -101
  31. magic_pdf/layout/mcol_sort.py +0 -336
  32. magic_pdf/libs/calc_span_stats.py +0 -239
  33. magic_pdf/libs/detect_language_from_model.py +0 -21
  34. magic_pdf/libs/nlp_utils.py +0 -203
  35. magic_pdf/libs/textbase.py +0 -33
  36. magic_pdf/libs/vis_utils.py +0 -308
  37. magic_pdf/para/block_continuation_processor.py +0 -562
  38. magic_pdf/para/block_termination_processor.py +0 -480
  39. magic_pdf/para/commons.py +0 -222
  40. magic_pdf/para/denoise.py +0 -246
  41. magic_pdf/para/draw.py +0 -121
  42. magic_pdf/para/exceptions.py +0 -198
  43. magic_pdf/para/layout_match_processor.py +0 -40
  44. magic_pdf/para/para_split.py +0 -807
  45. magic_pdf/para/para_split_v2.py +0 -959
  46. magic_pdf/para/raw_processor.py +0 -207
  47. magic_pdf/para/stats.py +0 -268
  48. magic_pdf/para/title_processor.py +0 -1014
  49. magic_pdf/pdf_parse_union_core.py +0 -345
  50. magic_pdf/post_proc/__init__.py +0 -0
  51. magic_pdf/post_proc/detect_para.py +0 -3472
  52. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  53. magic_pdf/post_proc/remove_footnote.py +0 -153
  54. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  55. magic_pdf/pre_proc/detect_equation.py +0 -134
  56. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  57. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  58. magic_pdf/pre_proc/detect_footnote.py +0 -170
  59. magic_pdf/pre_proc/detect_header.py +0 -64
  60. magic_pdf/pre_proc/detect_images.py +0 -647
  61. magic_pdf/pre_proc/detect_page_number.py +0 -64
  62. magic_pdf/pre_proc/detect_tables.py +0 -62
  63. magic_pdf/pre_proc/equations_replace.py +0 -550
  64. magic_pdf/pre_proc/fix_image.py +0 -244
  65. magic_pdf/pre_proc/fix_table.py +0 -270
  66. magic_pdf/pre_proc/main_text_font.py +0 -23
  67. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  68. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  69. magic_pdf/pre_proc/post_layout_split.py +0 -0
  70. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  71. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  72. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  73. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  74. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  75. magic_pdf/pre_proc/statistics.py +0 -12
  76. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
  77. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
  78. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
  79. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -1,184 +1,11 @@
1
-
2
1
  from magic_pdf.config.ocr_content_type import BlockType
3
2
  from magic_pdf.libs.boxbase import (
4
- calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
3
+ calculate_iou,
4
+ calculate_overlap_area_in_bbox1_area_ratio,
5
5
  calculate_vertical_projection_overlap_ratio,
6
- get_minbox_if_overlap_by_ratio)
7
- from magic_pdf.pre_proc.remove_bbox_overlap import \
8
- remove_overlap_between_bbox_for_block
9
-
10
-
11
- def ocr_prepare_bboxes_for_layout_split(
12
- img_blocks,
13
- table_blocks,
14
- discarded_blocks,
15
- text_blocks,
16
- title_blocks,
17
- interline_equation_blocks,
18
- page_w,
19
- page_h,
20
- ):
21
- all_bboxes = []
22
- all_discarded_blocks = []
23
- for image in img_blocks:
24
- x0, y0, x1, y1 = image['bbox']
25
- all_bboxes.append(
26
- [
27
- x0,
28
- y0,
29
- x1,
30
- y1,
31
- None,
32
- None,
33
- None,
34
- BlockType.Image,
35
- None,
36
- None,
37
- None,
38
- None,
39
- image['score'],
40
- ]
41
- )
42
-
43
- for table in table_blocks:
44
- x0, y0, x1, y1 = table['bbox']
45
- all_bboxes.append(
46
- [
47
- x0,
48
- y0,
49
- x1,
50
- y1,
51
- None,
52
- None,
53
- None,
54
- BlockType.Table,
55
- None,
56
- None,
57
- None,
58
- None,
59
- table['score'],
60
- ]
61
- )
62
-
63
- for text in text_blocks:
64
- x0, y0, x1, y1 = text['bbox']
65
- all_bboxes.append(
66
- [
67
- x0,
68
- y0,
69
- x1,
70
- y1,
71
- None,
72
- None,
73
- None,
74
- BlockType.Text,
75
- None,
76
- None,
77
- None,
78
- None,
79
- text['score'],
80
- ]
81
- )
82
-
83
- for title in title_blocks:
84
- x0, y0, x1, y1 = title['bbox']
85
- all_bboxes.append(
86
- [
87
- x0,
88
- y0,
89
- x1,
90
- y1,
91
- None,
92
- None,
93
- None,
94
- BlockType.Title,
95
- None,
96
- None,
97
- None,
98
- None,
99
- title['score'],
100
- ]
101
- )
102
-
103
- for interline_equation in interline_equation_blocks:
104
- x0, y0, x1, y1 = interline_equation['bbox']
105
- all_bboxes.append(
106
- [
107
- x0,
108
- y0,
109
- x1,
110
- y1,
111
- None,
112
- None,
113
- None,
114
- BlockType.InterlineEquation,
115
- None,
116
- None,
117
- None,
118
- None,
119
- interline_equation['score'],
120
- ]
121
- )
122
-
123
- """block嵌套问题解决"""
124
- """文本框与标题框重叠,优先信任文本框"""
125
- all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
126
- """任何框体与舍弃框重叠,优先信任舍弃框"""
127
- all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
128
-
129
- # interline_equation 与title或text框冲突的情况,分两种情况处理
130
- """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
131
- all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
132
- """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
133
- # 通过后续大框套小框逻辑删除
134
-
135
- """discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
136
- for discarded in discarded_blocks:
137
- x0, y0, x1, y1 = discarded['bbox']
138
- all_discarded_blocks.append(
139
- [
140
- x0,
141
- y0,
142
- x1,
143
- y1,
144
- None,
145
- None,
146
- None,
147
- BlockType.Discarded,
148
- None,
149
- None,
150
- None,
151
- None,
152
- discarded['score'],
153
- ]
154
- )
155
- # 将footnote加入到all_bboxes中,用来计算layout
156
- if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
157
- all_bboxes.append(
158
- [
159
- x0,
160
- y0,
161
- x1,
162
- y1,
163
- None,
164
- None,
165
- None,
166
- BlockType.Footnote,
167
- None,
168
- None,
169
- None,
170
- None,
171
- discarded['score'],
172
- ]
173
- )
174
-
175
- """经过以上处理后,还存在大框套小框的情况,则删除小框"""
176
- all_bboxes = remove_overlaps_min_blocks(all_bboxes)
177
- all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
178
- """将剩余的bbox做分离处理,防止后面分layout时出错"""
179
- all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
180
-
181
- return all_bboxes, all_discarded_blocks, drop_reasons
6
+ get_minbox_if_overlap_by_ratio
7
+ )
8
+ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
182
9
 
183
10
 
184
11
  def add_bboxes(blocks, block_type, bboxes):
@@ -1,8 +1,5 @@
1
- from magic_pdf.config.drop_tag import DropTag
2
1
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
3
- from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
4
- _is_in_or_part_overlap_with_area_ratio,
5
- calculate_overlap_area_in_bbox1_area_ratio)
2
+ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
6
3
 
7
4
 
8
5
  # 将每一个line中的span从左到右排序
@@ -63,86 +60,6 @@ def merge_spans_to_line(spans, threshold=0.6):
63
60
  return lines
64
61
 
65
62
 
66
- def merge_spans_to_line_by_layout(spans, layout_bboxes):
67
- lines = []
68
- new_spans = []
69
- dropped_spans = []
70
- for item in layout_bboxes:
71
- layout_bbox = item['layout_bbox']
72
- # 遍历spans,将每个span放入对应的layout中
73
- layout_sapns = []
74
- for span in spans:
75
- if calculate_overlap_area_in_bbox1_area_ratio(
76
- span['bbox'], layout_bbox) > 0.6:
77
- layout_sapns.append(span)
78
- # 如果layout_sapns不为空,则放入new_spans中
79
- if len(layout_sapns) > 0:
80
- new_spans.append(layout_sapns)
81
- # 从spans删除已经放入layout_sapns中的span
82
- for layout_sapn in layout_sapns:
83
- spans.remove(layout_sapn)
84
-
85
- if len(new_spans) > 0:
86
- for layout_sapns in new_spans:
87
- layout_lines = merge_spans_to_line(layout_sapns)
88
- lines.extend(layout_lines)
89
-
90
- # 对line中的span进行排序
91
- lines = line_sort_spans_by_left_to_right(lines)
92
-
93
- for span in spans:
94
- span['tag'] = DropTag.NOT_IN_LAYOUT
95
- dropped_spans.append(span)
96
-
97
- return lines, dropped_spans
98
-
99
-
100
- def merge_lines_to_block(lines):
101
- # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
102
- blocks = []
103
- for line in lines:
104
- blocks.append({
105
- 'bbox': line['bbox'],
106
- 'lines': [line],
107
- })
108
- return blocks
109
-
110
-
111
- def sort_blocks_by_layout(all_bboxes, layout_bboxes):
112
- new_blocks = []
113
- sort_blocks = []
114
- for item in layout_bboxes:
115
- layout_bbox = item['layout_bbox']
116
-
117
- # 遍历blocks,将每个blocks放入对应的layout中
118
- layout_blocks = []
119
- for block in all_bboxes:
120
- # 如果是footnote则跳过
121
- if block[7] == BlockType.Footnote:
122
- continue
123
- block_bbox = block[:4]
124
- if calculate_overlap_area_in_bbox1_area_ratio(
125
- block_bbox, layout_bbox) > 0.8:
126
- layout_blocks.append(block)
127
-
128
- # 如果layout_blocks不为空,则放入new_blocks中
129
- if len(layout_blocks) > 0:
130
- new_blocks.append(layout_blocks)
131
- # 从all_bboxes删除已经放入layout_blocks中的block
132
- for layout_block in layout_blocks:
133
- all_bboxes.remove(layout_block)
134
-
135
- # 如果new_blocks不为空,则对new_blocks中每个block进行排序
136
- if len(new_blocks) > 0:
137
- for bboxes_in_layout_block in new_blocks:
138
- bboxes_in_layout_block.sort(
139
- key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
140
- sort_blocks.extend(bboxes_in_layout_block)
141
-
142
- # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
143
- return sort_blocks
144
-
145
-
146
63
  def fill_spans_in_blocks(blocks, spans, radio):
147
64
  """将allspans中的span按位置关系,放入blocks中."""
148
65
  block_with_spans = []
@@ -164,14 +81,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
164
81
  if calculate_overlap_area_in_bbox1_area_ratio(
165
82
  span_bbox, block_bbox) > radio:
166
83
  block_spans.append(span)
167
- '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
168
- # displayed_list = []
169
- # text_inline_lines = []
170
- # modify_y_axis(block_spans, displayed_list, text_inline_lines)
171
- '''模型识别错误的行间公式, type类型转换成行内公式'''
172
- # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
173
- '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
174
- # block_spans = remove_overlap_between_bbox_for_span(block_spans)
175
84
 
176
85
  block_dict['spans'] = block_spans
177
86
  block_with_spans.append(block_dict)
@@ -184,32 +93,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
184
93
  return block_with_spans, spans
185
94
 
186
95
 
187
- def fix_block_spans(block_with_spans, img_blocks, table_blocks):
188
- """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
189
- 需要将caption和footnote的text_span放入相应img_block和table_block内的
190
- caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
191
- fix_blocks = []
192
- for block in block_with_spans:
193
- block_type = block['type']
194
-
195
- if block_type == BlockType.Image:
196
- block = fix_image_block(block, img_blocks)
197
- elif block_type == BlockType.Table:
198
- block = fix_table_block(block, table_blocks)
199
- elif block_type in [BlockType.Text, BlockType.Title]:
200
- block = fix_text_block(block)
201
- elif block_type == BlockType.InterlineEquation:
202
- block = fix_interline_block(block)
203
- else:
204
- continue
205
- fix_blocks.append(block)
206
- return fix_blocks
207
-
208
-
209
96
  def fix_block_spans_v2(block_with_spans):
210
- """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
211
- 需要将caption和footnote的text_span放入相应img_block和table_block内的
212
- caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
213
97
  fix_blocks = []
214
98
  for block in block_with_spans:
215
99
  block_type = block['type']
@@ -235,113 +119,6 @@ def fix_discarded_block(discarded_block_with_spans):
235
119
  return fix_discarded_blocks
236
120
 
237
121
 
238
- def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
239
- block_spans = []
240
- # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
241
- for span in spans:
242
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
243
- block_bbox) > 0.6:
244
- block_spans.append(span)
245
- block_lines = merge_spans_to_line(block_spans)
246
- # 对line中的span进行排序
247
- sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
248
- block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
249
- return block, block_spans
250
-
251
-
252
- def make_body_block(span: dict, block_bbox: list, block_type: str):
253
- # 创建body_block
254
- body_line = {
255
- 'bbox': block_bbox,
256
- 'spans': [span],
257
- }
258
- body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
259
- return body_block
260
-
261
-
262
- def fix_image_block(block, img_blocks):
263
- block['blocks'] = []
264
- # 遍历img_blocks,找到与当前block匹配的img_block
265
- for img_block in img_blocks:
266
- if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
267
- img_block['bbox'], 0.95):
268
-
269
- # 创建img_body_block
270
- for span in block['spans']:
271
- if span['type'] == ContentType.Image and img_block[
272
- 'img_body_bbox'] == span['bbox']:
273
- # 创建img_body_block
274
- img_body_block = make_body_block(
275
- span, img_block['img_body_bbox'], BlockType.ImageBody)
276
- block['blocks'].append(img_body_block)
277
-
278
- # 从spans中移除img_body_block中已经放入的span
279
- block['spans'].remove(span)
280
- break
281
-
282
- # 根据list长度,判断img_block中是否有img_caption
283
- if img_block['img_caption_bbox'] is not None:
284
- img_caption_block, img_caption_spans = merge_spans_to_block(
285
- block['spans'], img_block['img_caption_bbox'],
286
- BlockType.ImageCaption)
287
- block['blocks'].append(img_caption_block)
288
-
289
- if img_block['img_footnote_bbox'] is not None:
290
- img_footnote_block, img_footnote_spans = merge_spans_to_block(
291
- block['spans'], img_block['img_footnote_bbox'],
292
- BlockType.ImageFootnote)
293
- block['blocks'].append(img_footnote_block)
294
- break
295
- del block['spans']
296
- return block
297
-
298
-
299
- def fix_table_block(block, table_blocks):
300
- block['blocks'] = []
301
- # 遍历table_blocks,找到与当前block匹配的table_block
302
- for table_block in table_blocks:
303
- if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
304
- table_block['bbox'], 0.95):
305
-
306
- # 创建table_body_block
307
- for span in block['spans']:
308
- if span['type'] == ContentType.Table and table_block[
309
- 'table_body_bbox'] == span['bbox']:
310
- # 创建table_body_block
311
- table_body_block = make_body_block(
312
- span, table_block['table_body_bbox'],
313
- BlockType.TableBody)
314
- block['blocks'].append(table_body_block)
315
-
316
- # 从spans中移除img_body_block中已经放入的span
317
- block['spans'].remove(span)
318
- break
319
-
320
- # 根据list长度,判断table_block中是否有caption
321
- if table_block['table_caption_bbox'] is not None:
322
- table_caption_block, table_caption_spans = merge_spans_to_block(
323
- block['spans'], table_block['table_caption_bbox'],
324
- BlockType.TableCaption)
325
- block['blocks'].append(table_caption_block)
326
-
327
- # 如果table_caption_block_spans不为空
328
- if len(table_caption_spans) > 0:
329
- # 一些span已经放入了caption_block中,需要从block['spans']中删除
330
- for span in table_caption_spans:
331
- block['spans'].remove(span)
332
-
333
- # 根据list长度,判断table_block中是否有table_note
334
- if table_block['table_footnote_bbox'] is not None:
335
- table_footnote_block, table_footnote_spans = merge_spans_to_block(
336
- block['spans'], table_block['table_footnote_bbox'],
337
- BlockType.TableFootnote)
338
- block['blocks'].append(table_footnote_block)
339
-
340
- break
341
- del block['spans']
342
- return block
343
-
344
-
345
122
  def fix_text_block(block):
346
123
  # 文本block中的公式span都应该转换成行内type
347
124
  for span in block['spans']: