magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,5 @@
1
- from magic_pdf.config.drop_tag import DropTag
2
1
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
3
- from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
4
- _is_in_or_part_overlap_with_area_ratio,
5
- calculate_overlap_area_in_bbox1_area_ratio)
2
+ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
6
3
 
7
4
 
8
5
  # 将每一个line中的span从左到右排序
@@ -63,86 +60,6 @@ def merge_spans_to_line(spans, threshold=0.6):
63
60
  return lines
64
61
 
65
62
 
66
- def merge_spans_to_line_by_layout(spans, layout_bboxes):
67
- lines = []
68
- new_spans = []
69
- dropped_spans = []
70
- for item in layout_bboxes:
71
- layout_bbox = item['layout_bbox']
72
- # 遍历spans,将每个span放入对应的layout中
73
- layout_sapns = []
74
- for span in spans:
75
- if calculate_overlap_area_in_bbox1_area_ratio(
76
- span['bbox'], layout_bbox) > 0.6:
77
- layout_sapns.append(span)
78
- # 如果layout_sapns不为空,则放入new_spans中
79
- if len(layout_sapns) > 0:
80
- new_spans.append(layout_sapns)
81
- # 从spans删除已经放入layout_sapns中的span
82
- for layout_sapn in layout_sapns:
83
- spans.remove(layout_sapn)
84
-
85
- if len(new_spans) > 0:
86
- for layout_sapns in new_spans:
87
- layout_lines = merge_spans_to_line(layout_sapns)
88
- lines.extend(layout_lines)
89
-
90
- # 对line中的span进行排序
91
- lines = line_sort_spans_by_left_to_right(lines)
92
-
93
- for span in spans:
94
- span['tag'] = DropTag.NOT_IN_LAYOUT
95
- dropped_spans.append(span)
96
-
97
- return lines, dropped_spans
98
-
99
-
100
- def merge_lines_to_block(lines):
101
- # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
102
- blocks = []
103
- for line in lines:
104
- blocks.append({
105
- 'bbox': line['bbox'],
106
- 'lines': [line],
107
- })
108
- return blocks
109
-
110
-
111
- def sort_blocks_by_layout(all_bboxes, layout_bboxes):
112
- new_blocks = []
113
- sort_blocks = []
114
- for item in layout_bboxes:
115
- layout_bbox = item['layout_bbox']
116
-
117
- # 遍历blocks,将每个blocks放入对应的layout中
118
- layout_blocks = []
119
- for block in all_bboxes:
120
- # 如果是footnote则跳过
121
- if block[7] == BlockType.Footnote:
122
- continue
123
- block_bbox = block[:4]
124
- if calculate_overlap_area_in_bbox1_area_ratio(
125
- block_bbox, layout_bbox) > 0.8:
126
- layout_blocks.append(block)
127
-
128
- # 如果layout_blocks不为空,则放入new_blocks中
129
- if len(layout_blocks) > 0:
130
- new_blocks.append(layout_blocks)
131
- # 从all_bboxes删除已经放入layout_blocks中的block
132
- for layout_block in layout_blocks:
133
- all_bboxes.remove(layout_block)
134
-
135
- # 如果new_blocks不为空,则对new_blocks中每个block进行排序
136
- if len(new_blocks) > 0:
137
- for bboxes_in_layout_block in new_blocks:
138
- bboxes_in_layout_block.sort(
139
- key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
140
- sort_blocks.extend(bboxes_in_layout_block)
141
-
142
- # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
143
- return sort_blocks
144
-
145
-
146
63
  def fill_spans_in_blocks(blocks, spans, radio):
147
64
  """将allspans中的span按位置关系,放入blocks中."""
148
65
  block_with_spans = []
@@ -164,14 +81,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
164
81
  if calculate_overlap_area_in_bbox1_area_ratio(
165
82
  span_bbox, block_bbox) > radio:
166
83
  block_spans.append(span)
167
- '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
168
- # displayed_list = []
169
- # text_inline_lines = []
170
- # modify_y_axis(block_spans, displayed_list, text_inline_lines)
171
- '''模型识别错误的行间公式, type类型转换成行内公式'''
172
- # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
173
- '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
174
- # block_spans = remove_overlap_between_bbox_for_span(block_spans)
175
84
 
176
85
  block_dict['spans'] = block_spans
177
86
  block_with_spans.append(block_dict)
@@ -184,32 +93,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
184
93
  return block_with_spans, spans
185
94
 
186
95
 
187
- def fix_block_spans(block_with_spans, img_blocks, table_blocks):
188
- """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
189
- 需要将caption和footnote的text_span放入相应img_block和table_block内的
190
- caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
191
- fix_blocks = []
192
- for block in block_with_spans:
193
- block_type = block['type']
194
-
195
- if block_type == BlockType.Image:
196
- block = fix_image_block(block, img_blocks)
197
- elif block_type == BlockType.Table:
198
- block = fix_table_block(block, table_blocks)
199
- elif block_type in [BlockType.Text, BlockType.Title]:
200
- block = fix_text_block(block)
201
- elif block_type == BlockType.InterlineEquation:
202
- block = fix_interline_block(block)
203
- else:
204
- continue
205
- fix_blocks.append(block)
206
- return fix_blocks
207
-
208
-
209
96
  def fix_block_spans_v2(block_with_spans):
210
- """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
211
- 需要将caption和footnote的text_span放入相应img_block和table_block内的
212
- caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
213
97
  fix_blocks = []
214
98
  for block in block_with_spans:
215
99
  block_type = block['type']
@@ -235,113 +119,6 @@ def fix_discarded_block(discarded_block_with_spans):
235
119
  return fix_discarded_blocks
236
120
 
237
121
 
238
- def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
239
- block_spans = []
240
- # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
241
- for span in spans:
242
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
243
- block_bbox) > 0.6:
244
- block_spans.append(span)
245
- block_lines = merge_spans_to_line(block_spans)
246
- # 对line中的span进行排序
247
- sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
248
- block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
249
- return block, block_spans
250
-
251
-
252
- def make_body_block(span: dict, block_bbox: list, block_type: str):
253
- # 创建body_block
254
- body_line = {
255
- 'bbox': block_bbox,
256
- 'spans': [span],
257
- }
258
- body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
259
- return body_block
260
-
261
-
262
- def fix_image_block(block, img_blocks):
263
- block['blocks'] = []
264
- # 遍历img_blocks,找到与当前block匹配的img_block
265
- for img_block in img_blocks:
266
- if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
267
- img_block['bbox'], 0.95):
268
-
269
- # 创建img_body_block
270
- for span in block['spans']:
271
- if span['type'] == ContentType.Image and img_block[
272
- 'img_body_bbox'] == span['bbox']:
273
- # 创建img_body_block
274
- img_body_block = make_body_block(
275
- span, img_block['img_body_bbox'], BlockType.ImageBody)
276
- block['blocks'].append(img_body_block)
277
-
278
- # 从spans中移除img_body_block中已经放入的span
279
- block['spans'].remove(span)
280
- break
281
-
282
- # 根据list长度,判断img_block中是否有img_caption
283
- if img_block['img_caption_bbox'] is not None:
284
- img_caption_block, img_caption_spans = merge_spans_to_block(
285
- block['spans'], img_block['img_caption_bbox'],
286
- BlockType.ImageCaption)
287
- block['blocks'].append(img_caption_block)
288
-
289
- if img_block['img_footnote_bbox'] is not None:
290
- img_footnote_block, img_footnote_spans = merge_spans_to_block(
291
- block['spans'], img_block['img_footnote_bbox'],
292
- BlockType.ImageFootnote)
293
- block['blocks'].append(img_footnote_block)
294
- break
295
- del block['spans']
296
- return block
297
-
298
-
299
- def fix_table_block(block, table_blocks):
300
- block['blocks'] = []
301
- # 遍历table_blocks,找到与当前block匹配的table_block
302
- for table_block in table_blocks:
303
- if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
304
- table_block['bbox'], 0.95):
305
-
306
- # 创建table_body_block
307
- for span in block['spans']:
308
- if span['type'] == ContentType.Table and table_block[
309
- 'table_body_bbox'] == span['bbox']:
310
- # 创建table_body_block
311
- table_body_block = make_body_block(
312
- span, table_block['table_body_bbox'],
313
- BlockType.TableBody)
314
- block['blocks'].append(table_body_block)
315
-
316
- # 从spans中移除img_body_block中已经放入的span
317
- block['spans'].remove(span)
318
- break
319
-
320
- # 根据list长度,判断table_block中是否有caption
321
- if table_block['table_caption_bbox'] is not None:
322
- table_caption_block, table_caption_spans = merge_spans_to_block(
323
- block['spans'], table_block['table_caption_bbox'],
324
- BlockType.TableCaption)
325
- block['blocks'].append(table_caption_block)
326
-
327
- # 如果table_caption_block_spans不为空
328
- if len(table_caption_spans) > 0:
329
- # 一些span已经放入了caption_block中,需要从block['spans']中删除
330
- for span in table_caption_spans:
331
- block['spans'].remove(span)
332
-
333
- # 根据list长度,判断table_block中是否有table_note
334
- if table_block['table_footnote_bbox'] is not None:
335
- table_footnote_block, table_footnote_spans = merge_spans_to_block(
336
- block['spans'], table_block['table_footnote_bbox'],
337
- BlockType.TableFootnote)
338
- block['blocks'].append(table_footnote_block)
339
-
340
- break
341
- del block['spans']
342
- return block
343
-
344
-
345
122
  def fix_text_block(block):
346
123
  # 文本block中的公式span都应该转换成行内type
347
124
  for span in block['spans']:
@@ -1,10 +1,7 @@
1
1
 
2
2
  from magic_pdf.config.drop_tag import DropTag
3
- from magic_pdf.config.ocr_content_type import BlockType, ContentType
4
- from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
5
- calculate_iou,
6
- calculate_overlap_area_in_bbox1_area_ratio,
7
- get_minbox_if_overlap_by_ratio)
3
+ from magic_pdf.config.ocr_content_type import BlockType
4
+ from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
8
5
 
9
6
 
10
7
  def remove_overlaps_low_confidence_spans(spans):
@@ -59,253 +56,6 @@ def remove_overlaps_min_spans(spans):
59
56
  return spans, dropped_spans
60
57
 
61
58
 
62
- def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
63
- # 遍历spans, 判断是否在removed_span_block_bboxes中
64
- # 如果是, 则删除该span 否则, 保留该span
65
- need_remove_spans = []
66
- for span in spans:
67
- for removed_bbox in need_remove_spans_bboxes:
68
- if (
69
- calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
70
- > 0.5
71
- ):
72
- if span not in need_remove_spans:
73
- need_remove_spans.append(span)
74
- break
75
-
76
- if len(need_remove_spans) > 0:
77
- for span in need_remove_spans:
78
- spans.remove(span)
79
-
80
- return spans
81
-
82
-
83
- def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
84
- dropped_spans = []
85
- for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
86
- # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
87
- need_remove_spans = []
88
- for span in spans:
89
- # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
90
- for removed_bbox in removed_bboxes:
91
- if (
92
- calculate_overlap_area_in_bbox1_area_ratio(
93
- span['bbox'], removed_bbox
94
- )
95
- > 0.5
96
- ):
97
- need_remove_spans.append(span)
98
- break
99
- # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
100
- elif (
101
- drop_tag == DropTag.FOOTNOTE
102
- and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
103
- and removed_bbox[0]
104
- < (span['bbox'][0] + span['bbox'][2]) / 2
105
- < removed_bbox[2]
106
- ):
107
- need_remove_spans.append(span)
108
- break
109
-
110
- for span in need_remove_spans:
111
- spans.remove(span)
112
- span['tag'] = drop_tag
113
- dropped_spans.append(span)
114
-
115
- return spans, dropped_spans
116
-
117
-
118
- def adjust_bbox_for_standalone_block(spans):
119
- # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
120
- for sb_span in spans:
121
- if sb_span['type'] in [
122
- ContentType.InterlineEquation,
123
- ContentType.Image,
124
- ContentType.Table,
125
- ]:
126
- for text_span in spans:
127
- if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
128
- # 判断span2的纵向高度是否被span所覆盖
129
- if (
130
- sb_span['bbox'][1] < text_span['bbox'][1]
131
- and sb_span['bbox'][3] > text_span['bbox'][3]
132
- ):
133
- # 判断span2是否在span左边
134
- if text_span['bbox'][0] < sb_span['bbox'][0]:
135
- # 调整span的y0和span2的y0一致
136
- sb_span['bbox'][1] = text_span['bbox'][1]
137
- return spans
138
-
139
-
140
- def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
141
- # displayed_list = []
142
- # 如果spans为空,则不处理
143
- if len(spans) == 0:
144
- pass
145
- else:
146
- spans.sort(key=lambda span: span['bbox'][1])
147
-
148
- lines = []
149
- current_line = [spans[0]]
150
- if spans[0]['type'] in [
151
- ContentType.InterlineEquation,
152
- ContentType.Image,
153
- ContentType.Table,
154
- ]:
155
- displayed_list.append(spans[0])
156
-
157
- line_first_y0 = spans[0]['bbox'][1]
158
- line_first_y = spans[0]['bbox'][3]
159
- # 用于给行间公式搜索
160
- # text_inline_lines = []
161
- for span in spans[1:]:
162
- # if span.get("content","") == "78.":
163
- # print("debug")
164
- # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
165
- # image和table类型,同上
166
- if span['type'] in [
167
- ContentType.InterlineEquation,
168
- ContentType.Image,
169
- ContentType.Table,
170
- ] or any(
171
- s['type']
172
- in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
173
- for s in current_line
174
- ):
175
- # 传入
176
- if span['type'] in [
177
- ContentType.InterlineEquation,
178
- ContentType.Image,
179
- ContentType.Table,
180
- ]:
181
- displayed_list.append(span)
182
- # 则开始新行
183
- lines.append(current_line)
184
- if len(current_line) > 1 or current_line[0]['type'] in [
185
- ContentType.Text,
186
- ContentType.InlineEquation,
187
- ]:
188
- text_inline_lines.append(
189
- (current_line, (line_first_y0, line_first_y))
190
- )
191
- current_line = [span]
192
- line_first_y0 = span['bbox'][1]
193
- line_first_y = span['bbox'][3]
194
- continue
195
-
196
- # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
197
- if __is_overlaps_y_exceeds_threshold(
198
- span['bbox'], current_line[-1]['bbox']
199
- ):
200
- if span['type'] == 'text':
201
- line_first_y0 = span['bbox'][1]
202
- line_first_y = span['bbox'][3]
203
- current_line.append(span)
204
-
205
- else:
206
- # 否则,开始新行
207
- lines.append(current_line)
208
- text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
209
- current_line = [span]
210
- line_first_y0 = span['bbox'][1]
211
- line_first_y = span['bbox'][3]
212
-
213
- # 添加最后一行
214
- if current_line:
215
- lines.append(current_line)
216
- if len(current_line) > 1 or current_line[0]['type'] in [
217
- ContentType.Text,
218
- ContentType.InlineEquation,
219
- ]:
220
- text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
221
- for line in text_inline_lines:
222
- # 按照x0坐标排序
223
- current_line = line[0]
224
- current_line.sort(key=lambda span: span['bbox'][0])
225
-
226
- # 调整每一个文字行内bbox统一
227
- for line in text_inline_lines:
228
- current_line, (line_first_y0, line_first_y) = line
229
- for span in current_line:
230
- span['bbox'][1] = line_first_y0
231
- span['bbox'][3] = line_first_y
232
-
233
- # return spans, displayed_list, text_inline_lines
234
-
235
-
236
- def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
237
- # 错误行间公式转行内公式
238
- j = 0
239
- for i in range(len(displayed_list)):
240
- # if i == 8:
241
- # print("debug")
242
- span = displayed_list[i]
243
- span_y0, span_y = span['bbox'][1], span['bbox'][3]
244
-
245
- while j < len(text_inline_lines):
246
- text_line = text_inline_lines[j]
247
- y0, y1 = text_line[1]
248
- if (
249
- span_y0 < y0 < span_y
250
- or span_y0 < y1 < span_y
251
- or span_y0 < y0
252
- and span_y > y1
253
- ) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
254
- # 调整公式类型
255
- if span['type'] == ContentType.InterlineEquation:
256
- # 最后一行是行间公式
257
- if j + 1 >= len(text_inline_lines):
258
- span['type'] = ContentType.InlineEquation
259
- span['bbox'][1] = y0
260
- span['bbox'][3] = y1
261
- else:
262
- # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
263
- y0_next, y1_next = text_inline_lines[j + 1][1]
264
- if (
265
- not __is_overlaps_y_exceeds_threshold(
266
- span['bbox'], (0, y0_next, 0, y1_next)
267
- )
268
- and 3 * (y1 - y0) > span_y - span_y0
269
- ):
270
- span['type'] = ContentType.InlineEquation
271
- span['bbox'][1] = y0
272
- span['bbox'][3] = y1
273
- break
274
- elif (
275
- span_y < y0
276
- or span_y0 < y0 < span_y
277
- and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
278
- ):
279
- break
280
- else:
281
- j += 1
282
-
283
- return spans
284
-
285
-
286
- def get_qa_need_list(blocks):
287
- # 创建 images, tables, interline_equations, inline_equations 的副本
288
- images = []
289
- tables = []
290
- interline_equations = []
291
- inline_equations = []
292
-
293
- for block in blocks:
294
- for line in block['lines']:
295
- for span in line['spans']:
296
- if span['type'] == ContentType.Image:
297
- images.append(span)
298
- elif span['type'] == ContentType.Table:
299
- tables.append(span)
300
- elif span['type'] == ContentType.InlineEquation:
301
- inline_equations.append(span)
302
- elif span['type'] == ContentType.InterlineEquation:
303
- interline_equations.append(span)
304
- else:
305
- continue
306
- return images, tables, interline_equations, inline_equations
307
-
308
-
309
59
  def get_qa_need_list_v2(blocks):
310
60
  # 创建 images, tables, interline_equations, inline_equations 的副本
311
61
  images = []
@@ -1,5 +1,5 @@
1
1
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
2
- from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
2
+ from magic_pdf.libs.commons import parse_bucket_key, join_path
3
3
  import boto3
4
4
  from loguru import logger
5
5
  from botocore.config import Config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.0
3
+ Version: 0.10.2
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -320,88 +320,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
320
320
 
321
321
  ### Command Line
322
322
 
323
- ```bash
324
- magic-pdf --help
325
- Usage: magic-pdf [OPTIONS]
326
-
327
- Options:
328
- -v, --version display the version and exit
329
- -p, --path PATH local pdf filepath or directory [required]
330
- -o, --output-dir PATH output local directory [required]
331
- -m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr
332
- technique to extract information from pdf. txt:
333
- suitable for the text-based pdf only and
334
- outperform ocr. auto: automatically choose the
335
- best method for parsing pdf from ocr and txt.
336
- without method specified, auto will be used by
337
- default.
338
- -l, --lang TEXT Input the languages in the pdf (if known) to
339
- improve OCR accuracy. Optional. You should
340
- input "Abbreviation" with language form url: ht
341
- tps://paddlepaddle.github.io/PaddleOCR/latest/en
342
- /ppocr/blog/multi_languages.html#5-support-languages-
343
- and-abbreviations
344
- -d, --debug BOOLEAN Enables detailed debugging information during
345
- the execution of the CLI commands.
346
- -s, --start INTEGER The starting page for PDF parsing, beginning
347
- from 0.
348
- -e, --end INTEGER The ending page for PDF parsing, beginning from
349
- 0.
350
- --help Show this message and exit.
351
-
352
-
353
- ## show version
354
- magic-pdf -v
355
-
356
- ## command line example
357
- magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
358
- ```
323
+ [Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/command_line.html)
359
324
 
360
- `{some_pdf}` can be a single PDF file or a directory containing multiple PDFs.
361
- The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
362
-
363
- ```text
364
- ├── some_pdf.md # markdown file
365
- ├── images # directory for storing images
366
- ├── some_pdf_layout.pdf # layout diagram (Include layout reading order)
367
- ├── some_pdf_middle.json # MinerU intermediate processing result
368
- ├── some_pdf_model.json # model inference result
369
- ├── some_pdf_origin.pdf # original PDF file
370
- ├── some_pdf_spans.pdf # smallest granularity bbox position information diagram
371
- └── some_pdf_content_list.json # Rich text JSON arranged in reading order
372
- ```
373
325
  > [!TIP]
374
326
  > For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
375
327
 
376
328
  ### API
377
329
 
378
- Processing files from local disk
379
-
380
- ```python
381
- image_writer = DiskReaderWriter(local_image_dir)
382
- image_dir = str(os.path.basename(local_image_dir))
383
- jso_useful_key = {"_pdf_type": "", "model_list": []}
384
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
385
- pipe.pipe_classify()
386
- pipe.pipe_analyze()
387
- pipe.pipe_parse()
388
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
389
- ```
390
-
391
- Processing files from object storage
392
-
393
- ```python
394
- s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
395
- image_dir = "s3://img_bucket/"
396
- s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
397
- pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
398
- jso_useful_key = {"_pdf_type": "", "model_list": []}
399
- pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
400
- pipe.pipe_classify()
401
- pipe.pipe_analyze()
402
- pipe.pipe_parse()
403
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
404
- ```
330
+ [Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html)
405
331
 
406
332
  For detailed implementation, refer to:
407
333