magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -5,19 +5,18 @@ import time
5
5
  from typing import List
6
6
 
7
7
  import torch
8
+ import fitz
8
9
  from loguru import logger
9
10
 
10
- from magic_pdf.config.drop_reason import DropReason
11
11
  from magic_pdf.config.enums import SupportedPdfParseMethod
12
12
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
13
13
  from magic_pdf.data.dataset import Dataset, PageableData
14
14
  from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
15
15
  from magic_pdf.libs.clean_memory import clean_memory
16
- from magic_pdf.libs.commons import fitz, get_delta_time
17
16
  from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
18
17
  from magic_pdf.libs.convert_utils import dict_to_list
19
18
  from magic_pdf.libs.hash_utils import compute_md5
20
- from magic_pdf.libs.local_math import float_equal
19
+
21
20
  from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
22
21
  from magic_pdf.model.magic_model import MagicModel
23
22
 
@@ -34,13 +33,11 @@ except ImportError:
34
33
  from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
35
34
 
36
35
  from magic_pdf.para.para_split_v3 import para_split
37
- from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
36
+
38
37
  from magic_pdf.pre_proc.construct_page_dict import \
39
38
  ocr_construct_page_component_v2
40
39
  from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
41
- from magic_pdf.pre_proc.equations_replace import (
42
- combine_chars_to_pymudict, remove_chars_in_text_blocks,
43
- replace_equations_in_textblock)
40
+
44
41
  from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
45
42
  ocr_prepare_bboxes_for_layout_split_v2
46
43
  from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
@@ -49,26 +46,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
49
46
  from magic_pdf.pre_proc.ocr_span_list_modify import (
50
47
  get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
51
48
  remove_overlaps_min_spans)
52
- from magic_pdf.pre_proc.resolve_bbox_conflict import \
53
- check_useful_block_horizontal_overlap
54
-
55
-
56
- def remove_horizontal_overlap_block_which_smaller(all_bboxes):
57
- useful_blocks = []
58
- for bbox in all_bboxes:
59
- useful_blocks.append({'bbox': bbox[:4]})
60
- is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
61
- check_useful_block_horizontal_overlap(useful_blocks)
62
- )
63
- if is_useful_block_horz_overlap:
64
- logger.warning(
65
- f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
66
- ) # noqa: E501
67
- for bbox in all_bboxes.copy():
68
- if smaller_bbox == bbox[:4]:
69
- all_bboxes.remove(bbox)
70
-
71
- return is_useful_block_horz_overlap, all_bboxes
72
49
 
73
50
 
74
51
  def __replace_STX_ETX(text_str: str):
@@ -89,29 +66,26 @@ def __replace_STX_ETX(text_str: str):
89
66
 
90
67
 
91
68
  def chars_to_content(span):
92
- # # 先给chars按char['bbox']的x坐标排序
93
- # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
94
-
69
+ # 检查span中的char是否为空
70
+ if len(span['chars']) == 0:
71
+ span['content'] = ''
72
+ else:
95
73
  # 先给chars按char['bbox']的中心点的x坐标排序
96
74
  span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
97
- content = ''
98
75
 
99
76
  # 求char的平均宽度
100
- if len(span['chars']) == 0:
101
- span['content'] = content
102
- del span['chars']
103
- return
104
- else:
105
- char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
106
- char_avg_width = char_width_sum / len(span['chars'])
77
+ char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
78
+ char_avg_width = char_width_sum / len(span['chars'])
107
79
 
80
+ content = ''
108
81
  for char in span['chars']:
109
82
  # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
110
83
  if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
111
84
  content += ' '
112
85
  content += char['c']
113
86
  span['content'] = __replace_STX_ETX(content)
114
- del span['chars']
87
+
88
+ del span['chars']
115
89
 
116
90
 
117
91
  LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
@@ -128,8 +102,13 @@ def fill_char_in_spans(spans, all_chars):
128
102
  span['chars'].append(char)
129
103
  break
130
104
 
105
+ empty_spans = []
106
+
131
107
  for span in spans:
132
108
  chars_to_content(span)
109
+ if len(span['content']) == 0:
110
+ empty_spans.append(span)
111
+ return empty_spans
133
112
 
134
113
 
135
114
  # 使用鲁棒性更强的中心点坐标判断
@@ -162,48 +141,79 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
162
141
 
163
142
  def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
164
143
 
144
+ text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
145
+
146
+ # @todo: 拿到char之后把倾斜角度较大的先删一遍
147
+ all_pymu_chars = []
148
+ for block in text_blocks_raw:
149
+ for line in block['lines']:
150
+ for span in line['spans']:
151
+ all_pymu_chars.extend(span['chars'])
152
+
153
+ # 计算所有sapn的高度的中位数
154
+ span_height_list = []
155
+ for span in spans:
156
+ if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
157
+ continue
158
+ span_height = span['bbox'][3] - span['bbox'][1]
159
+ span['height'] = span_height
160
+ span_height_list.append(span_height)
161
+ if len(span_height_list) == 0:
162
+ return spans
163
+ else:
164
+ median_span_height = statistics.median(span_height_list)
165
+
165
166
  useful_spans = []
166
167
  unuseful_spans = []
168
+ # 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
169
+ vertical_spans = []
167
170
  for span in spans:
168
- for block in all_bboxes:
171
+ if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
172
+ continue
173
+ for block in all_bboxes + all_discarded_blocks:
169
174
  if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
170
175
  continue
171
- else:
172
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
173
- useful_spans.append(span)
174
- break
175
- for block in all_discarded_blocks:
176
176
  if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
177
- unuseful_spans.append(span)
177
+ if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
178
+ vertical_spans.append(span)
179
+ elif block in all_bboxes:
180
+ useful_spans.append(span)
181
+ else:
182
+ unuseful_spans.append(span)
183
+
184
+ del span['height']
185
+
178
186
  break
179
187
 
180
- text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
188
+ """垂直的span框直接用pymu的line进行填充"""
189
+ if len(vertical_spans) > 0:
190
+ text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
191
+ all_pymu_lines = []
192
+ for block in text_blocks:
193
+ for line in block['lines']:
194
+ all_pymu_lines.append(line)
181
195
 
182
- # @todo: 拿到char之后把倾斜角度较大的先删一遍
183
- all_pymu_chars = []
184
- for block in text_blocks:
185
- for line in block['lines']:
186
- for span in line['spans']:
187
- all_pymu_chars.extend(span['chars'])
196
+ for pymu_line in all_pymu_lines:
197
+ for span in vertical_spans:
198
+ if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
199
+ for pymu_span in pymu_line['spans']:
200
+ span['content'] += pymu_span['text']
201
+ break
188
202
 
189
- new_spans = []
203
+ for span in vertical_spans:
204
+ if len(span['content']) == 0:
205
+ spans.remove(span)
190
206
 
191
- for span in useful_spans:
192
- if span['type'] in [ContentType.Text]:
193
- span['chars'] = []
194
- new_spans.append(span)
207
+ """水平的span框如果没有char则用ocr进行填充"""
208
+ new_spans = []
195
209
 
196
- for span in unuseful_spans:
210
+ for span in useful_spans + unuseful_spans:
197
211
  if span['type'] in [ContentType.Text]:
198
212
  span['chars'] = []
199
213
  new_spans.append(span)
200
214
 
201
- fill_char_in_spans(new_spans, all_pymu_chars)
215
+ empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
202
216
 
203
- empty_spans = []
204
- for span in new_spans:
205
- if len(span['content']) == 0:
206
- empty_spans.append(span)
207
217
  if len(empty_spans) > 0:
208
218
 
209
219
  # 初始化ocr模型
@@ -216,55 +226,21 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
216
226
  )
217
227
 
218
228
  for span in empty_spans:
219
- spans.remove(span)
220
- # 对span的bbox截图
229
+ # 对span的bbox截图再ocr
221
230
  span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
222
231
  ocr_res = ocr_model.ocr(span_img, det=False)
223
- # logger.info(f"ocr_res: {ocr_res}")
224
- # logger.info(f"empty_span: {span}")
225
232
  if ocr_res and len(ocr_res) > 0:
226
233
  if len(ocr_res[0]) > 0:
227
234
  ocr_text, ocr_score = ocr_res[0][0]
228
235
  if ocr_score > 0.5 and len(ocr_text) > 0:
229
- span['content'] = ocr_text
230
- spans.append(span)
236
+ span['content'] = ocr_text
237
+ span['score'] = ocr_score
238
+ else:
239
+ spans.remove(span)
231
240
 
232
241
  return spans
233
242
 
234
243
 
235
- def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
236
- text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
237
- char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
238
- 'blocks'
239
- ]
240
- text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
241
- text_blocks = replace_equations_in_textblock(
242
- text_blocks, inline_equations, interline_equations
243
- )
244
- text_blocks = remove_citation_marker(text_blocks)
245
- text_blocks = remove_chars_in_text_blocks(text_blocks)
246
- spans = []
247
- for v in text_blocks:
248
- for line in v['lines']:
249
- for span in line['spans']:
250
- bbox = span['bbox']
251
- if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
252
- continue
253
- if span.get('type') not in (
254
- ContentType.InlineEquation,
255
- ContentType.InterlineEquation,
256
- ):
257
- spans.append(
258
- {
259
- 'bbox': list(span['bbox']),
260
- 'content': __replace_STX_ETX(span['text']),
261
- 'type': ContentType.Text,
262
- 'score': 1.0,
263
- }
264
- )
265
- return spans
266
-
267
-
268
244
  def replace_text_span(pymu_spans, ocr_spans):
269
245
  return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
270
246
 
@@ -682,6 +658,23 @@ def parse_page_core(
682
658
  """顺便删除大水印并保留abandon的span"""
683
659
  spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
684
660
 
661
+ """删除重叠spans中置信度较低的那些"""
662
+ spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
663
+ """删除重叠spans中较小的那些"""
664
+ spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
665
+
666
+ """根据parse_mode,构造spans,主要是文本类的字符填充"""
667
+ if parse_mode == SupportedPdfParseMethod.TXT:
668
+
669
+ """使用新版本的混合ocr方案"""
670
+ spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
671
+
672
+ elif parse_mode == SupportedPdfParseMethod.OCR:
673
+ pass
674
+ else:
675
+ raise Exception('parse_mode must be txt or ocr')
676
+
677
+
685
678
  """先处理不需要排版的discarded_blocks"""
686
679
  discarded_block_with_spans, spans = fill_spans_in_blocks(
687
680
  all_discarded_blocks, spans, 0.4
@@ -706,26 +699,6 @@ def parse_page_core(
706
699
  drop_reason,
707
700
  )
708
701
 
709
- """删除重叠spans中置信度较低的那些"""
710
- spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
711
- """删除重叠spans中较小的那些"""
712
- spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
713
-
714
- """根据parse_mode,构造spans,主要是文本类的字符填充"""
715
- if parse_mode == SupportedPdfParseMethod.TXT:
716
-
717
- """之前的公式替换方案"""
718
- # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
719
- # spans = replace_text_span(pymu_spans, spans)
720
-
721
- """ocr 中文本类的 span 用 pymu spans 替换!"""
722
- spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
723
-
724
- elif parse_mode == SupportedPdfParseMethod.OCR:
725
- pass
726
- else:
727
- raise Exception('parse_mode must be txt or ocr')
728
-
729
702
  """对image和table截图"""
730
703
  spans = ocr_cut_image_and_table(
731
704
  spans, page_doc, page_id, pdf_bytes_md5, imageWriter
@@ -811,7 +784,7 @@ def pdf_parse_union(
811
784
  if debug_mode:
812
785
  time_now = time.time()
813
786
  logger.info(
814
- f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
787
+ f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
815
788
  )
816
789
  start_time = time_now
817
790
 
@@ -1,58 +1,3 @@
1
- def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
2
- interline_eq_info, raw_pymu_blocks,
3
- removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
4
- layout_tree,
5
- page_w, page_h, footnote_bboxes_tmp):
6
- """
7
-
8
- """
9
- return_dict = {}
10
-
11
- return_dict['para_blocks'] = {}
12
- return_dict['preproc_blocks'] = text_blocks_preproc
13
- return_dict['images'] = image_info
14
- return_dict['tables'] = table_info
15
- return_dict['interline_equations'] = interline_eq_info
16
- return_dict['inline_equations'] = inline_eq_info
17
- return_dict['layout_bboxes'] = layout_bboxes
18
- return_dict['pymu_raw_blocks'] = raw_pymu_blocks
19
- return_dict['global_statistic'] = {}
20
-
21
- return_dict['droped_text_block'] = removed_text_blocks
22
- return_dict['droped_image_block'] = removed_image_blocks
23
- return_dict['droped_table_block'] = []
24
- return_dict['image_backup'] = images_backup
25
- return_dict['table_backup'] = []
26
- return_dict['page_idx'] = page_id
27
- return_dict['page_size'] = [page_w, page_h]
28
- return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
29
- return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
30
-
31
- return return_dict
32
-
33
-
34
- def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
35
- images, tables, interline_equations, inline_equations,
36
- dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
37
- need_remove_spans_bboxes_dict):
38
- return_dict = {
39
- 'preproc_blocks': blocks,
40
- 'layout_bboxes': layout_bboxes,
41
- 'page_idx': page_id,
42
- 'page_size': [page_w, page_h],
43
- '_layout_tree': layout_tree,
44
- 'images': images,
45
- 'tables': tables,
46
- 'interline_equations': interline_equations,
47
- 'inline_equations': inline_equations,
48
- 'droped_text_block': dropped_text_block,
49
- 'droped_image_block': dropped_image_block,
50
- 'droped_table_block': dropped_table_block,
51
- 'dropped_equation_block': dropped_equation_block,
52
- 'droped_bboxes': need_remove_spans_bboxes_dict,
53
- }
54
- return return_dict
55
-
56
1
 
57
2
  def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
58
3
  images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
@@ -25,43 +25,6 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
25
25
  return spans
26
26
 
27
27
 
28
- def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
29
- image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
30
- equation_inline_bboxes: list,
31
- equation_interline_bboxes: list, imageWriter) -> dict:
32
- """返回一个dict, key为bbox, 值是图片地址."""
33
- image_info = []
34
- image_backup_info = []
35
- table_info = []
36
- inline_eq_info = []
37
- interline_eq_info = []
38
-
39
- # 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
40
-
41
- def return_path(type):
42
- return join_path(pdf_bytes_md5, type)
43
-
44
- for bbox in image_bboxes:
45
- if not check_img_bbox(bbox):
46
- continue
47
- image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
48
- image_info.append({'bbox': bbox, 'image_path': image_path})
49
-
50
- for bbox in images_overlap_backup:
51
- if not check_img_bbox(bbox):
52
- continue
53
- image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
54
- image_backup_info.append({'bbox': bbox, 'image_path': image_path})
55
-
56
- for bbox in table_bboxes:
57
- if not check_img_bbox(bbox):
58
- continue
59
- image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
60
- table_info.append({'bbox': bbox, 'image_path': image_path})
61
-
62
- return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
63
-
64
-
65
28
  def check_img_bbox(bbox) -> bool:
66
29
  if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
67
30
  logger.warning(f'image_bboxes: 错误的box, {bbox}')
@@ -1,184 +1,11 @@
1
-
2
1
  from magic_pdf.config.ocr_content_type import BlockType
3
2
  from magic_pdf.libs.boxbase import (
4
- calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
3
+ calculate_iou,
4
+ calculate_overlap_area_in_bbox1_area_ratio,
5
5
  calculate_vertical_projection_overlap_ratio,
6
- get_minbox_if_overlap_by_ratio)
7
- from magic_pdf.pre_proc.remove_bbox_overlap import \
8
- remove_overlap_between_bbox_for_block
9
-
10
-
11
- def ocr_prepare_bboxes_for_layout_split(
12
- img_blocks,
13
- table_blocks,
14
- discarded_blocks,
15
- text_blocks,
16
- title_blocks,
17
- interline_equation_blocks,
18
- page_w,
19
- page_h,
20
- ):
21
- all_bboxes = []
22
- all_discarded_blocks = []
23
- for image in img_blocks:
24
- x0, y0, x1, y1 = image['bbox']
25
- all_bboxes.append(
26
- [
27
- x0,
28
- y0,
29
- x1,
30
- y1,
31
- None,
32
- None,
33
- None,
34
- BlockType.Image,
35
- None,
36
- None,
37
- None,
38
- None,
39
- image['score'],
40
- ]
41
- )
42
-
43
- for table in table_blocks:
44
- x0, y0, x1, y1 = table['bbox']
45
- all_bboxes.append(
46
- [
47
- x0,
48
- y0,
49
- x1,
50
- y1,
51
- None,
52
- None,
53
- None,
54
- BlockType.Table,
55
- None,
56
- None,
57
- None,
58
- None,
59
- table['score'],
60
- ]
61
- )
62
-
63
- for text in text_blocks:
64
- x0, y0, x1, y1 = text['bbox']
65
- all_bboxes.append(
66
- [
67
- x0,
68
- y0,
69
- x1,
70
- y1,
71
- None,
72
- None,
73
- None,
74
- BlockType.Text,
75
- None,
76
- None,
77
- None,
78
- None,
79
- text['score'],
80
- ]
81
- )
82
-
83
- for title in title_blocks:
84
- x0, y0, x1, y1 = title['bbox']
85
- all_bboxes.append(
86
- [
87
- x0,
88
- y0,
89
- x1,
90
- y1,
91
- None,
92
- None,
93
- None,
94
- BlockType.Title,
95
- None,
96
- None,
97
- None,
98
- None,
99
- title['score'],
100
- ]
101
- )
102
-
103
- for interline_equation in interline_equation_blocks:
104
- x0, y0, x1, y1 = interline_equation['bbox']
105
- all_bboxes.append(
106
- [
107
- x0,
108
- y0,
109
- x1,
110
- y1,
111
- None,
112
- None,
113
- None,
114
- BlockType.InterlineEquation,
115
- None,
116
- None,
117
- None,
118
- None,
119
- interline_equation['score'],
120
- ]
121
- )
122
-
123
- """block嵌套问题解决"""
124
- """文本框与标题框重叠,优先信任文本框"""
125
- all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
126
- """任何框体与舍弃框重叠,优先信任舍弃框"""
127
- all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
128
-
129
- # interline_equation 与title或text框冲突的情况,分两种情况处理
130
- """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
131
- all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
132
- """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
133
- # 通过后续大框套小框逻辑删除
134
-
135
- """discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
136
- for discarded in discarded_blocks:
137
- x0, y0, x1, y1 = discarded['bbox']
138
- all_discarded_blocks.append(
139
- [
140
- x0,
141
- y0,
142
- x1,
143
- y1,
144
- None,
145
- None,
146
- None,
147
- BlockType.Discarded,
148
- None,
149
- None,
150
- None,
151
- None,
152
- discarded['score'],
153
- ]
154
- )
155
- # 将footnote加入到all_bboxes中,用来计算layout
156
- if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
157
- all_bboxes.append(
158
- [
159
- x0,
160
- y0,
161
- x1,
162
- y1,
163
- None,
164
- None,
165
- None,
166
- BlockType.Footnote,
167
- None,
168
- None,
169
- None,
170
- None,
171
- discarded['score'],
172
- ]
173
- )
174
-
175
- """经过以上处理后,还存在大框套小框的情况,则删除小框"""
176
- all_bboxes = remove_overlaps_min_blocks(all_bboxes)
177
- all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
178
- """将剩余的bbox做分离处理,防止后面分layout时出错"""
179
- all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
180
-
181
- return all_bboxes, all_discarded_blocks, drop_reasons
6
+ get_minbox_if_overlap_by_ratio
7
+ )
8
+ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
182
9
 
183
10
 
184
11
  def add_bboxes(blocks, block_type, bboxes):