magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +16 -22
  2. magic_pdf/filter/pdf_meta_scan.py +5 -19
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_check.py +52 -25
  7. magic_pdf/libs/pdf_image_tools.py +2 -1
  8. magic_pdf/libs/version.py +1 -1
  9. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  10. magic_pdf/model/magic_model.py +0 -30
  11. magic_pdf/model/pp_structure_v2.py +23 -3
  12. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
  13. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
  14. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
  15. magic_pdf/para/para_split_v3.py +21 -7
  16. magic_pdf/pdf_parse_union_core_v2.py +134 -146
  17. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  18. magic_pdf/pre_proc/cut_image.py +0 -37
  19. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  20. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  21. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  22. magic_pdf/rw/S3ReaderWriter.py +1 -1
  23. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
  24. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
  25. magic_pdf/dict2md/mkcontent.py +0 -438
  26. magic_pdf/layout/__init__.py +0 -0
  27. magic_pdf/layout/bbox_sort.py +0 -681
  28. magic_pdf/layout/layout_det_utils.py +0 -182
  29. magic_pdf/layout/layout_sort.py +0 -921
  30. magic_pdf/layout/layout_spiler_recog.py +0 -101
  31. magic_pdf/layout/mcol_sort.py +0 -336
  32. magic_pdf/libs/calc_span_stats.py +0 -239
  33. magic_pdf/libs/detect_language_from_model.py +0 -21
  34. magic_pdf/libs/nlp_utils.py +0 -203
  35. magic_pdf/libs/textbase.py +0 -33
  36. magic_pdf/libs/vis_utils.py +0 -308
  37. magic_pdf/para/block_continuation_processor.py +0 -562
  38. magic_pdf/para/block_termination_processor.py +0 -480
  39. magic_pdf/para/commons.py +0 -222
  40. magic_pdf/para/denoise.py +0 -246
  41. magic_pdf/para/draw.py +0 -121
  42. magic_pdf/para/exceptions.py +0 -198
  43. magic_pdf/para/layout_match_processor.py +0 -40
  44. magic_pdf/para/para_split.py +0 -807
  45. magic_pdf/para/para_split_v2.py +0 -959
  46. magic_pdf/para/raw_processor.py +0 -207
  47. magic_pdf/para/stats.py +0 -268
  48. magic_pdf/para/title_processor.py +0 -1014
  49. magic_pdf/pdf_parse_union_core.py +0 -345
  50. magic_pdf/post_proc/__init__.py +0 -0
  51. magic_pdf/post_proc/detect_para.py +0 -3472
  52. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  53. magic_pdf/post_proc/remove_footnote.py +0 -153
  54. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  55. magic_pdf/pre_proc/detect_equation.py +0 -134
  56. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  57. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  58. magic_pdf/pre_proc/detect_footnote.py +0 -170
  59. magic_pdf/pre_proc/detect_header.py +0 -64
  60. magic_pdf/pre_proc/detect_images.py +0 -647
  61. magic_pdf/pre_proc/detect_page_number.py +0 -64
  62. magic_pdf/pre_proc/detect_tables.py +0 -62
  63. magic_pdf/pre_proc/equations_replace.py +0 -550
  64. magic_pdf/pre_proc/fix_image.py +0 -244
  65. magic_pdf/pre_proc/fix_table.py +0 -270
  66. magic_pdf/pre_proc/main_text_font.py +0 -23
  67. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  68. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  69. magic_pdf/pre_proc/post_layout_split.py +0 -0
  70. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  71. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  72. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  73. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  74. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  75. magic_pdf/pre_proc/statistics.py +0 -12
  76. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
  77. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
  78. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
  79. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,17 @@
1
1
  import copy
2
2
  import time
3
-
4
3
  import cv2
5
4
  import numpy as np
5
+
6
6
  from paddleocr import PaddleOCR
7
- from paddleocr.paddleocr import check_img, logger
8
- from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
9
- from paddleocr.tools.infer.predict_system import sorted_boxes
10
- from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
7
+ from ppocr.utils.logging import get_logger
8
+ from ppocr.utils.utility import alpha_to_color, binarize_img
9
+ from tools.infer.predict_system import sorted_boxes
10
+ from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
11
+
12
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
11
13
 
12
- from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes
14
+ logger = get_logger()
13
15
 
14
16
 
15
17
  class ModifiedPaddleOCR(PaddleOCR):
@@ -63,7 +65,7 @@ class ModifiedPaddleOCR(PaddleOCR):
63
65
 
64
66
  if det and rec:
65
67
  ocr_res = []
66
- for idx, img in enumerate(imgs):
68
+ for img in imgs:
67
69
  img = preprocess_image(img)
68
70
  dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
69
71
  if not dt_boxes and not rec_res:
@@ -75,7 +77,7 @@ class ModifiedPaddleOCR(PaddleOCR):
75
77
  return ocr_res
76
78
  elif det and not rec:
77
79
  ocr_res = []
78
- for idx, img in enumerate(imgs):
80
+ for img in imgs:
79
81
  img = preprocess_image(img)
80
82
  dt_boxes, elapse = self.text_detector(img)
81
83
  if dt_boxes is None:
@@ -96,7 +98,7 @@ class ModifiedPaddleOCR(PaddleOCR):
96
98
  else:
97
99
  ocr_res = []
98
100
  cls_res = []
99
- for idx, img in enumerate(imgs):
101
+ for img in imgs:
100
102
  if not isinstance(img, list):
101
103
  img = preprocess_image(img)
102
104
  img = [img]
@@ -2,8 +2,8 @@ import os
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
- from paddleocr.ppstructure.table.predict_table import TableSystem
6
- from paddleocr.ppstructure.utility import init_args
5
+ from ppstructure.table.predict_table import TableSystem
6
+ from ppstructure.utility import init_args
7
7
  from PIL import Image
8
8
 
9
9
  from magic_pdf.config.constants import * # noqa: F403
@@ -1,7 +1,10 @@
1
1
  import copy
2
2
 
3
+ from loguru import logger
4
+
3
5
  from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
4
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
+ from magic_pdf.libs.language import detect_lang
5
8
 
6
9
  LINE_STOP_FLAG = (
7
10
  '.',
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
125
128
 
126
129
  # 添加所有文本,包括空行,保持与block['lines']长度一致
127
130
  lines_text_list.append(line_text)
131
+ block_text = ''.join(lines_text_list)
132
+ block_lang = detect_lang(block_text)
133
+ # logger.info(f"block_lang: {block_lang}")
128
134
 
129
135
  # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
130
136
  if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
136
142
  if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
137
143
  right_close_num += 1
138
144
  else:
139
- # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
140
- # block宽的阈值可以小些,block窄的阈值要大
141
-
142
- if block_weight_radio >= 0.5:
145
+ # 类中文没有超长单词的情况,可以用统一的阈值
146
+ if block_lang in ['zh', 'ja', 'ko']:
143
147
  closed_area = 0.26 * block_weight
144
148
  else:
145
- closed_area = 0.36 * block_weight
149
+ # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
150
+ # block宽的阈值可以小些,block窄的阈值要大
151
+ if block_weight_radio >= 0.5:
152
+ closed_area = 0.26 * block_weight
153
+ else:
154
+ closed_area = 0.36 * block_weight
146
155
  if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
147
156
  right_not_close_num += 1
148
157
 
@@ -271,13 +280,18 @@ def __merge_2_text_blocks(block1, block2):
271
280
  first_span = first_line['spans'][0]
272
281
  if len(first_span['content']) > 0:
273
282
  span_start_with_num = first_span['content'][0].isdigit()
283
+ span_start_with_big_char = first_span['content'][0].isupper()
274
284
  if (
275
- abs(block2['bbox_fs'][2] - last_line['bbox'][2])
276
- < line_height
285
+ # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
286
+ abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
287
+ # 上一个block的最后一个span不是以特定符号结尾
277
288
  and not last_span['content'].endswith(LINE_STOP_FLAG)
278
289
  # 两个block宽度差距超过2倍也不合并
279
290
  and abs(block1_weight - block2_weight) < min_block_weight
291
+ # 下一个block的第一个字符是数字
280
292
  and not span_start_with_num
293
+ # 下一个block的第一个字符是大写字母
294
+ and not span_start_with_big_char
281
295
  ):
282
296
  if block1['page_num'] != block2['page_num']:
283
297
  for line in block1['lines']:
@@ -5,19 +5,18 @@ import time
5
5
  from typing import List
6
6
 
7
7
  import torch
8
+ import fitz
8
9
  from loguru import logger
9
10
 
10
- from magic_pdf.config.drop_reason import DropReason
11
11
  from magic_pdf.config.enums import SupportedPdfParseMethod
12
12
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
13
13
  from magic_pdf.data.dataset import Dataset, PageableData
14
14
  from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
15
15
  from magic_pdf.libs.clean_memory import clean_memory
16
- from magic_pdf.libs.commons import fitz, get_delta_time
17
16
  from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
18
17
  from magic_pdf.libs.convert_utils import dict_to_list
19
18
  from magic_pdf.libs.hash_utils import compute_md5
20
- from magic_pdf.libs.local_math import float_equal
19
+
21
20
  from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
22
21
  from magic_pdf.model.magic_model import MagicModel
23
22
 
@@ -31,44 +30,14 @@ try:
31
30
  torchtext.disable_torchtext_deprecation_warning()
32
31
  except ImportError:
33
32
  pass
34
- from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
35
33
 
34
+ from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
36
35
  from magic_pdf.para.para_split_v3 import para_split
37
- from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
38
- from magic_pdf.pre_proc.construct_page_dict import \
39
- ocr_construct_page_component_v2
36
+ from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
40
37
  from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
41
- from magic_pdf.pre_proc.equations_replace import (
42
- combine_chars_to_pymudict, remove_chars_in_text_blocks,
43
- replace_equations_in_textblock)
44
- from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
45
- ocr_prepare_bboxes_for_layout_split_v2
46
- from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
47
- fix_block_spans_v2,
48
- fix_discarded_block)
49
- from magic_pdf.pre_proc.ocr_span_list_modify import (
50
- get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
51
- remove_overlaps_min_spans)
52
- from magic_pdf.pre_proc.resolve_bbox_conflict import \
53
- check_useful_block_horizontal_overlap
54
-
55
-
56
- def remove_horizontal_overlap_block_which_smaller(all_bboxes):
57
- useful_blocks = []
58
- for bbox in all_bboxes:
59
- useful_blocks.append({'bbox': bbox[:4]})
60
- is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
61
- check_useful_block_horizontal_overlap(useful_blocks)
62
- )
63
- if is_useful_block_horz_overlap:
64
- logger.warning(
65
- f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
66
- ) # noqa: E501
67
- for bbox in all_bboxes.copy():
68
- if smaller_bbox == bbox[:4]:
69
- all_bboxes.remove(bbox)
70
-
71
- return is_useful_block_horz_overlap, all_bboxes
38
+ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
39
+ from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
40
+ from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
72
41
 
73
42
 
74
43
  def __replace_STX_ETX(text_str: str):
@@ -88,52 +57,67 @@ def __replace_STX_ETX(text_str: str):
88
57
  return text_str
89
58
 
90
59
 
91
- def chars_to_content(span):
92
- # # 先给chars按char['bbox']的x坐标排序
93
- # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
60
+ def __replace_0xfffd(text_str: str):
61
+ """Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
62
+ if text_str:
63
+ s = text_str.replace('\ufffd', " ")
64
+ return s
65
+ return text_str
94
66
 
67
+ def chars_to_content(span):
68
+ # 检查span中的char是否为空
69
+ if len(span['chars']) == 0:
70
+ pass
71
+ # span['content'] = ''
72
+ else:
95
73
  # 先给chars按char['bbox']的中心点的x坐标排序
96
74
  span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
97
- content = ''
98
75
 
99
76
  # 求char的平均宽度
100
- if len(span['chars']) == 0:
101
- span['content'] = content
102
- del span['chars']
103
- return
104
- else:
105
- char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
106
- char_avg_width = char_width_sum / len(span['chars'])
77
+ char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
78
+ char_avg_width = char_width_sum / len(span['chars'])
107
79
 
80
+ content = ''
108
81
  for char in span['chars']:
109
82
  # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
110
83
  if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
111
84
  content += ' '
112
85
  content += char['c']
113
- span['content'] = __replace_STX_ETX(content)
114
- del span['chars']
86
+
87
+ span['content'] = __replace_0xfffd(content)
88
+
89
+ del span['chars']
115
90
 
116
91
 
117
92
  LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
93
+ LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
94
+
95
+
118
96
  def fill_char_in_spans(spans, all_chars):
119
97
 
98
+ # 简单从上到下排一下序
99
+ spans = sorted(spans, key=lambda x: x['bbox'][1])
100
+
120
101
  for char in all_chars:
121
102
  for span in spans:
122
- # 判断char是否属于LINE_STOP_FLAG
123
- if char['c'] in LINE_STOP_FLAG:
124
- char_is_line_stop_flag = True
125
- else:
126
- char_is_line_stop_flag = False
127
- if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
103
+ if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
128
104
  span['chars'].append(char)
129
105
  break
130
106
 
107
+ empty_spans = []
108
+
131
109
  for span in spans:
132
110
  chars_to_content(span)
111
+ # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
112
+ if len(span['content']) * span['height'] < span['width'] * 0.5:
113
+ # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
114
+ empty_spans.append(span)
115
+ del span['height'], span['width']
116
+ return empty_spans
133
117
 
134
118
 
135
119
  # 使用鲁棒性更强的中心点坐标判断
136
- def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
120
+ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
137
121
  char_center_x = (char_bbox[0] + char_bbox[2]) / 2
138
122
  char_center_y = (char_bbox[1] + char_bbox[3]) / 2
139
123
  span_center_y = (span_bbox[1] + span_bbox[3]) / 2
@@ -142,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
142
126
  if (
143
127
  span_bbox[0] < char_center_x < span_bbox[2]
144
128
  and span_bbox[1] < char_center_y < span_bbox[3]
145
- and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
129
+ and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
146
130
  ):
147
131
  return True
148
132
  else:
149
133
  # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
150
134
  # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
151
- if char_is_line_stop_flag:
135
+ if char in LINE_STOP_FLAG:
152
136
  if (
153
137
  (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
154
138
  and char_center_x > span_bbox[0]
155
139
  and span_bbox[1] < char_center_y < span_bbox[3]
156
- and abs(char_center_y - span_center_y) < span_height / 4
140
+ and abs(char_center_y - span_center_y) < span_height * span_height_radio
141
+ ):
142
+ return True
143
+ elif char in LINE_START_FLAG:
144
+ if (
145
+ span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
146
+ and char_center_x < span_bbox[2]
147
+ and span_bbox[1] < char_center_y < span_bbox[3]
148
+ and abs(char_center_y - span_center_y) < span_height * span_height_radio
157
149
  ):
158
150
  return True
159
151
  else:
@@ -162,48 +154,80 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
162
154
 
163
155
  def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
164
156
 
157
+ text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
158
+
159
+ all_pymu_chars = []
160
+ for block in text_blocks_raw:
161
+ for line in block['lines']:
162
+ cosine, sine = line['dir']
163
+ if abs (cosine) < 0.9 or abs(sine) > 0.1:
164
+ continue
165
+ for span in line['spans']:
166
+ all_pymu_chars.extend(span['chars'])
167
+
168
+ # 计算所有sapn的高度的中位数
169
+ span_height_list = []
170
+ for span in spans:
171
+ if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
172
+ continue
173
+ span_height = span['bbox'][3] - span['bbox'][1]
174
+ span['height'] = span_height
175
+ span['width'] = span['bbox'][2] - span['bbox'][0]
176
+ span_height_list.append(span_height)
177
+ if len(span_height_list) == 0:
178
+ return spans
179
+ else:
180
+ median_span_height = statistics.median(span_height_list)
181
+
165
182
  useful_spans = []
166
183
  unuseful_spans = []
184
+ # 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
185
+ vertical_spans = []
167
186
  for span in spans:
168
- for block in all_bboxes:
187
+ if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
188
+ continue
189
+ for block in all_bboxes + all_discarded_blocks:
169
190
  if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
170
191
  continue
171
- else:
172
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
173
- useful_spans.append(span)
174
- break
175
- for block in all_discarded_blocks:
176
192
  if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
177
- unuseful_spans.append(span)
193
+ if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
194
+ vertical_spans.append(span)
195
+ elif block in all_bboxes:
196
+ useful_spans.append(span)
197
+ else:
198
+ unuseful_spans.append(span)
199
+
178
200
  break
179
201
 
180
- text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
202
+ """垂直的span框直接用pymu的line进行填充"""
203
+ if len(vertical_spans) > 0:
204
+ text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
205
+ all_pymu_lines = []
206
+ for block in text_blocks:
207
+ for line in block['lines']:
208
+ all_pymu_lines.append(line)
181
209
 
182
- # @todo: 拿到char之后把倾斜角度较大的先删一遍
183
- all_pymu_chars = []
184
- for block in text_blocks:
185
- for line in block['lines']:
186
- for span in line['spans']:
187
- all_pymu_chars.extend(span['chars'])
210
+ for pymu_line in all_pymu_lines:
211
+ for span in vertical_spans:
212
+ if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
213
+ for pymu_span in pymu_line['spans']:
214
+ span['content'] += pymu_span['text']
215
+ break
188
216
 
189
- new_spans = []
217
+ for span in vertical_spans:
218
+ if len(span['content']) == 0:
219
+ spans.remove(span)
190
220
 
191
- for span in useful_spans:
192
- if span['type'] in [ContentType.Text]:
193
- span['chars'] = []
194
- new_spans.append(span)
221
+ """水平的span框如果没有char则用ocr进行填充"""
222
+ new_spans = []
195
223
 
196
- for span in unuseful_spans:
224
+ for span in useful_spans + unuseful_spans:
197
225
  if span['type'] in [ContentType.Text]:
198
226
  span['chars'] = []
199
227
  new_spans.append(span)
200
228
 
201
- fill_char_in_spans(new_spans, all_pymu_chars)
229
+ empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
202
230
 
203
- empty_spans = []
204
- for span in new_spans:
205
- if len(span['content']) == 0:
206
- empty_spans.append(span)
207
231
  if len(empty_spans) > 0:
208
232
 
209
233
  # 初始化ocr模型
@@ -216,52 +240,19 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
216
240
  )
217
241
 
218
242
  for span in empty_spans:
219
- spans.remove(span)
220
- # 对span的bbox截图
243
+ # 对span的bbox截图再ocr
221
244
  span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
222
245
  ocr_res = ocr_model.ocr(span_img, det=False)
223
- # logger.info(f"ocr_res: {ocr_res}")
224
- # logger.info(f"empty_span: {span}")
225
246
  if ocr_res and len(ocr_res) > 0:
226
247
  if len(ocr_res[0]) > 0:
227
248
  ocr_text, ocr_score = ocr_res[0][0]
249
+ # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
228
250
  if ocr_score > 0.5 and len(ocr_text) > 0:
229
- span['content'] = ocr_text
230
- spans.append(span)
231
-
232
- return spans
251
+ span['content'] = ocr_text
252
+ span['score'] = ocr_score
253
+ else:
254
+ spans.remove(span)
233
255
 
234
-
235
- def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
236
- text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
237
- char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
238
- 'blocks'
239
- ]
240
- text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
241
- text_blocks = replace_equations_in_textblock(
242
- text_blocks, inline_equations, interline_equations
243
- )
244
- text_blocks = remove_citation_marker(text_blocks)
245
- text_blocks = remove_chars_in_text_blocks(text_blocks)
246
- spans = []
247
- for v in text_blocks:
248
- for line in v['lines']:
249
- for span in line['spans']:
250
- bbox = span['bbox']
251
- if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
252
- continue
253
- if span.get('type') not in (
254
- ContentType.InlineEquation,
255
- ContentType.InterlineEquation,
256
- ):
257
- spans.append(
258
- {
259
- 'bbox': list(span['bbox']),
260
- 'content': __replace_STX_ETX(span['text']),
261
- 'type': ContentType.Text,
262
- 'score': 1.0,
263
- }
264
- )
265
256
  return spans
266
257
 
267
258
 
@@ -682,6 +673,23 @@ def parse_page_core(
682
673
  """顺便删除大水印并保留abandon的span"""
683
674
  spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
684
675
 
676
+ """删除重叠spans中置信度较低的那些"""
677
+ spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
678
+ """删除重叠spans中较小的那些"""
679
+ spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
680
+
681
+ """根据parse_mode,构造spans,主要是文本类的字符填充"""
682
+ if parse_mode == SupportedPdfParseMethod.TXT:
683
+
684
+ """使用新版本的混合ocr方案"""
685
+ spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
686
+
687
+ elif parse_mode == SupportedPdfParseMethod.OCR:
688
+ pass
689
+ else:
690
+ raise Exception('parse_mode must be txt or ocr')
691
+
692
+
685
693
  """先处理不需要排版的discarded_blocks"""
686
694
  discarded_block_with_spans, spans = fill_spans_in_blocks(
687
695
  all_discarded_blocks, spans, 0.4
@@ -706,26 +714,6 @@ def parse_page_core(
706
714
  drop_reason,
707
715
  )
708
716
 
709
- """删除重叠spans中置信度较低的那些"""
710
- spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
711
- """删除重叠spans中较小的那些"""
712
- spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
713
-
714
- """根据parse_mode,构造spans,主要是文本类的字符填充"""
715
- if parse_mode == SupportedPdfParseMethod.TXT:
716
-
717
- """之前的公式替换方案"""
718
- # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
719
- # spans = replace_text_span(pymu_spans, spans)
720
-
721
- """ocr 中文本类的 span 用 pymu spans 替换!"""
722
- spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
723
-
724
- elif parse_mode == SupportedPdfParseMethod.OCR:
725
- pass
726
- else:
727
- raise Exception('parse_mode must be txt or ocr')
728
-
729
717
  """对image和table截图"""
730
718
  spans = ocr_cut_image_and_table(
731
719
  spans, page_doc, page_id, pdf_bytes_md5, imageWriter
@@ -811,7 +799,7 @@ def pdf_parse_union(
811
799
  if debug_mode:
812
800
  time_now = time.time()
813
801
  logger.info(
814
- f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
802
+ f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
815
803
  )
816
804
  start_time = time_now
817
805
 
@@ -1,58 +1,3 @@
1
- def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
2
- interline_eq_info, raw_pymu_blocks,
3
- removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
4
- layout_tree,
5
- page_w, page_h, footnote_bboxes_tmp):
6
- """
7
-
8
- """
9
- return_dict = {}
10
-
11
- return_dict['para_blocks'] = {}
12
- return_dict['preproc_blocks'] = text_blocks_preproc
13
- return_dict['images'] = image_info
14
- return_dict['tables'] = table_info
15
- return_dict['interline_equations'] = interline_eq_info
16
- return_dict['inline_equations'] = inline_eq_info
17
- return_dict['layout_bboxes'] = layout_bboxes
18
- return_dict['pymu_raw_blocks'] = raw_pymu_blocks
19
- return_dict['global_statistic'] = {}
20
-
21
- return_dict['droped_text_block'] = removed_text_blocks
22
- return_dict['droped_image_block'] = removed_image_blocks
23
- return_dict['droped_table_block'] = []
24
- return_dict['image_backup'] = images_backup
25
- return_dict['table_backup'] = []
26
- return_dict['page_idx'] = page_id
27
- return_dict['page_size'] = [page_w, page_h]
28
- return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
29
- return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
30
-
31
- return return_dict
32
-
33
-
34
- def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
35
- images, tables, interline_equations, inline_equations,
36
- dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
37
- need_remove_spans_bboxes_dict):
38
- return_dict = {
39
- 'preproc_blocks': blocks,
40
- 'layout_bboxes': layout_bboxes,
41
- 'page_idx': page_id,
42
- 'page_size': [page_w, page_h],
43
- '_layout_tree': layout_tree,
44
- 'images': images,
45
- 'tables': tables,
46
- 'interline_equations': interline_equations,
47
- 'inline_equations': inline_equations,
48
- 'droped_text_block': dropped_text_block,
49
- 'droped_image_block': dropped_image_block,
50
- 'droped_table_block': dropped_table_block,
51
- 'dropped_equation_block': dropped_equation_block,
52
- 'droped_bboxes': need_remove_spans_bboxes_dict,
53
- }
54
- return return_dict
55
-
56
1
 
57
2
  def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
58
3
  images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
@@ -25,43 +25,6 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
25
25
  return spans
26
26
 
27
27
 
28
- def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
29
- image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
30
- equation_inline_bboxes: list,
31
- equation_interline_bboxes: list, imageWriter) -> dict:
32
- """返回一个dict, key为bbox, 值是图片地址."""
33
- image_info = []
34
- image_backup_info = []
35
- table_info = []
36
- inline_eq_info = []
37
- interline_eq_info = []
38
-
39
- # 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
40
-
41
- def return_path(type):
42
- return join_path(pdf_bytes_md5, type)
43
-
44
- for bbox in image_bboxes:
45
- if not check_img_bbox(bbox):
46
- continue
47
- image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
48
- image_info.append({'bbox': bbox, 'image_path': image_path})
49
-
50
- for bbox in images_overlap_backup:
51
- if not check_img_bbox(bbox):
52
- continue
53
- image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
54
- image_backup_info.append({'bbox': bbox, 'image_path': image_path})
55
-
56
- for bbox in table_bboxes:
57
- if not check_img_bbox(bbox):
58
- continue
59
- image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
60
- table_info.append({'bbox': bbox, 'image_path': image_path})
61
-
62
- return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
63
-
64
-
65
28
  def check_img_bbox(bbox) -> bool:
66
29
  if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
67
30
  logger.warning(f'image_bboxes: 错误的box, {bbox}')