magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,30 @@
1
1
  import copy
2
2
 
3
- from loguru import logger
4
-
5
- from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
6
- from magic_pdf.libs.ocr_content_type import BlockType, ContentType
7
-
8
- LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';')
3
+ from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
4
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
5
+
6
+ LINE_STOP_FLAG = (
7
+ '.',
8
+ '!',
9
+ '?',
10
+ '。',
11
+ '!',
12
+ '?',
13
+ ')',
14
+ ')',
15
+ '"',
16
+ '”',
17
+ ':',
18
+ ':',
19
+ ';',
20
+ ';',
21
+ )
9
22
  LIST_END_FLAG = ('.', '。', ';', ';')
10
23
 
11
24
 
12
25
  class ListLineTag:
13
- IS_LIST_START_LINE = "is_list_start_line"
14
- IS_LIST_END_LINE = "is_list_end_line"
26
+ IS_LIST_START_LINE = 'is_list_start_line'
27
+ IS_LIST_END_LINE = 'is_list_end_line'
15
28
 
16
29
 
17
30
  def __process_blocks(blocks):
@@ -27,12 +40,14 @@ def __process_blocks(blocks):
27
40
 
28
41
  # 如果当前块是 text 类型
29
42
  if current_block['type'] == 'text':
30
- current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"])
31
- if 'lines' in current_block and len(current_block["lines"]) > 0:
32
- current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]),
33
- min([line['bbox'][1] for line in current_block['lines']]),
34
- max([line['bbox'][2] for line in current_block['lines']]),
35
- max([line['bbox'][3] for line in current_block['lines']])]
43
+ current_block['bbox_fs'] = copy.deepcopy(current_block['bbox'])
44
+ if 'lines' in current_block and len(current_block['lines']) > 0:
45
+ current_block['bbox_fs'] = [
46
+ min([line['bbox'][0] for line in current_block['lines']]),
47
+ min([line['bbox'][1] for line in current_block['lines']]),
48
+ max([line['bbox'][2] for line in current_block['lines']]),
49
+ max([line['bbox'][3] for line in current_block['lines']]),
50
+ ]
36
51
  current_group.append(current_block)
37
52
 
38
53
  # 检查下一个块是否存在
@@ -64,6 +79,7 @@ def __is_list_or_index_block(block):
64
79
  line_height = first_line['bbox'][3] - first_line['bbox'][1]
65
80
  block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
66
81
  block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
82
+ page_weight, page_height = block['page_size']
67
83
 
68
84
  left_close_num = 0
69
85
  left_not_close_num = 0
@@ -75,40 +91,45 @@ def __is_list_or_index_block(block):
75
91
  multiple_para_flag = False
76
92
  last_line = block['lines'][-1]
77
93
 
94
+ if page_weight == 0:
95
+ block_weight_radio = 0
96
+ else:
97
+ block_weight_radio = block_weight / page_weight
98
+ # logger.info(f"block_weight_radio: {block_weight_radio}")
99
+
78
100
  # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
79
- if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
80
- # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
81
- abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and
82
- block['bbox_fs'][2] - last_line['bbox'][2] > line_height
101
+ if (
102
+ first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2
103
+ and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2
104
+ and block['bbox_fs'][2] - last_line['bbox'][2] > line_height
83
105
  ):
84
106
  multiple_para_flag = True
85
107
 
86
108
  for line in block['lines']:
87
-
88
109
  line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
89
110
  block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
90
111
  if (
91
- line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
92
- block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
112
+ line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
113
+ and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
93
114
  ):
94
115
  external_sides_not_close_num += 1
95
116
  if abs(line_mid_x - block_mid_x) < line_height / 2:
96
117
  center_close_num += 1
97
118
 
98
- line_text = ""
119
+ line_text = ''
99
120
 
100
121
  for span in line['spans']:
101
122
  span_type = span['type']
102
123
  if span_type == ContentType.Text:
103
124
  line_text += span['content'].strip()
104
125
 
126
+ # 添加所有文本,包括空行,保持与block['lines']长度一致
105
127
  lines_text_list.append(line_text)
106
128
 
107
129
  # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
108
130
  if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
109
131
  left_close_num += 1
110
132
  elif line['bbox'][0] - block['bbox_fs'][0] > line_height:
111
- # logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}")
112
133
  left_not_close_num += 1
113
134
 
114
135
  # 计算右侧是否顶格
@@ -116,8 +137,12 @@ def __is_list_or_index_block(block):
116
137
  right_close_num += 1
117
138
  else:
118
139
  # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
119
- closed_area = 0.26 * block_weight
120
- # closed_area = 5 * line_height
140
+ # block宽的阈值可以小些,block窄的阈值要大
141
+
142
+ if block_weight_radio >= 0.5:
143
+ closed_area = 0.26 * block_weight
144
+ else:
145
+ closed_area = 0.36 * block_weight
121
146
  if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
122
147
  right_not_close_num += 1
123
148
 
@@ -128,6 +153,7 @@ def __is_list_or_index_block(block):
128
153
  num_start_count = 0
129
154
  num_end_count = 0
130
155
  flag_end_count = 0
156
+
131
157
  if len(lines_text_list) > 0:
132
158
  for line_text in lines_text_list:
133
159
  if len(line_text) > 0:
@@ -138,16 +164,19 @@ def __is_list_or_index_block(block):
138
164
  if line_text[-1].isdigit():
139
165
  num_end_count += 1
140
166
 
167
+ if (
168
+ num_start_count / len(lines_text_list) >= 0.8
169
+ or num_end_count / len(lines_text_list) >= 0.8
170
+ ):
171
+ line_num_flag = True
141
172
  if flag_end_count / len(lines_text_list) >= 0.8:
142
173
  line_end_flag = True
143
174
 
144
- if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
145
- line_num_flag = True
146
-
147
175
  # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
148
- if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
149
- and line_num_flag
150
- ):
176
+ if (
177
+ left_close_num / len(block['lines']) >= 0.8
178
+ or right_close_num / len(block['lines']) >= 0.8
179
+ ) and line_num_flag:
151
180
  for line in block['lines']:
152
181
  line[ListLineTag.IS_LIST_START_LINE] = True
153
182
  return BlockType.Index
@@ -155,17 +184,21 @@ def __is_list_or_index_block(block):
155
184
  # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
156
185
  # 补充条件block的长宽比有要求
157
186
  elif (
158
- external_sides_not_close_num >= 2 and
159
- center_close_num == len(block['lines']) and
160
- external_sides_not_close_num / len(block['lines']) >= 0.5 and
161
- block_height / block_weight > 0.4
187
+ external_sides_not_close_num >= 2
188
+ and center_close_num == len(block['lines'])
189
+ and external_sides_not_close_num / len(block['lines']) >= 0.5
190
+ and block_height / block_weight > 0.4
162
191
  ):
163
192
  for line in block['lines']:
164
193
  line[ListLineTag.IS_LIST_START_LINE] = True
165
194
  return BlockType.List
166
195
 
167
- elif left_close_num >= 2 and (
168
- right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
196
+ elif (
197
+ left_close_num >= 2
198
+ and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2)
199
+ and not multiple_para_flag
200
+ # and block_weight_radio > 0.27
201
+ ):
169
202
  # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
170
203
  if left_close_num / len(block['lines']) > 0.8:
171
204
  # 这种是每个item只有一行,且左边都贴边的短item list
@@ -176,10 +209,15 @@ def __is_list_or_index_block(block):
176
209
  # 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item
177
210
  elif line_end_flag:
178
211
  for i, line in enumerate(block['lines']):
179
- if lines_text_list[i][-1] in LIST_END_FLAG:
212
+ if (
213
+ len(lines_text_list[i]) > 0
214
+ and lines_text_list[i][-1] in LIST_END_FLAG
215
+ ):
180
216
  line[ListLineTag.IS_LIST_END_LINE] = True
181
217
  if i + 1 < len(block['lines']):
182
- block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
218
+ block['lines'][i + 1][
219
+ ListLineTag.IS_LIST_START_LINE
220
+ ] = True
183
221
  # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
184
222
  else:
185
223
  line_start_flag = False
@@ -187,17 +225,21 @@ def __is_list_or_index_block(block):
187
225
  if line_start_flag:
188
226
  line[ListLineTag.IS_LIST_START_LINE] = True
189
227
  line_start_flag = False
190
- # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
191
- if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
228
+
229
+ if (
230
+ abs(block['bbox_fs'][2] - line['bbox'][2])
231
+ > 0.1 * block_weight
232
+ ):
192
233
  line[ListLineTag.IS_LIST_END_LINE] = True
193
234
  line_start_flag = True
194
- # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
195
- elif num_start_count >= 2 and num_start_count == flag_end_count: # 简单一点先不考虑左侧不贴边的情况
235
+ # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致
236
+ elif num_start_count >= 2 and num_start_count == flag_end_count:
196
237
  for i, line in enumerate(block['lines']):
197
- if lines_text_list[i][0].isdigit():
198
- line[ListLineTag.IS_LIST_START_LINE] = True
199
- if lines_text_list[i][-1] in LIST_END_FLAG:
200
- line[ListLineTag.IS_LIST_END_LINE] = True
238
+ if len(lines_text_list[i]) > 0:
239
+ if lines_text_list[i][0].isdigit():
240
+ line[ListLineTag.IS_LIST_START_LINE] = True
241
+ if lines_text_list[i][-1] in LIST_END_FLAG:
242
+ line[ListLineTag.IS_LIST_END_LINE] = True
201
243
  else:
202
244
  # 正常有缩进的list处理
203
245
  for line in block['lines']:
@@ -225,18 +267,25 @@ def __merge_2_text_blocks(block1, block2):
225
267
  if len(last_line['spans']) > 0:
226
268
  last_span = last_line['spans'][-1]
227
269
  line_height = last_line['bbox'][3] - last_line['bbox'][1]
228
- if (abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height and
229
- not last_span['content'].endswith(LINE_STOP_FLAG) and
230
- # 两个block宽度差距超过2倍也不合并
231
- abs(block1_weight - block2_weight) < min_block_weight
232
- ):
233
- if block1['page_num'] != block2['page_num']:
234
- for line in block1['lines']:
235
- for span in line['spans']:
236
- span[CROSS_PAGE] = True
237
- block2['lines'].extend(block1['lines'])
238
- block1['lines'] = []
239
- block1[LINES_DELETED] = True
270
+ if len(first_line['spans']) > 0:
271
+ first_span = first_line['spans'][0]
272
+ if len(first_span['content']) > 0:
273
+ span_start_with_num = first_span['content'][0].isdigit()
274
+ if (
275
+ abs(block2['bbox_fs'][2] - last_line['bbox'][2])
276
+ < line_height
277
+ and not last_span['content'].endswith(LINE_STOP_FLAG)
278
+ # 两个block宽度差距超过2倍也不合并
279
+ and abs(block1_weight - block2_weight) < min_block_weight
280
+ and not span_start_with_num
281
+ ):
282
+ if block1['page_num'] != block2['page_num']:
283
+ for line in block1['lines']:
284
+ for span in line['spans']:
285
+ span[CROSS_PAGE] = True
286
+ block2['lines'].extend(block1['lines'])
287
+ block1['lines'] = []
288
+ block1[LINES_DELETED] = True
240
289
 
241
290
  return block1, block2
242
291
 
@@ -265,7 +314,6 @@ def __is_list_group(text_blocks_group):
265
314
  def __para_merge_page(blocks):
266
315
  page_text_blocks_groups = __process_blocks(blocks)
267
316
  for text_blocks_group in page_text_blocks_groups:
268
-
269
317
  if len(text_blocks_group) > 0:
270
318
  # 需要先在合并前对所有block判断是否为list or index block
271
319
  for block in text_blocks_group:
@@ -274,7 +322,6 @@ def __para_merge_page(blocks):
274
322
  # logger.info(f"{block['type']}:{block}")
275
323
 
276
324
  if len(text_blocks_group) > 1:
277
-
278
325
  # 在合并前判断这个group 是否是一个 list group
279
326
  is_list_group = __is_list_group(text_blocks_group)
280
327
 
@@ -286,11 +333,18 @@ def __para_merge_page(blocks):
286
333
  if i - 1 >= 0:
287
334
  prev_block = text_blocks_group[i - 1]
288
335
 
289
- if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group:
336
+ if (
337
+ current_block['type'] == 'text'
338
+ and prev_block['type'] == 'text'
339
+ and not is_list_group
340
+ ):
290
341
  __merge_2_text_blocks(current_block, prev_block)
291
342
  elif (
292
- (current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or
293
- (current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index)
343
+ current_block['type'] == BlockType.List
344
+ and prev_block['type'] == BlockType.List
345
+ ) or (
346
+ current_block['type'] == BlockType.Index
347
+ and prev_block['type'] == BlockType.Index
294
348
  ):
295
349
  __merge_2_list_blocks(current_block, prev_block)
296
350
 
@@ -298,12 +352,13 @@ def __para_merge_page(blocks):
298
352
  continue
299
353
 
300
354
 
301
- def para_split(pdf_info_dict, debug_mode=False):
355
+ def para_split(pdf_info_dict):
302
356
  all_blocks = []
303
357
  for page_num, page in pdf_info_dict.items():
304
358
  blocks = copy.deepcopy(page['preproc_blocks'])
305
359
  for block in blocks:
306
360
  block['page_num'] = page_num
361
+ block['page_size'] = page['page_size']
307
362
  all_blocks.extend(blocks)
308
363
 
309
364
  __para_merge_page(all_blocks)
@@ -319,4 +374,4 @@ if __name__ == '__main__':
319
374
  # 调用函数
320
375
  groups = __process_blocks(input_blocks)
321
376
  for group_index, group in enumerate(groups):
322
- print(f"Group {group_index}: {group}")
377
+ print(f'Group {group_index}: {group}')
@@ -9,6 +9,7 @@ def parse_pdf_by_ocr(pdf_bytes,
9
9
  start_page_id=0,
10
10
  end_page_id=None,
11
11
  debug_mode=False,
12
+ lang=None,
12
13
  ):
13
14
  dataset = PymuDocDataset(pdf_bytes)
14
15
  return pdf_parse_union(dataset,
@@ -18,4 +19,5 @@ def parse_pdf_by_ocr(pdf_bytes,
18
19
  start_page_id=start_page_id,
19
20
  end_page_id=end_page_id,
20
21
  debug_mode=debug_mode,
22
+ lang=lang,
21
23
  )
@@ -10,6 +10,7 @@ def parse_pdf_by_txt(
10
10
  start_page_id=0,
11
11
  end_page_id=None,
12
12
  debug_mode=False,
13
+ lang=None,
13
14
  ):
14
15
  dataset = PymuDocDataset(pdf_bytes)
15
16
  return pdf_parse_union(dataset,
@@ -19,4 +20,5 @@ def parse_pdf_by_txt(
19
20
  start_page_id=start_page_id,
20
21
  end_page_id=end_page_id,
21
22
  debug_mode=debug_mode,
23
+ lang=lang,
22
24
  )