magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +5 -5
  12. magic_pdf/libs/draw_bbox.py +3 -2
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +122 -76
  18. magic_pdf/model/sub_modules/model_init.py +40 -35
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  21. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  22. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  23. magic_pdf/para/para_split.py +411 -248
  24. magic_pdf/para/para_split_v2.py +352 -182
  25. magic_pdf/para/para_split_v3.py +110 -53
  26. magic_pdf/pdf_parse_by_ocr.py +2 -0
  27. magic_pdf/pdf_parse_by_txt.py +2 -0
  28. magic_pdf/pdf_parse_union_core.py +174 -100
  29. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  30. magic_pdf/pipe/AbsPipe.py +28 -44
  31. magic_pdf/pipe/OCRPipe.py +5 -5
  32. magic_pdf/pipe/TXTPipe.py +5 -6
  33. magic_pdf/pipe/UNIPipe.py +24 -25
  34. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  35. magic_pdf/pre_proc/cut_image.py +9 -11
  36. magic_pdf/pre_proc/equations_replace.py +203 -212
  37. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  38. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  39. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  40. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  41. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  42. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  43. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  44. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  45. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  46. magic_pdf/spark/spark_api.py +15 -17
  47. magic_pdf/tools/cli.py +3 -4
  48. magic_pdf/tools/cli_dev.py +6 -9
  49. magic_pdf/tools/common.py +26 -36
  50. magic_pdf/user_api.py +29 -38
  51. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
  53. magic_pdf/libs/Constants.py +0 -55
  54. magic_pdf/libs/MakeContentConfig.py +0 -11
  55. magic_pdf/libs/drop_reason.py +0 -27
  56. magic_pdf/libs/drop_tag.py +0 -19
  57. magic_pdf/para/para_pipeline.py +0 -297
  58. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  59. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  60. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,30 @@
1
1
  import copy
2
2
 
3
- from loguru import logger
4
-
5
- from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
6
- from magic_pdf.libs.ocr_content_type import BlockType, ContentType
7
-
8
- LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';')
3
+ from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
4
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
5
+
6
+ LINE_STOP_FLAG = (
7
+ '.',
8
+ '!',
9
+ '?',
10
+ '。',
11
+ '!',
12
+ '?',
13
+ ')',
14
+ ')',
15
+ '"',
16
+ '”',
17
+ ':',
18
+ ':',
19
+ ';',
20
+ ';',
21
+ )
9
22
  LIST_END_FLAG = ('.', '。', ';', ';')
10
23
 
11
24
 
12
25
  class ListLineTag:
13
- IS_LIST_START_LINE = "is_list_start_line"
14
- IS_LIST_END_LINE = "is_list_end_line"
26
+ IS_LIST_START_LINE = 'is_list_start_line'
27
+ IS_LIST_END_LINE = 'is_list_end_line'
15
28
 
16
29
 
17
30
  def __process_blocks(blocks):
@@ -27,12 +40,14 @@ def __process_blocks(blocks):
27
40
 
28
41
  # 如果当前块是 text 类型
29
42
  if current_block['type'] == 'text':
30
- current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"])
31
- if 'lines' in current_block and len(current_block["lines"]) > 0:
32
- current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]),
33
- min([line['bbox'][1] for line in current_block['lines']]),
34
- max([line['bbox'][2] for line in current_block['lines']]),
35
- max([line['bbox'][3] for line in current_block['lines']])]
43
+ current_block['bbox_fs'] = copy.deepcopy(current_block['bbox'])
44
+ if 'lines' in current_block and len(current_block['lines']) > 0:
45
+ current_block['bbox_fs'] = [
46
+ min([line['bbox'][0] for line in current_block['lines']]),
47
+ min([line['bbox'][1] for line in current_block['lines']]),
48
+ max([line['bbox'][2] for line in current_block['lines']]),
49
+ max([line['bbox'][3] for line in current_block['lines']]),
50
+ ]
36
51
  current_group.append(current_block)
37
52
 
38
53
  # 检查下一个块是否存在
@@ -64,6 +79,7 @@ def __is_list_or_index_block(block):
64
79
  line_height = first_line['bbox'][3] - first_line['bbox'][1]
65
80
  block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
66
81
  block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
82
+ page_weight, page_height = block['page_size']
67
83
 
68
84
  left_close_num = 0
69
85
  left_not_close_num = 0
@@ -75,10 +91,17 @@ def __is_list_or_index_block(block):
75
91
  multiple_para_flag = False
76
92
  last_line = block['lines'][-1]
77
93
 
94
+ if page_weight == 0:
95
+ block_weight_radio = 0
96
+ else:
97
+ block_weight_radio = block_weight / page_weight
98
+ # logger.info(f"block_weight_radio: {block_weight_radio}")
99
+
78
100
  # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
79
- if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
80
- abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and
81
- block['bbox_fs'][2] - last_line['bbox'][2] > line_height
101
+ if (
102
+ first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2
103
+ and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2
104
+ and block['bbox_fs'][2] - last_line['bbox'][2] > line_height
82
105
  ):
83
106
  multiple_para_flag = True
84
107
 
@@ -86,14 +109,14 @@ def __is_list_or_index_block(block):
86
109
  line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
87
110
  block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
88
111
  if (
89
- line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
90
- block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
112
+ line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
113
+ and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
91
114
  ):
92
115
  external_sides_not_close_num += 1
93
116
  if abs(line_mid_x - block_mid_x) < line_height / 2:
94
117
  center_close_num += 1
95
118
 
96
- line_text = ""
119
+ line_text = ''
97
120
 
98
121
  for span in line['spans']:
99
122
  span_type = span['type']
@@ -114,7 +137,12 @@ def __is_list_or_index_block(block):
114
137
  right_close_num += 1
115
138
  else:
116
139
  # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
117
- closed_area = 0.26 * block_weight
140
+ # block宽的阈值可以小些,block窄的阈值要大
141
+
142
+ if block_weight_radio >= 0.5:
143
+ closed_area = 0.26 * block_weight
144
+ else:
145
+ closed_area = 0.36 * block_weight
118
146
  if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
119
147
  right_not_close_num += 1
120
148
 
@@ -136,15 +164,19 @@ def __is_list_or_index_block(block):
136
164
  if line_text[-1].isdigit():
137
165
  num_end_count += 1
138
166
 
139
- if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
167
+ if (
168
+ num_start_count / len(lines_text_list) >= 0.8
169
+ or num_end_count / len(lines_text_list) >= 0.8
170
+ ):
140
171
  line_num_flag = True
141
172
  if flag_end_count / len(lines_text_list) >= 0.8:
142
173
  line_end_flag = True
143
174
 
144
175
  # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
145
- if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
146
- and line_num_flag
147
- ):
176
+ if (
177
+ left_close_num / len(block['lines']) >= 0.8
178
+ or right_close_num / len(block['lines']) >= 0.8
179
+ ) and line_num_flag:
148
180
  for line in block['lines']:
149
181
  line[ListLineTag.IS_LIST_START_LINE] = True
150
182
  return BlockType.Index
@@ -152,17 +184,21 @@ def __is_list_or_index_block(block):
152
184
  # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
153
185
  # 补充条件block的长宽比有要求
154
186
  elif (
155
- external_sides_not_close_num >= 2 and
156
- center_close_num == len(block['lines']) and
157
- external_sides_not_close_num / len(block['lines']) >= 0.5 and
158
- block_height / block_weight > 0.4
187
+ external_sides_not_close_num >= 2
188
+ and center_close_num == len(block['lines'])
189
+ and external_sides_not_close_num / len(block['lines']) >= 0.5
190
+ and block_height / block_weight > 0.4
159
191
  ):
160
192
  for line in block['lines']:
161
193
  line[ListLineTag.IS_LIST_START_LINE] = True
162
194
  return BlockType.List
163
195
 
164
- elif left_close_num >= 2 and (
165
- right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
196
+ elif (
197
+ left_close_num >= 2
198
+ and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2)
199
+ and not multiple_para_flag
200
+ # and block_weight_radio > 0.27
201
+ ):
166
202
  # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
167
203
  if left_close_num / len(block['lines']) > 0.8:
168
204
  # 这种是每个item只有一行,且左边都贴边的短item list
@@ -173,10 +209,15 @@ def __is_list_or_index_block(block):
173
209
  # 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item
174
210
  elif line_end_flag:
175
211
  for i, line in enumerate(block['lines']):
176
- if len(lines_text_list[i]) > 0 and lines_text_list[i][-1] in LIST_END_FLAG:
212
+ if (
213
+ len(lines_text_list[i]) > 0
214
+ and lines_text_list[i][-1] in LIST_END_FLAG
215
+ ):
177
216
  line[ListLineTag.IS_LIST_END_LINE] = True
178
217
  if i + 1 < len(block['lines']):
179
- block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
218
+ block['lines'][i + 1][
219
+ ListLineTag.IS_LIST_START_LINE
220
+ ] = True
180
221
  # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
181
222
  else:
182
223
  line_start_flag = False
@@ -185,7 +226,10 @@ def __is_list_or_index_block(block):
185
226
  line[ListLineTag.IS_LIST_START_LINE] = True
186
227
  line_start_flag = False
187
228
 
188
- if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
229
+ if (
230
+ abs(block['bbox_fs'][2] - line['bbox'][2])
231
+ > 0.1 * block_weight
232
+ ):
189
233
  line[ListLineTag.IS_LIST_END_LINE] = True
190
234
  line_start_flag = True
191
235
  # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致
@@ -223,18 +267,25 @@ def __merge_2_text_blocks(block1, block2):
223
267
  if len(last_line['spans']) > 0:
224
268
  last_span = last_line['spans'][-1]
225
269
  line_height = last_line['bbox'][3] - last_line['bbox'][1]
226
- if (abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height and
227
- not last_span['content'].endswith(LINE_STOP_FLAG) and
228
- # 两个block宽度差距超过2倍也不合并
229
- abs(block1_weight - block2_weight) < min_block_weight
230
- ):
231
- if block1['page_num'] != block2['page_num']:
232
- for line in block1['lines']:
233
- for span in line['spans']:
234
- span[CROSS_PAGE] = True
235
- block2['lines'].extend(block1['lines'])
236
- block1['lines'] = []
237
- block1[LINES_DELETED] = True
270
+ if len(first_line['spans']) > 0:
271
+ first_span = first_line['spans'][0]
272
+ if len(first_span['content']) > 0:
273
+ span_start_with_num = first_span['content'][0].isdigit()
274
+ if (
275
+ abs(block2['bbox_fs'][2] - last_line['bbox'][2])
276
+ < line_height
277
+ and not last_span['content'].endswith(LINE_STOP_FLAG)
278
+ # 两个block宽度差距超过2倍也不合并
279
+ and abs(block1_weight - block2_weight) < min_block_weight
280
+ and not span_start_with_num
281
+ ):
282
+ if block1['page_num'] != block2['page_num']:
283
+ for line in block1['lines']:
284
+ for span in line['spans']:
285
+ span[CROSS_PAGE] = True
286
+ block2['lines'].extend(block1['lines'])
287
+ block1['lines'] = []
288
+ block1[LINES_DELETED] = True
238
289
 
239
290
  return block1, block2
240
291
 
@@ -263,7 +314,6 @@ def __is_list_group(text_blocks_group):
263
314
  def __para_merge_page(blocks):
264
315
  page_text_blocks_groups = __process_blocks(blocks)
265
316
  for text_blocks_group in page_text_blocks_groups:
266
-
267
317
  if len(text_blocks_group) > 0:
268
318
  # 需要先在合并前对所有block判断是否为list or index block
269
319
  for block in text_blocks_group:
@@ -272,7 +322,6 @@ def __para_merge_page(blocks):
272
322
  # logger.info(f"{block['type']}:{block}")
273
323
 
274
324
  if len(text_blocks_group) > 1:
275
-
276
325
  # 在合并前判断这个group 是否是一个 list group
277
326
  is_list_group = __is_list_group(text_blocks_group)
278
327
 
@@ -284,11 +333,18 @@ def __para_merge_page(blocks):
284
333
  if i - 1 >= 0:
285
334
  prev_block = text_blocks_group[i - 1]
286
335
 
287
- if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group:
336
+ if (
337
+ current_block['type'] == 'text'
338
+ and prev_block['type'] == 'text'
339
+ and not is_list_group
340
+ ):
288
341
  __merge_2_text_blocks(current_block, prev_block)
289
342
  elif (
290
- (current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or
291
- (current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index)
343
+ current_block['type'] == BlockType.List
344
+ and prev_block['type'] == BlockType.List
345
+ ) or (
346
+ current_block['type'] == BlockType.Index
347
+ and prev_block['type'] == BlockType.Index
292
348
  ):
293
349
  __merge_2_list_blocks(current_block, prev_block)
294
350
 
@@ -296,12 +352,13 @@ def __para_merge_page(blocks):
296
352
  continue
297
353
 
298
354
 
299
- def para_split(pdf_info_dict, debug_mode=False):
355
+ def para_split(pdf_info_dict):
300
356
  all_blocks = []
301
357
  for page_num, page in pdf_info_dict.items():
302
358
  blocks = copy.deepcopy(page['preproc_blocks'])
303
359
  for block in blocks:
304
360
  block['page_num'] = page_num
361
+ block['page_size'] = page['page_size']
305
362
  all_blocks.extend(blocks)
306
363
 
307
364
  __para_merge_page(all_blocks)
@@ -317,4 +374,4 @@ if __name__ == '__main__':
317
374
  # 调用函数
318
375
  groups = __process_blocks(input_blocks)
319
376
  for group_index, group in enumerate(groups):
320
- print(f"Group {group_index}: {group}")
377
+ print(f'Group {group_index}: {group}')
@@ -9,6 +9,7 @@ def parse_pdf_by_ocr(pdf_bytes,
9
9
  start_page_id=0,
10
10
  end_page_id=None,
11
11
  debug_mode=False,
12
+ lang=None,
12
13
  ):
13
14
  dataset = PymuDocDataset(pdf_bytes)
14
15
  return pdf_parse_union(dataset,
@@ -18,4 +19,5 @@ def parse_pdf_by_ocr(pdf_bytes,
18
19
  start_page_id=start_page_id,
19
20
  end_page_id=end_page_id,
20
21
  debug_mode=debug_mode,
22
+ lang=lang,
21
23
  )
@@ -10,6 +10,7 @@ def parse_pdf_by_txt(
10
10
  start_page_id=0,
11
11
  end_page_id=None,
12
12
  debug_mode=False,
13
+ lang=None,
13
14
  ):
14
15
  dataset = PymuDocDataset(pdf_bytes)
15
16
  return pdf_parse_union(dataset,
@@ -19,4 +20,5 @@ def parse_pdf_by_txt(
19
20
  start_page_id=start_page_id,
20
21
  end_page_id=end_page_id,
21
22
  debug_mode=debug_mode,
23
+ lang=lang,
22
24
  )