magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -7,18 +7,32 @@ from typing import List
7
7
  import torch
8
8
  from loguru import logger
9
9
 
10
+ from magic_pdf.config.drop_reason import DropReason
10
11
  from magic_pdf.config.enums import SupportedPdfParseMethod
12
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
11
13
  from magic_pdf.data.dataset import Dataset, PageableData
12
14
  from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
13
15
  from magic_pdf.libs.clean_memory import clean_memory
14
16
  from magic_pdf.libs.commons import fitz, get_delta_time
15
17
  from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
16
18
  from magic_pdf.libs.convert_utils import dict_to_list
17
- from magic_pdf.libs.drop_reason import DropReason
18
19
  from magic_pdf.libs.hash_utils import compute_md5
19
20
  from magic_pdf.libs.local_math import float_equal
20
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
21
+ from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
21
22
  from magic_pdf.model.magic_model import MagicModel
23
+
24
+ os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
25
+ os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
26
+
27
+ try:
28
+ import torchtext
29
+
30
+ if torchtext.__version__ >= "0.18.0":
31
+ torchtext.disable_torchtext_deprecation_warning()
32
+ except ImportError:
33
+ pass
34
+ from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
35
+
22
36
  from magic_pdf.para.para_split_v3 import para_split
23
37
  from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
24
38
  from magic_pdf.pre_proc.construct_page_dict import \
@@ -30,8 +44,8 @@ from magic_pdf.pre_proc.equations_replace import (
30
44
  from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
31
45
  ocr_prepare_bboxes_for_layout_split_v2
32
46
  from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
33
- fix_block_spans,
34
- fix_discarded_block, fix_block_spans_v2)
47
+ fix_block_spans_v2,
48
+ fix_discarded_block)
35
49
  from magic_pdf.pre_proc.ocr_span_list_modify import (
36
50
  get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
37
51
  remove_overlaps_min_spans)
@@ -74,7 +88,151 @@ def __replace_STX_ETX(text_str: str):
74
88
  return text_str
75
89
 
76
90
 
77
- def txt_spans_extract(pdf_page, inline_equations, interline_equations):
91
+ def chars_to_content(span):
92
+ # # 先给chars按char['bbox']的x坐标排序
93
+ # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
94
+
95
+ # 先给chars按char['bbox']的中心点的x坐标排序
96
+ span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
97
+ content = ''
98
+
99
+ # 求char的平均宽度
100
+ if len(span['chars']) == 0:
101
+ span['content'] = content
102
+ del span['chars']
103
+ return
104
+ else:
105
+ char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
106
+ char_avg_width = char_width_sum / len(span['chars'])
107
+
108
+ for char in span['chars']:
109
+ # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
110
+ if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
111
+ content += ' '
112
+ content += char['c']
113
+ span['content'] = __replace_STX_ETX(content)
114
+ del span['chars']
115
+
116
+
117
+ LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
118
+ def fill_char_in_spans(spans, all_chars):
119
+
120
+ for char in all_chars:
121
+ for span in spans:
122
+ # 判断char是否属于LINE_STOP_FLAG
123
+ if char['c'] in LINE_STOP_FLAG:
124
+ char_is_line_stop_flag = True
125
+ else:
126
+ char_is_line_stop_flag = False
127
+ if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
128
+ span['chars'].append(char)
129
+ break
130
+
131
+ for span in spans:
132
+ chars_to_content(span)
133
+
134
+
135
+ # 使用鲁棒性更强的中心点坐标判断
136
+ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
137
+ char_center_x = (char_bbox[0] + char_bbox[2]) / 2
138
+ char_center_y = (char_bbox[1] + char_bbox[3]) / 2
139
+ span_center_y = (span_bbox[1] + span_bbox[3]) / 2
140
+ span_height = span_bbox[3] - span_bbox[1]
141
+
142
+ if (
143
+ span_bbox[0] < char_center_x < span_bbox[2]
144
+ and span_bbox[1] < char_center_y < span_bbox[3]
145
+ and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
146
+ ):
147
+ return True
148
+ else:
149
+ # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
150
+ # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
151
+ if char_is_line_stop_flag:
152
+ if (
153
+ (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
154
+ and char_center_x > span_bbox[0]
155
+ and span_bbox[1] < char_center_y < span_bbox[3]
156
+ and abs(char_center_y - span_center_y) < span_height / 4
157
+ ):
158
+ return True
159
+ else:
160
+ return False
161
+
162
+
163
+ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
164
+
165
+ useful_spans = []
166
+ unuseful_spans = []
167
+ for span in spans:
168
+ for block in all_bboxes:
169
+ if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
170
+ continue
171
+ else:
172
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
173
+ useful_spans.append(span)
174
+ break
175
+ for block in all_discarded_blocks:
176
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
177
+ unuseful_spans.append(span)
178
+ break
179
+
180
+ text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
181
+
182
+ # @todo: 拿到char之后把倾斜角度较大的先删一遍
183
+ all_pymu_chars = []
184
+ for block in text_blocks:
185
+ for line in block['lines']:
186
+ for span in line['spans']:
187
+ all_pymu_chars.extend(span['chars'])
188
+
189
+ new_spans = []
190
+
191
+ for span in useful_spans:
192
+ if span['type'] in [ContentType.Text]:
193
+ span['chars'] = []
194
+ new_spans.append(span)
195
+
196
+ for span in unuseful_spans:
197
+ if span['type'] in [ContentType.Text]:
198
+ span['chars'] = []
199
+ new_spans.append(span)
200
+
201
+ fill_char_in_spans(new_spans, all_pymu_chars)
202
+
203
+ empty_spans = []
204
+ for span in new_spans:
205
+ if len(span['content']) == 0:
206
+ empty_spans.append(span)
207
+ if len(empty_spans) > 0:
208
+
209
+ # 初始化ocr模型
210
+ atom_model_manager = AtomModelSingleton()
211
+ ocr_model = atom_model_manager.get_atom_model(
212
+ atom_model_name="ocr",
213
+ ocr_show_log=False,
214
+ det_db_box_thresh=0.3,
215
+ lang=lang
216
+ )
217
+
218
+ for span in empty_spans:
219
+ spans.remove(span)
220
+ # 对span的bbox截图
221
+ span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
222
+ ocr_res = ocr_model.ocr(span_img, det=False)
223
+ # logger.info(f"ocr_res: {ocr_res}")
224
+ # logger.info(f"empty_span: {span}")
225
+ if ocr_res and len(ocr_res) > 0:
226
+ if len(ocr_res[0]) > 0:
227
+ ocr_text, ocr_score = ocr_res[0][0]
228
+ if ocr_score > 0.5 and len(ocr_text) > 0:
229
+ span['content'] = ocr_text
230
+ spans.append(span)
231
+
232
+ return spans
233
+
234
+
235
+ def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
78
236
  text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
79
237
  char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
80
238
  'blocks'
@@ -164,8 +322,8 @@ class ModelSingleton:
164
322
 
165
323
 
166
324
  def do_predict(boxes: List[List[int]], model) -> List[int]:
167
- from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits,
168
- prepare_inputs)
325
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
326
+ boxes2inputs, parse_logits, prepare_inputs)
169
327
 
170
328
  inputs = boxes2inputs(boxes)
171
329
  inputs = prepare_inputs(inputs, model)
@@ -174,23 +332,59 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
174
332
 
175
333
 
176
334
  def cal_block_index(fix_blocks, sorted_bboxes):
177
- for block in fix_blocks:
178
335
 
179
- line_index_list = []
180
- if len(block['lines']) == 0:
181
- block['index'] = sorted_bboxes.index(block['bbox'])
182
- else:
336
+ if sorted_bboxes is not None:
337
+ # 使用layoutreader排序
338
+ for block in fix_blocks:
339
+ line_index_list = []
340
+ if len(block['lines']) == 0:
341
+ block['index'] = sorted_bboxes.index(block['bbox'])
342
+ else:
343
+ for line in block['lines']:
344
+ line['index'] = sorted_bboxes.index(line['bbox'])
345
+ line_index_list.append(line['index'])
346
+ median_value = statistics.median(line_index_list)
347
+ block['index'] = median_value
348
+
349
+ # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
350
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
351
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
352
+ block['lines'] = copy.deepcopy(block['real_lines'])
353
+ del block['real_lines']
354
+ else:
355
+ # 使用xycut排序
356
+ block_bboxes = []
357
+ for block in fix_blocks:
358
+ block_bboxes.append(block['bbox'])
359
+
360
+ # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
361
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
362
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
363
+ block['lines'] = copy.deepcopy(block['real_lines'])
364
+ del block['real_lines']
365
+
366
+ import numpy as np
367
+
368
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
369
+ recursive_xy_cut
370
+
371
+ random_boxes = np.array(block_bboxes)
372
+ np.random.shuffle(random_boxes)
373
+ res = []
374
+ recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
375
+ assert len(res) == len(block_bboxes)
376
+ sorted_boxes = random_boxes[np.array(res)].tolist()
377
+
378
+ for i, block in enumerate(fix_blocks):
379
+ block['index'] = sorted_boxes.index(block['bbox'])
380
+
381
+ # 生成line index
382
+ sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
383
+ line_inedx = 1
384
+ for block in sorted_blocks:
183
385
  for line in block['lines']:
184
- line['index'] = sorted_bboxes.index(line['bbox'])
185
- line_index_list.append(line['index'])
186
- median_value = statistics.median(line_index_list)
187
- block['index'] = median_value
188
-
189
- # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
190
- if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
191
- block['virtual_lines'] = copy.deepcopy(block['lines'])
192
- block['lines'] = copy.deepcopy(block['real_lines'])
193
- del block['real_lines']
386
+ line['index'] = line_inedx
387
+ line_inedx += 1
194
388
 
195
389
  return fix_blocks
196
390
 
@@ -257,13 +451,16 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
257
451
  page_line_list.append(bbox)
258
452
  elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
259
453
  bbox = block['bbox']
260
- block["real_lines"] = copy.deepcopy(block['lines'])
454
+ block['real_lines'] = copy.deepcopy(block['lines'])
261
455
  lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
262
456
  block['lines'] = []
263
457
  for line in lines:
264
458
  block['lines'].append({'bbox': line, 'spans': []})
265
459
  page_line_list.extend(lines)
266
460
 
461
+ if len(page_line_list) > 200: # layoutreader最高支持512line
462
+ return None
463
+
267
464
  # 使用layoutreader排序
268
465
  x_scale = 1000.0 / page_w
269
466
  y_scale = 1000.0 / page_h
@@ -425,18 +622,16 @@ def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
425
622
 
426
623
 
427
624
  def parse_page_core(
428
- page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
625
+ page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
429
626
  ):
430
627
  need_drop = False
431
628
  drop_reason = []
432
629
 
433
630
  """从magic_model对象中获取后面会用到的区块信息"""
434
- # img_blocks = magic_model.get_imgs(page_id)
435
- # table_blocks = magic_model.get_tables(page_id)
436
-
437
631
  img_groups = magic_model.get_imgs_v2(page_id)
438
632
  table_groups = magic_model.get_tables_v2(page_id)
439
633
 
634
+ """对image和table的区块分组"""
440
635
  img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
441
636
  img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
442
637
  )
@@ -480,38 +675,20 @@ def parse_page_core(
480
675
  page_h,
481
676
  )
482
677
 
678
+ """获取所有的spans信息"""
483
679
  spans = magic_model.get_all_spans(page_id)
484
680
 
485
- """根据parse_mode,构造spans"""
486
- if parse_mode == SupportedPdfParseMethod.TXT:
487
- """ocr 中文本类的 span 用 pymu spans 替换!"""
488
- pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
489
- spans = replace_text_span(pymu_spans, spans)
490
- elif parse_mode == SupportedPdfParseMethod.OCR:
491
- pass
492
- else:
493
- raise Exception('parse_mode must be txt or ocr')
494
-
495
681
  """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
496
682
  """顺便删除大水印并保留abandon的span"""
497
683
  spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
498
684
 
499
- """删除重叠spans中置信度较低的那些"""
500
- spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
501
- """删除重叠spans中较小的那些"""
502
- spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
503
- """对image和table截图"""
504
- spans = ocr_cut_image_and_table(
505
- spans, page_doc, page_id, pdf_bytes_md5, imageWriter
506
- )
507
-
508
685
  """先处理不需要排版的discarded_blocks"""
509
686
  discarded_block_with_spans, spans = fill_spans_in_blocks(
510
687
  all_discarded_blocks, spans, 0.4
511
688
  )
512
689
  fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
513
690
 
514
- """如果当前页面没有bbox则跳过"""
691
+ """如果当前页面没有有效的bbox则跳过"""
515
692
  if len(all_bboxes) == 0:
516
693
  logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
517
694
  return ocr_construct_page_component_v2(
@@ -529,7 +706,32 @@ def parse_page_core(
529
706
  drop_reason,
530
707
  )
531
708
 
532
- """将span填入blocks中"""
709
+ """删除重叠spans中置信度较低的那些"""
710
+ spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
711
+ """删除重叠spans中较小的那些"""
712
+ spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
713
+
714
+ """根据parse_mode,构造spans,主要是文本类的字符填充"""
715
+ if parse_mode == SupportedPdfParseMethod.TXT:
716
+
717
+ """之前的公式替换方案"""
718
+ # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
719
+ # spans = replace_text_span(pymu_spans, spans)
720
+
721
+ """ocr 中文本类的 span 用 pymu spans 替换!"""
722
+ spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
723
+
724
+ elif parse_mode == SupportedPdfParseMethod.OCR:
725
+ pass
726
+ else:
727
+ raise Exception('parse_mode must be txt or ocr')
728
+
729
+ """对image和table截图"""
730
+ spans = ocr_cut_image_and_table(
731
+ spans, page_doc, page_id, pdf_bytes_md5, imageWriter
732
+ )
733
+
734
+ """span填充进block"""
533
735
  block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
534
736
 
535
737
  """对block进行fix操作"""
@@ -579,6 +781,7 @@ def pdf_parse_union(
579
781
  start_page_id=0,
580
782
  end_page_id=None,
581
783
  debug_mode=False,
784
+ lang=None,
582
785
  ):
583
786
  pdf_bytes_md5 = compute_md5(dataset.data_bits())
584
787
 
@@ -615,7 +818,7 @@ def pdf_parse_union(
615
818
  """解析pdf中的每一页"""
616
819
  if start_page_id <= page_id <= end_page_id:
617
820
  page_info = parse_page_core(
618
- page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
821
+ page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
619
822
  )
620
823
  else:
621
824
  page_info = page.get_page_info()
@@ -627,7 +830,7 @@ def pdf_parse_union(
627
830
  pdf_info_dict[f'page_{page_id}'] = page_info
628
831
 
629
832
  """分段"""
630
- para_split(pdf_info_dict, debug_mode=debug_mode)
833
+ para_split(pdf_info_dict)
631
834
 
632
835
  """dict转list"""
633
836
  pdf_info_list = dict_to_list(pdf_info_dict)
magic_pdf/pipe/AbsPipe.py CHANGED
@@ -1,22 +1,20 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
+ from magic_pdf.config.drop_reason import DropReason
4
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
5
+ from magic_pdf.data.data_reader_writer import DataWriter
3
6
  from magic_pdf.dict2md.ocr_mkcontent import union_make
4
7
  from magic_pdf.filter.pdf_classify_by_type import classify
5
8
  from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
6
- from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
7
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
- from magic_pdf.libs.drop_reason import DropReason
9
9
  from magic_pdf.libs.json_compressor import JsonCompressor
10
10
 
11
11
 
12
12
  class AbsPipe(ABC):
13
- """
14
- txt和ocr处理的抽象类
15
- """
16
- PIP_OCR = "ocr"
17
- PIP_TXT = "txt"
13
+ """txt和ocr处理的抽象类."""
14
+ PIP_OCR = 'ocr'
15
+ PIP_TXT = 'txt'
18
16
 
19
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
17
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
20
18
  start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
21
19
  self.pdf_bytes = pdf_bytes
22
20
  self.model_list = model_list
@@ -29,29 +27,23 @@ class AbsPipe(ABC):
29
27
  self.layout_model = layout_model
30
28
  self.formula_enable = formula_enable
31
29
  self.table_enable = table_enable
32
-
30
+
33
31
  def get_compress_pdf_mid_data(self):
34
32
  return JsonCompressor.compress_json(self.pdf_mid_data)
35
33
 
36
34
  @abstractmethod
37
35
  def pipe_classify(self):
38
- """
39
- 有状态的分类
40
- """
36
+ """有状态的分类."""
41
37
  raise NotImplementedError
42
38
 
43
39
  @abstractmethod
44
40
  def pipe_analyze(self):
45
- """
46
- 有状态的跑模型分析
47
- """
41
+ """有状态的跑模型分析."""
48
42
  raise NotImplementedError
49
43
 
50
44
  @abstractmethod
51
45
  def pipe_parse(self):
52
- """
53
- 有状态的解析
54
- """
46
+ """有状态的解析."""
55
47
  raise NotImplementedError
56
48
 
57
49
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
@@ -64,27 +56,25 @@ class AbsPipe(ABC):
64
56
 
65
57
  @staticmethod
66
58
  def classify(pdf_bytes: bytes) -> str:
67
- """
68
- 根据pdf的元数据,判断是文本pdf,还是ocr pdf
69
- """
59
+ """根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
70
60
  pdf_meta = pdf_meta_scan(pdf_bytes)
71
- if pdf_meta.get("_need_drop", False): # 如果返回了需要丢弃的标志,则抛出异常
61
+ if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
72
62
  raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
73
63
  else:
74
- is_encrypted = pdf_meta["is_encrypted"]
75
- is_needs_password = pdf_meta["is_needs_password"]
64
+ is_encrypted = pdf_meta['is_encrypted']
65
+ is_needs_password = pdf_meta['is_needs_password']
76
66
  if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
77
- raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
67
+ raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
78
68
  else:
79
69
  is_text_pdf, results = classify(
80
- pdf_meta["total_page"],
81
- pdf_meta["page_width_pts"],
82
- pdf_meta["page_height_pts"],
83
- pdf_meta["image_info_per_page"],
84
- pdf_meta["text_len_per_page"],
85
- pdf_meta["imgs_per_page"],
86
- pdf_meta["text_layout_per_page"],
87
- pdf_meta["invalid_chars"],
70
+ pdf_meta['total_page'],
71
+ pdf_meta['page_width_pts'],
72
+ pdf_meta['page_height_pts'],
73
+ pdf_meta['image_info_per_page'],
74
+ pdf_meta['text_len_per_page'],
75
+ pdf_meta['imgs_per_page'],
76
+ pdf_meta['text_layout_per_page'],
77
+ pdf_meta['invalid_chars'],
88
78
  )
89
79
  if is_text_pdf:
90
80
  return AbsPipe.PIP_TXT
@@ -93,22 +83,16 @@ class AbsPipe(ABC):
93
83
 
94
84
  @staticmethod
95
85
  def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
96
- """
97
- 根据pdf类型,生成统一格式content_list
98
- """
86
+ """根据pdf类型,生成统一格式content_list."""
99
87
  pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
100
- pdf_info_list = pdf_mid_data["pdf_info"]
88
+ pdf_info_list = pdf_mid_data['pdf_info']
101
89
  content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
102
90
  return content_list
103
91
 
104
92
  @staticmethod
105
93
  def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
106
- """
107
- 根据pdf类型,markdown
108
- """
94
+ """根据pdf类型,markdown."""
109
95
  pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
110
- pdf_info_list = pdf_mid_data["pdf_info"]
96
+ pdf_info_list = pdf_mid_data['pdf_info']
111
97
  md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
112
98
  return md_content
113
-
114
-
magic_pdf/pipe/OCRPipe.py CHANGED
@@ -1,15 +1,15 @@
1
1
  from loguru import logger
2
2
 
3
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
3
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
4
+ from magic_pdf.data.data_reader_writer import DataWriter
4
5
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
6
6
  from magic_pdf.pipe.AbsPipe import AbsPipe
7
7
  from magic_pdf.user_api import parse_ocr_pdf
8
8
 
9
9
 
10
10
  class OCRPipe(AbsPipe):
11
11
 
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
12
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
13
  start_page_id=0, end_page_id=None, lang=None,
14
14
  layout_model=None, formula_enable=None, table_enable=None):
15
15
  super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
@@ -32,10 +32,10 @@ class OCRPipe(AbsPipe):
32
32
 
33
33
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
34
34
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
35
- logger.info("ocr_pipe mk content list finished")
35
+ logger.info('ocr_pipe mk content list finished')
36
36
  return result
37
37
 
38
38
  def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
39
39
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
40
- logger.info(f"ocr_pipe mk {md_make_mode} finished")
40
+ logger.info(f'ocr_pipe mk {md_make_mode} finished')
41
41
  return result
magic_pdf/pipe/TXTPipe.py CHANGED
@@ -1,16 +1,15 @@
1
1
  from loguru import logger
2
2
 
3
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
3
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
4
+ from magic_pdf.data.data_reader_writer import DataWriter
4
5
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
6
- from magic_pdf.libs.json_compressor import JsonCompressor
7
6
  from magic_pdf.pipe.AbsPipe import AbsPipe
8
7
  from magic_pdf.user_api import parse_txt_pdf
9
8
 
10
9
 
11
10
  class TXTPipe(AbsPipe):
12
11
 
13
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
12
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
14
13
  start_page_id=0, end_page_id=None, lang=None,
15
14
  layout_model=None, formula_enable=None, table_enable=None):
16
15
  super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
@@ -33,10 +32,10 @@ class TXTPipe(AbsPipe):
33
32
 
34
33
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
35
34
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
36
- logger.info("txt_pipe mk content list finished")
35
+ logger.info('txt_pipe mk content list finished')
37
36
  return result
38
37
 
39
38
  def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
40
39
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
41
- logger.info(f"txt_pipe mk {md_make_mode} finished")
40
+ logger.info(f'txt_pipe mk {md_make_mode} finished')
42
41
  return result