magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +12 -12
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +6 -6
- magic_pdf/libs/draw_bbox.py +13 -6
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +142 -351
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +149 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +121 -66
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +253 -50
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +70 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
- magic_pdf-0.10.0.dist-info/RECORD +198 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -53
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- magic_pdf/para/para_pipeline.py +0 -297
- magic_pdf-0.9.2.dist-info/RECORD +0 -178
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -7,18 +7,32 @@ from typing import List
|
|
7
7
|
import torch
|
8
8
|
from loguru import logger
|
9
9
|
|
10
|
+
from magic_pdf.config.drop_reason import DropReason
|
10
11
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
12
|
+
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
11
13
|
from magic_pdf.data.dataset import Dataset, PageableData
|
12
14
|
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
13
15
|
from magic_pdf.libs.clean_memory import clean_memory
|
14
16
|
from magic_pdf.libs.commons import fitz, get_delta_time
|
15
17
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
16
18
|
from magic_pdf.libs.convert_utils import dict_to_list
|
17
|
-
from magic_pdf.libs.drop_reason import DropReason
|
18
19
|
from magic_pdf.libs.hash_utils import compute_md5
|
19
20
|
from magic_pdf.libs.local_math import float_equal
|
20
|
-
from magic_pdf.libs.
|
21
|
+
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
21
22
|
from magic_pdf.model.magic_model import MagicModel
|
23
|
+
|
24
|
+
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
25
|
+
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
26
|
+
|
27
|
+
try:
|
28
|
+
import torchtext
|
29
|
+
|
30
|
+
if torchtext.__version__ >= "0.18.0":
|
31
|
+
torchtext.disable_torchtext_deprecation_warning()
|
32
|
+
except ImportError:
|
33
|
+
pass
|
34
|
+
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
35
|
+
|
22
36
|
from magic_pdf.para.para_split_v3 import para_split
|
23
37
|
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
|
24
38
|
from magic_pdf.pre_proc.construct_page_dict import \
|
@@ -30,8 +44,8 @@ from magic_pdf.pre_proc.equations_replace import (
|
|
30
44
|
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
31
45
|
ocr_prepare_bboxes_for_layout_split_v2
|
32
46
|
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
33
|
-
|
34
|
-
fix_discarded_block
|
47
|
+
fix_block_spans_v2,
|
48
|
+
fix_discarded_block)
|
35
49
|
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
36
50
|
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
37
51
|
remove_overlaps_min_spans)
|
@@ -74,7 +88,151 @@ def __replace_STX_ETX(text_str: str):
|
|
74
88
|
return text_str
|
75
89
|
|
76
90
|
|
77
|
-
def
|
91
|
+
def chars_to_content(span):
|
92
|
+
# # 先给chars按char['bbox']的x坐标排序
|
93
|
+
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
|
94
|
+
|
95
|
+
# 先给chars按char['bbox']的中心点的x坐标排序
|
96
|
+
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
97
|
+
content = ''
|
98
|
+
|
99
|
+
# 求char的平均宽度
|
100
|
+
if len(span['chars']) == 0:
|
101
|
+
span['content'] = content
|
102
|
+
del span['chars']
|
103
|
+
return
|
104
|
+
else:
|
105
|
+
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
106
|
+
char_avg_width = char_width_sum / len(span['chars'])
|
107
|
+
|
108
|
+
for char in span['chars']:
|
109
|
+
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
|
110
|
+
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
111
|
+
content += ' '
|
112
|
+
content += char['c']
|
113
|
+
span['content'] = __replace_STX_ETX(content)
|
114
|
+
del span['chars']
|
115
|
+
|
116
|
+
|
117
|
+
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
118
|
+
def fill_char_in_spans(spans, all_chars):
|
119
|
+
|
120
|
+
for char in all_chars:
|
121
|
+
for span in spans:
|
122
|
+
# 判断char是否属于LINE_STOP_FLAG
|
123
|
+
if char['c'] in LINE_STOP_FLAG:
|
124
|
+
char_is_line_stop_flag = True
|
125
|
+
else:
|
126
|
+
char_is_line_stop_flag = False
|
127
|
+
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
|
128
|
+
span['chars'].append(char)
|
129
|
+
break
|
130
|
+
|
131
|
+
for span in spans:
|
132
|
+
chars_to_content(span)
|
133
|
+
|
134
|
+
|
135
|
+
# 使用鲁棒性更强的中心点坐标判断
|
136
|
+
def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
137
|
+
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
138
|
+
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
139
|
+
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
|
140
|
+
span_height = span_bbox[3] - span_bbox[1]
|
141
|
+
|
142
|
+
if (
|
143
|
+
span_bbox[0] < char_center_x < span_bbox[2]
|
144
|
+
and span_bbox[1] < char_center_y < span_bbox[3]
|
145
|
+
and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
|
146
|
+
):
|
147
|
+
return True
|
148
|
+
else:
|
149
|
+
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
|
150
|
+
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
|
151
|
+
if char_is_line_stop_flag:
|
152
|
+
if (
|
153
|
+
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
|
154
|
+
and char_center_x > span_bbox[0]
|
155
|
+
and span_bbox[1] < char_center_y < span_bbox[3]
|
156
|
+
and abs(char_center_y - span_center_y) < span_height / 4
|
157
|
+
):
|
158
|
+
return True
|
159
|
+
else:
|
160
|
+
return False
|
161
|
+
|
162
|
+
|
163
|
+
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
164
|
+
|
165
|
+
useful_spans = []
|
166
|
+
unuseful_spans = []
|
167
|
+
for span in spans:
|
168
|
+
for block in all_bboxes:
|
169
|
+
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
|
170
|
+
continue
|
171
|
+
else:
|
172
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
173
|
+
useful_spans.append(span)
|
174
|
+
break
|
175
|
+
for block in all_discarded_blocks:
|
176
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
177
|
+
unuseful_spans.append(span)
|
178
|
+
break
|
179
|
+
|
180
|
+
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
181
|
+
|
182
|
+
# @todo: 拿到char之后把倾斜角度较大的先删一遍
|
183
|
+
all_pymu_chars = []
|
184
|
+
for block in text_blocks:
|
185
|
+
for line in block['lines']:
|
186
|
+
for span in line['spans']:
|
187
|
+
all_pymu_chars.extend(span['chars'])
|
188
|
+
|
189
|
+
new_spans = []
|
190
|
+
|
191
|
+
for span in useful_spans:
|
192
|
+
if span['type'] in [ContentType.Text]:
|
193
|
+
span['chars'] = []
|
194
|
+
new_spans.append(span)
|
195
|
+
|
196
|
+
for span in unuseful_spans:
|
197
|
+
if span['type'] in [ContentType.Text]:
|
198
|
+
span['chars'] = []
|
199
|
+
new_spans.append(span)
|
200
|
+
|
201
|
+
fill_char_in_spans(new_spans, all_pymu_chars)
|
202
|
+
|
203
|
+
empty_spans = []
|
204
|
+
for span in new_spans:
|
205
|
+
if len(span['content']) == 0:
|
206
|
+
empty_spans.append(span)
|
207
|
+
if len(empty_spans) > 0:
|
208
|
+
|
209
|
+
# 初始化ocr模型
|
210
|
+
atom_model_manager = AtomModelSingleton()
|
211
|
+
ocr_model = atom_model_manager.get_atom_model(
|
212
|
+
atom_model_name="ocr",
|
213
|
+
ocr_show_log=False,
|
214
|
+
det_db_box_thresh=0.3,
|
215
|
+
lang=lang
|
216
|
+
)
|
217
|
+
|
218
|
+
for span in empty_spans:
|
219
|
+
spans.remove(span)
|
220
|
+
# 对span的bbox截图
|
221
|
+
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
|
222
|
+
ocr_res = ocr_model.ocr(span_img, det=False)
|
223
|
+
# logger.info(f"ocr_res: {ocr_res}")
|
224
|
+
# logger.info(f"empty_span: {span}")
|
225
|
+
if ocr_res and len(ocr_res) > 0:
|
226
|
+
if len(ocr_res[0]) > 0:
|
227
|
+
ocr_text, ocr_score = ocr_res[0][0]
|
228
|
+
if ocr_score > 0.5 and len(ocr_text) > 0:
|
229
|
+
span['content'] = ocr_text
|
230
|
+
spans.append(span)
|
231
|
+
|
232
|
+
return spans
|
233
|
+
|
234
|
+
|
235
|
+
def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
|
78
236
|
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
79
237
|
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
|
80
238
|
'blocks'
|
@@ -164,8 +322,8 @@ class ModelSingleton:
|
|
164
322
|
|
165
323
|
|
166
324
|
def do_predict(boxes: List[List[int]], model) -> List[int]:
|
167
|
-
from magic_pdf.model.
|
168
|
-
|
325
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
|
326
|
+
boxes2inputs, parse_logits, prepare_inputs)
|
169
327
|
|
170
328
|
inputs = boxes2inputs(boxes)
|
171
329
|
inputs = prepare_inputs(inputs, model)
|
@@ -174,23 +332,59 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
|
|
174
332
|
|
175
333
|
|
176
334
|
def cal_block_index(fix_blocks, sorted_bboxes):
|
177
|
-
for block in fix_blocks:
|
178
335
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
336
|
+
if sorted_bboxes is not None:
|
337
|
+
# 使用layoutreader排序
|
338
|
+
for block in fix_blocks:
|
339
|
+
line_index_list = []
|
340
|
+
if len(block['lines']) == 0:
|
341
|
+
block['index'] = sorted_bboxes.index(block['bbox'])
|
342
|
+
else:
|
343
|
+
for line in block['lines']:
|
344
|
+
line['index'] = sorted_bboxes.index(line['bbox'])
|
345
|
+
line_index_list.append(line['index'])
|
346
|
+
median_value = statistics.median(line_index_list)
|
347
|
+
block['index'] = median_value
|
348
|
+
|
349
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
350
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
351
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
352
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
353
|
+
del block['real_lines']
|
354
|
+
else:
|
355
|
+
# 使用xycut排序
|
356
|
+
block_bboxes = []
|
357
|
+
for block in fix_blocks:
|
358
|
+
block_bboxes.append(block['bbox'])
|
359
|
+
|
360
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
361
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
362
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
363
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
364
|
+
del block['real_lines']
|
365
|
+
|
366
|
+
import numpy as np
|
367
|
+
|
368
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
|
369
|
+
recursive_xy_cut
|
370
|
+
|
371
|
+
random_boxes = np.array(block_bboxes)
|
372
|
+
np.random.shuffle(random_boxes)
|
373
|
+
res = []
|
374
|
+
recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
|
375
|
+
assert len(res) == len(block_bboxes)
|
376
|
+
sorted_boxes = random_boxes[np.array(res)].tolist()
|
377
|
+
|
378
|
+
for i, block in enumerate(fix_blocks):
|
379
|
+
block['index'] = sorted_boxes.index(block['bbox'])
|
380
|
+
|
381
|
+
# 生成line index
|
382
|
+
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
|
383
|
+
line_inedx = 1
|
384
|
+
for block in sorted_blocks:
|
183
385
|
for line in block['lines']:
|
184
|
-
line['index'] =
|
185
|
-
|
186
|
-
median_value = statistics.median(line_index_list)
|
187
|
-
block['index'] = median_value
|
188
|
-
|
189
|
-
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
190
|
-
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
191
|
-
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
192
|
-
block['lines'] = copy.deepcopy(block['real_lines'])
|
193
|
-
del block['real_lines']
|
386
|
+
line['index'] = line_inedx
|
387
|
+
line_inedx += 1
|
194
388
|
|
195
389
|
return fix_blocks
|
196
390
|
|
@@ -257,13 +451,16 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
|
257
451
|
page_line_list.append(bbox)
|
258
452
|
elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
259
453
|
bbox = block['bbox']
|
260
|
-
block[
|
454
|
+
block['real_lines'] = copy.deepcopy(block['lines'])
|
261
455
|
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
|
262
456
|
block['lines'] = []
|
263
457
|
for line in lines:
|
264
458
|
block['lines'].append({'bbox': line, 'spans': []})
|
265
459
|
page_line_list.extend(lines)
|
266
460
|
|
461
|
+
if len(page_line_list) > 200: # layoutreader最高支持512line
|
462
|
+
return None
|
463
|
+
|
267
464
|
# 使用layoutreader排序
|
268
465
|
x_scale = 1000.0 / page_w
|
269
466
|
y_scale = 1000.0 / page_h
|
@@ -425,18 +622,16 @@ def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
|
|
425
622
|
|
426
623
|
|
427
624
|
def parse_page_core(
|
428
|
-
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
|
625
|
+
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
|
429
626
|
):
|
430
627
|
need_drop = False
|
431
628
|
drop_reason = []
|
432
629
|
|
433
630
|
"""从magic_model对象中获取后面会用到的区块信息"""
|
434
|
-
# img_blocks = magic_model.get_imgs(page_id)
|
435
|
-
# table_blocks = magic_model.get_tables(page_id)
|
436
|
-
|
437
631
|
img_groups = magic_model.get_imgs_v2(page_id)
|
438
632
|
table_groups = magic_model.get_tables_v2(page_id)
|
439
633
|
|
634
|
+
"""对image和table的区块分组"""
|
440
635
|
img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
|
441
636
|
img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
|
442
637
|
)
|
@@ -480,38 +675,20 @@ def parse_page_core(
|
|
480
675
|
page_h,
|
481
676
|
)
|
482
677
|
|
678
|
+
"""获取所有的spans信息"""
|
483
679
|
spans = magic_model.get_all_spans(page_id)
|
484
680
|
|
485
|
-
"""根据parse_mode,构造spans"""
|
486
|
-
if parse_mode == SupportedPdfParseMethod.TXT:
|
487
|
-
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
488
|
-
pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
|
489
|
-
spans = replace_text_span(pymu_spans, spans)
|
490
|
-
elif parse_mode == SupportedPdfParseMethod.OCR:
|
491
|
-
pass
|
492
|
-
else:
|
493
|
-
raise Exception('parse_mode must be txt or ocr')
|
494
|
-
|
495
681
|
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
|
496
682
|
"""顺便删除大水印并保留abandon的span"""
|
497
683
|
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
|
498
684
|
|
499
|
-
"""删除重叠spans中置信度较低的那些"""
|
500
|
-
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
501
|
-
"""删除重叠spans中较小的那些"""
|
502
|
-
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
503
|
-
"""对image和table截图"""
|
504
|
-
spans = ocr_cut_image_and_table(
|
505
|
-
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
506
|
-
)
|
507
|
-
|
508
685
|
"""先处理不需要排版的discarded_blocks"""
|
509
686
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
510
687
|
all_discarded_blocks, spans, 0.4
|
511
688
|
)
|
512
689
|
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
|
513
690
|
|
514
|
-
"""
|
691
|
+
"""如果当前页面没有有效的bbox则跳过"""
|
515
692
|
if len(all_bboxes) == 0:
|
516
693
|
logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
|
517
694
|
return ocr_construct_page_component_v2(
|
@@ -529,7 +706,32 @@ def parse_page_core(
|
|
529
706
|
drop_reason,
|
530
707
|
)
|
531
708
|
|
532
|
-
"""
|
709
|
+
"""删除重叠spans中置信度较低的那些"""
|
710
|
+
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
711
|
+
"""删除重叠spans中较小的那些"""
|
712
|
+
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
713
|
+
|
714
|
+
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
715
|
+
if parse_mode == SupportedPdfParseMethod.TXT:
|
716
|
+
|
717
|
+
"""之前的公式替换方案"""
|
718
|
+
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
|
719
|
+
# spans = replace_text_span(pymu_spans, spans)
|
720
|
+
|
721
|
+
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
722
|
+
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
723
|
+
|
724
|
+
elif parse_mode == SupportedPdfParseMethod.OCR:
|
725
|
+
pass
|
726
|
+
else:
|
727
|
+
raise Exception('parse_mode must be txt or ocr')
|
728
|
+
|
729
|
+
"""对image和table截图"""
|
730
|
+
spans = ocr_cut_image_and_table(
|
731
|
+
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
732
|
+
)
|
733
|
+
|
734
|
+
"""span填充进block"""
|
533
735
|
block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
|
534
736
|
|
535
737
|
"""对block进行fix操作"""
|
@@ -579,6 +781,7 @@ def pdf_parse_union(
|
|
579
781
|
start_page_id=0,
|
580
782
|
end_page_id=None,
|
581
783
|
debug_mode=False,
|
784
|
+
lang=None,
|
582
785
|
):
|
583
786
|
pdf_bytes_md5 = compute_md5(dataset.data_bits())
|
584
787
|
|
@@ -615,7 +818,7 @@ def pdf_parse_union(
|
|
615
818
|
"""解析pdf中的每一页"""
|
616
819
|
if start_page_id <= page_id <= end_page_id:
|
617
820
|
page_info = parse_page_core(
|
618
|
-
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
|
821
|
+
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
|
619
822
|
)
|
620
823
|
else:
|
621
824
|
page_info = page.get_page_info()
|
@@ -627,7 +830,7 @@ def pdf_parse_union(
|
|
627
830
|
pdf_info_dict[f'page_{page_id}'] = page_info
|
628
831
|
|
629
832
|
"""分段"""
|
630
|
-
para_split(pdf_info_dict
|
833
|
+
para_split(pdf_info_dict)
|
631
834
|
|
632
835
|
"""dict转list"""
|
633
836
|
pdf_info_list = dict_to_list(pdf_info_dict)
|
magic_pdf/pipe/AbsPipe.py
CHANGED
@@ -1,22 +1,20 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
|
3
|
+
from magic_pdf.config.drop_reason import DropReason
|
4
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
5
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
3
6
|
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
4
7
|
from magic_pdf.filter.pdf_classify_by_type import classify
|
5
8
|
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
6
|
-
from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
|
7
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
|
-
from magic_pdf.libs.drop_reason import DropReason
|
9
9
|
from magic_pdf.libs.json_compressor import JsonCompressor
|
10
10
|
|
11
11
|
|
12
12
|
class AbsPipe(ABC):
|
13
|
-
"""
|
14
|
-
|
15
|
-
|
16
|
-
PIP_OCR = "ocr"
|
17
|
-
PIP_TXT = "txt"
|
13
|
+
"""txt和ocr处理的抽象类."""
|
14
|
+
PIP_OCR = 'ocr'
|
15
|
+
PIP_TXT = 'txt'
|
18
16
|
|
19
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer:
|
17
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
20
18
|
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
21
19
|
self.pdf_bytes = pdf_bytes
|
22
20
|
self.model_list = model_list
|
@@ -29,29 +27,23 @@ class AbsPipe(ABC):
|
|
29
27
|
self.layout_model = layout_model
|
30
28
|
self.formula_enable = formula_enable
|
31
29
|
self.table_enable = table_enable
|
32
|
-
|
30
|
+
|
33
31
|
def get_compress_pdf_mid_data(self):
|
34
32
|
return JsonCompressor.compress_json(self.pdf_mid_data)
|
35
33
|
|
36
34
|
@abstractmethod
|
37
35
|
def pipe_classify(self):
|
38
|
-
"""
|
39
|
-
有状态的分类
|
40
|
-
"""
|
36
|
+
"""有状态的分类."""
|
41
37
|
raise NotImplementedError
|
42
38
|
|
43
39
|
@abstractmethod
|
44
40
|
def pipe_analyze(self):
|
45
|
-
"""
|
46
|
-
有状态的跑模型分析
|
47
|
-
"""
|
41
|
+
"""有状态的跑模型分析."""
|
48
42
|
raise NotImplementedError
|
49
43
|
|
50
44
|
@abstractmethod
|
51
45
|
def pipe_parse(self):
|
52
|
-
"""
|
53
|
-
有状态的解析
|
54
|
-
"""
|
46
|
+
"""有状态的解析."""
|
55
47
|
raise NotImplementedError
|
56
48
|
|
57
49
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
@@ -64,27 +56,25 @@ class AbsPipe(ABC):
|
|
64
56
|
|
65
57
|
@staticmethod
|
66
58
|
def classify(pdf_bytes: bytes) -> str:
|
67
|
-
"""
|
68
|
-
根据pdf的元数据,判断是文本pdf,还是ocr pdf
|
69
|
-
"""
|
59
|
+
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
|
70
60
|
pdf_meta = pdf_meta_scan(pdf_bytes)
|
71
|
-
if pdf_meta.get(
|
61
|
+
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
|
72
62
|
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
|
73
63
|
else:
|
74
|
-
is_encrypted = pdf_meta[
|
75
|
-
is_needs_password = pdf_meta[
|
64
|
+
is_encrypted = pdf_meta['is_encrypted']
|
65
|
+
is_needs_password = pdf_meta['is_needs_password']
|
76
66
|
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
|
77
|
-
raise Exception(f
|
67
|
+
raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
|
78
68
|
else:
|
79
69
|
is_text_pdf, results = classify(
|
80
|
-
pdf_meta[
|
81
|
-
pdf_meta[
|
82
|
-
pdf_meta[
|
83
|
-
pdf_meta[
|
84
|
-
pdf_meta[
|
85
|
-
pdf_meta[
|
86
|
-
pdf_meta[
|
87
|
-
pdf_meta[
|
70
|
+
pdf_meta['total_page'],
|
71
|
+
pdf_meta['page_width_pts'],
|
72
|
+
pdf_meta['page_height_pts'],
|
73
|
+
pdf_meta['image_info_per_page'],
|
74
|
+
pdf_meta['text_len_per_page'],
|
75
|
+
pdf_meta['imgs_per_page'],
|
76
|
+
pdf_meta['text_layout_per_page'],
|
77
|
+
pdf_meta['invalid_chars'],
|
88
78
|
)
|
89
79
|
if is_text_pdf:
|
90
80
|
return AbsPipe.PIP_TXT
|
@@ -93,22 +83,16 @@ class AbsPipe(ABC):
|
|
93
83
|
|
94
84
|
@staticmethod
|
95
85
|
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
|
96
|
-
"""
|
97
|
-
根据pdf类型,生成统一格式content_list
|
98
|
-
"""
|
86
|
+
"""根据pdf类型,生成统一格式content_list."""
|
99
87
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
100
|
-
pdf_info_list = pdf_mid_data[
|
88
|
+
pdf_info_list = pdf_mid_data['pdf_info']
|
101
89
|
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
|
102
90
|
return content_list
|
103
91
|
|
104
92
|
@staticmethod
|
105
93
|
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
|
106
|
-
"""
|
107
|
-
根据pdf类型,markdown
|
108
|
-
"""
|
94
|
+
"""根据pdf类型,markdown."""
|
109
95
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
110
|
-
pdf_info_list = pdf_mid_data[
|
96
|
+
pdf_info_list = pdf_mid_data['pdf_info']
|
111
97
|
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
|
112
98
|
return md_content
|
113
|
-
|
114
|
-
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
-
from magic_pdf.
|
3
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
4
5
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
6
6
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
7
7
|
from magic_pdf.user_api import parse_ocr_pdf
|
8
8
|
|
9
9
|
|
10
10
|
class OCRPipe(AbsPipe):
|
11
11
|
|
12
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer:
|
12
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
13
13
|
start_page_id=0, end_page_id=None, lang=None,
|
14
14
|
layout_model=None, formula_enable=None, table_enable=None):
|
15
15
|
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
@@ -32,10 +32,10 @@ class OCRPipe(AbsPipe):
|
|
32
32
|
|
33
33
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
34
34
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
35
|
-
logger.info(
|
35
|
+
logger.info('ocr_pipe mk content list finished')
|
36
36
|
return result
|
37
37
|
|
38
38
|
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
39
39
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
40
|
-
logger.info(f
|
40
|
+
logger.info(f'ocr_pipe mk {md_make_mode} finished')
|
41
41
|
return result
|
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
-
from magic_pdf.
|
3
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
4
5
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
6
|
-
from magic_pdf.libs.json_compressor import JsonCompressor
|
7
6
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
8
7
|
from magic_pdf.user_api import parse_txt_pdf
|
9
8
|
|
10
9
|
|
11
10
|
class TXTPipe(AbsPipe):
|
12
11
|
|
13
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer:
|
12
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
14
13
|
start_page_id=0, end_page_id=None, lang=None,
|
15
14
|
layout_model=None, formula_enable=None, table_enable=None):
|
16
15
|
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
@@ -33,10 +32,10 @@ class TXTPipe(AbsPipe):
|
|
33
32
|
|
34
33
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
35
34
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
36
|
-
logger.info(
|
35
|
+
logger.info('txt_pipe mk content list finished')
|
37
36
|
return result
|
38
37
|
|
39
38
|
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
40
39
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
41
|
-
logger.info(f
|
40
|
+
logger.info(f'txt_pipe mk {md_make_mode} finished')
|
42
41
|
return result
|