magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -7,18 +7,32 @@ from typing import List
|
|
7
7
|
import torch
|
8
8
|
from loguru import logger
|
9
9
|
|
10
|
+
from magic_pdf.config.drop_reason import DropReason
|
10
11
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
12
|
+
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
11
13
|
from magic_pdf.data.dataset import Dataset, PageableData
|
12
14
|
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
13
15
|
from magic_pdf.libs.clean_memory import clean_memory
|
14
16
|
from magic_pdf.libs.commons import fitz, get_delta_time
|
15
17
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
16
18
|
from magic_pdf.libs.convert_utils import dict_to_list
|
17
|
-
from magic_pdf.libs.drop_reason import DropReason
|
18
19
|
from magic_pdf.libs.hash_utils import compute_md5
|
19
20
|
from magic_pdf.libs.local_math import float_equal
|
20
|
-
from magic_pdf.libs.
|
21
|
+
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
21
22
|
from magic_pdf.model.magic_model import MagicModel
|
23
|
+
|
24
|
+
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
25
|
+
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
26
|
+
|
27
|
+
try:
|
28
|
+
import torchtext
|
29
|
+
|
30
|
+
if torchtext.__version__ >= "0.18.0":
|
31
|
+
torchtext.disable_torchtext_deprecation_warning()
|
32
|
+
except ImportError:
|
33
|
+
pass
|
34
|
+
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
35
|
+
|
22
36
|
from magic_pdf.para.para_split_v3 import para_split
|
23
37
|
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
|
24
38
|
from magic_pdf.pre_proc.construct_page_dict import \
|
@@ -30,8 +44,8 @@ from magic_pdf.pre_proc.equations_replace import (
|
|
30
44
|
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
31
45
|
ocr_prepare_bboxes_for_layout_split_v2
|
32
46
|
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
33
|
-
|
34
|
-
|
47
|
+
fix_block_spans_v2,
|
48
|
+
fix_discarded_block)
|
35
49
|
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
36
50
|
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
37
51
|
remove_overlaps_min_spans)
|
@@ -74,7 +88,151 @@ def __replace_STX_ETX(text_str: str):
|
|
74
88
|
return text_str
|
75
89
|
|
76
90
|
|
77
|
-
def
|
91
|
+
def chars_to_content(span):
|
92
|
+
# # 先给chars按char['bbox']的x坐标排序
|
93
|
+
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
|
94
|
+
|
95
|
+
# 先给chars按char['bbox']的中心点的x坐标排序
|
96
|
+
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
97
|
+
content = ''
|
98
|
+
|
99
|
+
# 求char的平均宽度
|
100
|
+
if len(span['chars']) == 0:
|
101
|
+
span['content'] = content
|
102
|
+
del span['chars']
|
103
|
+
return
|
104
|
+
else:
|
105
|
+
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
106
|
+
char_avg_width = char_width_sum / len(span['chars'])
|
107
|
+
|
108
|
+
for char in span['chars']:
|
109
|
+
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
|
110
|
+
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
111
|
+
content += ' '
|
112
|
+
content += char['c']
|
113
|
+
span['content'] = __replace_STX_ETX(content)
|
114
|
+
del span['chars']
|
115
|
+
|
116
|
+
|
117
|
+
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
118
|
+
def fill_char_in_spans(spans, all_chars):
|
119
|
+
|
120
|
+
for char in all_chars:
|
121
|
+
for span in spans:
|
122
|
+
# 判断char是否属于LINE_STOP_FLAG
|
123
|
+
if char['c'] in LINE_STOP_FLAG:
|
124
|
+
char_is_line_stop_flag = True
|
125
|
+
else:
|
126
|
+
char_is_line_stop_flag = False
|
127
|
+
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
|
128
|
+
span['chars'].append(char)
|
129
|
+
break
|
130
|
+
|
131
|
+
for span in spans:
|
132
|
+
chars_to_content(span)
|
133
|
+
|
134
|
+
|
135
|
+
# 使用鲁棒性更强的中心点坐标判断
|
136
|
+
def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
137
|
+
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
138
|
+
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
139
|
+
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
|
140
|
+
span_height = span_bbox[3] - span_bbox[1]
|
141
|
+
|
142
|
+
if (
|
143
|
+
span_bbox[0] < char_center_x < span_bbox[2]
|
144
|
+
and span_bbox[1] < char_center_y < span_bbox[3]
|
145
|
+
and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
|
146
|
+
):
|
147
|
+
return True
|
148
|
+
else:
|
149
|
+
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
|
150
|
+
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
|
151
|
+
if char_is_line_stop_flag:
|
152
|
+
if (
|
153
|
+
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
|
154
|
+
and char_center_x > span_bbox[0]
|
155
|
+
and span_bbox[1] < char_center_y < span_bbox[3]
|
156
|
+
and abs(char_center_y - span_center_y) < span_height / 4
|
157
|
+
):
|
158
|
+
return True
|
159
|
+
else:
|
160
|
+
return False
|
161
|
+
|
162
|
+
|
163
|
+
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
164
|
+
|
165
|
+
useful_spans = []
|
166
|
+
unuseful_spans = []
|
167
|
+
for span in spans:
|
168
|
+
for block in all_bboxes:
|
169
|
+
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
|
170
|
+
continue
|
171
|
+
else:
|
172
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
173
|
+
useful_spans.append(span)
|
174
|
+
break
|
175
|
+
for block in all_discarded_blocks:
|
176
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
177
|
+
unuseful_spans.append(span)
|
178
|
+
break
|
179
|
+
|
180
|
+
text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
181
|
+
|
182
|
+
# @todo: 拿到char之后把倾斜角度较大的先删一遍
|
183
|
+
all_pymu_chars = []
|
184
|
+
for block in text_blocks:
|
185
|
+
for line in block['lines']:
|
186
|
+
for span in line['spans']:
|
187
|
+
all_pymu_chars.extend(span['chars'])
|
188
|
+
|
189
|
+
new_spans = []
|
190
|
+
|
191
|
+
for span in useful_spans:
|
192
|
+
if span['type'] in [ContentType.Text]:
|
193
|
+
span['chars'] = []
|
194
|
+
new_spans.append(span)
|
195
|
+
|
196
|
+
for span in unuseful_spans:
|
197
|
+
if span['type'] in [ContentType.Text]:
|
198
|
+
span['chars'] = []
|
199
|
+
new_spans.append(span)
|
200
|
+
|
201
|
+
fill_char_in_spans(new_spans, all_pymu_chars)
|
202
|
+
|
203
|
+
empty_spans = []
|
204
|
+
for span in new_spans:
|
205
|
+
if len(span['content']) == 0:
|
206
|
+
empty_spans.append(span)
|
207
|
+
if len(empty_spans) > 0:
|
208
|
+
|
209
|
+
# 初始化ocr模型
|
210
|
+
atom_model_manager = AtomModelSingleton()
|
211
|
+
ocr_model = atom_model_manager.get_atom_model(
|
212
|
+
atom_model_name="ocr",
|
213
|
+
ocr_show_log=False,
|
214
|
+
det_db_box_thresh=0.3,
|
215
|
+
lang=lang
|
216
|
+
)
|
217
|
+
|
218
|
+
for span in empty_spans:
|
219
|
+
spans.remove(span)
|
220
|
+
# 对span的bbox截图
|
221
|
+
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
|
222
|
+
ocr_res = ocr_model.ocr(span_img, det=False)
|
223
|
+
# logger.info(f"ocr_res: {ocr_res}")
|
224
|
+
# logger.info(f"empty_span: {span}")
|
225
|
+
if ocr_res and len(ocr_res) > 0:
|
226
|
+
if len(ocr_res[0]) > 0:
|
227
|
+
ocr_text, ocr_score = ocr_res[0][0]
|
228
|
+
if ocr_score > 0.5 and len(ocr_text) > 0:
|
229
|
+
span['content'] = ocr_text
|
230
|
+
spans.append(span)
|
231
|
+
|
232
|
+
return spans
|
233
|
+
|
234
|
+
|
235
|
+
def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
|
78
236
|
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
79
237
|
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
|
80
238
|
'blocks'
|
@@ -164,8 +322,8 @@ class ModelSingleton:
|
|
164
322
|
|
165
323
|
|
166
324
|
def do_predict(boxes: List[List[int]], model) -> List[int]:
|
167
|
-
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
|
168
|
-
|
325
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
|
326
|
+
boxes2inputs, parse_logits, prepare_inputs)
|
169
327
|
|
170
328
|
inputs = boxes2inputs(boxes)
|
171
329
|
inputs = prepare_inputs(inputs, model)
|
@@ -206,7 +364,9 @@ def cal_block_index(fix_blocks, sorted_bboxes):
|
|
206
364
|
del block['real_lines']
|
207
365
|
|
208
366
|
import numpy as np
|
209
|
-
|
367
|
+
|
368
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
|
369
|
+
recursive_xy_cut
|
210
370
|
|
211
371
|
random_boxes = np.array(block_bboxes)
|
212
372
|
np.random.shuffle(random_boxes)
|
@@ -291,7 +451,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
|
291
451
|
page_line_list.append(bbox)
|
292
452
|
elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
293
453
|
bbox = block['bbox']
|
294
|
-
block[
|
454
|
+
block['real_lines'] = copy.deepcopy(block['lines'])
|
295
455
|
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
|
296
456
|
block['lines'] = []
|
297
457
|
for line in lines:
|
@@ -462,18 +622,16 @@ def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
|
|
462
622
|
|
463
623
|
|
464
624
|
def parse_page_core(
|
465
|
-
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
|
625
|
+
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
|
466
626
|
):
|
467
627
|
need_drop = False
|
468
628
|
drop_reason = []
|
469
629
|
|
470
630
|
"""从magic_model对象中获取后面会用到的区块信息"""
|
471
|
-
# img_blocks = magic_model.get_imgs(page_id)
|
472
|
-
# table_blocks = magic_model.get_tables(page_id)
|
473
|
-
|
474
631
|
img_groups = magic_model.get_imgs_v2(page_id)
|
475
632
|
table_groups = magic_model.get_tables_v2(page_id)
|
476
633
|
|
634
|
+
"""对image和table的区块分组"""
|
477
635
|
img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
|
478
636
|
img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
|
479
637
|
)
|
@@ -517,38 +675,20 @@ def parse_page_core(
|
|
517
675
|
page_h,
|
518
676
|
)
|
519
677
|
|
678
|
+
"""获取所有的spans信息"""
|
520
679
|
spans = magic_model.get_all_spans(page_id)
|
521
680
|
|
522
|
-
"""根据parse_mode,构造spans"""
|
523
|
-
if parse_mode == SupportedPdfParseMethod.TXT:
|
524
|
-
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
525
|
-
pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
|
526
|
-
spans = replace_text_span(pymu_spans, spans)
|
527
|
-
elif parse_mode == SupportedPdfParseMethod.OCR:
|
528
|
-
pass
|
529
|
-
else:
|
530
|
-
raise Exception('parse_mode must be txt or ocr')
|
531
|
-
|
532
681
|
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
|
533
682
|
"""顺便删除大水印并保留abandon的span"""
|
534
683
|
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
|
535
684
|
|
536
|
-
"""删除重叠spans中置信度较低的那些"""
|
537
|
-
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
538
|
-
"""删除重叠spans中较小的那些"""
|
539
|
-
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
540
|
-
"""对image和table截图"""
|
541
|
-
spans = ocr_cut_image_and_table(
|
542
|
-
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
543
|
-
)
|
544
|
-
|
545
685
|
"""先处理不需要排版的discarded_blocks"""
|
546
686
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
547
687
|
all_discarded_blocks, spans, 0.4
|
548
688
|
)
|
549
689
|
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
|
550
690
|
|
551
|
-
"""
|
691
|
+
"""如果当前页面没有有效的bbox则跳过"""
|
552
692
|
if len(all_bboxes) == 0:
|
553
693
|
logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
|
554
694
|
return ocr_construct_page_component_v2(
|
@@ -566,7 +706,32 @@ def parse_page_core(
|
|
566
706
|
drop_reason,
|
567
707
|
)
|
568
708
|
|
569
|
-
"""
|
709
|
+
"""删除重叠spans中置信度较低的那些"""
|
710
|
+
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
711
|
+
"""删除重叠spans中较小的那些"""
|
712
|
+
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
713
|
+
|
714
|
+
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
715
|
+
if parse_mode == SupportedPdfParseMethod.TXT:
|
716
|
+
|
717
|
+
"""之前的公式替换方案"""
|
718
|
+
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
|
719
|
+
# spans = replace_text_span(pymu_spans, spans)
|
720
|
+
|
721
|
+
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
722
|
+
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
723
|
+
|
724
|
+
elif parse_mode == SupportedPdfParseMethod.OCR:
|
725
|
+
pass
|
726
|
+
else:
|
727
|
+
raise Exception('parse_mode must be txt or ocr')
|
728
|
+
|
729
|
+
"""对image和table截图"""
|
730
|
+
spans = ocr_cut_image_and_table(
|
731
|
+
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
732
|
+
)
|
733
|
+
|
734
|
+
"""span填充进block"""
|
570
735
|
block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
|
571
736
|
|
572
737
|
"""对block进行fix操作"""
|
@@ -616,6 +781,7 @@ def pdf_parse_union(
|
|
616
781
|
start_page_id=0,
|
617
782
|
end_page_id=None,
|
618
783
|
debug_mode=False,
|
784
|
+
lang=None,
|
619
785
|
):
|
620
786
|
pdf_bytes_md5 = compute_md5(dataset.data_bits())
|
621
787
|
|
@@ -652,7 +818,7 @@ def pdf_parse_union(
|
|
652
818
|
"""解析pdf中的每一页"""
|
653
819
|
if start_page_id <= page_id <= end_page_id:
|
654
820
|
page_info = parse_page_core(
|
655
|
-
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
|
821
|
+
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
|
656
822
|
)
|
657
823
|
else:
|
658
824
|
page_info = page.get_page_info()
|
@@ -664,7 +830,7 @@ def pdf_parse_union(
|
|
664
830
|
pdf_info_dict[f'page_{page_id}'] = page_info
|
665
831
|
|
666
832
|
"""分段"""
|
667
|
-
para_split(pdf_info_dict
|
833
|
+
para_split(pdf_info_dict)
|
668
834
|
|
669
835
|
"""dict转list"""
|
670
836
|
pdf_info_list = dict_to_list(pdf_info_dict)
|
magic_pdf/pipe/AbsPipe.py
CHANGED
@@ -1,22 +1,20 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
|
3
|
+
from magic_pdf.config.drop_reason import DropReason
|
4
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
5
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
3
6
|
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
4
7
|
from magic_pdf.filter.pdf_classify_by_type import classify
|
5
8
|
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
6
|
-
from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
|
7
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
|
-
from magic_pdf.libs.drop_reason import DropReason
|
9
9
|
from magic_pdf.libs.json_compressor import JsonCompressor
|
10
10
|
|
11
11
|
|
12
12
|
class AbsPipe(ABC):
|
13
|
-
"""
|
14
|
-
|
15
|
-
|
16
|
-
PIP_OCR = "ocr"
|
17
|
-
PIP_TXT = "txt"
|
13
|
+
"""txt和ocr处理的抽象类."""
|
14
|
+
PIP_OCR = 'ocr'
|
15
|
+
PIP_TXT = 'txt'
|
18
16
|
|
19
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer:
|
17
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
20
18
|
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
21
19
|
self.pdf_bytes = pdf_bytes
|
22
20
|
self.model_list = model_list
|
@@ -29,29 +27,23 @@ class AbsPipe(ABC):
|
|
29
27
|
self.layout_model = layout_model
|
30
28
|
self.formula_enable = formula_enable
|
31
29
|
self.table_enable = table_enable
|
32
|
-
|
30
|
+
|
33
31
|
def get_compress_pdf_mid_data(self):
|
34
32
|
return JsonCompressor.compress_json(self.pdf_mid_data)
|
35
33
|
|
36
34
|
@abstractmethod
|
37
35
|
def pipe_classify(self):
|
38
|
-
"""
|
39
|
-
有状态的分类
|
40
|
-
"""
|
36
|
+
"""有状态的分类."""
|
41
37
|
raise NotImplementedError
|
42
38
|
|
43
39
|
@abstractmethod
|
44
40
|
def pipe_analyze(self):
|
45
|
-
"""
|
46
|
-
有状态的跑模型分析
|
47
|
-
"""
|
41
|
+
"""有状态的跑模型分析."""
|
48
42
|
raise NotImplementedError
|
49
43
|
|
50
44
|
@abstractmethod
|
51
45
|
def pipe_parse(self):
|
52
|
-
"""
|
53
|
-
有状态的解析
|
54
|
-
"""
|
46
|
+
"""有状态的解析."""
|
55
47
|
raise NotImplementedError
|
56
48
|
|
57
49
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
@@ -64,27 +56,25 @@ class AbsPipe(ABC):
|
|
64
56
|
|
65
57
|
@staticmethod
|
66
58
|
def classify(pdf_bytes: bytes) -> str:
|
67
|
-
"""
|
68
|
-
根据pdf的元数据,判断是文本pdf,还是ocr pdf
|
69
|
-
"""
|
59
|
+
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
|
70
60
|
pdf_meta = pdf_meta_scan(pdf_bytes)
|
71
|
-
if pdf_meta.get(
|
61
|
+
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
|
72
62
|
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
|
73
63
|
else:
|
74
|
-
is_encrypted = pdf_meta[
|
75
|
-
is_needs_password = pdf_meta[
|
64
|
+
is_encrypted = pdf_meta['is_encrypted']
|
65
|
+
is_needs_password = pdf_meta['is_needs_password']
|
76
66
|
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
|
77
|
-
raise Exception(f
|
67
|
+
raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
|
78
68
|
else:
|
79
69
|
is_text_pdf, results = classify(
|
80
|
-
pdf_meta[
|
81
|
-
pdf_meta[
|
82
|
-
pdf_meta[
|
83
|
-
pdf_meta[
|
84
|
-
pdf_meta[
|
85
|
-
pdf_meta[
|
86
|
-
pdf_meta[
|
87
|
-
pdf_meta[
|
70
|
+
pdf_meta['total_page'],
|
71
|
+
pdf_meta['page_width_pts'],
|
72
|
+
pdf_meta['page_height_pts'],
|
73
|
+
pdf_meta['image_info_per_page'],
|
74
|
+
pdf_meta['text_len_per_page'],
|
75
|
+
pdf_meta['imgs_per_page'],
|
76
|
+
pdf_meta['text_layout_per_page'],
|
77
|
+
pdf_meta['invalid_chars'],
|
88
78
|
)
|
89
79
|
if is_text_pdf:
|
90
80
|
return AbsPipe.PIP_TXT
|
@@ -93,22 +83,16 @@ class AbsPipe(ABC):
|
|
93
83
|
|
94
84
|
@staticmethod
|
95
85
|
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
|
96
|
-
"""
|
97
|
-
根据pdf类型,生成统一格式content_list
|
98
|
-
"""
|
86
|
+
"""根据pdf类型,生成统一格式content_list."""
|
99
87
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
100
|
-
pdf_info_list = pdf_mid_data[
|
88
|
+
pdf_info_list = pdf_mid_data['pdf_info']
|
101
89
|
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
|
102
90
|
return content_list
|
103
91
|
|
104
92
|
@staticmethod
|
105
93
|
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
|
106
|
-
"""
|
107
|
-
根据pdf类型,markdown
|
108
|
-
"""
|
94
|
+
"""根据pdf类型,markdown."""
|
109
95
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
110
|
-
pdf_info_list = pdf_mid_data[
|
96
|
+
pdf_info_list = pdf_mid_data['pdf_info']
|
111
97
|
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
|
112
98
|
return md_content
|
113
|
-
|
114
|
-
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
-
from magic_pdf.
|
3
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
4
5
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
6
6
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
7
7
|
from magic_pdf.user_api import parse_ocr_pdf
|
8
8
|
|
9
9
|
|
10
10
|
class OCRPipe(AbsPipe):
|
11
11
|
|
12
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer:
|
12
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
13
13
|
start_page_id=0, end_page_id=None, lang=None,
|
14
14
|
layout_model=None, formula_enable=None, table_enable=None):
|
15
15
|
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
@@ -32,10 +32,10 @@ class OCRPipe(AbsPipe):
|
|
32
32
|
|
33
33
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
34
34
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
35
|
-
logger.info(
|
35
|
+
logger.info('ocr_pipe mk content list finished')
|
36
36
|
return result
|
37
37
|
|
38
38
|
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
39
39
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
40
|
-
logger.info(f
|
40
|
+
logger.info(f'ocr_pipe mk {md_make_mode} finished')
|
41
41
|
return result
|
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
-
from magic_pdf.
|
3
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
4
5
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
6
|
-
from magic_pdf.libs.json_compressor import JsonCompressor
|
7
6
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
8
7
|
from magic_pdf.user_api import parse_txt_pdf
|
9
8
|
|
10
9
|
|
11
10
|
class TXTPipe(AbsPipe):
|
12
11
|
|
13
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer:
|
12
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
14
13
|
start_page_id=0, end_page_id=None, lang=None,
|
15
14
|
layout_model=None, formula_enable=None, table_enable=None):
|
16
15
|
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
@@ -33,10 +32,10 @@ class TXTPipe(AbsPipe):
|
|
33
32
|
|
34
33
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
35
34
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
36
|
-
logger.info(
|
35
|
+
logger.info('txt_pipe mk content list finished')
|
37
36
|
return result
|
38
37
|
|
39
38
|
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
40
39
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
41
|
-
logger.info(f
|
40
|
+
logger.info(f'txt_pipe mk {md_make_mode} finished')
|
42
41
|
return result
|
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -2,22 +2,21 @@ import json
|
|
2
2
|
|
3
3
|
from loguru import logger
|
4
4
|
|
5
|
-
from magic_pdf.
|
6
|
-
from magic_pdf.
|
7
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
5
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
9
7
|
from magic_pdf.libs.commons import join_path
|
8
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
10
9
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
11
|
-
from magic_pdf.user_api import
|
10
|
+
from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
|
12
11
|
|
13
12
|
|
14
13
|
class UNIPipe(AbsPipe):
|
15
14
|
|
16
|
-
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer:
|
15
|
+
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
|
17
16
|
start_page_id=0, end_page_id=None, lang=None,
|
18
17
|
layout_model=None, formula_enable=None, table_enable=None):
|
19
|
-
self.pdf_type = jso_useful_key[
|
20
|
-
super().__init__(pdf_bytes, jso_useful_key[
|
18
|
+
self.pdf_type = jso_useful_key['_pdf_type']
|
19
|
+
super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
|
21
20
|
lang, layout_model, formula_enable, table_enable)
|
22
21
|
if len(self.model_list) == 0:
|
23
22
|
self.input_model_is_empty = True
|
@@ -54,27 +53,28 @@ class UNIPipe(AbsPipe):
|
|
54
53
|
|
55
54
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
|
56
55
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
57
|
-
logger.info(
|
56
|
+
logger.info('uni_pipe mk content list finished')
|
58
57
|
return result
|
59
58
|
|
60
59
|
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
61
60
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
62
|
-
logger.info(f
|
61
|
+
logger.info(f'uni_pipe mk {md_make_mode} finished')
|
63
62
|
return result
|
64
63
|
|
65
64
|
|
66
65
|
if __name__ == '__main__':
|
67
66
|
# 测试
|
68
|
-
|
67
|
+
from magic_pdf.data.data_reader_writer import DataReader
|
68
|
+
drw = DataReader(r'D:/project/20231108code-clean')
|
69
69
|
|
70
|
-
pdf_file_path = r
|
71
|
-
model_file_path = r
|
72
|
-
pdf_bytes = drw.read(pdf_file_path
|
73
|
-
model_json_txt = drw.read(model_file_path
|
70
|
+
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
71
|
+
model_file_path = r'linshixuqiu\19983-00.json'
|
72
|
+
pdf_bytes = drw.read(pdf_file_path)
|
73
|
+
model_json_txt = drw.read(model_file_path).decode()
|
74
74
|
model_list = json.loads(model_json_txt)
|
75
|
-
write_path = r
|
76
|
-
img_bucket_path =
|
77
|
-
img_writer =
|
75
|
+
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
76
|
+
img_bucket_path = 'imgs'
|
77
|
+
img_writer = DataWriter(join_path(write_path, img_bucket_path))
|
78
78
|
|
79
79
|
# pdf_type = UNIPipe.classify(pdf_bytes)
|
80
80
|
# jso_useful_key = {
|
@@ -83,8 +83,8 @@ if __name__ == '__main__':
|
|
83
83
|
# }
|
84
84
|
|
85
85
|
jso_useful_key = {
|
86
|
-
|
87
|
-
|
86
|
+
'_pdf_type': '',
|
87
|
+
'model_list': model_list
|
88
88
|
}
|
89
89
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
|
90
90
|
pipe.pipe_classify()
|
@@ -92,8 +92,7 @@ if __name__ == '__main__':
|
|
92
92
|
md_content = pipe.pipe_mk_markdown(img_bucket_path)
|
93
93
|
content_list = pipe.pipe_mk_uni_format(img_bucket_path)
|
94
94
|
|
95
|
-
md_writer =
|
96
|
-
md_writer.
|
97
|
-
md_writer.
|
98
|
-
|
99
|
-
md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
|
95
|
+
md_writer = DataWriter(write_path)
|
96
|
+
md_writer.write_string('19983-00.md', md_content)
|
97
|
+
md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
|
98
|
+
md_writer.write_string('19983-00.txt', str(content_list))
|