magic-pdf 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +143 -0
- magic_pdf/data/data_reader_writer/s3.py +73 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +6 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +19 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +106 -244
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +35 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +331 -15
- magic_pdf/model/pdf_extract_kit.py +170 -83
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +40 -16
- magic_pdf/model/ppTableModel.py +8 -6
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +322 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +18 -8
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/LICENSE.md +1 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/METADATA +124 -78
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/RECORD +57 -33
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/top_level.txt +0 -0
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -11,19 +11,25 @@ from magic_pdf.user_api import parse_txt_pdf
|
|
11
11
|
class TXTPipe(AbsPipe):
|
12
12
|
|
13
13
|
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
|
14
|
-
start_page_id=0, end_page_id=None
|
15
|
-
|
14
|
+
start_page_id=0, end_page_id=None, lang=None,
|
15
|
+
layout_model=None, formula_enable=None, table_enable=None):
|
16
|
+
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
17
|
+
layout_model, formula_enable, table_enable)
|
16
18
|
|
17
19
|
def pipe_classify(self):
|
18
20
|
pass
|
19
21
|
|
20
22
|
def pipe_analyze(self):
|
21
23
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
22
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
24
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
25
|
+
lang=self.lang, layout_model=self.layout_model,
|
26
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
23
27
|
|
24
28
|
def pipe_parse(self):
|
25
29
|
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
|
26
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
30
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
31
|
+
lang=self.lang, layout_model=self.layout_model,
|
32
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
27
33
|
|
28
34
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
29
35
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -14,9 +14,11 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
|
|
14
14
|
class UNIPipe(AbsPipe):
|
15
15
|
|
16
16
|
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
|
17
|
-
start_page_id=0, end_page_id=None
|
17
|
+
start_page_id=0, end_page_id=None, lang=None,
|
18
|
+
layout_model=None, formula_enable=None, table_enable=None):
|
18
19
|
self.pdf_type = jso_useful_key["_pdf_type"]
|
19
|
-
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id
|
20
|
+
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id,
|
21
|
+
lang, layout_model, formula_enable, table_enable)
|
20
22
|
if len(self.model_list) == 0:
|
21
23
|
self.input_model_is_empty = True
|
22
24
|
else:
|
@@ -28,22 +30,29 @@ class UNIPipe(AbsPipe):
|
|
28
30
|
def pipe_analyze(self):
|
29
31
|
if self.pdf_type == self.PIP_TXT:
|
30
32
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
31
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
33
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
34
|
+
lang=self.lang, layout_model=self.layout_model,
|
35
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
32
36
|
elif self.pdf_type == self.PIP_OCR:
|
33
37
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
34
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
38
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
39
|
+
lang=self.lang, layout_model=self.layout_model,
|
40
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
35
41
|
|
36
42
|
def pipe_parse(self):
|
37
43
|
if self.pdf_type == self.PIP_TXT:
|
38
44
|
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
39
45
|
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
|
40
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
46
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
47
|
+
lang=self.lang, layout_model=self.layout_model,
|
48
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
41
49
|
elif self.pdf_type == self.PIP_OCR:
|
42
50
|
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
43
51
|
is_debug=self.is_debug,
|
44
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
52
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
53
|
+
lang=self.lang)
|
45
54
|
|
46
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.
|
55
|
+
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
|
47
56
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
48
57
|
logger.info("uni_pipe mk content list finished")
|
49
58
|
return result
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
3
|
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
|
4
|
-
calculate_iou
|
4
|
+
calculate_iou, calculate_vertical_projection_overlap_ratio
|
5
5
|
from magic_pdf.libs.drop_tag import DropTag
|
6
6
|
from magic_pdf.libs.ocr_content_type import BlockType
|
7
7
|
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
|
@@ -60,6 +60,88 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
|
|
60
60
|
return all_bboxes, all_discarded_blocks, drop_reasons
|
61
61
|
|
62
62
|
|
63
|
+
def add_bboxes(blocks, block_type, bboxes):
|
64
|
+
for block in blocks:
|
65
|
+
x0, y0, x1, y1 = block['bbox']
|
66
|
+
if block_type in [
|
67
|
+
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
|
68
|
+
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
|
69
|
+
]:
|
70
|
+
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
|
71
|
+
else:
|
72
|
+
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
|
73
|
+
|
74
|
+
|
75
|
+
def ocr_prepare_bboxes_for_layout_split_v2(
|
76
|
+
img_body_blocks, img_caption_blocks, img_footnote_blocks,
|
77
|
+
table_body_blocks, table_caption_blocks, table_footnote_blocks,
|
78
|
+
discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
|
79
|
+
):
|
80
|
+
all_bboxes = []
|
81
|
+
|
82
|
+
add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
|
83
|
+
add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
|
84
|
+
add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
|
85
|
+
add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
|
86
|
+
add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
|
87
|
+
add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
|
88
|
+
add_bboxes(text_blocks, BlockType.Text, all_bboxes)
|
89
|
+
add_bboxes(title_blocks, BlockType.Title, all_bboxes)
|
90
|
+
add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
|
91
|
+
|
92
|
+
'''block嵌套问题解决'''
|
93
|
+
'''文本框与标题框重叠,优先信任文本框'''
|
94
|
+
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
|
95
|
+
'''任何框体与舍弃框重叠,优先信任舍弃框'''
|
96
|
+
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
|
97
|
+
|
98
|
+
# interline_equation 与title或text框冲突的情况,分两种情况处理
|
99
|
+
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
|
100
|
+
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
|
101
|
+
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
|
102
|
+
# 通过后续大框套小框逻辑删除
|
103
|
+
|
104
|
+
'''discarded_blocks'''
|
105
|
+
all_discarded_blocks = []
|
106
|
+
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
|
107
|
+
|
108
|
+
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
|
109
|
+
footnote_blocks = []
|
110
|
+
for discarded in discarded_blocks:
|
111
|
+
x0, y0, x1, y1 = discarded['bbox']
|
112
|
+
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
|
113
|
+
footnote_blocks.append([x0, y0, x1, y1])
|
114
|
+
|
115
|
+
'''移除在footnote下面的任何框'''
|
116
|
+
need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
|
117
|
+
if len(need_remove_blocks) > 0:
|
118
|
+
for block in need_remove_blocks:
|
119
|
+
all_bboxes.remove(block)
|
120
|
+
all_discarded_blocks.append(block)
|
121
|
+
|
122
|
+
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
|
123
|
+
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
124
|
+
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
125
|
+
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
|
126
|
+
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
127
|
+
|
128
|
+
return all_bboxes, all_discarded_blocks
|
129
|
+
|
130
|
+
|
131
|
+
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
|
132
|
+
need_remove_blocks = []
|
133
|
+
for block in all_bboxes:
|
134
|
+
block_x0, block_y0, block_x1, block_y1 = block[:4]
|
135
|
+
for footnote_bbox in footnote_blocks:
|
136
|
+
footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
|
137
|
+
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
|
138
|
+
if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
|
139
|
+
if block not in need_remove_blocks:
|
140
|
+
need_remove_blocks.append(block)
|
141
|
+
break
|
142
|
+
return need_remove_blocks
|
143
|
+
|
144
|
+
|
63
145
|
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
|
64
146
|
# 先提取所有text和interline block
|
65
147
|
text_blocks = []
|
@@ -49,8 +49,7 @@ def merge_spans_to_line(spans):
|
|
49
49
|
continue
|
50
50
|
|
51
51
|
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
52
|
-
if __is_overlaps_y_exceeds_threshold(span['bbox'],
|
53
|
-
current_line[-1]['bbox']):
|
52
|
+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
|
54
53
|
current_line.append(span)
|
55
54
|
else:
|
56
55
|
# 否则,开始新行
|
@@ -154,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
|
154
153
|
'type': block_type,
|
155
154
|
'bbox': block_bbox,
|
156
155
|
}
|
156
|
+
if block_type in [
|
157
|
+
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
|
158
|
+
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
|
159
|
+
]:
|
160
|
+
block_dict["group_id"] = block[-1]
|
157
161
|
block_spans = []
|
158
162
|
for span in spans:
|
159
163
|
span_bbox = span['bbox']
|
@@ -202,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
|
|
202
206
|
return fix_blocks
|
203
207
|
|
204
208
|
|
209
|
+
def fix_block_spans_v2(block_with_spans):
|
210
|
+
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
|
211
|
+
需要将caption和footnote的text_span放入相应img_block和table_block内的
|
212
|
+
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
|
213
|
+
fix_blocks = []
|
214
|
+
for block in block_with_spans:
|
215
|
+
block_type = block['type']
|
216
|
+
|
217
|
+
if block_type in [BlockType.Text, BlockType.Title,
|
218
|
+
BlockType.ImageCaption, BlockType.ImageFootnote,
|
219
|
+
BlockType.TableCaption, BlockType.TableFootnote
|
220
|
+
]:
|
221
|
+
block = fix_text_block(block)
|
222
|
+
elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
|
223
|
+
block = fix_interline_block(block)
|
224
|
+
else:
|
225
|
+
continue
|
226
|
+
fix_blocks.append(block)
|
227
|
+
return fix_blocks
|
228
|
+
|
229
|
+
|
205
230
|
def fix_discarded_block(discarded_block_with_spans):
|
206
231
|
fix_discarded_blocks = []
|
207
232
|
for block in discarded_block_with_spans:
|
@@ -2,13 +2,13 @@ model:
|
|
2
2
|
arch: unimernet
|
3
3
|
model_type: unimernet
|
4
4
|
model_config:
|
5
|
-
model_name: ./models
|
6
|
-
max_seq_len:
|
7
|
-
|
5
|
+
model_name: ./models/unimernet_base
|
6
|
+
max_seq_len: 1536
|
7
|
+
|
8
8
|
load_pretrained: True
|
9
|
-
pretrained: ./models/pytorch_model.
|
9
|
+
pretrained: './models/unimernet_base/pytorch_model.pth'
|
10
10
|
tokenizer_config:
|
11
|
-
path: ./models
|
11
|
+
path: ./models/unimernet_base
|
12
12
|
|
13
13
|
datasets:
|
14
14
|
formula_rec_eval:
|
@@ -18,7 +18,7 @@ datasets:
|
|
18
18
|
image_size:
|
19
19
|
- 192
|
20
20
|
- 672
|
21
|
-
|
21
|
+
|
22
22
|
run:
|
23
23
|
runner: runner_iter
|
24
24
|
task: unimernet_train
|
@@ -43,4 +43,4 @@ run:
|
|
43
43
|
distributed_type: ddp # or fsdp when train llm
|
44
44
|
|
45
45
|
generate_cfg:
|
46
|
-
temperature: 0.0
|
46
|
+
temperature: 0.0
|
@@ -1,15 +1,7 @@
|
|
1
|
-
config:
|
2
|
-
device: cpu
|
3
|
-
layout: True
|
4
|
-
formula: True
|
5
|
-
table_config:
|
6
|
-
model: TableMaster
|
7
|
-
is_table_recog_enable: False
|
8
|
-
max_time: 400
|
9
|
-
|
10
1
|
weights:
|
11
|
-
|
12
|
-
|
13
|
-
|
2
|
+
layoutlmv3: Layout/LayoutLMv3/model_final.pth
|
3
|
+
doclayout_yolo: Layout/YOLO/doclayout_yolo_ft.pt
|
4
|
+
yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
|
5
|
+
unimernet_small: MFR/unimernet_small
|
14
6
|
struct_eqtable: TabRec/StructEqTable
|
15
|
-
|
7
|
+
tablemaster: TabRec/TableMaster
|
magic_pdf/tools/cli.py
CHANGED
@@ -44,6 +44,18 @@ auto: automatically choose the best method for parsing pdf from ocr and txt.
|
|
44
44
|
without method specified, auto will be used by default.""",
|
45
45
|
default='auto',
|
46
46
|
)
|
47
|
+
@click.option(
|
48
|
+
'-l',
|
49
|
+
'--lang',
|
50
|
+
'lang',
|
51
|
+
type=str,
|
52
|
+
help="""
|
53
|
+
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
|
54
|
+
You should input "Abbreviation" with language form url:
|
55
|
+
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
|
56
|
+
""",
|
57
|
+
default=None,
|
58
|
+
)
|
47
59
|
@click.option(
|
48
60
|
'-d',
|
49
61
|
'--debug',
|
@@ -68,7 +80,7 @@ without method specified, auto will be used by default.""",
|
|
68
80
|
help='The ending page for PDF parsing, beginning from 0.',
|
69
81
|
default=None,
|
70
82
|
)
|
71
|
-
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
|
83
|
+
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
72
84
|
model_config.__use_inside_model__ = True
|
73
85
|
model_config.__model_mode__ = 'full'
|
74
86
|
os.makedirs(output_dir, exist_ok=True)
|
@@ -90,6 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
|
|
90
102
|
debug_able,
|
91
103
|
start_page_id=start_page_id,
|
92
104
|
end_page_id=end_page_id,
|
105
|
+
lang=lang
|
93
106
|
)
|
94
107
|
|
95
108
|
except Exception as e:
|
magic_pdf/tools/common.py
CHANGED
@@ -6,8 +6,8 @@ import click
|
|
6
6
|
from loguru import logger
|
7
7
|
|
8
8
|
import magic_pdf.model as model_config
|
9
|
-
from magic_pdf.libs.draw_bbox import (draw_layout_bbox,
|
10
|
-
|
9
|
+
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
|
10
|
+
draw_model_bbox, draw_span_bbox)
|
11
11
|
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
12
12
|
from magic_pdf.pipe.OCRPipe import OCRPipe
|
13
13
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
@@ -39,16 +39,21 @@ def do_parse(
|
|
39
39
|
f_dump_middle_json=True,
|
40
40
|
f_dump_model_json=True,
|
41
41
|
f_dump_orig_pdf=True,
|
42
|
-
f_dump_content_list=
|
42
|
+
f_dump_content_list=True,
|
43
43
|
f_make_md_mode=MakeMode.MM_MD,
|
44
44
|
f_draw_model_bbox=False,
|
45
|
+
f_draw_line_sort_bbox=False,
|
45
46
|
start_page_id=0,
|
46
47
|
end_page_id=None,
|
48
|
+
lang=None,
|
49
|
+
layout_model=None,
|
50
|
+
formula_enable=None,
|
51
|
+
table_enable=None,
|
47
52
|
):
|
48
53
|
if debug_able:
|
49
54
|
logger.warning('debug mode is on')
|
50
|
-
f_dump_content_list = True
|
51
55
|
f_draw_model_bbox = True
|
56
|
+
f_draw_line_sort_bbox = True
|
52
57
|
|
53
58
|
orig_model_list = copy.deepcopy(model_list)
|
54
59
|
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
@@ -61,13 +66,16 @@ def do_parse(
|
|
61
66
|
if parse_method == 'auto':
|
62
67
|
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
63
68
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
64
|
-
start_page_id=start_page_id, end_page_id=end_page_id
|
69
|
+
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
|
70
|
+
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
65
71
|
elif parse_method == 'txt':
|
66
72
|
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
67
|
-
start_page_id=start_page_id, end_page_id=end_page_id
|
73
|
+
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
|
74
|
+
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
68
75
|
elif parse_method == 'ocr':
|
69
76
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
70
|
-
start_page_id=start_page_id, end_page_id=end_page_id
|
77
|
+
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
|
78
|
+
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
71
79
|
else:
|
72
80
|
logger.error('unknown parse method')
|
73
81
|
exit(1)
|
@@ -89,7 +97,9 @@ def do_parse(
|
|
89
97
|
if f_draw_span_bbox:
|
90
98
|
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
91
99
|
if f_draw_model_bbox:
|
92
|
-
|
100
|
+
draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
|
101
|
+
if f_draw_line_sort_bbox:
|
102
|
+
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
93
103
|
|
94
104
|
md_content = pipe.pipe_mk_markdown(image_dir,
|
95
105
|
drop_mode=DropMode.NONE,
|
magic_pdf/user_api.py
CHANGED
@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"
|
|
26
26
|
|
27
27
|
|
28
28
|
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
-
start_page_id=0, end_page_id=None,
|
29
|
+
start_page_id=0, end_page_id=None, lang=None,
|
30
30
|
*args, **kwargs):
|
31
31
|
"""
|
32
32
|
解析文本类pdf
|
@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
44
44
|
|
45
45
|
pdf_info_dict["_version_name"] = __version__
|
46
46
|
|
47
|
+
if lang is not None:
|
48
|
+
pdf_info_dict["_lang"] = lang
|
49
|
+
|
47
50
|
return pdf_info_dict
|
48
51
|
|
49
52
|
|
50
53
|
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
51
|
-
start_page_id=0, end_page_id=None,
|
54
|
+
start_page_id=0, end_page_id=None, lang=None,
|
52
55
|
*args, **kwargs):
|
53
56
|
"""
|
54
57
|
解析ocr类pdf
|
@@ -66,12 +69,15 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
66
69
|
|
67
70
|
pdf_info_dict["_version_name"] = __version__
|
68
71
|
|
72
|
+
if lang is not None:
|
73
|
+
pdf_info_dict["_lang"] = lang
|
74
|
+
|
69
75
|
return pdf_info_dict
|
70
76
|
|
71
77
|
|
72
78
|
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
73
79
|
input_model_is_empty: bool = False,
|
74
|
-
start_page_id=0, end_page_id=None,
|
80
|
+
start_page_id=0, end_page_id=None, lang=None,
|
75
81
|
*args, **kwargs):
|
76
82
|
"""
|
77
83
|
ocr和文本混合的pdf,全部解析出来
|
@@ -95,9 +101,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
95
101
|
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
|
96
102
|
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
97
103
|
if input_model_is_empty:
|
98
|
-
|
99
|
-
|
100
|
-
|
104
|
+
layout_model = kwargs.get("layout_model", None)
|
105
|
+
formula_enable = kwargs.get("formula_enable", None)
|
106
|
+
table_enable = kwargs.get("table_enable", None)
|
107
|
+
pdf_models = doc_analyze(
|
108
|
+
pdf_bytes,
|
109
|
+
ocr=True,
|
110
|
+
start_page_id=start_page_id,
|
111
|
+
end_page_id=end_page_id,
|
112
|
+
lang=lang,
|
113
|
+
layout_model=layout_model,
|
114
|
+
formula_enable=formula_enable,
|
115
|
+
table_enable=table_enable,
|
116
|
+
)
|
101
117
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
102
118
|
if pdf_info_dict is None:
|
103
119
|
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
|
@@ -108,4 +124,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
108
124
|
|
109
125
|
pdf_info_dict["_version_name"] = __version__
|
110
126
|
|
127
|
+
if lang is not None:
|
128
|
+
pdf_info_dict["_lang"] = lang
|
129
|
+
|
111
130
|
return pdf_info_dict
|
File without changes
|