magic-pdf 0.10.5__py3-none-any.whl → 0.10.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +5 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/dataset.py +175 -4
- magic_pdf/dict2md/ocr_mkcontent.py +2 -2
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/draw_bbox.py +11 -10
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +124 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +119 -60
- magic_pdf/model/operators.py +190 -0
- magic_pdf/model/pdf_extract_kit.py +20 -1
- magic_pdf/model/sub_modules/model_init.py +13 -3
- magic_pdf/model/sub_modules/model_utils.py +11 -5
- magic_pdf/pdf_parse_by_ocr.py +4 -5
- magic_pdf/pdf_parse_by_txt.py +4 -5
- magic_pdf/pdf_parse_union_core_v2.py +10 -11
- magic_pdf/pipe/AbsPipe.py +3 -2
- magic_pdf/pipe/OCRPipe.py +54 -15
- magic_pdf/pipe/TXTPipe.py +5 -4
- magic_pdf/pipe/UNIPipe.py +82 -30
- magic_pdf/pipe/operators.py +138 -0
- magic_pdf/tools/common.py +108 -59
- magic_pdf/user_api.py +47 -24
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/METADATA +7 -4
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/RECORD +31 -29
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/top_level.txt +0 -0
@@ -4,8 +4,8 @@ import statistics
|
|
4
4
|
import time
|
5
5
|
from typing import List
|
6
6
|
|
7
|
-
import torch
|
8
7
|
import fitz
|
8
|
+
import torch
|
9
9
|
from loguru import logger
|
10
10
|
|
11
11
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
|
|
16
16
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
17
17
|
from magic_pdf.libs.convert_utils import dict_to_list
|
18
18
|
from magic_pdf.libs.hash_utils import compute_md5
|
19
|
-
|
20
19
|
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
21
20
|
from magic_pdf.model.magic_model import MagicModel
|
22
21
|
|
23
|
-
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
24
|
-
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
25
|
-
|
26
22
|
try:
|
27
23
|
import torchtext
|
28
24
|
|
29
|
-
if torchtext.__version__ >=
|
25
|
+
if torchtext.__version__ >= '0.18.0':
|
30
26
|
torchtext.disable_torchtext_deprecation_warning()
|
31
27
|
except ImportError:
|
32
28
|
pass
|
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
|
|
39
35
|
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
|
40
36
|
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
|
41
37
|
|
38
|
+
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
39
|
+
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
40
|
+
|
42
41
|
|
43
42
|
def __replace_STX_ETX(text_str: str):
|
44
43
|
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
|
@@ -233,7 +232,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
233
232
|
# 初始化ocr模型
|
234
233
|
atom_model_manager = AtomModelSingleton()
|
235
234
|
ocr_model = atom_model_manager.get_atom_model(
|
236
|
-
atom_model_name=
|
235
|
+
atom_model_name='ocr',
|
237
236
|
ocr_show_log=False,
|
238
237
|
det_db_box_thresh=0.3,
|
239
238
|
lang=lang
|
@@ -241,7 +240,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
241
240
|
|
242
241
|
for span in empty_spans:
|
243
242
|
# 对span的bbox截图再ocr
|
244
|
-
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode=
|
243
|
+
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
|
245
244
|
ocr_res = ocr_model.ocr(span_img, det=False)
|
246
245
|
if ocr_res and len(ocr_res) > 0:
|
247
246
|
if len(ocr_res[0]) > 0:
|
@@ -681,7 +680,7 @@ def parse_page_core(
|
|
681
680
|
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
682
681
|
if parse_mode == SupportedPdfParseMethod.TXT:
|
683
682
|
|
684
|
-
"""使用新版本的混合ocr
|
683
|
+
"""使用新版本的混合ocr方案."""
|
685
684
|
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
686
685
|
|
687
686
|
elif parse_mode == SupportedPdfParseMethod.OCR:
|
@@ -689,7 +688,6 @@ def parse_page_core(
|
|
689
688
|
else:
|
690
689
|
raise Exception('parse_mode must be txt or ocr')
|
691
690
|
|
692
|
-
|
693
691
|
"""先处理不需要排版的discarded_blocks"""
|
694
692
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
695
693
|
all_discarded_blocks, spans, 0.4
|
@@ -762,8 +760,8 @@ def parse_page_core(
|
|
762
760
|
|
763
761
|
|
764
762
|
def pdf_parse_union(
|
765
|
-
dataset: Dataset,
|
766
763
|
model_list,
|
764
|
+
dataset: Dataset,
|
767
765
|
imageWriter,
|
768
766
|
parse_mode,
|
769
767
|
start_page_id=0,
|
@@ -771,6 +769,7 @@ def pdf_parse_union(
|
|
771
769
|
debug_mode=False,
|
772
770
|
lang=None,
|
773
771
|
):
|
772
|
+
|
774
773
|
pdf_bytes_md5 = compute_md5(dataset.data_bits())
|
775
774
|
|
776
775
|
"""初始化空的pdf_info_dict"""
|
magic_pdf/pipe/AbsPipe.py
CHANGED
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|
3
3
|
from magic_pdf.config.drop_reason import DropReason
|
4
4
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
5
5
|
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
+
from magic_pdf.data.dataset import Dataset
|
6
7
|
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
7
8
|
from magic_pdf.filter.pdf_classify_by_type import classify
|
8
9
|
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
@@ -14,9 +15,9 @@ class AbsPipe(ABC):
|
|
14
15
|
PIP_OCR = 'ocr'
|
15
16
|
PIP_TXT = 'txt'
|
16
17
|
|
17
|
-
def __init__(self,
|
18
|
+
def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
18
19
|
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
19
|
-
self.
|
20
|
+
self.dataset = Dataset
|
20
21
|
self.model_list = model_list
|
21
22
|
self.image_writer = image_writer
|
22
23
|
self.pdf_mid_data = None # 未压缩
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -2,40 +2,79 @@ from loguru import logger
|
|
2
2
|
|
3
3
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
4
|
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
+
from magic_pdf.data.dataset import Dataset
|
5
6
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
6
7
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
7
8
|
from magic_pdf.user_api import parse_ocr_pdf
|
8
9
|
|
9
10
|
|
10
11
|
class OCRPipe(AbsPipe):
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
dataset: Dataset,
|
15
|
+
model_list: list,
|
16
|
+
image_writer: DataWriter,
|
17
|
+
is_debug: bool = False,
|
18
|
+
start_page_id=0,
|
19
|
+
end_page_id=None,
|
20
|
+
lang=None,
|
21
|
+
layout_model=None,
|
22
|
+
formula_enable=None,
|
23
|
+
table_enable=None,
|
24
|
+
):
|
25
|
+
super().__init__(
|
26
|
+
dataset,
|
27
|
+
model_list,
|
28
|
+
image_writer,
|
29
|
+
is_debug,
|
30
|
+
start_page_id,
|
31
|
+
end_page_id,
|
32
|
+
lang,
|
33
|
+
layout_model,
|
34
|
+
formula_enable,
|
35
|
+
table_enable,
|
36
|
+
)
|
17
37
|
|
18
38
|
def pipe_classify(self):
|
19
39
|
pass
|
20
40
|
|
21
41
|
def pipe_analyze(self):
|
22
|
-
self.
|
23
|
-
|
24
|
-
|
25
|
-
|
42
|
+
self.infer_res = doc_analyze(
|
43
|
+
self.dataset,
|
44
|
+
ocr=True,
|
45
|
+
start_page_id=self.start_page_id,
|
46
|
+
end_page_id=self.end_page_id,
|
47
|
+
lang=self.lang,
|
48
|
+
layout_model=self.layout_model,
|
49
|
+
formula_enable=self.formula_enable,
|
50
|
+
table_enable=self.table_enable,
|
51
|
+
)
|
26
52
|
|
27
53
|
def pipe_parse(self):
|
28
|
-
self.pdf_mid_data = parse_ocr_pdf(
|
29
|
-
|
30
|
-
|
31
|
-
|
54
|
+
self.pdf_mid_data = parse_ocr_pdf(
|
55
|
+
self.dataset,
|
56
|
+
self.infer_res,
|
57
|
+
self.image_writer,
|
58
|
+
is_debug=self.is_debug,
|
59
|
+
start_page_id=self.start_page_id,
|
60
|
+
end_page_id=self.end_page_id,
|
61
|
+
lang=self.lang,
|
62
|
+
layout_model=self.layout_model,
|
63
|
+
formula_enable=self.formula_enable,
|
64
|
+
table_enable=self.table_enable,
|
65
|
+
)
|
32
66
|
|
33
67
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
34
68
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
35
69
|
logger.info('ocr_pipe mk content list finished')
|
36
70
|
return result
|
37
71
|
|
38
|
-
def pipe_mk_markdown(
|
72
|
+
def pipe_mk_markdown(
|
73
|
+
self,
|
74
|
+
img_parent_path: str,
|
75
|
+
drop_mode=DropMode.WHOLE_PDF,
|
76
|
+
md_make_mode=MakeMode.MM_MD,
|
77
|
+
):
|
39
78
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
40
79
|
logger.info(f'ocr_pipe mk {md_make_mode} finished')
|
41
80
|
return result
|
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -2,6 +2,7 @@ from loguru import logger
|
|
2
2
|
|
3
3
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
4
|
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
+
from magic_pdf.data.dataset import Dataset
|
5
6
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
6
7
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
7
8
|
from magic_pdf.user_api import parse_txt_pdf
|
@@ -9,23 +10,23 @@ from magic_pdf.user_api import parse_txt_pdf
|
|
9
10
|
|
10
11
|
class TXTPipe(AbsPipe):
|
11
12
|
|
12
|
-
def __init__(self,
|
13
|
+
def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
13
14
|
start_page_id=0, end_page_id=None, lang=None,
|
14
15
|
layout_model=None, formula_enable=None, table_enable=None):
|
15
|
-
super().__init__(
|
16
|
+
super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
16
17
|
layout_model, formula_enable, table_enable)
|
17
18
|
|
18
19
|
def pipe_classify(self):
|
19
20
|
pass
|
20
21
|
|
21
22
|
def pipe_analyze(self):
|
22
|
-
self.model_list = doc_analyze(self.
|
23
|
+
self.model_list = doc_analyze(self.dataset, ocr=False,
|
23
24
|
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
24
25
|
lang=self.lang, layout_model=self.layout_model,
|
25
26
|
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
26
27
|
|
27
28
|
def pipe_parse(self):
|
28
|
-
self.pdf_mid_data = parse_txt_pdf(self.
|
29
|
+
self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
|
29
30
|
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
30
31
|
lang=self.lang, layout_model=self.layout_model,
|
31
32
|
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -4,6 +4,7 @@ from loguru import logger
|
|
4
4
|
|
5
5
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
6
|
from magic_pdf.data.data_reader_writer import DataWriter
|
7
|
+
from magic_pdf.data.dataset import Dataset
|
7
8
|
from magic_pdf.libs.commons import join_path
|
8
9
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
9
10
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
@@ -12,12 +13,32 @@ from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
|
|
12
13
|
|
13
14
|
class UNIPipe(AbsPipe):
|
14
15
|
|
15
|
-
def __init__(
|
16
|
-
|
17
|
-
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
dataset: Dataset,
|
19
|
+
jso_useful_key: dict,
|
20
|
+
image_writer: DataWriter,
|
21
|
+
is_debug: bool = False,
|
22
|
+
start_page_id=0,
|
23
|
+
end_page_id=None,
|
24
|
+
lang=None,
|
25
|
+
layout_model=None,
|
26
|
+
formula_enable=None,
|
27
|
+
table_enable=None,
|
28
|
+
):
|
18
29
|
self.pdf_type = jso_useful_key['_pdf_type']
|
19
|
-
super().__init__(
|
20
|
-
|
30
|
+
super().__init__(
|
31
|
+
dataset,
|
32
|
+
jso_useful_key['model_list'],
|
33
|
+
image_writer,
|
34
|
+
is_debug,
|
35
|
+
start_page_id,
|
36
|
+
end_page_id,
|
37
|
+
lang,
|
38
|
+
layout_model,
|
39
|
+
formula_enable,
|
40
|
+
table_enable,
|
41
|
+
)
|
21
42
|
if len(self.model_list) == 0:
|
22
43
|
self.input_model_is_empty = True
|
23
44
|
else:
|
@@ -28,35 +49,66 @@ class UNIPipe(AbsPipe):
|
|
28
49
|
|
29
50
|
def pipe_analyze(self):
|
30
51
|
if self.pdf_type == self.PIP_TXT:
|
31
|
-
self.model_list = doc_analyze(
|
32
|
-
|
33
|
-
|
34
|
-
|
52
|
+
self.model_list = doc_analyze(
|
53
|
+
self.dataset,
|
54
|
+
ocr=False,
|
55
|
+
start_page_id=self.start_page_id,
|
56
|
+
end_page_id=self.end_page_id,
|
57
|
+
lang=self.lang,
|
58
|
+
layout_model=self.layout_model,
|
59
|
+
formula_enable=self.formula_enable,
|
60
|
+
table_enable=self.table_enable,
|
61
|
+
)
|
35
62
|
elif self.pdf_type == self.PIP_OCR:
|
36
|
-
self.model_list = doc_analyze(
|
37
|
-
|
38
|
-
|
39
|
-
|
63
|
+
self.model_list = doc_analyze(
|
64
|
+
self.dataset,
|
65
|
+
ocr=True,
|
66
|
+
start_page_id=self.start_page_id,
|
67
|
+
end_page_id=self.end_page_id,
|
68
|
+
lang=self.lang,
|
69
|
+
layout_model=self.layout_model,
|
70
|
+
formula_enable=self.formula_enable,
|
71
|
+
table_enable=self.table_enable,
|
72
|
+
)
|
40
73
|
|
41
74
|
def pipe_parse(self):
|
42
75
|
if self.pdf_type == self.PIP_TXT:
|
43
|
-
self.pdf_mid_data = parse_union_pdf(
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
76
|
+
self.pdf_mid_data = parse_union_pdf(
|
77
|
+
self.dataset,
|
78
|
+
self.model_list,
|
79
|
+
self.image_writer,
|
80
|
+
is_debug=self.is_debug,
|
81
|
+
start_page_id=self.start_page_id,
|
82
|
+
end_page_id=self.end_page_id,
|
83
|
+
lang=self.lang,
|
84
|
+
layout_model=self.layout_model,
|
85
|
+
formula_enable=self.formula_enable,
|
86
|
+
table_enable=self.table_enable,
|
87
|
+
)
|
48
88
|
elif self.pdf_type == self.PIP_OCR:
|
49
|
-
self.pdf_mid_data = parse_ocr_pdf(
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
89
|
+
self.pdf_mid_data = parse_ocr_pdf(
|
90
|
+
self.dataset,
|
91
|
+
self.model_list,
|
92
|
+
self.image_writer,
|
93
|
+
is_debug=self.is_debug,
|
94
|
+
start_page_id=self.start_page_id,
|
95
|
+
end_page_id=self.end_page_id,
|
96
|
+
lang=self.lang,
|
97
|
+
)
|
98
|
+
|
99
|
+
def pipe_mk_uni_format(
|
100
|
+
self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON
|
101
|
+
):
|
55
102
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
56
103
|
logger.info('uni_pipe mk content list finished')
|
57
104
|
return result
|
58
105
|
|
59
|
-
def pipe_mk_markdown(
|
106
|
+
def pipe_mk_markdown(
|
107
|
+
self,
|
108
|
+
img_parent_path: str,
|
109
|
+
drop_mode=DropMode.WHOLE_PDF,
|
110
|
+
md_make_mode=MakeMode.MM_MD,
|
111
|
+
):
|
60
112
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
61
113
|
logger.info(f'uni_pipe mk {md_make_mode} finished')
|
62
114
|
return result
|
@@ -65,6 +117,7 @@ class UNIPipe(AbsPipe):
|
|
65
117
|
if __name__ == '__main__':
|
66
118
|
# 测试
|
67
119
|
from magic_pdf.data.data_reader_writer import DataReader
|
120
|
+
|
68
121
|
drw = DataReader(r'D:/project/20231108code-clean')
|
69
122
|
|
70
123
|
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
@@ -82,10 +135,7 @@ if __name__ == '__main__':
|
|
82
135
|
# "model_list": model_list
|
83
136
|
# }
|
84
137
|
|
85
|
-
jso_useful_key = {
|
86
|
-
'_pdf_type': '',
|
87
|
-
'model_list': model_list
|
88
|
-
}
|
138
|
+
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
89
139
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
|
90
140
|
pipe.pipe_classify()
|
91
141
|
pipe.pipe_parse()
|
@@ -94,5 +144,7 @@ if __name__ == '__main__':
|
|
94
144
|
|
95
145
|
md_writer = DataWriter(write_path)
|
96
146
|
md_writer.write_string('19983-00.md', md_content)
|
97
|
-
md_writer.write_string(
|
147
|
+
md_writer.write_string(
|
148
|
+
'19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
|
149
|
+
)
|
98
150
|
md_writer.write_string('19983-00.txt', str(content_list))
|
@@ -0,0 +1,138 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
from typing import Callable
|
4
|
+
import copy
|
5
|
+
|
6
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
7
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
8
|
+
from magic_pdf.data.dataset import Dataset
|
9
|
+
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
10
|
+
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
|
11
|
+
draw_span_bbox)
|
12
|
+
from magic_pdf.libs.json_compressor import JsonCompressor
|
13
|
+
|
14
|
+
|
15
|
+
class PipeResult:
|
16
|
+
def __init__(self, pipe_res, dataset: Dataset):
|
17
|
+
"""Initialized.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
pipe_res (list[dict]): the pipeline processed result of model inference result
|
21
|
+
dataset (Dataset): the dataset associated with pipe_res
|
22
|
+
"""
|
23
|
+
self._pipe_res = pipe_res
|
24
|
+
self._dataset = dataset
|
25
|
+
|
26
|
+
def dump_md(
|
27
|
+
self,
|
28
|
+
writer: DataWriter,
|
29
|
+
file_path: str,
|
30
|
+
img_dir_or_bucket_prefix: str,
|
31
|
+
drop_mode=DropMode.WHOLE_PDF,
|
32
|
+
md_make_mode=MakeMode.MM_MD,
|
33
|
+
):
|
34
|
+
"""Dump The Markdown.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
writer (DataWriter): File writer handle
|
38
|
+
file_path (str): The file location of markdown
|
39
|
+
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
|
40
|
+
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
|
41
|
+
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
|
42
|
+
"""
|
43
|
+
pdf_info_list = self._pipe_res['pdf_info']
|
44
|
+
md_content = union_make(
|
45
|
+
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
|
46
|
+
)
|
47
|
+
writer.write_string(file_path, md_content)
|
48
|
+
|
49
|
+
def dump_content_list(
|
50
|
+
self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
|
51
|
+
):
|
52
|
+
"""Dump Content List.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
writer (DataWriter): File writer handle
|
56
|
+
file_path (str): The file location of content list
|
57
|
+
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
|
58
|
+
"""
|
59
|
+
pdf_info_list = self._pipe_res['pdf_info']
|
60
|
+
content_list = union_make(
|
61
|
+
pdf_info_list,
|
62
|
+
MakeMode.STANDARD_FORMAT,
|
63
|
+
DropMode.NONE,
|
64
|
+
image_dir_or_bucket_prefix,
|
65
|
+
)
|
66
|
+
writer.write_string(
|
67
|
+
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
|
68
|
+
)
|
69
|
+
|
70
|
+
def dump_middle_json(self, writer: DataWriter, file_path: str):
|
71
|
+
"""Dump the result of pipeline.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
writer (DataWriter): File writer handler
|
75
|
+
file_path (str): The file location of middle json
|
76
|
+
"""
|
77
|
+
writer.write_string(
|
78
|
+
file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
|
79
|
+
)
|
80
|
+
|
81
|
+
def draw_layout(self, file_path: str) -> None:
|
82
|
+
"""Draw the layout.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
file_path (str): The file location of layout result file
|
86
|
+
"""
|
87
|
+
dir_name = os.path.dirname(file_path)
|
88
|
+
base_name = os.path.basename(file_path)
|
89
|
+
if not os.path.exists(dir_name):
|
90
|
+
os.makedirs(dir_name, exist_ok=True)
|
91
|
+
pdf_info = self._pipe_res['pdf_info']
|
92
|
+
draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
|
93
|
+
|
94
|
+
def draw_span(self, file_path: str):
|
95
|
+
"""Draw the Span.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
file_path (str): The file location of span result file
|
99
|
+
"""
|
100
|
+
dir_name = os.path.dirname(file_path)
|
101
|
+
base_name = os.path.basename(file_path)
|
102
|
+
if not os.path.exists(dir_name):
|
103
|
+
os.makedirs(dir_name, exist_ok=True)
|
104
|
+
pdf_info = self._pipe_res['pdf_info']
|
105
|
+
draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
|
106
|
+
|
107
|
+
def draw_line_sort(self, file_path: str):
|
108
|
+
"""Draw line sort.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
file_path (str): The file location of line sort result file
|
112
|
+
"""
|
113
|
+
dir_name = os.path.dirname(file_path)
|
114
|
+
base_name = os.path.basename(file_path)
|
115
|
+
if not os.path.exists(dir_name):
|
116
|
+
os.makedirs(dir_name, exist_ok=True)
|
117
|
+
pdf_info = self._pipe_res['pdf_info']
|
118
|
+
draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
|
119
|
+
|
120
|
+
def get_compress_pdf_mid_data(self):
|
121
|
+
"""Compress the pipeline result.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
str: compress the pipeline result and return
|
125
|
+
"""
|
126
|
+
return JsonCompressor.compress_json(self.pdf_mid_data)
|
127
|
+
|
128
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
129
|
+
"""Apply callable method which.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
proc (Callable): invoke proc as follows:
|
133
|
+
proc(pipeline_result, *args, **kwargs)
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
Any: return the result generated by proc
|
137
|
+
"""
|
138
|
+
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
|