magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +7 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +188 -5
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +16 -15
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +19 -22
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +35 -5
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +43 -7
- magic_pdf/model/sub_modules/model_utils.py +17 -5
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/operators/models.py +154 -0
- magic_pdf/operators/pipes.py +191 -0
- magic_pdf/pdf_parse_union_core_v2.py +77 -27
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +120 -61
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -23
- magic_pdf/pdf_parse_by_txt.py +0 -24
- magic_pdf/pipe/AbsPipe.py +0 -98
- magic_pdf/pipe/OCRPipe.py +0 -41
- magic_pdf/pipe/TXTPipe.py +0 -41
- magic_pdf/pipe/UNIPipe.py +0 -98
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -121
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
|
|
1
1
|
import copy
|
2
|
+
import platform
|
2
3
|
import time
|
3
4
|
import cv2
|
4
5
|
import numpy as np
|
6
|
+
import torch
|
5
7
|
|
6
8
|
from paddleocr import PaddleOCR
|
7
9
|
from ppocr.utils.logging import get_logger
|
@@ -9,12 +11,25 @@ from ppocr.utils.utility import alpha_to_color, binarize_img
|
|
9
11
|
from tools.infer.predict_system import sorted_boxes
|
10
12
|
from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
|
11
13
|
|
12
|
-
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
|
14
|
+
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img, \
|
15
|
+
ONNXModelSingleton
|
13
16
|
|
14
17
|
logger = get_logger()
|
15
18
|
|
16
19
|
|
17
20
|
class ModifiedPaddleOCR(PaddleOCR):
|
21
|
+
def __init__(self, *args, **kwargs):
|
22
|
+
|
23
|
+
super().__init__(*args, **kwargs)
|
24
|
+
self.lang = kwargs.get('lang', 'ch')
|
25
|
+
# 在cpu架构为arm且不支持cuda时调用onnx、
|
26
|
+
if not torch.cuda.is_available() and platform.machine() in ['arm64', 'aarch64']:
|
27
|
+
self.use_onnx = True
|
28
|
+
onnx_model_manager = ONNXModelSingleton()
|
29
|
+
self.additional_ocr = onnx_model_manager.get_onnx_model(**kwargs)
|
30
|
+
else:
|
31
|
+
self.use_onnx = False
|
32
|
+
|
18
33
|
def ocr(self,
|
19
34
|
img,
|
20
35
|
det=True,
|
@@ -79,7 +94,10 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
79
94
|
ocr_res = []
|
80
95
|
for img in imgs:
|
81
96
|
img = preprocess_image(img)
|
82
|
-
|
97
|
+
if self.lang in ['ch'] and self.use_onnx:
|
98
|
+
dt_boxes, elapse = self.additional_ocr.text_detector(img)
|
99
|
+
else:
|
100
|
+
dt_boxes, elapse = self.text_detector(img)
|
83
101
|
if dt_boxes is None:
|
84
102
|
ocr_res.append(None)
|
85
103
|
continue
|
@@ -106,7 +124,10 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
106
124
|
img, cls_res_tmp, elapse = self.text_classifier(img)
|
107
125
|
if not rec:
|
108
126
|
cls_res.append(cls_res_tmp)
|
109
|
-
|
127
|
+
if self.lang in ['ch'] and self.use_onnx:
|
128
|
+
rec_res, elapse = self.additional_ocr.text_recognizer(img)
|
129
|
+
else:
|
130
|
+
rec_res, elapse = self.text_recognizer(img)
|
110
131
|
ocr_res.append(rec_res)
|
111
132
|
if not rec:
|
112
133
|
return cls_res
|
@@ -121,7 +142,10 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
121
142
|
|
122
143
|
start = time.time()
|
123
144
|
ori_im = img.copy()
|
124
|
-
|
145
|
+
if self.lang in ['ch'] and self.use_onnx:
|
146
|
+
dt_boxes, elapse = self.additional_ocr.text_detector(img)
|
147
|
+
else:
|
148
|
+
dt_boxes, elapse = self.text_detector(img)
|
125
149
|
time_dict['det'] = elapse
|
126
150
|
|
127
151
|
if dt_boxes is None:
|
@@ -159,8 +183,10 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
159
183
|
time_dict['cls'] = elapse
|
160
184
|
logger.debug("cls num : {}, elapsed : {}".format(
|
161
185
|
len(img_crop_list), elapse))
|
162
|
-
|
163
|
-
|
186
|
+
if self.lang in ['ch'] and self.use_onnx:
|
187
|
+
rec_res, elapse = self.additional_ocr.text_recognizer(img_crop_list)
|
188
|
+
else:
|
189
|
+
rec_res, elapse = self.text_recognizer(img_crop_list)
|
164
190
|
time_dict['rec'] = elapse
|
165
191
|
logger.debug("rec_res num : {}, elapsed : {}".format(
|
166
192
|
len(rec_res), elapse))
|
@@ -1,16 +1,51 @@
|
|
1
|
+
import cv2
|
1
2
|
import numpy as np
|
3
|
+
import torch
|
4
|
+
from loguru import logger
|
2
5
|
from rapid_table import RapidTable
|
3
|
-
from rapidocr_paddle import RapidOCR
|
4
6
|
|
5
7
|
|
6
8
|
class RapidTableModel(object):
|
7
|
-
def __init__(self):
|
9
|
+
def __init__(self, ocr_engine):
|
8
10
|
self.table_model = RapidTable()
|
9
|
-
|
11
|
+
# if ocr_engine is None:
|
12
|
+
# self.ocr_model_name = "RapidOCR"
|
13
|
+
# if torch.cuda.is_available():
|
14
|
+
# from rapidocr_paddle import RapidOCR
|
15
|
+
# self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
|
16
|
+
# else:
|
17
|
+
# from rapidocr_onnxruntime import RapidOCR
|
18
|
+
# self.ocr_engine = RapidOCR()
|
19
|
+
# else:
|
20
|
+
# self.ocr_model_name = "PaddleOCR"
|
21
|
+
# self.ocr_engine = ocr_engine
|
22
|
+
|
23
|
+
self.ocr_model_name = "RapidOCR"
|
24
|
+
if torch.cuda.is_available():
|
25
|
+
from rapidocr_paddle import RapidOCR
|
26
|
+
self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
|
27
|
+
else:
|
28
|
+
from rapidocr_onnxruntime import RapidOCR
|
29
|
+
self.ocr_engine = RapidOCR()
|
10
30
|
|
11
31
|
def predict(self, image):
|
12
|
-
|
13
|
-
if
|
32
|
+
|
33
|
+
if self.ocr_model_name == "RapidOCR":
|
34
|
+
ocr_result, _ = self.ocr_engine(np.asarray(image))
|
35
|
+
elif self.ocr_model_name == "PaddleOCR":
|
36
|
+
bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
|
37
|
+
ocr_result = self.ocr_engine.ocr(bgr_image)[0]
|
38
|
+
if ocr_result:
|
39
|
+
ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
|
40
|
+
len(item) == 2 and isinstance(item[1], tuple)]
|
41
|
+
else:
|
42
|
+
ocr_result = None
|
43
|
+
else:
|
44
|
+
logger.error("OCR model not supported")
|
45
|
+
ocr_result = None
|
46
|
+
|
47
|
+
if ocr_result:
|
48
|
+
html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result)
|
49
|
+
return html_code, table_cell_bboxes, elapse
|
50
|
+
else:
|
14
51
|
return None, None, None
|
15
|
-
html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result)
|
16
|
-
return html_code, table_cell_bboxes, elapse
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Callable
|
3
|
+
|
4
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
+
from magic_pdf.data.dataset import Dataset
|
6
|
+
from magic_pdf.operators.pipes import PipeResult
|
7
|
+
|
8
|
+
|
9
|
+
class InferenceResultBase(ABC):
|
10
|
+
|
11
|
+
@abstractmethod
|
12
|
+
def __init__(self, inference_results: list, dataset: Dataset):
|
13
|
+
"""Initialized method.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
inference_results (list): the inference result generated by model
|
17
|
+
dataset (Dataset): the dataset related with model inference result
|
18
|
+
"""
|
19
|
+
pass
|
20
|
+
|
21
|
+
@abstractmethod
|
22
|
+
def draw_model(self, file_path: str) -> None:
|
23
|
+
"""Draw model inference result.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
file_path (str): the output file path
|
27
|
+
"""
|
28
|
+
pass
|
29
|
+
|
30
|
+
@abstractmethod
|
31
|
+
def dump_model(self, writer: DataWriter, file_path: str):
|
32
|
+
"""Dump model inference result to file.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
writer (DataWriter): writer handle
|
36
|
+
file_path (str): the location of target file
|
37
|
+
"""
|
38
|
+
pass
|
39
|
+
|
40
|
+
@abstractmethod
|
41
|
+
def get_infer_res(self):
|
42
|
+
"""Get the inference result.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
list: the inference result generated by model
|
46
|
+
"""
|
47
|
+
pass
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
51
|
+
"""Apply callable method which.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
proc (Callable): invoke proc as follows:
|
55
|
+
proc(inference_result, *args, **kwargs)
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
Any: return the result generated by proc
|
59
|
+
"""
|
60
|
+
pass
|
61
|
+
|
62
|
+
def pipe_txt_mode(
|
63
|
+
self,
|
64
|
+
imageWriter: DataWriter,
|
65
|
+
start_page_id=0,
|
66
|
+
end_page_id=None,
|
67
|
+
debug_mode=False,
|
68
|
+
lang=None,
|
69
|
+
) -> PipeResult:
|
70
|
+
"""Post-proc the model inference result, Extract the text using the
|
71
|
+
third library, such as `pymupdf`
|
72
|
+
|
73
|
+
Args:
|
74
|
+
imageWriter (DataWriter): the image writer handle
|
75
|
+
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
76
|
+
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
77
|
+
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
78
|
+
lang (str, optional): Defaults to None.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
PipeResult: the result
|
82
|
+
"""
|
83
|
+
pass
|
84
|
+
|
85
|
+
@abstractmethod
|
86
|
+
def pipe_ocr_mode(
|
87
|
+
self,
|
88
|
+
imageWriter: DataWriter,
|
89
|
+
start_page_id=0,
|
90
|
+
end_page_id=None,
|
91
|
+
debug_mode=False,
|
92
|
+
lang=None,
|
93
|
+
) -> PipeResult:
|
94
|
+
pass
|
@@ -0,0 +1,154 @@
|
|
1
|
+
import copy
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from typing import Callable
|
5
|
+
|
6
|
+
from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
|
7
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
8
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
9
|
+
from magic_pdf.data.dataset import Dataset
|
10
|
+
from magic_pdf.libs.draw_bbox import draw_model_bbox
|
11
|
+
from magic_pdf.libs.version import __version__
|
12
|
+
from magic_pdf.operators.pipes import PipeResult
|
13
|
+
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
|
14
|
+
from magic_pdf.operators import InferenceResultBase
|
15
|
+
|
16
|
+
class InferenceResult(InferenceResultBase):
|
17
|
+
def __init__(self, inference_results: list, dataset: Dataset):
|
18
|
+
"""Initialized method.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
inference_results (list): the inference result generated by model
|
22
|
+
dataset (Dataset): the dataset related with model inference result
|
23
|
+
"""
|
24
|
+
self._infer_res = inference_results
|
25
|
+
self._dataset = dataset
|
26
|
+
|
27
|
+
def draw_model(self, file_path: str) -> None:
|
28
|
+
"""Draw model inference result.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
file_path (str): the output file path
|
32
|
+
"""
|
33
|
+
dir_name = os.path.dirname(file_path)
|
34
|
+
base_name = os.path.basename(file_path)
|
35
|
+
if not os.path.exists(dir_name):
|
36
|
+
os.makedirs(dir_name, exist_ok=True)
|
37
|
+
draw_model_bbox(
|
38
|
+
copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
|
39
|
+
)
|
40
|
+
|
41
|
+
def dump_model(self, writer: DataWriter, file_path: str):
|
42
|
+
"""Dump model inference result to file.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
writer (DataWriter): writer handle
|
46
|
+
file_path (str): the location of target file
|
47
|
+
"""
|
48
|
+
writer.write_string(
|
49
|
+
file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
|
50
|
+
)
|
51
|
+
|
52
|
+
def get_infer_res(self):
|
53
|
+
"""Get the inference result.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
list: the inference result generated by model
|
57
|
+
"""
|
58
|
+
return self._infer_res
|
59
|
+
|
60
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
61
|
+
"""Apply callable method which.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
proc (Callable): invoke proc as follows:
|
65
|
+
proc(inference_result, *args, **kwargs)
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Any: return the result generated by proc
|
69
|
+
"""
|
70
|
+
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
|
71
|
+
|
72
|
+
def pipe_txt_mode(
|
73
|
+
self,
|
74
|
+
imageWriter: DataWriter,
|
75
|
+
start_page_id=0,
|
76
|
+
end_page_id=None,
|
77
|
+
debug_mode=False,
|
78
|
+
lang=None,
|
79
|
+
) -> PipeResult:
|
80
|
+
"""Post-proc the model inference result, Extract the text using the
|
81
|
+
third library, such as `pymupdf`
|
82
|
+
|
83
|
+
Args:
|
84
|
+
imageWriter (DataWriter): the image writer handle
|
85
|
+
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
86
|
+
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
87
|
+
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
88
|
+
lang (str, optional): Defaults to None.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
PipeResult: the result
|
92
|
+
"""
|
93
|
+
|
94
|
+
def proc(*args, **kwargs) -> PipeResult:
|
95
|
+
res = pdf_parse_union(*args, **kwargs)
|
96
|
+
res['_parse_type'] = PARSE_TYPE_TXT
|
97
|
+
res['_version_name'] = __version__
|
98
|
+
if 'lang' in kwargs and kwargs['lang'] is not None:
|
99
|
+
res['lang'] = kwargs['lang']
|
100
|
+
return PipeResult(res, self._dataset)
|
101
|
+
|
102
|
+
res = self.apply(
|
103
|
+
proc,
|
104
|
+
self._dataset,
|
105
|
+
imageWriter,
|
106
|
+
SupportedPdfParseMethod.TXT,
|
107
|
+
start_page_id=start_page_id,
|
108
|
+
end_page_id=end_page_id,
|
109
|
+
debug_mode=debug_mode,
|
110
|
+
lang=lang,
|
111
|
+
)
|
112
|
+
return res
|
113
|
+
|
114
|
+
def pipe_ocr_mode(
|
115
|
+
self,
|
116
|
+
imageWriter: DataWriter,
|
117
|
+
start_page_id=0,
|
118
|
+
end_page_id=None,
|
119
|
+
debug_mode=False,
|
120
|
+
lang=None,
|
121
|
+
) -> PipeResult:
|
122
|
+
"""Post-proc the model inference result, Extract the text using `OCR`
|
123
|
+
technical.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
imageWriter (DataWriter): the image writer handle
|
127
|
+
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
|
128
|
+
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
|
129
|
+
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
|
130
|
+
lang (str, optional): Defaults to None.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
PipeResult: the result
|
134
|
+
"""
|
135
|
+
|
136
|
+
def proc(*args, **kwargs) -> PipeResult:
|
137
|
+
res = pdf_parse_union(*args, **kwargs)
|
138
|
+
res['_parse_type'] = PARSE_TYPE_OCR
|
139
|
+
res['_version_name'] = __version__
|
140
|
+
if 'lang' in kwargs and kwargs['lang'] is not None:
|
141
|
+
res['lang'] = kwargs['lang']
|
142
|
+
return PipeResult(res, self._dataset)
|
143
|
+
|
144
|
+
res = self.apply(
|
145
|
+
proc,
|
146
|
+
self._dataset,
|
147
|
+
imageWriter,
|
148
|
+
SupportedPdfParseMethod.OCR,
|
149
|
+
start_page_id=start_page_id,
|
150
|
+
end_page_id=end_page_id,
|
151
|
+
debug_mode=debug_mode,
|
152
|
+
lang=lang,
|
153
|
+
)
|
154
|
+
return res
|
@@ -0,0 +1,191 @@
|
|
1
|
+
import copy
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from typing import Callable
|
5
|
+
|
6
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
7
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
8
|
+
from magic_pdf.data.dataset import Dataset
|
9
|
+
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
10
|
+
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
|
11
|
+
draw_span_bbox)
|
12
|
+
from magic_pdf.libs.json_compressor import JsonCompressor
|
13
|
+
|
14
|
+
|
15
|
+
class PipeResult:
|
16
|
+
def __init__(self, pipe_res, dataset: Dataset):
|
17
|
+
"""Initialized.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
pipe_res (list[dict]): the pipeline processed result of model inference result
|
21
|
+
dataset (Dataset): the dataset associated with pipe_res
|
22
|
+
"""
|
23
|
+
self._pipe_res = pipe_res
|
24
|
+
self._dataset = dataset
|
25
|
+
|
26
|
+
def get_markdown(
|
27
|
+
self,
|
28
|
+
img_dir_or_bucket_prefix: str,
|
29
|
+
drop_mode=DropMode.NONE,
|
30
|
+
md_make_mode=MakeMode.MM_MD,
|
31
|
+
) -> str:
|
32
|
+
"""Get markdown content.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
|
36
|
+
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
|
37
|
+
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
str: return markdown content
|
41
|
+
"""
|
42
|
+
pdf_info_list = self._pipe_res['pdf_info']
|
43
|
+
md_content = union_make(
|
44
|
+
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
|
45
|
+
)
|
46
|
+
return md_content
|
47
|
+
|
48
|
+
def dump_md(
|
49
|
+
self,
|
50
|
+
writer: DataWriter,
|
51
|
+
file_path: str,
|
52
|
+
img_dir_or_bucket_prefix: str,
|
53
|
+
drop_mode=DropMode.NONE,
|
54
|
+
md_make_mode=MakeMode.MM_MD,
|
55
|
+
):
|
56
|
+
"""Dump The Markdown.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
writer (DataWriter): File writer handle
|
60
|
+
file_path (str): The file location of markdown
|
61
|
+
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
|
62
|
+
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
|
63
|
+
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
|
64
|
+
"""
|
65
|
+
|
66
|
+
md_content = self.get_markdown(
|
67
|
+
img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
|
68
|
+
)
|
69
|
+
writer.write_string(file_path, md_content)
|
70
|
+
|
71
|
+
def get_content_list(
|
72
|
+
self,
|
73
|
+
image_dir_or_bucket_prefix: str,
|
74
|
+
drop_mode=DropMode.NONE,
|
75
|
+
) -> str:
|
76
|
+
"""Get Content List.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
|
80
|
+
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
str: content list content
|
84
|
+
"""
|
85
|
+
pdf_info_list = self._pipe_res['pdf_info']
|
86
|
+
content_list = union_make(
|
87
|
+
pdf_info_list,
|
88
|
+
MakeMode.STANDARD_FORMAT,
|
89
|
+
drop_mode,
|
90
|
+
image_dir_or_bucket_prefix,
|
91
|
+
)
|
92
|
+
return content_list
|
93
|
+
|
94
|
+
def dump_content_list(
|
95
|
+
self,
|
96
|
+
writer: DataWriter,
|
97
|
+
file_path: str,
|
98
|
+
image_dir_or_bucket_prefix: str,
|
99
|
+
drop_mode=DropMode.NONE,
|
100
|
+
):
|
101
|
+
"""Dump Content List.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
writer (DataWriter): File writer handle
|
105
|
+
file_path (str): The file location of content list
|
106
|
+
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
|
107
|
+
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
|
108
|
+
"""
|
109
|
+
content_list = self.get_content_list(
|
110
|
+
image_dir_or_bucket_prefix, drop_mode=drop_mode,
|
111
|
+
)
|
112
|
+
writer.write_string(
|
113
|
+
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
|
114
|
+
)
|
115
|
+
|
116
|
+
def get_middle_json(self) -> str:
|
117
|
+
"""Get middle json.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
str: The content of middle json
|
121
|
+
"""
|
122
|
+
return json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
|
123
|
+
|
124
|
+
def dump_middle_json(self, writer: DataWriter, file_path: str):
|
125
|
+
"""Dump the result of pipeline.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
writer (DataWriter): File writer handler
|
129
|
+
file_path (str): The file location of middle json
|
130
|
+
"""
|
131
|
+
middle_json = self.get_middle_json()
|
132
|
+
writer.write_string(file_path, middle_json)
|
133
|
+
|
134
|
+
def draw_layout(self, file_path: str) -> None:
|
135
|
+
"""Draw the layout.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
file_path (str): The file location of layout result file
|
139
|
+
"""
|
140
|
+
dir_name = os.path.dirname(file_path)
|
141
|
+
base_name = os.path.basename(file_path)
|
142
|
+
if not os.path.exists(dir_name):
|
143
|
+
os.makedirs(dir_name, exist_ok=True)
|
144
|
+
pdf_info = self._pipe_res['pdf_info']
|
145
|
+
draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
|
146
|
+
|
147
|
+
def draw_span(self, file_path: str):
|
148
|
+
"""Draw the Span.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
file_path (str): The file location of span result file
|
152
|
+
"""
|
153
|
+
dir_name = os.path.dirname(file_path)
|
154
|
+
base_name = os.path.basename(file_path)
|
155
|
+
if not os.path.exists(dir_name):
|
156
|
+
os.makedirs(dir_name, exist_ok=True)
|
157
|
+
pdf_info = self._pipe_res['pdf_info']
|
158
|
+
draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
|
159
|
+
|
160
|
+
def draw_line_sort(self, file_path: str):
|
161
|
+
"""Draw line sort.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
file_path (str): The file location of line sort result file
|
165
|
+
"""
|
166
|
+
dir_name = os.path.dirname(file_path)
|
167
|
+
base_name = os.path.basename(file_path)
|
168
|
+
if not os.path.exists(dir_name):
|
169
|
+
os.makedirs(dir_name, exist_ok=True)
|
170
|
+
pdf_info = self._pipe_res['pdf_info']
|
171
|
+
draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
|
172
|
+
|
173
|
+
def get_compress_pdf_mid_data(self):
|
174
|
+
"""Compress the pipeline result.
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
str: compress the pipeline result and return
|
178
|
+
"""
|
179
|
+
return JsonCompressor.compress_json(self._pipe_res)
|
180
|
+
|
181
|
+
def apply(self, proc: Callable, *args, **kwargs):
|
182
|
+
"""Apply callable method which.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
proc (Callable): invoke proc as follows:
|
186
|
+
proc(pipeline_result, *args, **kwargs)
|
187
|
+
|
188
|
+
Returns:
|
189
|
+
Any: return the result generated by proc
|
190
|
+
"""
|
191
|
+
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
|