magic-pdf 0.5.13__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +18 -7
- magic_pdf/libs/config_reader.py +10 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +38 -15
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +196 -0
- magic_pdf/model/pek_sub_modules/__init__.py +0 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py +0 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py +179 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py +671 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py +476 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py +7 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py +2 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py +171 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py +124 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py +136 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py +284 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py +213 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py +7 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +24 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +60 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +1282 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +32 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +34 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +150 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py +163 -0
- magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py +1236 -0
- magic_pdf/model/pek_sub_modules/post_process.py +36 -0
- magic_pdf/model/pek_sub_modules/self_modify.py +260 -0
- magic_pdf/model/pp_structure_v2.py +7 -0
- magic_pdf/pipe/AbsPipe.py +8 -14
- magic_pdf/pipe/OCRPipe.py +12 -8
- magic_pdf/pipe/TXTPipe.py +12 -8
- magic_pdf/pipe/UNIPipe.py +9 -7
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +46 -0
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +351 -0
- magic_pdf/resources/model_config/model_configs.yaml +9 -0
- {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/METADATA +18 -8
- {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/RECORD +44 -18
- magic_pdf/model/360_layout_analysis.py +0 -8
- {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
def layout_rm_equation(layout_res):
|
4
|
+
rm_idxs = []
|
5
|
+
for idx, ele in enumerate(layout_res['layout_dets']):
|
6
|
+
if ele['category_id'] == 10:
|
7
|
+
rm_idxs.append(idx)
|
8
|
+
|
9
|
+
for idx in rm_idxs[::-1]:
|
10
|
+
del layout_res['layout_dets'][idx]
|
11
|
+
return layout_res
|
12
|
+
|
13
|
+
|
14
|
+
def get_croped_image(image_pil, bbox):
|
15
|
+
x_min, y_min, x_max, y_max = bbox
|
16
|
+
croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
|
17
|
+
return croped_img
|
18
|
+
|
19
|
+
|
20
|
+
def latex_rm_whitespace(s: str):
|
21
|
+
"""Remove unnecessary whitespace from LaTeX code.
|
22
|
+
"""
|
23
|
+
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
|
24
|
+
letter = '[a-zA-Z]'
|
25
|
+
noletter = '[\W_^\d]'
|
26
|
+
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
|
27
|
+
s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
|
28
|
+
news = s
|
29
|
+
while True:
|
30
|
+
s = news
|
31
|
+
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
|
32
|
+
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
|
33
|
+
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
|
34
|
+
if news == s:
|
35
|
+
break
|
36
|
+
return s
|
@@ -0,0 +1,260 @@
|
|
1
|
+
import time
|
2
|
+
import copy
|
3
|
+
import base64
|
4
|
+
import cv2
|
5
|
+
import numpy as np
|
6
|
+
from io import BytesIO
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from paddleocr import PaddleOCR
|
10
|
+
from paddleocr.ppocr.utils.logging import get_logger
|
11
|
+
from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
|
12
|
+
from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
|
13
|
+
logger = get_logger()
|
14
|
+
|
15
|
+
def img_decode(content: bytes):
|
16
|
+
np_arr = np.frombuffer(content, dtype=np.uint8)
|
17
|
+
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
|
18
|
+
|
19
|
+
def check_img(img):
|
20
|
+
if isinstance(img, bytes):
|
21
|
+
img = img_decode(img)
|
22
|
+
if isinstance(img, str):
|
23
|
+
image_file = img
|
24
|
+
img, flag_gif, flag_pdf = check_and_read(image_file)
|
25
|
+
if not flag_gif and not flag_pdf:
|
26
|
+
with open(image_file, 'rb') as f:
|
27
|
+
img_str = f.read()
|
28
|
+
img = img_decode(img_str)
|
29
|
+
if img is None:
|
30
|
+
try:
|
31
|
+
buf = BytesIO()
|
32
|
+
image = BytesIO(img_str)
|
33
|
+
im = Image.open(image)
|
34
|
+
rgb = im.convert('RGB')
|
35
|
+
rgb.save(buf, 'jpeg')
|
36
|
+
buf.seek(0)
|
37
|
+
image_bytes = buf.read()
|
38
|
+
data_base64 = str(base64.b64encode(image_bytes),
|
39
|
+
encoding="utf-8")
|
40
|
+
image_decode = base64.b64decode(data_base64)
|
41
|
+
img_array = np.frombuffer(image_decode, np.uint8)
|
42
|
+
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
43
|
+
except:
|
44
|
+
logger.error("error in loading image:{}".format(image_file))
|
45
|
+
return None
|
46
|
+
if img is None:
|
47
|
+
logger.error("error in loading image:{}".format(image_file))
|
48
|
+
return None
|
49
|
+
if isinstance(img, np.ndarray) and len(img.shape) == 2:
|
50
|
+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
51
|
+
|
52
|
+
return img
|
53
|
+
|
54
|
+
def sorted_boxes(dt_boxes):
|
55
|
+
"""
|
56
|
+
Sort text boxes in order from top to bottom, left to right
|
57
|
+
args:
|
58
|
+
dt_boxes(array):detected text boxes with shape [4, 2]
|
59
|
+
return:
|
60
|
+
sorted boxes(array) with shape [4, 2]
|
61
|
+
"""
|
62
|
+
num_boxes = dt_boxes.shape[0]
|
63
|
+
sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
|
64
|
+
_boxes = list(sorted_boxes)
|
65
|
+
|
66
|
+
for i in range(num_boxes - 1):
|
67
|
+
for j in range(i, -1, -1):
|
68
|
+
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
|
69
|
+
(_boxes[j + 1][0][0] < _boxes[j][0][0]):
|
70
|
+
tmp = _boxes[j]
|
71
|
+
_boxes[j] = _boxes[j + 1]
|
72
|
+
_boxes[j + 1] = tmp
|
73
|
+
else:
|
74
|
+
break
|
75
|
+
return _boxes
|
76
|
+
|
77
|
+
|
78
|
+
def formula_in_text(mf_bbox, text_bbox):
|
79
|
+
x1, y1, x2, y2 = mf_bbox
|
80
|
+
x3, y3 = text_bbox[0]
|
81
|
+
x4, y4 = text_bbox[2]
|
82
|
+
left_box, right_box = None, None
|
83
|
+
same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
|
84
|
+
if not same_line:
|
85
|
+
return False, left_box, right_box
|
86
|
+
else:
|
87
|
+
drop_origin = False
|
88
|
+
left_x = x1 - 1
|
89
|
+
right_x = x2 + 1
|
90
|
+
if x3 < x1 and x2 < x4:
|
91
|
+
drop_origin = True
|
92
|
+
left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
|
93
|
+
right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
|
94
|
+
if x3 < x1 and x1 <= x4 <= x2:
|
95
|
+
drop_origin = True
|
96
|
+
left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
|
97
|
+
if x1 <= x3 <= x2 and x2 < x4:
|
98
|
+
drop_origin = True
|
99
|
+
right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
|
100
|
+
if x1 <= x3 < x4 <= x2:
|
101
|
+
drop_origin = True
|
102
|
+
return drop_origin, left_box, right_box
|
103
|
+
|
104
|
+
|
105
|
+
def update_det_boxes(dt_boxes, mfdetrec_res):
|
106
|
+
new_dt_boxes = dt_boxes
|
107
|
+
for mf_box in mfdetrec_res:
|
108
|
+
flag, left_box, right_box = False, None, None
|
109
|
+
for idx, text_box in enumerate(new_dt_boxes):
|
110
|
+
ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
|
111
|
+
if ret:
|
112
|
+
new_dt_boxes.pop(idx)
|
113
|
+
if left_box is not None:
|
114
|
+
new_dt_boxes.append(left_box)
|
115
|
+
if right_box is not None:
|
116
|
+
new_dt_boxes.append(right_box)
|
117
|
+
break
|
118
|
+
|
119
|
+
return new_dt_boxes
|
120
|
+
|
121
|
+
class ModifiedPaddleOCR(PaddleOCR):
|
122
|
+
def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
|
123
|
+
"""
|
124
|
+
OCR with PaddleOCR
|
125
|
+
args:
|
126
|
+
img: img for OCR, support ndarray, img_path and list or ndarray
|
127
|
+
det: use text detection or not. If False, only rec will be exec. Default is True
|
128
|
+
rec: use text recognition or not. If False, only det will be exec. Default is True
|
129
|
+
cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
|
130
|
+
bin: binarize image to black and white. Default is False.
|
131
|
+
inv: invert image colors. Default is False.
|
132
|
+
alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
|
133
|
+
"""
|
134
|
+
assert isinstance(img, (np.ndarray, list, str, bytes))
|
135
|
+
if isinstance(img, list) and det == True:
|
136
|
+
logger.error('When input a list of images, det must be false')
|
137
|
+
exit(0)
|
138
|
+
if cls == True and self.use_angle_cls == False:
|
139
|
+
pass
|
140
|
+
# logger.warning(
|
141
|
+
# 'Since the angle classifier is not initialized, it will not be used during the forward process'
|
142
|
+
# )
|
143
|
+
|
144
|
+
img = check_img(img)
|
145
|
+
# for infer pdf file
|
146
|
+
if isinstance(img, list):
|
147
|
+
if self.page_num > len(img) or self.page_num == 0:
|
148
|
+
self.page_num = len(img)
|
149
|
+
imgs = img[:self.page_num]
|
150
|
+
else:
|
151
|
+
imgs = [img]
|
152
|
+
|
153
|
+
def preprocess_image(_image):
|
154
|
+
_image = alpha_to_color(_image, alpha_color)
|
155
|
+
if inv:
|
156
|
+
_image = cv2.bitwise_not(_image)
|
157
|
+
if bin:
|
158
|
+
_image = binarize_img(_image)
|
159
|
+
return _image
|
160
|
+
|
161
|
+
if det and rec:
|
162
|
+
ocr_res = []
|
163
|
+
for idx, img in enumerate(imgs):
|
164
|
+
img = preprocess_image(img)
|
165
|
+
dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
|
166
|
+
if not dt_boxes and not rec_res:
|
167
|
+
ocr_res.append(None)
|
168
|
+
continue
|
169
|
+
tmp_res = [[box.tolist(), res]
|
170
|
+
for box, res in zip(dt_boxes, rec_res)]
|
171
|
+
ocr_res.append(tmp_res)
|
172
|
+
return ocr_res
|
173
|
+
elif det and not rec:
|
174
|
+
ocr_res = []
|
175
|
+
for idx, img in enumerate(imgs):
|
176
|
+
img = preprocess_image(img)
|
177
|
+
dt_boxes, elapse = self.text_detector(img)
|
178
|
+
if not dt_boxes:
|
179
|
+
ocr_res.append(None)
|
180
|
+
continue
|
181
|
+
tmp_res = [box.tolist() for box in dt_boxes]
|
182
|
+
ocr_res.append(tmp_res)
|
183
|
+
return ocr_res
|
184
|
+
else:
|
185
|
+
ocr_res = []
|
186
|
+
cls_res = []
|
187
|
+
for idx, img in enumerate(imgs):
|
188
|
+
if not isinstance(img, list):
|
189
|
+
img = preprocess_image(img)
|
190
|
+
img = [img]
|
191
|
+
if self.use_angle_cls and cls:
|
192
|
+
img, cls_res_tmp, elapse = self.text_classifier(img)
|
193
|
+
if not rec:
|
194
|
+
cls_res.append(cls_res_tmp)
|
195
|
+
rec_res, elapse = self.text_recognizer(img)
|
196
|
+
ocr_res.append(rec_res)
|
197
|
+
if not rec:
|
198
|
+
return cls_res
|
199
|
+
return ocr_res
|
200
|
+
|
201
|
+
def __call__(self, img, cls=True, mfd_res=None):
|
202
|
+
time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
|
203
|
+
|
204
|
+
if img is None:
|
205
|
+
logger.debug("no valid image provided")
|
206
|
+
return None, None, time_dict
|
207
|
+
|
208
|
+
start = time.time()
|
209
|
+
ori_im = img.copy()
|
210
|
+
dt_boxes, elapse = self.text_detector(img)
|
211
|
+
time_dict['det'] = elapse
|
212
|
+
|
213
|
+
if dt_boxes is None:
|
214
|
+
logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
|
215
|
+
end = time.time()
|
216
|
+
time_dict['all'] = end - start
|
217
|
+
return None, None, time_dict
|
218
|
+
else:
|
219
|
+
logger.debug("dt_boxes num : {}, elapsed : {}".format(
|
220
|
+
len(dt_boxes), elapse))
|
221
|
+
img_crop_list = []
|
222
|
+
|
223
|
+
dt_boxes = sorted_boxes(dt_boxes)
|
224
|
+
if mfd_res:
|
225
|
+
bef = time.time()
|
226
|
+
dt_boxes = update_det_boxes(dt_boxes, mfd_res)
|
227
|
+
aft = time.time()
|
228
|
+
logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
|
229
|
+
len(dt_boxes), aft-bef))
|
230
|
+
|
231
|
+
for bno in range(len(dt_boxes)):
|
232
|
+
tmp_box = copy.deepcopy(dt_boxes[bno])
|
233
|
+
if self.args.det_box_type == "quad":
|
234
|
+
img_crop = get_rotate_crop_image(ori_im, tmp_box)
|
235
|
+
else:
|
236
|
+
img_crop = get_minarea_rect_crop(ori_im, tmp_box)
|
237
|
+
img_crop_list.append(img_crop)
|
238
|
+
if self.use_angle_cls and cls:
|
239
|
+
img_crop_list, angle_list, elapse = self.text_classifier(
|
240
|
+
img_crop_list)
|
241
|
+
time_dict['cls'] = elapse
|
242
|
+
logger.debug("cls num : {}, elapsed : {}".format(
|
243
|
+
len(img_crop_list), elapse))
|
244
|
+
|
245
|
+
rec_res, elapse = self.text_recognizer(img_crop_list)
|
246
|
+
time_dict['rec'] = elapse
|
247
|
+
logger.debug("rec_res num : {}, elapsed : {}".format(
|
248
|
+
len(rec_res), elapse))
|
249
|
+
if self.args.save_crop_res:
|
250
|
+
self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
|
251
|
+
rec_res)
|
252
|
+
filter_boxes, filter_rec_res = [], []
|
253
|
+
for box, rec_result in zip(dt_boxes, rec_res):
|
254
|
+
text, score = rec_result
|
255
|
+
if score >= self.drop_score:
|
256
|
+
filter_boxes.append(box)
|
257
|
+
filter_rec_res.append(rec_result)
|
258
|
+
end = time.time()
|
259
|
+
time_dict['all'] = end - start
|
260
|
+
return filter_boxes, filter_rec_res, time_dict
|
@@ -22,6 +22,13 @@ class CustomPaddleModel:
|
|
22
22
|
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
23
23
|
|
24
24
|
def __call__(self, img):
|
25
|
+
try:
|
26
|
+
import cv2
|
27
|
+
except ImportError:
|
28
|
+
logger.error("opencv-python not installed, please install by pip.")
|
29
|
+
exit(1)
|
30
|
+
# 将RGB图片转换为BGR格式适配paddle
|
31
|
+
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
25
32
|
result = self.model(img)
|
26
33
|
spans = []
|
27
34
|
for line in result:
|
magic_pdf/pipe/AbsPipe.py
CHANGED
@@ -47,19 +47,13 @@ class AbsPipe(ABC):
|
|
47
47
|
"""
|
48
48
|
raise NotImplementedError
|
49
49
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
有状态的组装统一格式
|
54
|
-
"""
|
55
|
-
raise NotImplementedError
|
50
|
+
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
51
|
+
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
|
52
|
+
return content_list
|
56
53
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
有状态的组装markdown
|
61
|
-
"""
|
62
|
-
raise NotImplementedError
|
54
|
+
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
55
|
+
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
|
56
|
+
return md_content
|
63
57
|
|
64
58
|
@staticmethod
|
65
59
|
def classify(pdf_bytes: bytes) -> str:
|
@@ -101,13 +95,13 @@ class AbsPipe(ABC):
|
|
101
95
|
return content_list
|
102
96
|
|
103
97
|
@staticmethod
|
104
|
-
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
|
98
|
+
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
|
105
99
|
"""
|
106
100
|
根据pdf类型,markdown
|
107
101
|
"""
|
108
102
|
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
109
103
|
pdf_info_list = pdf_mid_data["pdf_info"]
|
110
|
-
md_content = union_make(pdf_info_list,
|
104
|
+
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
|
111
105
|
return md_content
|
112
106
|
|
113
107
|
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
from
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
2
4
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
3
5
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
4
6
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf
|
|
7
9
|
|
8
10
|
class OCRPipe(AbsPipe):
|
9
11
|
|
10
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
|
12
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
|
11
13
|
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
|
12
14
|
|
13
15
|
def pipe_classify(self):
|
@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe):
|
|
20
22
|
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
|
21
23
|
|
22
24
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
26
|
+
logger.info("ocr_pipe mk content list finished")
|
27
|
+
return result
|
28
|
+
|
29
|
+
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
30
|
+
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
31
|
+
logger.info(f"ocr_pipe mk {md_make_mode} finished")
|
32
|
+
return result
|
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
from
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
2
4
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
3
5
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
4
6
|
from magic_pdf.libs.json_compressor import JsonCompressor
|
@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf
|
|
8
10
|
|
9
11
|
class TXTPipe(AbsPipe):
|
10
12
|
|
11
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
|
13
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
|
12
14
|
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
|
13
15
|
|
14
16
|
def pipe_classify(self):
|
@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe):
|
|
21
23
|
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
|
22
24
|
|
23
25
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
26
|
+
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
27
|
+
logger.info("txt_pipe mk content list finished")
|
28
|
+
return result
|
29
|
+
|
30
|
+
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
31
|
+
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
32
|
+
logger.info(f"txt_pipe mk {md_make_mode} finished")
|
33
|
+
return result
|
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -2,7 +2,7 @@ import json
|
|
2
2
|
|
3
3
|
from loguru import logger
|
4
4
|
|
5
|
-
from magic_pdf.libs.MakeContentConfig import DropMode
|
5
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
6
6
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
7
7
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
8
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe):
|
|
39
39
|
is_debug=self.is_debug)
|
40
40
|
|
41
41
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
43
|
+
logger.info("uni_pipe mk content list finished")
|
44
|
+
return result
|
45
|
+
|
46
|
+
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
47
|
+
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
48
|
+
logger.info(f"uni_pipe mk {md_make_mode} finished")
|
49
|
+
return result
|
48
50
|
|
49
51
|
|
50
52
|
if __name__ == '__main__':
|
@@ -0,0 +1,46 @@
|
|
1
|
+
model:
|
2
|
+
arch: unimernet
|
3
|
+
model_type: unimernet
|
4
|
+
model_config:
|
5
|
+
model_name: ./models
|
6
|
+
max_seq_len: 1024
|
7
|
+
length_aware: False
|
8
|
+
load_pretrained: True
|
9
|
+
pretrained: ./models/pytorch_model.bin
|
10
|
+
tokenizer_config:
|
11
|
+
path: ./models
|
12
|
+
|
13
|
+
datasets:
|
14
|
+
formula_rec_eval:
|
15
|
+
vis_processor:
|
16
|
+
eval:
|
17
|
+
name: "formula_image_eval"
|
18
|
+
image_size:
|
19
|
+
- 192
|
20
|
+
- 672
|
21
|
+
|
22
|
+
run:
|
23
|
+
runner: runner_iter
|
24
|
+
task: unimernet_train
|
25
|
+
|
26
|
+
batch_size_train: 64
|
27
|
+
batch_size_eval: 64
|
28
|
+
num_workers: 1
|
29
|
+
|
30
|
+
iters_per_inner_epoch: 2000
|
31
|
+
max_iters: 60000
|
32
|
+
|
33
|
+
seed: 42
|
34
|
+
output_dir: "../output/demo"
|
35
|
+
|
36
|
+
evaluate: True
|
37
|
+
test_splits: [ "eval" ]
|
38
|
+
|
39
|
+
device: "cuda"
|
40
|
+
world_size: 1
|
41
|
+
dist_url: "env://"
|
42
|
+
distributed: True
|
43
|
+
distributed_type: ddp # or fsdp when train llm
|
44
|
+
|
45
|
+
generate_cfg:
|
46
|
+
temperature: 0.0
|