magic-pdf 0.5.13__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. magic_pdf/cli/magicpdf.py +18 -7
  2. magic_pdf/dict2md/ocr_mkcontent.py +2 -2
  3. magic_pdf/libs/config_reader.py +10 -0
  4. magic_pdf/libs/version.py +1 -1
  5. magic_pdf/model/__init__.py +1 -0
  6. magic_pdf/model/doc_analyze_by_custom_model.py +38 -15
  7. magic_pdf/model/model_list.py +1 -0
  8. magic_pdf/model/pdf_extract_kit.py +200 -0
  9. magic_pdf/model/pek_sub_modules/__init__.py +0 -0
  10. magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py +0 -0
  11. magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py +179 -0
  12. magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py +671 -0
  13. magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py +476 -0
  14. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py +7 -0
  15. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py +2 -0
  16. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py +171 -0
  17. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py +124 -0
  18. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py +136 -0
  19. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py +284 -0
  20. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py +213 -0
  21. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py +7 -0
  22. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +24 -0
  23. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +60 -0
  24. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +1282 -0
  25. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +32 -0
  26. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +34 -0
  27. magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +150 -0
  28. magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py +163 -0
  29. magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py +1236 -0
  30. magic_pdf/model/pek_sub_modules/post_process.py +36 -0
  31. magic_pdf/model/pek_sub_modules/self_modify.py +260 -0
  32. magic_pdf/model/pp_structure_v2.py +7 -0
  33. magic_pdf/pipe/AbsPipe.py +8 -14
  34. magic_pdf/pipe/OCRPipe.py +12 -8
  35. magic_pdf/pipe/TXTPipe.py +12 -8
  36. magic_pdf/pipe/UNIPipe.py +9 -7
  37. magic_pdf/resources/model_config/UniMERNet/demo.yaml +46 -0
  38. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +351 -0
  39. magic_pdf/resources/model_config/model_configs.yaml +9 -0
  40. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.1.dist-info}/METADATA +95 -12
  41. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.1.dist-info}/RECORD +45 -19
  42. magic_pdf/model/360_layout_analysis.py +0 -8
  43. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.1.dist-info}/LICENSE.md +0 -0
  44. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.1.dist-info}/WHEEL +0 -0
  45. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.1.dist-info}/entry_points.txt +0 -0
  46. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
1
+ import re
2
+
3
+ def layout_rm_equation(layout_res):
4
+ rm_idxs = []
5
+ for idx, ele in enumerate(layout_res['layout_dets']):
6
+ if ele['category_id'] == 10:
7
+ rm_idxs.append(idx)
8
+
9
+ for idx in rm_idxs[::-1]:
10
+ del layout_res['layout_dets'][idx]
11
+ return layout_res
12
+
13
+
14
+ def get_croped_image(image_pil, bbox):
15
+ x_min, y_min, x_max, y_max = bbox
16
+ croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
17
+ return croped_img
18
+
19
+
20
+ def latex_rm_whitespace(s: str):
21
+ """Remove unnecessary whitespace from LaTeX code.
22
+ """
23
+ text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
24
+ letter = '[a-zA-Z]'
25
+ noletter = '[\W_^\d]'
26
+ names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
27
+ s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
28
+ news = s
29
+ while True:
30
+ s = news
31
+ news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
32
+ news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
33
+ news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
34
+ if news == s:
35
+ break
36
+ return s
@@ -0,0 +1,260 @@
1
+ import time
2
+ import copy
3
+ import base64
4
+ import cv2
5
+ import numpy as np
6
+ from io import BytesIO
7
+ from PIL import Image
8
+
9
+ from paddleocr import PaddleOCR
10
+ from paddleocr.ppocr.utils.logging import get_logger
11
+ from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
12
+ from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
13
+ logger = get_logger()
14
+
15
+ def img_decode(content: bytes):
16
+ np_arr = np.frombuffer(content, dtype=np.uint8)
17
+ return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
18
+
19
+ def check_img(img):
20
+ if isinstance(img, bytes):
21
+ img = img_decode(img)
22
+ if isinstance(img, str):
23
+ image_file = img
24
+ img, flag_gif, flag_pdf = check_and_read(image_file)
25
+ if not flag_gif and not flag_pdf:
26
+ with open(image_file, 'rb') as f:
27
+ img_str = f.read()
28
+ img = img_decode(img_str)
29
+ if img is None:
30
+ try:
31
+ buf = BytesIO()
32
+ image = BytesIO(img_str)
33
+ im = Image.open(image)
34
+ rgb = im.convert('RGB')
35
+ rgb.save(buf, 'jpeg')
36
+ buf.seek(0)
37
+ image_bytes = buf.read()
38
+ data_base64 = str(base64.b64encode(image_bytes),
39
+ encoding="utf-8")
40
+ image_decode = base64.b64decode(data_base64)
41
+ img_array = np.frombuffer(image_decode, np.uint8)
42
+ img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
43
+ except:
44
+ logger.error("error in loading image:{}".format(image_file))
45
+ return None
46
+ if img is None:
47
+ logger.error("error in loading image:{}".format(image_file))
48
+ return None
49
+ if isinstance(img, np.ndarray) and len(img.shape) == 2:
50
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
51
+
52
+ return img
53
+
54
+ def sorted_boxes(dt_boxes):
55
+ """
56
+ Sort text boxes in order from top to bottom, left to right
57
+ args:
58
+ dt_boxes(array):detected text boxes with shape [4, 2]
59
+ return:
60
+ sorted boxes(array) with shape [4, 2]
61
+ """
62
+ num_boxes = dt_boxes.shape[0]
63
+ sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
64
+ _boxes = list(sorted_boxes)
65
+
66
+ for i in range(num_boxes - 1):
67
+ for j in range(i, -1, -1):
68
+ if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
69
+ (_boxes[j + 1][0][0] < _boxes[j][0][0]):
70
+ tmp = _boxes[j]
71
+ _boxes[j] = _boxes[j + 1]
72
+ _boxes[j + 1] = tmp
73
+ else:
74
+ break
75
+ return _boxes
76
+
77
+
78
+ def formula_in_text(mf_bbox, text_bbox):
79
+ x1, y1, x2, y2 = mf_bbox
80
+ x3, y3 = text_bbox[0]
81
+ x4, y4 = text_bbox[2]
82
+ left_box, right_box = None, None
83
+ same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
84
+ if not same_line:
85
+ return False, left_box, right_box
86
+ else:
87
+ drop_origin = False
88
+ left_x = x1 - 1
89
+ right_x = x2 + 1
90
+ if x3 < x1 and x2 < x4:
91
+ drop_origin = True
92
+ left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
93
+ right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
94
+ if x3 < x1 and x1 <= x4 <= x2:
95
+ drop_origin = True
96
+ left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
97
+ if x1 <= x3 <= x2 and x2 < x4:
98
+ drop_origin = True
99
+ right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
100
+ if x1 <= x3 < x4 <= x2:
101
+ drop_origin = True
102
+ return drop_origin, left_box, right_box
103
+
104
+
105
+ def update_det_boxes(dt_boxes, mfdetrec_res):
106
+ new_dt_boxes = dt_boxes
107
+ for mf_box in mfdetrec_res:
108
+ flag, left_box, right_box = False, None, None
109
+ for idx, text_box in enumerate(new_dt_boxes):
110
+ ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
111
+ if ret:
112
+ new_dt_boxes.pop(idx)
113
+ if left_box is not None:
114
+ new_dt_boxes.append(left_box)
115
+ if right_box is not None:
116
+ new_dt_boxes.append(right_box)
117
+ break
118
+
119
+ return new_dt_boxes
120
+
121
+ class ModifiedPaddleOCR(PaddleOCR):
122
+ def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
123
+ """
124
+ OCR with PaddleOCR
125
+ args:
126
+ img: img for OCR, support ndarray, img_path and list or ndarray
127
+ det: use text detection or not. If False, only rec will be exec. Default is True
128
+ rec: use text recognition or not. If False, only det will be exec. Default is True
129
+ cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
130
+ bin: binarize image to black and white. Default is False.
131
+ inv: invert image colors. Default is False.
132
+ alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
133
+ """
134
+ assert isinstance(img, (np.ndarray, list, str, bytes))
135
+ if isinstance(img, list) and det == True:
136
+ logger.error('When input a list of images, det must be false')
137
+ exit(0)
138
+ if cls == True and self.use_angle_cls == False:
139
+ pass
140
+ # logger.warning(
141
+ # 'Since the angle classifier is not initialized, it will not be used during the forward process'
142
+ # )
143
+
144
+ img = check_img(img)
145
+ # for infer pdf file
146
+ if isinstance(img, list):
147
+ if self.page_num > len(img) or self.page_num == 0:
148
+ self.page_num = len(img)
149
+ imgs = img[:self.page_num]
150
+ else:
151
+ imgs = [img]
152
+
153
+ def preprocess_image(_image):
154
+ _image = alpha_to_color(_image, alpha_color)
155
+ if inv:
156
+ _image = cv2.bitwise_not(_image)
157
+ if bin:
158
+ _image = binarize_img(_image)
159
+ return _image
160
+
161
+ if det and rec:
162
+ ocr_res = []
163
+ for idx, img in enumerate(imgs):
164
+ img = preprocess_image(img)
165
+ dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
166
+ if not dt_boxes and not rec_res:
167
+ ocr_res.append(None)
168
+ continue
169
+ tmp_res = [[box.tolist(), res]
170
+ for box, res in zip(dt_boxes, rec_res)]
171
+ ocr_res.append(tmp_res)
172
+ return ocr_res
173
+ elif det and not rec:
174
+ ocr_res = []
175
+ for idx, img in enumerate(imgs):
176
+ img = preprocess_image(img)
177
+ dt_boxes, elapse = self.text_detector(img)
178
+ if not dt_boxes:
179
+ ocr_res.append(None)
180
+ continue
181
+ tmp_res = [box.tolist() for box in dt_boxes]
182
+ ocr_res.append(tmp_res)
183
+ return ocr_res
184
+ else:
185
+ ocr_res = []
186
+ cls_res = []
187
+ for idx, img in enumerate(imgs):
188
+ if not isinstance(img, list):
189
+ img = preprocess_image(img)
190
+ img = [img]
191
+ if self.use_angle_cls and cls:
192
+ img, cls_res_tmp, elapse = self.text_classifier(img)
193
+ if not rec:
194
+ cls_res.append(cls_res_tmp)
195
+ rec_res, elapse = self.text_recognizer(img)
196
+ ocr_res.append(rec_res)
197
+ if not rec:
198
+ return cls_res
199
+ return ocr_res
200
+
201
+ def __call__(self, img, cls=True, mfd_res=None):
202
+ time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
203
+
204
+ if img is None:
205
+ logger.debug("no valid image provided")
206
+ return None, None, time_dict
207
+
208
+ start = time.time()
209
+ ori_im = img.copy()
210
+ dt_boxes, elapse = self.text_detector(img)
211
+ time_dict['det'] = elapse
212
+
213
+ if dt_boxes is None:
214
+ logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
215
+ end = time.time()
216
+ time_dict['all'] = end - start
217
+ return None, None, time_dict
218
+ else:
219
+ logger.debug("dt_boxes num : {}, elapsed : {}".format(
220
+ len(dt_boxes), elapse))
221
+ img_crop_list = []
222
+
223
+ dt_boxes = sorted_boxes(dt_boxes)
224
+ if mfd_res:
225
+ bef = time.time()
226
+ dt_boxes = update_det_boxes(dt_boxes, mfd_res)
227
+ aft = time.time()
228
+ logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
229
+ len(dt_boxes), aft-bef))
230
+
231
+ for bno in range(len(dt_boxes)):
232
+ tmp_box = copy.deepcopy(dt_boxes[bno])
233
+ if self.args.det_box_type == "quad":
234
+ img_crop = get_rotate_crop_image(ori_im, tmp_box)
235
+ else:
236
+ img_crop = get_minarea_rect_crop(ori_im, tmp_box)
237
+ img_crop_list.append(img_crop)
238
+ if self.use_angle_cls and cls:
239
+ img_crop_list, angle_list, elapse = self.text_classifier(
240
+ img_crop_list)
241
+ time_dict['cls'] = elapse
242
+ logger.debug("cls num : {}, elapsed : {}".format(
243
+ len(img_crop_list), elapse))
244
+
245
+ rec_res, elapse = self.text_recognizer(img_crop_list)
246
+ time_dict['rec'] = elapse
247
+ logger.debug("rec_res num : {}, elapsed : {}".format(
248
+ len(rec_res), elapse))
249
+ if self.args.save_crop_res:
250
+ self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
251
+ rec_res)
252
+ filter_boxes, filter_rec_res = [], []
253
+ for box, rec_result in zip(dt_boxes, rec_res):
254
+ text, score = rec_result
255
+ if score >= self.drop_score:
256
+ filter_boxes.append(box)
257
+ filter_rec_res.append(rec_result)
258
+ end = time.time()
259
+ time_dict['all'] = end - start
260
+ return filter_boxes, filter_rec_res, time_dict
@@ -22,6 +22,13 @@ class CustomPaddleModel:
22
22
  self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
23
23
 
24
24
  def __call__(self, img):
25
+ try:
26
+ import cv2
27
+ except ImportError:
28
+ logger.error("opencv-python not installed, please install by pip.")
29
+ exit(1)
30
+ # 将RGB图片转换为BGR格式适配paddle
31
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
25
32
  result = self.model(img)
26
33
  spans = []
27
34
  for line in result:
magic_pdf/pipe/AbsPipe.py CHANGED
@@ -47,19 +47,13 @@ class AbsPipe(ABC):
47
47
  """
48
48
  raise NotImplementedError
49
49
 
50
- @abstractmethod
51
- def pipe_mk_uni_format(self, img_parent_path, drop_mode):
52
- """
53
- 有状态的组装统一格式
54
- """
55
- raise NotImplementedError
50
+ def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
51
+ content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
52
+ return content_list
56
53
 
57
- @abstractmethod
58
- def pipe_mk_markdown(self, img_parent_path, drop_mode):
59
- """
60
- 有状态的组装markdown
61
- """
62
- raise NotImplementedError
54
+ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
55
+ md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
56
+ return md_content
63
57
 
64
58
  @staticmethod
65
59
  def classify(pdf_bytes: bytes) -> str:
@@ -101,13 +95,13 @@ class AbsPipe(ABC):
101
95
  return content_list
102
96
 
103
97
  @staticmethod
104
- def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
98
+ def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
105
99
  """
106
100
  根据pdf类型,markdown
107
101
  """
108
102
  pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
109
103
  pdf_info_list = pdf_mid_data["pdf_info"]
110
- md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
104
+ md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
111
105
  return md_content
112
106
 
113
107
 
magic_pdf/pipe/OCRPipe.py CHANGED
@@ -1,4 +1,6 @@
1
- from magic_pdf.libs.MakeContentConfig import DropMode
1
+ from loguru import logger
2
+
3
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
2
4
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
3
5
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
4
6
  from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf
7
9
 
8
10
  class OCRPipe(AbsPipe):
9
11
 
10
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
12
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
11
13
  super().__init__(pdf_bytes, model_list, image_writer, is_debug)
12
14
 
13
15
  def pipe_classify(self):
@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe):
20
22
  self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
21
23
 
22
24
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
23
- content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
24
- return content_list
25
-
26
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
27
- md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
28
- return md_content
25
+ result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
26
+ logger.info("ocr_pipe mk content list finished")
27
+ return result
28
+
29
+ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
30
+ result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
31
+ logger.info(f"ocr_pipe mk {md_make_mode} finished")
32
+ return result
magic_pdf/pipe/TXTPipe.py CHANGED
@@ -1,4 +1,6 @@
1
- from magic_pdf.libs.MakeContentConfig import DropMode
1
+ from loguru import logger
2
+
3
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
2
4
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
3
5
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
4
6
  from magic_pdf.libs.json_compressor import JsonCompressor
@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf
8
10
 
9
11
  class TXTPipe(AbsPipe):
10
12
 
11
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
13
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
12
14
  super().__init__(pdf_bytes, model_list, image_writer, is_debug)
13
15
 
14
16
  def pipe_classify(self):
@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe):
21
23
  self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
22
24
 
23
25
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
24
- content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
25
- return content_list
26
-
27
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
28
- md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
29
- return md_content
26
+ result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
27
+ logger.info("txt_pipe mk content list finished")
28
+ return result
29
+
30
+ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
31
+ result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
32
+ logger.info(f"txt_pipe mk {md_make_mode} finished")
33
+ return result
magic_pdf/pipe/UNIPipe.py CHANGED
@@ -2,7 +2,7 @@ import json
2
2
 
3
3
  from loguru import logger
4
4
 
5
- from magic_pdf.libs.MakeContentConfig import DropMode
5
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
6
6
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
7
7
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
8
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe):
39
39
  is_debug=self.is_debug)
40
40
 
41
41
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
42
- content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
43
- return content_list
44
-
45
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
46
- markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
47
- return markdown_content
42
+ result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
43
+ logger.info("uni_pipe mk content list finished")
44
+ return result
45
+
46
+ def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
47
+ result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
48
+ logger.info(f"uni_pipe mk {md_make_mode} finished")
49
+ return result
48
50
 
49
51
 
50
52
  if __name__ == '__main__':
@@ -0,0 +1,46 @@
1
+ model:
2
+ arch: unimernet
3
+ model_type: unimernet
4
+ model_config:
5
+ model_name: ./models
6
+ max_seq_len: 1024
7
+ length_aware: False
8
+ load_pretrained: True
9
+ pretrained: ./models/pytorch_model.bin
10
+ tokenizer_config:
11
+ path: ./models
12
+
13
+ datasets:
14
+ formula_rec_eval:
15
+ vis_processor:
16
+ eval:
17
+ name: "formula_image_eval"
18
+ image_size:
19
+ - 192
20
+ - 672
21
+
22
+ run:
23
+ runner: runner_iter
24
+ task: unimernet_train
25
+
26
+ batch_size_train: 64
27
+ batch_size_eval: 64
28
+ num_workers: 1
29
+
30
+ iters_per_inner_epoch: 2000
31
+ max_iters: 60000
32
+
33
+ seed: 42
34
+ output_dir: "../output/demo"
35
+
36
+ evaluate: True
37
+ test_splits: [ "eval" ]
38
+
39
+ device: "cuda"
40
+ world_size: 1
41
+ dist_url: "env://"
42
+ distributed: True
43
+ distributed_type: ddp # or fsdp when train llm
44
+
45
+ generate_cfg:
46
+ temperature: 0.0