magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
1
1
  import copy
2
+ import platform
2
3
  import time
3
4
  import cv2
4
5
  import numpy as np
6
+ import torch
5
7
 
6
8
  from paddleocr import PaddleOCR
7
9
  from ppocr.utils.logging import get_logger
@@ -9,12 +11,25 @@ from ppocr.utils.utility import alpha_to_color, binarize_img
9
11
  from tools.infer.predict_system import sorted_boxes
10
12
  from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
11
13
 
12
- from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
14
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img, \
15
+ ONNXModelSingleton
13
16
 
14
17
  logger = get_logger()
15
18
 
16
19
 
17
20
  class ModifiedPaddleOCR(PaddleOCR):
21
+ def __init__(self, *args, **kwargs):
22
+
23
+ super().__init__(*args, **kwargs)
24
+ self.lang = kwargs.get('lang', 'ch')
25
+ # 在cpu架构为arm且不支持cuda时调用onnx、
26
+ if not torch.cuda.is_available() and platform.machine() in ['arm64', 'aarch64']:
27
+ self.use_onnx = True
28
+ onnx_model_manager = ONNXModelSingleton()
29
+ self.additional_ocr = onnx_model_manager.get_onnx_model(**kwargs)
30
+ else:
31
+ self.use_onnx = False
32
+
18
33
  def ocr(self,
19
34
  img,
20
35
  det=True,
@@ -79,7 +94,10 @@ class ModifiedPaddleOCR(PaddleOCR):
79
94
  ocr_res = []
80
95
  for img in imgs:
81
96
  img = preprocess_image(img)
82
- dt_boxes, elapse = self.text_detector(img)
97
+ if self.lang in ['ch'] and self.use_onnx:
98
+ dt_boxes, elapse = self.additional_ocr.text_detector(img)
99
+ else:
100
+ dt_boxes, elapse = self.text_detector(img)
83
101
  if dt_boxes is None:
84
102
  ocr_res.append(None)
85
103
  continue
@@ -106,7 +124,10 @@ class ModifiedPaddleOCR(PaddleOCR):
106
124
  img, cls_res_tmp, elapse = self.text_classifier(img)
107
125
  if not rec:
108
126
  cls_res.append(cls_res_tmp)
109
- rec_res, elapse = self.text_recognizer(img)
127
+ if self.lang in ['ch'] and self.use_onnx:
128
+ rec_res, elapse = self.additional_ocr.text_recognizer(img)
129
+ else:
130
+ rec_res, elapse = self.text_recognizer(img)
110
131
  ocr_res.append(rec_res)
111
132
  if not rec:
112
133
  return cls_res
@@ -121,7 +142,10 @@ class ModifiedPaddleOCR(PaddleOCR):
121
142
 
122
143
  start = time.time()
123
144
  ori_im = img.copy()
124
- dt_boxes, elapse = self.text_detector(img)
145
+ if self.lang in ['ch'] and self.use_onnx:
146
+ dt_boxes, elapse = self.additional_ocr.text_detector(img)
147
+ else:
148
+ dt_boxes, elapse = self.text_detector(img)
125
149
  time_dict['det'] = elapse
126
150
 
127
151
  if dt_boxes is None:
@@ -159,8 +183,10 @@ class ModifiedPaddleOCR(PaddleOCR):
159
183
  time_dict['cls'] = elapse
160
184
  logger.debug("cls num : {}, elapsed : {}".format(
161
185
  len(img_crop_list), elapse))
162
-
163
- rec_res, elapse = self.text_recognizer(img_crop_list)
186
+ if self.lang in ['ch'] and self.use_onnx:
187
+ rec_res, elapse = self.additional_ocr.text_recognizer(img_crop_list)
188
+ else:
189
+ rec_res, elapse = self.text_recognizer(img_crop_list)
164
190
  time_dict['rec'] = elapse
165
191
  logger.debug("rec_res num : {}, elapsed : {}".format(
166
192
  len(rec_res), elapse))
@@ -1,16 +1,51 @@
1
+ import cv2
1
2
  import numpy as np
3
+ import torch
4
+ from loguru import logger
2
5
  from rapid_table import RapidTable
3
- from rapidocr_paddle import RapidOCR
4
6
 
5
7
 
6
8
  class RapidTableModel(object):
7
- def __init__(self):
9
+ def __init__(self, ocr_engine):
8
10
  self.table_model = RapidTable()
9
- self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
11
+ # if ocr_engine is None:
12
+ # self.ocr_model_name = "RapidOCR"
13
+ # if torch.cuda.is_available():
14
+ # from rapidocr_paddle import RapidOCR
15
+ # self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
16
+ # else:
17
+ # from rapidocr_onnxruntime import RapidOCR
18
+ # self.ocr_engine = RapidOCR()
19
+ # else:
20
+ # self.ocr_model_name = "PaddleOCR"
21
+ # self.ocr_engine = ocr_engine
22
+
23
+ self.ocr_model_name = "RapidOCR"
24
+ if torch.cuda.is_available():
25
+ from rapidocr_paddle import RapidOCR
26
+ self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
27
+ else:
28
+ from rapidocr_onnxruntime import RapidOCR
29
+ self.ocr_engine = RapidOCR()
10
30
 
11
31
  def predict(self, image):
12
- ocr_result, _ = self.ocr_engine(np.asarray(image))
13
- if ocr_result is None:
32
+
33
+ if self.ocr_model_name == "RapidOCR":
34
+ ocr_result, _ = self.ocr_engine(np.asarray(image))
35
+ elif self.ocr_model_name == "PaddleOCR":
36
+ bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
37
+ ocr_result = self.ocr_engine.ocr(bgr_image)[0]
38
+ if ocr_result:
39
+ ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
40
+ len(item) == 2 and isinstance(item[1], tuple)]
41
+ else:
42
+ ocr_result = None
43
+ else:
44
+ logger.error("OCR model not supported")
45
+ ocr_result = None
46
+
47
+ if ocr_result:
48
+ html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result)
49
+ return html_code, table_cell_bboxes, elapse
50
+ else:
14
51
  return None, None, None
15
- html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result)
16
- return html_code, table_cell_bboxes, elapse
@@ -0,0 +1,94 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable
3
+
4
+ from magic_pdf.data.data_reader_writer import DataWriter
5
+ from magic_pdf.data.dataset import Dataset
6
+ from magic_pdf.operators.pipes import PipeResult
7
+
8
+
9
+ class InferenceResultBase(ABC):
10
+
11
+ @abstractmethod
12
+ def __init__(self, inference_results: list, dataset: Dataset):
13
+ """Initialized method.
14
+
15
+ Args:
16
+ inference_results (list): the inference result generated by model
17
+ dataset (Dataset): the dataset related with model inference result
18
+ """
19
+ pass
20
+
21
+ @abstractmethod
22
+ def draw_model(self, file_path: str) -> None:
23
+ """Draw model inference result.
24
+
25
+ Args:
26
+ file_path (str): the output file path
27
+ """
28
+ pass
29
+
30
+ @abstractmethod
31
+ def dump_model(self, writer: DataWriter, file_path: str):
32
+ """Dump model inference result to file.
33
+
34
+ Args:
35
+ writer (DataWriter): writer handle
36
+ file_path (str): the location of target file
37
+ """
38
+ pass
39
+
40
+ @abstractmethod
41
+ def get_infer_res(self):
42
+ """Get the inference result.
43
+
44
+ Returns:
45
+ list: the inference result generated by model
46
+ """
47
+ pass
48
+
49
+ @abstractmethod
50
+ def apply(self, proc: Callable, *args, **kwargs):
51
+ """Apply callable method which.
52
+
53
+ Args:
54
+ proc (Callable): invoke proc as follows:
55
+ proc(inference_result, *args, **kwargs)
56
+
57
+ Returns:
58
+ Any: return the result generated by proc
59
+ """
60
+ pass
61
+
62
+ def pipe_txt_mode(
63
+ self,
64
+ imageWriter: DataWriter,
65
+ start_page_id=0,
66
+ end_page_id=None,
67
+ debug_mode=False,
68
+ lang=None,
69
+ ) -> PipeResult:
70
+ """Post-proc the model inference result, Extract the text using the
71
+ third library, such as `pymupdf`
72
+
73
+ Args:
74
+ imageWriter (DataWriter): the image writer handle
75
+ start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
76
+ end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
77
+ debug_mode (bool, optional): Defaults to False. will dump more log if enabled
78
+ lang (str, optional): Defaults to None.
79
+
80
+ Returns:
81
+ PipeResult: the result
82
+ """
83
+ pass
84
+
85
+ @abstractmethod
86
+ def pipe_ocr_mode(
87
+ self,
88
+ imageWriter: DataWriter,
89
+ start_page_id=0,
90
+ end_page_id=None,
91
+ debug_mode=False,
92
+ lang=None,
93
+ ) -> PipeResult:
94
+ pass
@@ -0,0 +1,154 @@
1
+ import copy
2
+ import json
3
+ import os
4
+ from typing import Callable
5
+
6
+ from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
7
+ from magic_pdf.config.enums import SupportedPdfParseMethod
8
+ from magic_pdf.data.data_reader_writer import DataWriter
9
+ from magic_pdf.data.dataset import Dataset
10
+ from magic_pdf.libs.draw_bbox import draw_model_bbox
11
+ from magic_pdf.libs.version import __version__
12
+ from magic_pdf.operators.pipes import PipeResult
13
+ from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
14
+ from magic_pdf.operators import InferenceResultBase
15
+
16
+ class InferenceResult(InferenceResultBase):
17
+ def __init__(self, inference_results: list, dataset: Dataset):
18
+ """Initialized method.
19
+
20
+ Args:
21
+ inference_results (list): the inference result generated by model
22
+ dataset (Dataset): the dataset related with model inference result
23
+ """
24
+ self._infer_res = inference_results
25
+ self._dataset = dataset
26
+
27
+ def draw_model(self, file_path: str) -> None:
28
+ """Draw model inference result.
29
+
30
+ Args:
31
+ file_path (str): the output file path
32
+ """
33
+ dir_name = os.path.dirname(file_path)
34
+ base_name = os.path.basename(file_path)
35
+ if not os.path.exists(dir_name):
36
+ os.makedirs(dir_name, exist_ok=True)
37
+ draw_model_bbox(
38
+ copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
39
+ )
40
+
41
+ def dump_model(self, writer: DataWriter, file_path: str):
42
+ """Dump model inference result to file.
43
+
44
+ Args:
45
+ writer (DataWriter): writer handle
46
+ file_path (str): the location of target file
47
+ """
48
+ writer.write_string(
49
+ file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
50
+ )
51
+
52
+ def get_infer_res(self):
53
+ """Get the inference result.
54
+
55
+ Returns:
56
+ list: the inference result generated by model
57
+ """
58
+ return self._infer_res
59
+
60
+ def apply(self, proc: Callable, *args, **kwargs):
61
+ """Apply callable method which.
62
+
63
+ Args:
64
+ proc (Callable): invoke proc as follows:
65
+ proc(inference_result, *args, **kwargs)
66
+
67
+ Returns:
68
+ Any: return the result generated by proc
69
+ """
70
+ return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
71
+
72
+ def pipe_txt_mode(
73
+ self,
74
+ imageWriter: DataWriter,
75
+ start_page_id=0,
76
+ end_page_id=None,
77
+ debug_mode=False,
78
+ lang=None,
79
+ ) -> PipeResult:
80
+ """Post-proc the model inference result, Extract the text using the
81
+ third library, such as `pymupdf`
82
+
83
+ Args:
84
+ imageWriter (DataWriter): the image writer handle
85
+ start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
86
+ end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
87
+ debug_mode (bool, optional): Defaults to False. will dump more log if enabled
88
+ lang (str, optional): Defaults to None.
89
+
90
+ Returns:
91
+ PipeResult: the result
92
+ """
93
+
94
+ def proc(*args, **kwargs) -> PipeResult:
95
+ res = pdf_parse_union(*args, **kwargs)
96
+ res['_parse_type'] = PARSE_TYPE_TXT
97
+ res['_version_name'] = __version__
98
+ if 'lang' in kwargs and kwargs['lang'] is not None:
99
+ res['lang'] = kwargs['lang']
100
+ return PipeResult(res, self._dataset)
101
+
102
+ res = self.apply(
103
+ proc,
104
+ self._dataset,
105
+ imageWriter,
106
+ SupportedPdfParseMethod.TXT,
107
+ start_page_id=start_page_id,
108
+ end_page_id=end_page_id,
109
+ debug_mode=debug_mode,
110
+ lang=lang,
111
+ )
112
+ return res
113
+
114
+ def pipe_ocr_mode(
115
+ self,
116
+ imageWriter: DataWriter,
117
+ start_page_id=0,
118
+ end_page_id=None,
119
+ debug_mode=False,
120
+ lang=None,
121
+ ) -> PipeResult:
122
+ """Post-proc the model inference result, Extract the text using `OCR`
123
+ technical.
124
+
125
+ Args:
126
+ imageWriter (DataWriter): the image writer handle
127
+ start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
128
+ end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
129
+ debug_mode (bool, optional): Defaults to False. will dump more log if enabled
130
+ lang (str, optional): Defaults to None.
131
+
132
+ Returns:
133
+ PipeResult: the result
134
+ """
135
+
136
+ def proc(*args, **kwargs) -> PipeResult:
137
+ res = pdf_parse_union(*args, **kwargs)
138
+ res['_parse_type'] = PARSE_TYPE_OCR
139
+ res['_version_name'] = __version__
140
+ if 'lang' in kwargs and kwargs['lang'] is not None:
141
+ res['lang'] = kwargs['lang']
142
+ return PipeResult(res, self._dataset)
143
+
144
+ res = self.apply(
145
+ proc,
146
+ self._dataset,
147
+ imageWriter,
148
+ SupportedPdfParseMethod.OCR,
149
+ start_page_id=start_page_id,
150
+ end_page_id=end_page_id,
151
+ debug_mode=debug_mode,
152
+ lang=lang,
153
+ )
154
+ return res
@@ -0,0 +1,191 @@
1
+ import copy
2
+ import json
3
+ import os
4
+ from typing import Callable
5
+
6
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
7
+ from magic_pdf.data.data_reader_writer import DataWriter
8
+ from magic_pdf.data.dataset import Dataset
9
+ from magic_pdf.dict2md.ocr_mkcontent import union_make
10
+ from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
11
+ draw_span_bbox)
12
+ from magic_pdf.libs.json_compressor import JsonCompressor
13
+
14
+
15
+ class PipeResult:
16
+ def __init__(self, pipe_res, dataset: Dataset):
17
+ """Initialized.
18
+
19
+ Args:
20
+ pipe_res (list[dict]): the pipeline processed result of model inference result
21
+ dataset (Dataset): the dataset associated with pipe_res
22
+ """
23
+ self._pipe_res = pipe_res
24
+ self._dataset = dataset
25
+
26
+ def get_markdown(
27
+ self,
28
+ img_dir_or_bucket_prefix: str,
29
+ drop_mode=DropMode.NONE,
30
+ md_make_mode=MakeMode.MM_MD,
31
+ ) -> str:
32
+ """Get markdown content.
33
+
34
+ Args:
35
+ img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
36
+ drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
37
+ md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
38
+
39
+ Returns:
40
+ str: return markdown content
41
+ """
42
+ pdf_info_list = self._pipe_res['pdf_info']
43
+ md_content = union_make(
44
+ pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
45
+ )
46
+ return md_content
47
+
48
+ def dump_md(
49
+ self,
50
+ writer: DataWriter,
51
+ file_path: str,
52
+ img_dir_or_bucket_prefix: str,
53
+ drop_mode=DropMode.NONE,
54
+ md_make_mode=MakeMode.MM_MD,
55
+ ):
56
+ """Dump The Markdown.
57
+
58
+ Args:
59
+ writer (DataWriter): File writer handle
60
+ file_path (str): The file location of markdown
61
+ img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
62
+ drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
63
+ md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
64
+ """
65
+
66
+ md_content = self.get_markdown(
67
+ img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
68
+ )
69
+ writer.write_string(file_path, md_content)
70
+
71
+ def get_content_list(
72
+ self,
73
+ image_dir_or_bucket_prefix: str,
74
+ drop_mode=DropMode.NONE,
75
+ ) -> str:
76
+ """Get Content List.
77
+
78
+ Args:
79
+ image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
80
+ drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
81
+
82
+ Returns:
83
+ str: content list content
84
+ """
85
+ pdf_info_list = self._pipe_res['pdf_info']
86
+ content_list = union_make(
87
+ pdf_info_list,
88
+ MakeMode.STANDARD_FORMAT,
89
+ drop_mode,
90
+ image_dir_or_bucket_prefix,
91
+ )
92
+ return content_list
93
+
94
+ def dump_content_list(
95
+ self,
96
+ writer: DataWriter,
97
+ file_path: str,
98
+ image_dir_or_bucket_prefix: str,
99
+ drop_mode=DropMode.NONE,
100
+ ):
101
+ """Dump Content List.
102
+
103
+ Args:
104
+ writer (DataWriter): File writer handle
105
+ file_path (str): The file location of content list
106
+ image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
107
+ drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
108
+ """
109
+ content_list = self.get_content_list(
110
+ image_dir_or_bucket_prefix, drop_mode=drop_mode,
111
+ )
112
+ writer.write_string(
113
+ file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
114
+ )
115
+
116
+ def get_middle_json(self) -> str:
117
+ """Get middle json.
118
+
119
+ Returns:
120
+ str: The content of middle json
121
+ """
122
+ return json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
123
+
124
+ def dump_middle_json(self, writer: DataWriter, file_path: str):
125
+ """Dump the result of pipeline.
126
+
127
+ Args:
128
+ writer (DataWriter): File writer handler
129
+ file_path (str): The file location of middle json
130
+ """
131
+ middle_json = self.get_middle_json()
132
+ writer.write_string(file_path, middle_json)
133
+
134
+ def draw_layout(self, file_path: str) -> None:
135
+ """Draw the layout.
136
+
137
+ Args:
138
+ file_path (str): The file location of layout result file
139
+ """
140
+ dir_name = os.path.dirname(file_path)
141
+ base_name = os.path.basename(file_path)
142
+ if not os.path.exists(dir_name):
143
+ os.makedirs(dir_name, exist_ok=True)
144
+ pdf_info = self._pipe_res['pdf_info']
145
+ draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
146
+
147
+ def draw_span(self, file_path: str):
148
+ """Draw the Span.
149
+
150
+ Args:
151
+ file_path (str): The file location of span result file
152
+ """
153
+ dir_name = os.path.dirname(file_path)
154
+ base_name = os.path.basename(file_path)
155
+ if not os.path.exists(dir_name):
156
+ os.makedirs(dir_name, exist_ok=True)
157
+ pdf_info = self._pipe_res['pdf_info']
158
+ draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
159
+
160
+ def draw_line_sort(self, file_path: str):
161
+ """Draw line sort.
162
+
163
+ Args:
164
+ file_path (str): The file location of line sort result file
165
+ """
166
+ dir_name = os.path.dirname(file_path)
167
+ base_name = os.path.basename(file_path)
168
+ if not os.path.exists(dir_name):
169
+ os.makedirs(dir_name, exist_ok=True)
170
+ pdf_info = self._pipe_res['pdf_info']
171
+ draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
172
+
173
+ def get_compress_pdf_mid_data(self):
174
+ """Compress the pipeline result.
175
+
176
+ Returns:
177
+ str: compress the pipeline result and return
178
+ """
179
+ return JsonCompressor.compress_json(self._pipe_res)
180
+
181
+ def apply(self, proc: Callable, *args, **kwargs):
182
+ """Apply callable method which.
183
+
184
+ Args:
185
+ proc (Callable): invoke proc as follows:
186
+ proc(pipeline_result, *args, **kwargs)
187
+
188
+ Returns:
189
+ Any: return the result generated by proc
190
+ """
191
+ return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)