magic-pdf 0.10.4__py3-none-any.whl → 0.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/config/constants.py +5 -0
  2. magic_pdf/data/data_reader_writer/base.py +13 -1
  3. magic_pdf/data/dataset.py +175 -4
  4. magic_pdf/data/utils.py +2 -2
  5. magic_pdf/dict2md/ocr_mkcontent.py +2 -2
  6. magic_pdf/filter/__init__.py +32 -0
  7. magic_pdf/filter/pdf_meta_scan.py +3 -2
  8. magic_pdf/libs/draw_bbox.py +11 -10
  9. magic_pdf/libs/pdf_check.py +30 -30
  10. magic_pdf/libs/version.py +1 -1
  11. magic_pdf/model/__init__.py +124 -0
  12. magic_pdf/model/doc_analyze_by_custom_model.py +119 -60
  13. magic_pdf/model/operators.py +190 -0
  14. magic_pdf/model/pdf_extract_kit.py +20 -1
  15. magic_pdf/model/sub_modules/model_init.py +13 -3
  16. magic_pdf/model/sub_modules/model_utils.py +11 -5
  17. magic_pdf/para/para_split_v3.py +2 -2
  18. magic_pdf/pdf_parse_by_ocr.py +4 -5
  19. magic_pdf/pdf_parse_by_txt.py +4 -5
  20. magic_pdf/pdf_parse_union_core_v2.py +10 -11
  21. magic_pdf/pipe/AbsPipe.py +3 -2
  22. magic_pdf/pipe/OCRPipe.py +54 -15
  23. magic_pdf/pipe/TXTPipe.py +5 -4
  24. magic_pdf/pipe/UNIPipe.py +82 -30
  25. magic_pdf/pipe/operators.py +138 -0
  26. magic_pdf/pre_proc/cut_image.py +2 -2
  27. magic_pdf/tools/common.py +108 -59
  28. magic_pdf/user_api.py +47 -24
  29. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/METADATA +7 -4
  30. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/RECORD +34 -32
  31. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.6.dist-info}/top_level.txt +0 -0
@@ -4,8 +4,8 @@ import statistics
4
4
  import time
5
5
  from typing import List
6
6
 
7
- import torch
8
7
  import fitz
8
+ import torch
9
9
  from loguru import logger
10
10
 
11
11
  from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
16
16
  from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
17
17
  from magic_pdf.libs.convert_utils import dict_to_list
18
18
  from magic_pdf.libs.hash_utils import compute_md5
19
-
20
19
  from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
21
20
  from magic_pdf.model.magic_model import MagicModel
22
21
 
23
- os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
24
- os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
25
-
26
22
  try:
27
23
  import torchtext
28
24
 
29
- if torchtext.__version__ >= "0.18.0":
25
+ if torchtext.__version__ >= '0.18.0':
30
26
  torchtext.disable_torchtext_deprecation_warning()
31
27
  except ImportError:
32
28
  pass
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
39
35
  from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
40
36
  from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
41
37
 
38
+ os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
39
+ os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
40
+
42
41
 
43
42
  def __replace_STX_ETX(text_str: str):
44
43
  """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
@@ -233,7 +232,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
233
232
  # 初始化ocr模型
234
233
  atom_model_manager = AtomModelSingleton()
235
234
  ocr_model = atom_model_manager.get_atom_model(
236
- atom_model_name="ocr",
235
+ atom_model_name='ocr',
237
236
  ocr_show_log=False,
238
237
  det_db_box_thresh=0.3,
239
238
  lang=lang
@@ -241,7 +240,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
241
240
 
242
241
  for span in empty_spans:
243
242
  # 对span的bbox截图再ocr
244
- span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
243
+ span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
245
244
  ocr_res = ocr_model.ocr(span_img, det=False)
246
245
  if ocr_res and len(ocr_res) > 0:
247
246
  if len(ocr_res[0]) > 0:
@@ -681,7 +680,7 @@ def parse_page_core(
681
680
  """根据parse_mode,构造spans,主要是文本类的字符填充"""
682
681
  if parse_mode == SupportedPdfParseMethod.TXT:
683
682
 
684
- """使用新版本的混合ocr方案"""
683
+ """使用新版本的混合ocr方案."""
685
684
  spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
686
685
 
687
686
  elif parse_mode == SupportedPdfParseMethod.OCR:
@@ -689,7 +688,6 @@ def parse_page_core(
689
688
  else:
690
689
  raise Exception('parse_mode must be txt or ocr')
691
690
 
692
-
693
691
  """先处理不需要排版的discarded_blocks"""
694
692
  discarded_block_with_spans, spans = fill_spans_in_blocks(
695
693
  all_discarded_blocks, spans, 0.4
@@ -762,8 +760,8 @@ def parse_page_core(
762
760
 
763
761
 
764
762
  def pdf_parse_union(
765
- dataset: Dataset,
766
763
  model_list,
764
+ dataset: Dataset,
767
765
  imageWriter,
768
766
  parse_mode,
769
767
  start_page_id=0,
@@ -771,6 +769,7 @@ def pdf_parse_union(
771
769
  debug_mode=False,
772
770
  lang=None,
773
771
  ):
772
+
774
773
  pdf_bytes_md5 = compute_md5(dataset.data_bits())
775
774
 
776
775
  """初始化空的pdf_info_dict"""
magic_pdf/pipe/AbsPipe.py CHANGED
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
  from magic_pdf.config.drop_reason import DropReason
4
4
  from magic_pdf.config.make_content_config import DropMode, MakeMode
5
5
  from magic_pdf.data.data_reader_writer import DataWriter
6
+ from magic_pdf.data.dataset import Dataset
6
7
  from magic_pdf.dict2md.ocr_mkcontent import union_make
7
8
  from magic_pdf.filter.pdf_classify_by_type import classify
8
9
  from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
@@ -14,9 +15,9 @@ class AbsPipe(ABC):
14
15
  PIP_OCR = 'ocr'
15
16
  PIP_TXT = 'txt'
16
17
 
17
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
18
+ def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
18
19
  start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
19
- self.pdf_bytes = pdf_bytes
20
+ self.dataset = Dataset
20
21
  self.model_list = model_list
21
22
  self.image_writer = image_writer
22
23
  self.pdf_mid_data = None # 未压缩
magic_pdf/pipe/OCRPipe.py CHANGED
@@ -2,40 +2,79 @@ from loguru import logger
2
2
 
3
3
  from magic_pdf.config.make_content_config import DropMode, MakeMode
4
4
  from magic_pdf.data.data_reader_writer import DataWriter
5
+ from magic_pdf.data.dataset import Dataset
5
6
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
6
7
  from magic_pdf.pipe.AbsPipe import AbsPipe
7
8
  from magic_pdf.user_api import parse_ocr_pdf
8
9
 
9
10
 
10
11
  class OCRPipe(AbsPipe):
11
-
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
- start_page_id=0, end_page_id=None, lang=None,
14
- layout_model=None, formula_enable=None, table_enable=None):
15
- super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
16
- layout_model, formula_enable, table_enable)
12
+ def __init__(
13
+ self,
14
+ dataset: Dataset,
15
+ model_list: list,
16
+ image_writer: DataWriter,
17
+ is_debug: bool = False,
18
+ start_page_id=0,
19
+ end_page_id=None,
20
+ lang=None,
21
+ layout_model=None,
22
+ formula_enable=None,
23
+ table_enable=None,
24
+ ):
25
+ super().__init__(
26
+ dataset,
27
+ model_list,
28
+ image_writer,
29
+ is_debug,
30
+ start_page_id,
31
+ end_page_id,
32
+ lang,
33
+ layout_model,
34
+ formula_enable,
35
+ table_enable,
36
+ )
17
37
 
18
38
  def pipe_classify(self):
19
39
  pass
20
40
 
21
41
  def pipe_analyze(self):
22
- self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
23
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
24
- lang=self.lang, layout_model=self.layout_model,
25
- formula_enable=self.formula_enable, table_enable=self.table_enable)
42
+ self.infer_res = doc_analyze(
43
+ self.dataset,
44
+ ocr=True,
45
+ start_page_id=self.start_page_id,
46
+ end_page_id=self.end_page_id,
47
+ lang=self.lang,
48
+ layout_model=self.layout_model,
49
+ formula_enable=self.formula_enable,
50
+ table_enable=self.table_enable,
51
+ )
26
52
 
27
53
  def pipe_parse(self):
28
- self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
29
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
30
- lang=self.lang, layout_model=self.layout_model,
31
- formula_enable=self.formula_enable, table_enable=self.table_enable)
54
+ self.pdf_mid_data = parse_ocr_pdf(
55
+ self.dataset,
56
+ self.infer_res,
57
+ self.image_writer,
58
+ is_debug=self.is_debug,
59
+ start_page_id=self.start_page_id,
60
+ end_page_id=self.end_page_id,
61
+ lang=self.lang,
62
+ layout_model=self.layout_model,
63
+ formula_enable=self.formula_enable,
64
+ table_enable=self.table_enable,
65
+ )
32
66
 
33
67
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
34
68
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
35
69
  logger.info('ocr_pipe mk content list finished')
36
70
  return result
37
71
 
38
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
72
+ def pipe_mk_markdown(
73
+ self,
74
+ img_parent_path: str,
75
+ drop_mode=DropMode.WHOLE_PDF,
76
+ md_make_mode=MakeMode.MM_MD,
77
+ ):
39
78
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
40
79
  logger.info(f'ocr_pipe mk {md_make_mode} finished')
41
80
  return result
magic_pdf/pipe/TXTPipe.py CHANGED
@@ -2,6 +2,7 @@ from loguru import logger
2
2
 
3
3
  from magic_pdf.config.make_content_config import DropMode, MakeMode
4
4
  from magic_pdf.data.data_reader_writer import DataWriter
5
+ from magic_pdf.data.dataset import Dataset
5
6
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
6
7
  from magic_pdf.pipe.AbsPipe import AbsPipe
7
8
  from magic_pdf.user_api import parse_txt_pdf
@@ -9,23 +10,23 @@ from magic_pdf.user_api import parse_txt_pdf
9
10
 
10
11
  class TXTPipe(AbsPipe):
11
12
 
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
+ def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
14
  start_page_id=0, end_page_id=None, lang=None,
14
15
  layout_model=None, formula_enable=None, table_enable=None):
15
- super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
16
+ super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
16
17
  layout_model, formula_enable, table_enable)
17
18
 
18
19
  def pipe_classify(self):
19
20
  pass
20
21
 
21
22
  def pipe_analyze(self):
22
- self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
23
+ self.model_list = doc_analyze(self.dataset, ocr=False,
23
24
  start_page_id=self.start_page_id, end_page_id=self.end_page_id,
24
25
  lang=self.lang, layout_model=self.layout_model,
25
26
  formula_enable=self.formula_enable, table_enable=self.table_enable)
26
27
 
27
28
  def pipe_parse(self):
28
- self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
29
+ self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
29
30
  start_page_id=self.start_page_id, end_page_id=self.end_page_id,
30
31
  lang=self.lang, layout_model=self.layout_model,
31
32
  formula_enable=self.formula_enable, table_enable=self.table_enable)
magic_pdf/pipe/UNIPipe.py CHANGED
@@ -4,6 +4,7 @@ from loguru import logger
4
4
 
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.data.data_reader_writer import DataWriter
7
+ from magic_pdf.data.dataset import Dataset
7
8
  from magic_pdf.libs.commons import join_path
8
9
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
9
10
  from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -12,12 +13,32 @@ from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
12
13
 
13
14
  class UNIPipe(AbsPipe):
14
15
 
15
- def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
16
- start_page_id=0, end_page_id=None, lang=None,
17
- layout_model=None, formula_enable=None, table_enable=None):
16
+ def __init__(
17
+ self,
18
+ dataset: Dataset,
19
+ jso_useful_key: dict,
20
+ image_writer: DataWriter,
21
+ is_debug: bool = False,
22
+ start_page_id=0,
23
+ end_page_id=None,
24
+ lang=None,
25
+ layout_model=None,
26
+ formula_enable=None,
27
+ table_enable=None,
28
+ ):
18
29
  self.pdf_type = jso_useful_key['_pdf_type']
19
- super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
20
- lang, layout_model, formula_enable, table_enable)
30
+ super().__init__(
31
+ dataset,
32
+ jso_useful_key['model_list'],
33
+ image_writer,
34
+ is_debug,
35
+ start_page_id,
36
+ end_page_id,
37
+ lang,
38
+ layout_model,
39
+ formula_enable,
40
+ table_enable,
41
+ )
21
42
  if len(self.model_list) == 0:
22
43
  self.input_model_is_empty = True
23
44
  else:
@@ -28,35 +49,66 @@ class UNIPipe(AbsPipe):
28
49
 
29
50
  def pipe_analyze(self):
30
51
  if self.pdf_type == self.PIP_TXT:
31
- self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
32
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
33
- lang=self.lang, layout_model=self.layout_model,
34
- formula_enable=self.formula_enable, table_enable=self.table_enable)
52
+ self.model_list = doc_analyze(
53
+ self.dataset,
54
+ ocr=False,
55
+ start_page_id=self.start_page_id,
56
+ end_page_id=self.end_page_id,
57
+ lang=self.lang,
58
+ layout_model=self.layout_model,
59
+ formula_enable=self.formula_enable,
60
+ table_enable=self.table_enable,
61
+ )
35
62
  elif self.pdf_type == self.PIP_OCR:
36
- self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
37
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
38
- lang=self.lang, layout_model=self.layout_model,
39
- formula_enable=self.formula_enable, table_enable=self.table_enable)
63
+ self.model_list = doc_analyze(
64
+ self.dataset,
65
+ ocr=True,
66
+ start_page_id=self.start_page_id,
67
+ end_page_id=self.end_page_id,
68
+ lang=self.lang,
69
+ layout_model=self.layout_model,
70
+ formula_enable=self.formula_enable,
71
+ table_enable=self.table_enable,
72
+ )
40
73
 
41
74
  def pipe_parse(self):
42
75
  if self.pdf_type == self.PIP_TXT:
43
- self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
44
- is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
45
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
46
- lang=self.lang, layout_model=self.layout_model,
47
- formula_enable=self.formula_enable, table_enable=self.table_enable)
76
+ self.pdf_mid_data = parse_union_pdf(
77
+ self.dataset,
78
+ self.model_list,
79
+ self.image_writer,
80
+ is_debug=self.is_debug,
81
+ start_page_id=self.start_page_id,
82
+ end_page_id=self.end_page_id,
83
+ lang=self.lang,
84
+ layout_model=self.layout_model,
85
+ formula_enable=self.formula_enable,
86
+ table_enable=self.table_enable,
87
+ )
48
88
  elif self.pdf_type == self.PIP_OCR:
49
- self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
50
- is_debug=self.is_debug,
51
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
52
- lang=self.lang)
53
-
54
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
89
+ self.pdf_mid_data = parse_ocr_pdf(
90
+ self.dataset,
91
+ self.model_list,
92
+ self.image_writer,
93
+ is_debug=self.is_debug,
94
+ start_page_id=self.start_page_id,
95
+ end_page_id=self.end_page_id,
96
+ lang=self.lang,
97
+ )
98
+
99
+ def pipe_mk_uni_format(
100
+ self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON
101
+ ):
55
102
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
56
103
  logger.info('uni_pipe mk content list finished')
57
104
  return result
58
105
 
59
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
106
+ def pipe_mk_markdown(
107
+ self,
108
+ img_parent_path: str,
109
+ drop_mode=DropMode.WHOLE_PDF,
110
+ md_make_mode=MakeMode.MM_MD,
111
+ ):
60
112
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
61
113
  logger.info(f'uni_pipe mk {md_make_mode} finished')
62
114
  return result
@@ -65,6 +117,7 @@ class UNIPipe(AbsPipe):
65
117
  if __name__ == '__main__':
66
118
  # 测试
67
119
  from magic_pdf.data.data_reader_writer import DataReader
120
+
68
121
  drw = DataReader(r'D:/project/20231108code-clean')
69
122
 
70
123
  pdf_file_path = r'linshixuqiu\19983-00.pdf'
@@ -82,10 +135,7 @@ if __name__ == '__main__':
82
135
  # "model_list": model_list
83
136
  # }
84
137
 
85
- jso_useful_key = {
86
- '_pdf_type': '',
87
- 'model_list': model_list
88
- }
138
+ jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
89
139
  pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
90
140
  pipe.pipe_classify()
91
141
  pipe.pipe_parse()
@@ -94,5 +144,7 @@ if __name__ == '__main__':
94
144
 
95
145
  md_writer = DataWriter(write_path)
96
146
  md_writer.write_string('19983-00.md', md_content)
97
- md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
147
+ md_writer.write_string(
148
+ '19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
149
+ )
98
150
  md_writer.write_string('19983-00.txt', str(content_list))
@@ -0,0 +1,138 @@
1
+ import json
2
+ import os
3
+ from typing import Callable
4
+ import copy
5
+
6
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
7
+ from magic_pdf.data.data_reader_writer import DataWriter
8
+ from magic_pdf.data.dataset import Dataset
9
+ from magic_pdf.dict2md.ocr_mkcontent import union_make
10
+ from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
11
+ draw_span_bbox)
12
+ from magic_pdf.libs.json_compressor import JsonCompressor
13
+
14
+
15
+ class PipeResult:
16
+ def __init__(self, pipe_res, dataset: Dataset):
17
+ """Initialized.
18
+
19
+ Args:
20
+ pipe_res (list[dict]): the pipeline processed result of model inference result
21
+ dataset (Dataset): the dataset associated with pipe_res
22
+ """
23
+ self._pipe_res = pipe_res
24
+ self._dataset = dataset
25
+
26
+ def dump_md(
27
+ self,
28
+ writer: DataWriter,
29
+ file_path: str,
30
+ img_dir_or_bucket_prefix: str,
31
+ drop_mode=DropMode.WHOLE_PDF,
32
+ md_make_mode=MakeMode.MM_MD,
33
+ ):
34
+ """Dump The Markdown.
35
+
36
+ Args:
37
+ writer (DataWriter): File writer handle
38
+ file_path (str): The file location of markdown
39
+ img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
40
+ drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
41
+ md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
42
+ """
43
+ pdf_info_list = self._pipe_res['pdf_info']
44
+ md_content = union_make(
45
+ pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
46
+ )
47
+ writer.write_string(file_path, md_content)
48
+
49
+ def dump_content_list(
50
+ self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
51
+ ):
52
+ """Dump Content List.
53
+
54
+ Args:
55
+ writer (DataWriter): File writer handle
56
+ file_path (str): The file location of content list
57
+ image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
58
+ """
59
+ pdf_info_list = self._pipe_res['pdf_info']
60
+ content_list = union_make(
61
+ pdf_info_list,
62
+ MakeMode.STANDARD_FORMAT,
63
+ DropMode.NONE,
64
+ image_dir_or_bucket_prefix,
65
+ )
66
+ writer.write_string(
67
+ file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
68
+ )
69
+
70
+ def dump_middle_json(self, writer: DataWriter, file_path: str):
71
+ """Dump the result of pipeline.
72
+
73
+ Args:
74
+ writer (DataWriter): File writer handler
75
+ file_path (str): The file location of middle json
76
+ """
77
+ writer.write_string(
78
+ file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
79
+ )
80
+
81
+ def draw_layout(self, file_path: str) -> None:
82
+ """Draw the layout.
83
+
84
+ Args:
85
+ file_path (str): The file location of layout result file
86
+ """
87
+ dir_name = os.path.dirname(file_path)
88
+ base_name = os.path.basename(file_path)
89
+ if not os.path.exists(dir_name):
90
+ os.makedirs(dir_name, exist_ok=True)
91
+ pdf_info = self._pipe_res['pdf_info']
92
+ draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
93
+
94
+ def draw_span(self, file_path: str):
95
+ """Draw the Span.
96
+
97
+ Args:
98
+ file_path (str): The file location of span result file
99
+ """
100
+ dir_name = os.path.dirname(file_path)
101
+ base_name = os.path.basename(file_path)
102
+ if not os.path.exists(dir_name):
103
+ os.makedirs(dir_name, exist_ok=True)
104
+ pdf_info = self._pipe_res['pdf_info']
105
+ draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
106
+
107
+ def draw_line_sort(self, file_path: str):
108
+ """Draw line sort.
109
+
110
+ Args:
111
+ file_path (str): The file location of line sort result file
112
+ """
113
+ dir_name = os.path.dirname(file_path)
114
+ base_name = os.path.basename(file_path)
115
+ if not os.path.exists(dir_name):
116
+ os.makedirs(dir_name, exist_ok=True)
117
+ pdf_info = self._pipe_res['pdf_info']
118
+ draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
119
+
120
+ def get_compress_pdf_mid_data(self):
121
+ """Compress the pipeline result.
122
+
123
+ Returns:
124
+ str: compress the pipeline result and return
125
+ """
126
+ return JsonCompressor.compress_json(self.pdf_mid_data)
127
+
128
+ def apply(self, proc: Callable, *args, **kwargs):
129
+ """Apply callable method which.
130
+
131
+ Args:
132
+ proc (Callable): invoke proc as follows:
133
+ proc(pipeline_result, *args, **kwargs)
134
+
135
+ Returns:
136
+ Any: return the result generated by proc
137
+ """
138
+ return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
12
12
  for span in spans:
13
13
  span_type = span['type']
14
14
  if span_type == ContentType.Image:
15
- if not check_img_bbox(span['bbox']):
15
+ if not check_img_bbox(span['bbox']) or not imageWriter:
16
16
  continue
17
17
  span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
18
18
  imageWriter=imageWriter)
19
19
  elif span_type == ContentType.Table:
20
- if not check_img_bbox(span['bbox']):
20
+ if not check_img_bbox(span['bbox']) or not imageWriter:
21
21
  continue
22
22
  span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
23
23
  imageWriter=imageWriter)