magic-pdf 0.6.2b1__py3-none-any.whl → 0.7.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +10 -3
  2. magic_pdf/libs/Constants.py +4 -1
  3. magic_pdf/libs/config_reader.py +10 -10
  4. magic_pdf/libs/draw_bbox.py +66 -1
  5. magic_pdf/libs/ocr_content_type.py +14 -0
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +10 -4
  8. magic_pdf/model/magic_model.py +4 -0
  9. magic_pdf/model/pdf_extract_kit.py +83 -39
  10. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
  11. magic_pdf/resources/model_config/model_configs.yaml +4 -0
  12. magic_pdf/rw/AbsReaderWriter.py +1 -18
  13. magic_pdf/rw/DiskReaderWriter.py +32 -24
  14. magic_pdf/rw/S3ReaderWriter.py +83 -48
  15. magic_pdf/tools/cli.py +79 -0
  16. magic_pdf/tools/cli_dev.py +155 -0
  17. magic_pdf/tools/common.py +122 -0
  18. magic_pdf-0.7.0b1.dist-info/METADATA +421 -0
  19. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/RECORD +25 -27
  20. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/WHEEL +1 -1
  21. magic_pdf-0.7.0b1.dist-info/entry_points.txt +3 -0
  22. magic_pdf/cli/magicpdf.py +0 -359
  23. magic_pdf/pdf_parse_for_train.py +0 -685
  24. magic_pdf/train_utils/convert_to_train_format.py +0 -65
  25. magic_pdf/train_utils/extract_caption.py +0 -59
  26. magic_pdf/train_utils/remove_footer_header.py +0 -159
  27. magic_pdf/train_utils/vis_utils.py +0 -327
  28. magic_pdf-0.6.2b1.dist-info/METADATA +0 -344
  29. magic_pdf-0.6.2b1.dist-info/entry_points.txt +0 -2
  30. /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
  31. /magic_pdf/{train_utils → tools}/__init__.py +0 -0
  32. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/LICENSE.md +0 -0
  33. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/top_level.txt +0 -0
@@ -2,16 +2,18 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
2
2
  from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
3
3
  import boto3
4
4
  from loguru import logger
5
- from boto3.s3.transfer import TransferConfig
6
5
  from botocore.config import Config
7
- import os
8
-
9
- MODE_TXT = "text"
10
- MODE_BIN = "binary"
11
6
 
12
7
 
13
8
  class S3ReaderWriter(AbsReaderWriter):
14
- def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
9
+ def __init__(
10
+ self,
11
+ ak: str,
12
+ sk: str,
13
+ endpoint_url: str,
14
+ addressing_style: str = "auto",
15
+ parent_path: str = "",
16
+ ):
15
17
  self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
16
18
  self.path = parent_path
17
19
 
@@ -21,12 +23,14 @@ class S3ReaderWriter(AbsReaderWriter):
21
23
  aws_access_key_id=ak,
22
24
  aws_secret_access_key=sk,
23
25
  endpoint_url=endpoint_url,
24
- config=Config(s3={"addressing_style": addressing_style},
25
- retries={'max_attempts': 5, 'mode': 'standard'}),
26
+ config=Config(
27
+ s3={"addressing_style": addressing_style},
28
+ retries={"max_attempts": 5, "mode": "standard"},
29
+ ),
26
30
  )
27
31
  return s3_client
28
32
 
29
- def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
33
+ def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
30
34
  if s3_relative_path.startswith("s3://"):
31
35
  s3_path = s3_relative_path
32
36
  else:
@@ -34,22 +38,22 @@ class S3ReaderWriter(AbsReaderWriter):
34
38
  bucket_name, key = parse_bucket_key(s3_path)
35
39
  res = self.client.get_object(Bucket=bucket_name, Key=key)
36
40
  body = res["Body"].read()
37
- if mode == MODE_TXT:
41
+ if mode == AbsReaderWriter.MODE_TXT:
38
42
  data = body.decode(encoding) # Decode bytes to text
39
- elif mode == MODE_BIN:
43
+ elif mode == AbsReaderWriter.MODE_BIN:
40
44
  data = body
41
45
  else:
42
46
  raise ValueError("Invalid mode. Use 'text' or 'binary'.")
43
47
  return data
44
48
 
45
- def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
49
+ def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
46
50
  if s3_relative_path.startswith("s3://"):
47
51
  s3_path = s3_relative_path
48
52
  else:
49
53
  s3_path = join_path(self.path, s3_relative_path)
50
- if mode == MODE_TXT:
54
+ if mode == AbsReaderWriter.MODE_TXT:
51
55
  body = content.encode(encoding) # Encode text data as bytes
52
- elif mode == MODE_BIN:
56
+ elif mode == AbsReaderWriter.MODE_BIN:
53
57
  body = content
54
58
  else:
55
59
  raise ValueError("Invalid mode. Use 'text' or 'binary'.")
@@ -57,51 +61,82 @@ class S3ReaderWriter(AbsReaderWriter):
57
61
  self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
58
62
  logger.info(f"内容已写入 {s3_path} ")
59
63
 
60
- def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'):
64
+ def read_offset(self, path: str, offset=0, limit=None) -> bytes:
61
65
  if path.startswith("s3://"):
62
66
  s3_path = path
63
67
  else:
64
68
  s3_path = join_path(self.path, path)
65
69
  bucket_name, key = parse_bucket_key(s3_path)
66
70
 
67
- range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-'
71
+ range_header = (
72
+ f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
73
+ )
68
74
  res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
69
- body = res["Body"].read()
70
- if mode == MODE_TXT:
71
- data = body.decode(encoding) # Decode bytes to text
72
- elif mode == MODE_BIN:
73
- data = body
74
- else:
75
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
76
- return data
75
+ return res["Body"].read()
77
76
 
78
77
 
79
78
  if __name__ == "__main__":
80
- # Config the connection info
81
- ak = ""
82
- sk = ""
83
- endpoint_url = ""
84
- addressing_style = "auto"
85
- bucket_name = ""
86
- # Create an S3ReaderWriter object
87
- s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/")
79
+ if 0:
80
+ # Config the connection info
81
+ ak = ""
82
+ sk = ""
83
+ endpoint_url = ""
84
+ addressing_style = "auto"
85
+ bucket_name = ""
86
+ # Create an S3ReaderWriter object
87
+ s3_reader_writer = S3ReaderWriter(
88
+ ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
89
+ )
88
90
 
89
- # Write text data to S3
90
- text_data = "This is some text data"
91
- s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
91
+ # Write text data to S3
92
+ text_data = "This is some text data"
93
+ s3_reader_writer.write(
94
+ text_data,
95
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
96
+ mode=AbsReaderWriter.MODE_TXT,
97
+ )
98
+
99
+ # Read text data from S3
100
+ text_data_read = s3_reader_writer.read(
101
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
102
+ )
103
+ logger.info(f"Read text data from S3: {text_data_read}")
104
+ # Write binary data to S3
105
+ binary_data = b"This is some binary data"
106
+ s3_reader_writer.write(
107
+ text_data,
108
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
109
+ mode=AbsReaderWriter.MODE_BIN,
110
+ )
92
111
 
93
- # Read text data from S3
94
- text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
95
- logger.info(f"Read text data from S3: {text_data_read}")
96
- # Write binary data to S3
97
- binary_data = b"This is some binary data"
98
- s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
112
+ # Read binary data from S3
113
+ binary_data_read = s3_reader_writer.read(
114
+ s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
115
+ )
116
+ logger.info(f"Read binary data from S3: {binary_data_read}")
117
+
118
+ # Range Read text data from S3
119
+ binary_data_read = s3_reader_writer.read_offset(
120
+ path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
121
+ )
122
+ logger.info(f"Read binary data from S3: {binary_data_read}")
123
+ if 1:
124
+ import os
125
+ import json
99
126
 
100
- # Read binary data from S3
101
- binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
102
- logger.info(f"Read binary data from S3: {binary_data_read}")
127
+ ak = os.getenv("AK", "")
128
+ sk = os.getenv("SK", "")
129
+ endpoint_url = os.getenv("ENDPOINT", "")
130
+ bucket = os.getenv("S3_BUCKET", "")
131
+ prefix = os.getenv("S3_PREFIX", "")
132
+ key_basename = os.getenv("S3_KEY_BASENAME", "")
133
+ s3_reader_writer = S3ReaderWriter(
134
+ ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
135
+ )
136
+ content_bin = s3_reader_writer.read_offset(key_basename)
137
+ assert content_bin[:10] == b'{"track_id'
138
+ assert content_bin[-10:] == b'r":null}}\n'
103
139
 
104
- # Range Read text data from S3
105
- binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json",
106
- byte_start=0, byte_end=10, mode=MODE_BIN)
107
- logger.info(f"Read binary data from S3: {binary_data_read}")
140
+ content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
141
+ jso = json.dumps(content_bin.decode("utf-8"))
142
+ print(jso)
magic_pdf/tools/cli.py ADDED
@@ -0,0 +1,79 @@
1
+ import os
2
+ import click
3
+ from loguru import logger
4
+ from pathlib import Path
5
+
6
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
7
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
+ import magic_pdf.model as model_config
9
+ from magic_pdf.tools.common import parse_pdf_methods, do_parse
10
+ from magic_pdf.libs.version import __version__
11
+
12
+
13
+ @click.command()
14
+ @click.version_option(__version__, "--version", "-v", help="display the version and exit")
15
+ @click.option(
16
+ "-p",
17
+ "--path",
18
+ "path",
19
+ type=click.Path(exists=True),
20
+ required=True,
21
+ help="local pdf filepath or directory",
22
+ )
23
+ @click.option(
24
+ "-o",
25
+ "--output-dir",
26
+ "output_dir",
27
+ type=str,
28
+ help="output local directory",
29
+ default="",
30
+ )
31
+ @click.option(
32
+ "-m",
33
+ "--method",
34
+ "method",
35
+ type=parse_pdf_methods,
36
+ help="""the method for parsing pdf.
37
+ ocr: using ocr technique to extract information from pdf.
38
+ txt: suitable for the text-based pdf only and outperform ocr.
39
+ auto: automatically choose the best method for parsing pdf from ocr and txt.
40
+ without method specified, auto will be used by default.""",
41
+ default="auto",
42
+ )
43
+ def cli(path, output_dir, method):
44
+ model_config.__use_inside_model__ = True
45
+ model_config.__model_mode__ = "full"
46
+ if output_dir == "":
47
+ if os.path.isdir(path):
48
+ output_dir = os.path.join(path, "output")
49
+ else:
50
+ output_dir = os.path.join(os.path.dirname(path), "output")
51
+
52
+ def read_fn(path):
53
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
54
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
55
+
56
+ def parse_doc(doc_path: str):
57
+ try:
58
+ file_name = str(Path(doc_path).stem)
59
+ pdf_data = read_fn(doc_path)
60
+ do_parse(
61
+ output_dir,
62
+ file_name,
63
+ pdf_data,
64
+ [],
65
+ method,
66
+ )
67
+
68
+ except Exception as e:
69
+ logger.exception(e)
70
+
71
+ if os.path.isdir(path):
72
+ for doc_path in Path(path).glob("*.pdf"):
73
+ parse_doc(doc_path)
74
+ else:
75
+ parse_doc(path)
76
+
77
+
78
+ if __name__ == "__main__":
79
+ cli()
@@ -0,0 +1,155 @@
1
+ import os
2
+ import json as json_parse
3
+ import click
4
+ from pathlib import Path
5
+ from magic_pdf.libs.path_utils import (
6
+ parse_s3path,
7
+ parse_s3_range_params,
8
+ remove_non_official_s3_args,
9
+ )
10
+ from magic_pdf.libs.config_reader import (
11
+ get_s3_config,
12
+ )
13
+ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
14
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
15
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
+ import magic_pdf.model as model_config
17
+ from magic_pdf.tools.common import parse_pdf_methods, do_parse
18
+ from magic_pdf.libs.version import __version__
19
+
20
+
21
+ def read_s3_path(s3path):
22
+ bucket, key = parse_s3path(s3path)
23
+
24
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
25
+ s3_rw = S3ReaderWriter(
26
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
27
+ )
28
+ may_range_params = parse_s3_range_params(s3path)
29
+ if may_range_params is None or 2 != len(may_range_params):
30
+ byte_start, byte_end = 0, None
31
+ else:
32
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
33
+ return s3_rw.read_offset(
34
+ remove_non_official_s3_args(s3path),
35
+ byte_start,
36
+ byte_end,
37
+ )
38
+
39
+
40
+ @click.group()
41
+ @click.version_option(__version__, "--version", "-v", help="显示版本信息")
42
+ def cli():
43
+ pass
44
+
45
+
46
+ @cli.command()
47
+ @click.option(
48
+ "-j",
49
+ "--jsonl",
50
+ "jsonl",
51
+ type=str,
52
+ help="输入 jsonl 路径,本地或者 s3 上的文件",
53
+ required=True,
54
+ )
55
+ @click.option(
56
+ "-m",
57
+ "--method",
58
+ "method",
59
+ type=parse_pdf_methods,
60
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
61
+ default="auto",
62
+ )
63
+ @click.option(
64
+ "-o",
65
+ "--output-dir",
66
+ "output_dir",
67
+ type=str,
68
+ help="输出到本地目录",
69
+ default="",
70
+ )
71
+ def jsonl(jsonl, method, output_dir):
72
+ model_config.__use_inside_model__ = False
73
+ if jsonl.startswith("s3://"):
74
+ jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
75
+ full_jsonl_path = "."
76
+ else:
77
+ full_jsonl_path = os.path.realpath(jsonl)
78
+ with open(jsonl) as f:
79
+ jso = json_parse.loads(f.readline())
80
+
81
+ if output_dir == "":
82
+ output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
83
+ s3_file_path = jso.get("file_location")
84
+ if s3_file_path is None:
85
+ s3_file_path = jso.get("path")
86
+ pdf_file_name = Path(s3_file_path).stem
87
+ pdf_data = read_s3_path(s3_file_path)
88
+
89
+ print(pdf_file_name, jso, method)
90
+ do_parse(
91
+ output_dir,
92
+ pdf_file_name,
93
+ pdf_data,
94
+ jso["doc_layout_result"],
95
+ method,
96
+ f_dump_content_list=True,
97
+ f_draw_model_bbox=True,
98
+ )
99
+
100
+
101
+ @cli.command()
102
+ @click.option(
103
+ "-p",
104
+ "--pdf",
105
+ "pdf",
106
+ type=click.Path(exists=True),
107
+ required=True,
108
+ help="本地 PDF 文件",
109
+ )
110
+ @click.option(
111
+ "-j",
112
+ "--json",
113
+ "json_data",
114
+ type=click.Path(exists=True),
115
+ required=True,
116
+ help="本地模型推理出的 json 数据",
117
+ )
118
+ @click.option(
119
+ "-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
120
+ )
121
+ @click.option(
122
+ "-m",
123
+ "--method",
124
+ "method",
125
+ type=parse_pdf_methods,
126
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
127
+ default="auto",
128
+ )
129
+ def pdf(pdf, json_data, output_dir, method):
130
+ model_config.__use_inside_model__ = False
131
+ full_pdf_path = os.path.realpath(pdf)
132
+ if output_dir == "":
133
+ output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
134
+
135
+ def read_fn(path):
136
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
137
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
138
+
139
+ model_json_list = json_parse.loads(read_fn(json_data).decode("utf-8"))
140
+
141
+ file_name = str(Path(full_pdf_path).stem)
142
+ pdf_data = read_fn(full_pdf_path)
143
+ do_parse(
144
+ output_dir,
145
+ file_name,
146
+ pdf_data,
147
+ model_json_list,
148
+ method,
149
+ f_dump_content_list=True,
150
+ f_draw_model_bbox=True,
151
+ )
152
+
153
+
154
+ if __name__ == "__main__":
155
+ cli()
@@ -0,0 +1,122 @@
1
+ import os
2
+ import json as json_parse
3
+ import copy
4
+ import click
5
+ from loguru import logger
6
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
7
+ from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
8
+ from magic_pdf.pipe.UNIPipe import UNIPipe
9
+ from magic_pdf.pipe.OCRPipe import OCRPipe
10
+ from magic_pdf.pipe.TXTPipe import TXTPipe
11
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
12
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
+ import magic_pdf.model as model_config
14
+
15
+
16
+ def prepare_env(output_dir, pdf_file_name, method):
17
+ local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
18
+
19
+ local_image_dir = os.path.join(str(local_parent_dir), "images")
20
+ local_md_dir = local_parent_dir
21
+ os.makedirs(local_image_dir, exist_ok=True)
22
+ os.makedirs(local_md_dir, exist_ok=True)
23
+ return local_image_dir, local_md_dir
24
+
25
+
26
+ def do_parse(
27
+ output_dir,
28
+ pdf_file_name,
29
+ pdf_bytes,
30
+ model_list,
31
+ parse_method,
32
+ f_draw_span_bbox=True,
33
+ f_draw_layout_bbox=True,
34
+ f_dump_md=True,
35
+ f_dump_middle_json=True,
36
+ f_dump_model_json=True,
37
+ f_dump_orig_pdf=True,
38
+ f_dump_content_list=False,
39
+ f_make_md_mode=MakeMode.MM_MD,
40
+ f_draw_model_bbox=False,
41
+ ):
42
+ orig_model_list = copy.deepcopy(model_list)
43
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
44
+
45
+ image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
46
+ local_md_dir
47
+ )
48
+ image_dir = str(os.path.basename(local_image_dir))
49
+
50
+ if parse_method == "auto":
51
+ jso_useful_key = {"_pdf_type": "", "model_list": model_list}
52
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
53
+ elif parse_method == "txt":
54
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
55
+ elif parse_method == "ocr":
56
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
57
+ else:
58
+ logger.error("unknown parse method")
59
+ exit(1)
60
+
61
+ pipe.pipe_classify()
62
+
63
+ if len(model_list) == 0:
64
+ if model_config.__use_inside_model__:
65
+ pipe.pipe_analyze()
66
+ orig_model_list = copy.deepcopy(pipe.model_list)
67
+ else:
68
+ logger.error("need model list input")
69
+ exit(2)
70
+
71
+ pipe.pipe_parse()
72
+ pdf_info = pipe.pdf_mid_data["pdf_info"]
73
+ if f_draw_layout_bbox:
74
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
75
+ if f_draw_span_bbox:
76
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
77
+ if f_draw_model_bbox:
78
+ drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
79
+
80
+ md_content = pipe.pipe_mk_markdown(
81
+ image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
82
+ )
83
+ if f_dump_md:
84
+ md_writer.write(
85
+ content=md_content,
86
+ path=f"{pdf_file_name}.md",
87
+ mode=AbsReaderWriter.MODE_TXT,
88
+ )
89
+
90
+ if f_dump_middle_json:
91
+ md_writer.write(
92
+ content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
93
+ path="middle.json",
94
+ mode=AbsReaderWriter.MODE_TXT,
95
+ )
96
+
97
+ if f_dump_model_json:
98
+ md_writer.write(
99
+ content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
100
+ path="model.json",
101
+ mode=AbsReaderWriter.MODE_TXT,
102
+ )
103
+
104
+ if f_dump_orig_pdf:
105
+ md_writer.write(
106
+ content=pdf_bytes,
107
+ path="origin.pdf",
108
+ mode=AbsReaderWriter.MODE_BIN,
109
+ )
110
+
111
+ content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
112
+ if f_dump_content_list:
113
+ md_writer.write(
114
+ content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
115
+ path="content_list.json",
116
+ mode=AbsReaderWriter.MODE_TXT,
117
+ )
118
+
119
+ logger.info(f"local output dir is {local_md_dir}")
120
+
121
+
122
+ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])