magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +20 -7
- magic_pdf/libs/config_reader.py +28 -10
- magic_pdf/libs/language.py +12 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
- magic_pdf/model/magic_model.py +49 -41
- magic_pdf/model/pdf_extract_kit.py +155 -60
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
- magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
- magic_pdf/model/pp_structure_v2.py +1 -1
- magic_pdf/pdf_parse_union_core.py +4 -2
- magic_pdf/pre_proc/citationmarker_remove.py +5 -1
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
- magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
- magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
- magic_pdf/resources/model_config/model_configs.yaml +4 -0
- magic_pdf/rw/AbsReaderWriter.py +1 -18
- magic_pdf/rw/DiskReaderWriter.py +32 -24
- magic_pdf/rw/S3ReaderWriter.py +83 -48
- magic_pdf/tools/cli.py +79 -0
- magic_pdf/tools/cli_dev.py +156 -0
- magic_pdf/tools/common.py +119 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
- magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
- magic_pdf/cli/magicpdf.py +0 -337
- magic_pdf/pdf_parse_for_train.py +0 -685
- magic_pdf/train_utils/convert_to_train_format.py +0 -65
- magic_pdf/train_utils/extract_caption.py +0 -59
- magic_pdf/train_utils/remove_footer_header.py +0 -159
- magic_pdf/train_utils/vis_utils.py +0 -327
- magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
- /magic_pdf/libs/{math.py → local_math.py} +0 -0
- /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
- /magic_pdf/{train_utils → tools}/__init__.py +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
AUG:
|
2
2
|
DETR: true
|
3
|
-
CACHE_DIR:
|
3
|
+
CACHE_DIR: ~/cache/huggingface
|
4
4
|
CUDNN_BENCHMARK: false
|
5
5
|
DATALOADER:
|
6
6
|
ASPECT_RATIO_GROUPING: true
|
@@ -294,7 +294,7 @@ MODEL:
|
|
294
294
|
POS_TYPE: abs
|
295
295
|
WEIGHTS:
|
296
296
|
OUTPUT_DIR:
|
297
|
-
SCIHUB_DATA_DIR_TRAIN:
|
297
|
+
SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
|
298
298
|
SEED: 42
|
299
299
|
SOLVER:
|
300
300
|
AMP:
|
magic_pdf/rw/AbsReaderWriter.py
CHANGED
@@ -2,33 +2,16 @@ from abc import ABC, abstractmethod
|
|
2
2
|
|
3
3
|
|
4
4
|
class AbsReaderWriter(ABC):
|
5
|
-
"""
|
6
|
-
同时支持二进制和文本读写的抽象类
|
7
|
-
"""
|
8
5
|
MODE_TXT = "text"
|
9
6
|
MODE_BIN = "binary"
|
10
|
-
|
11
|
-
def __init__(self, parent_path):
|
12
|
-
# 初始化代码可以在这里添加,如果需要的话
|
13
|
-
self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个path下。
|
14
|
-
|
15
7
|
@abstractmethod
|
16
8
|
def read(self, path: str, mode=MODE_TXT):
|
17
|
-
"""
|
18
|
-
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
|
19
|
-
"""
|
20
9
|
raise NotImplementedError
|
21
10
|
|
22
11
|
@abstractmethod
|
23
12
|
def write(self, content: str, path: str, mode=MODE_TXT):
|
24
|
-
"""
|
25
|
-
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
|
26
|
-
"""
|
27
13
|
raise NotImplementedError
|
28
14
|
|
29
15
|
@abstractmethod
|
30
|
-
def
|
31
|
-
"""
|
32
|
-
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
|
33
|
-
"""
|
16
|
+
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
|
34
17
|
raise NotImplementedError
|
magic_pdf/rw/DiskReaderWriter.py
CHANGED
@@ -3,34 +3,29 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
|
3
3
|
from loguru import logger
|
4
4
|
|
5
5
|
|
6
|
-
MODE_TXT = "text"
|
7
|
-
MODE_BIN = "binary"
|
8
|
-
|
9
|
-
|
10
6
|
class DiskReaderWriter(AbsReaderWriter):
|
11
|
-
|
12
7
|
def __init__(self, parent_path, encoding="utf-8"):
|
13
8
|
self.path = parent_path
|
14
9
|
self.encoding = encoding
|
15
10
|
|
16
|
-
def read(self, path, mode=MODE_TXT):
|
11
|
+
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
|
17
12
|
if os.path.isabs(path):
|
18
13
|
abspath = path
|
19
14
|
else:
|
20
15
|
abspath = os.path.join(self.path, path)
|
21
16
|
if not os.path.exists(abspath):
|
22
|
-
logger.error(f"
|
23
|
-
raise Exception(f"
|
24
|
-
if mode == MODE_TXT:
|
17
|
+
logger.error(f"file {abspath} not exists")
|
18
|
+
raise Exception(f"file {abspath} no exists")
|
19
|
+
if mode == AbsReaderWriter.MODE_TXT:
|
25
20
|
with open(abspath, "r", encoding=self.encoding) as f:
|
26
21
|
return f.read()
|
27
|
-
elif mode == MODE_BIN:
|
22
|
+
elif mode == AbsReaderWriter.MODE_BIN:
|
28
23
|
with open(abspath, "rb") as f:
|
29
24
|
return f.read()
|
30
25
|
else:
|
31
26
|
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
32
27
|
|
33
|
-
def write(self, content, path, mode=MODE_TXT):
|
28
|
+
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
|
34
29
|
if os.path.isabs(path):
|
35
30
|
abspath = path
|
36
31
|
else:
|
@@ -38,29 +33,42 @@ class DiskReaderWriter(AbsReaderWriter):
|
|
38
33
|
directory_path = os.path.dirname(abspath)
|
39
34
|
if not os.path.exists(directory_path):
|
40
35
|
os.makedirs(directory_path)
|
41
|
-
if mode == MODE_TXT:
|
36
|
+
if mode == AbsReaderWriter.MODE_TXT:
|
42
37
|
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
|
43
38
|
f.write(content)
|
44
39
|
|
45
|
-
elif mode == MODE_BIN:
|
40
|
+
elif mode == AbsReaderWriter.MODE_BIN:
|
46
41
|
with open(abspath, "wb") as f:
|
47
42
|
f.write(content)
|
48
43
|
else:
|
49
44
|
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
50
45
|
|
51
|
-
def
|
52
|
-
|
46
|
+
def read_offset(self, path: str, offset=0, limit=None):
|
47
|
+
abspath = path
|
48
|
+
if not os.path.isabs(path):
|
49
|
+
abspath = os.path.join(self.path, path)
|
50
|
+
with open(abspath, "rb") as f:
|
51
|
+
f.seek(offset)
|
52
|
+
return f.read(limit)
|
53
53
|
|
54
54
|
|
55
|
-
# 使用示例
|
56
55
|
if __name__ == "__main__":
|
57
|
-
|
58
|
-
|
56
|
+
if 0:
|
57
|
+
file_path = "io/test/example.txt"
|
58
|
+
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
|
59
|
+
|
60
|
+
# 写入内容到文件
|
61
|
+
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
|
62
|
+
|
63
|
+
# 从文件读取内容
|
64
|
+
content = drw.read(path=file_path)
|
65
|
+
if content:
|
66
|
+
logger.info(f"从 {file_path} 读取的内容: {content}")
|
67
|
+
if 1:
|
68
|
+
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
|
69
|
+
content_bin = drw.read_offset("1.txt")
|
70
|
+
assert content_bin == b"ABCD!"
|
59
71
|
|
60
|
-
|
61
|
-
|
72
|
+
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
|
73
|
+
assert content_bin == b"BC"
|
62
74
|
|
63
|
-
# 从文件读取内容
|
64
|
-
content = drw.read(path=file_path)
|
65
|
-
if content:
|
66
|
-
logger.info(f"从 {file_path} 读取的内容: {content}")
|
magic_pdf/rw/S3ReaderWriter.py
CHANGED
@@ -2,16 +2,18 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
|
2
2
|
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
|
3
3
|
import boto3
|
4
4
|
from loguru import logger
|
5
|
-
from boto3.s3.transfer import TransferConfig
|
6
5
|
from botocore.config import Config
|
7
|
-
import os
|
8
|
-
|
9
|
-
MODE_TXT = "text"
|
10
|
-
MODE_BIN = "binary"
|
11
6
|
|
12
7
|
|
13
8
|
class S3ReaderWriter(AbsReaderWriter):
|
14
|
-
def __init__(
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
ak: str,
|
12
|
+
sk: str,
|
13
|
+
endpoint_url: str,
|
14
|
+
addressing_style: str = "auto",
|
15
|
+
parent_path: str = "",
|
16
|
+
):
|
15
17
|
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
|
16
18
|
self.path = parent_path
|
17
19
|
|
@@ -21,12 +23,14 @@ class S3ReaderWriter(AbsReaderWriter):
|
|
21
23
|
aws_access_key_id=ak,
|
22
24
|
aws_secret_access_key=sk,
|
23
25
|
endpoint_url=endpoint_url,
|
24
|
-
config=Config(
|
25
|
-
|
26
|
+
config=Config(
|
27
|
+
s3={"addressing_style": addressing_style},
|
28
|
+
retries={"max_attempts": 5, "mode": "standard"},
|
29
|
+
),
|
26
30
|
)
|
27
31
|
return s3_client
|
28
32
|
|
29
|
-
def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
|
33
|
+
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
|
30
34
|
if s3_relative_path.startswith("s3://"):
|
31
35
|
s3_path = s3_relative_path
|
32
36
|
else:
|
@@ -34,22 +38,22 @@ class S3ReaderWriter(AbsReaderWriter):
|
|
34
38
|
bucket_name, key = parse_bucket_key(s3_path)
|
35
39
|
res = self.client.get_object(Bucket=bucket_name, Key=key)
|
36
40
|
body = res["Body"].read()
|
37
|
-
if mode == MODE_TXT:
|
41
|
+
if mode == AbsReaderWriter.MODE_TXT:
|
38
42
|
data = body.decode(encoding) # Decode bytes to text
|
39
|
-
elif mode == MODE_BIN:
|
43
|
+
elif mode == AbsReaderWriter.MODE_BIN:
|
40
44
|
data = body
|
41
45
|
else:
|
42
46
|
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
43
47
|
return data
|
44
48
|
|
45
|
-
def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
|
49
|
+
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
|
46
50
|
if s3_relative_path.startswith("s3://"):
|
47
51
|
s3_path = s3_relative_path
|
48
52
|
else:
|
49
53
|
s3_path = join_path(self.path, s3_relative_path)
|
50
|
-
if mode == MODE_TXT:
|
54
|
+
if mode == AbsReaderWriter.MODE_TXT:
|
51
55
|
body = content.encode(encoding) # Encode text data as bytes
|
52
|
-
elif mode == MODE_BIN:
|
56
|
+
elif mode == AbsReaderWriter.MODE_BIN:
|
53
57
|
body = content
|
54
58
|
else:
|
55
59
|
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
@@ -57,51 +61,82 @@ class S3ReaderWriter(AbsReaderWriter):
|
|
57
61
|
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
|
58
62
|
logger.info(f"内容已写入 {s3_path} ")
|
59
63
|
|
60
|
-
def
|
64
|
+
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
|
61
65
|
if path.startswith("s3://"):
|
62
66
|
s3_path = path
|
63
67
|
else:
|
64
68
|
s3_path = join_path(self.path, path)
|
65
69
|
bucket_name, key = parse_bucket_key(s3_path)
|
66
70
|
|
67
|
-
range_header =
|
71
|
+
range_header = (
|
72
|
+
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
|
73
|
+
)
|
68
74
|
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
|
69
|
-
|
70
|
-
if mode == MODE_TXT:
|
71
|
-
data = body.decode(encoding) # Decode bytes to text
|
72
|
-
elif mode == MODE_BIN:
|
73
|
-
data = body
|
74
|
-
else:
|
75
|
-
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
76
|
-
return data
|
75
|
+
return res["Body"].read()
|
77
76
|
|
78
77
|
|
79
78
|
if __name__ == "__main__":
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
79
|
+
if 0:
|
80
|
+
# Config the connection info
|
81
|
+
ak = ""
|
82
|
+
sk = ""
|
83
|
+
endpoint_url = ""
|
84
|
+
addressing_style = "auto"
|
85
|
+
bucket_name = ""
|
86
|
+
# Create an S3ReaderWriter object
|
87
|
+
s3_reader_writer = S3ReaderWriter(
|
88
|
+
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
|
89
|
+
)
|
88
90
|
|
89
|
-
|
90
|
-
|
91
|
-
|
91
|
+
# Write text data to S3
|
92
|
+
text_data = "This is some text data"
|
93
|
+
s3_reader_writer.write(
|
94
|
+
text_data,
|
95
|
+
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
|
96
|
+
mode=AbsReaderWriter.MODE_TXT,
|
97
|
+
)
|
98
|
+
|
99
|
+
# Read text data from S3
|
100
|
+
text_data_read = s3_reader_writer.read(
|
101
|
+
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
|
102
|
+
)
|
103
|
+
logger.info(f"Read text data from S3: {text_data_read}")
|
104
|
+
# Write binary data to S3
|
105
|
+
binary_data = b"This is some binary data"
|
106
|
+
s3_reader_writer.write(
|
107
|
+
text_data,
|
108
|
+
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
|
109
|
+
mode=AbsReaderWriter.MODE_BIN,
|
110
|
+
)
|
92
111
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
112
|
+
# Read binary data from S3
|
113
|
+
binary_data_read = s3_reader_writer.read(
|
114
|
+
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
|
115
|
+
)
|
116
|
+
logger.info(f"Read binary data from S3: {binary_data_read}")
|
117
|
+
|
118
|
+
# Range Read text data from S3
|
119
|
+
binary_data_read = s3_reader_writer.read_offset(
|
120
|
+
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
|
121
|
+
)
|
122
|
+
logger.info(f"Read binary data from S3: {binary_data_read}")
|
123
|
+
if 1:
|
124
|
+
import os
|
125
|
+
import json
|
99
126
|
|
100
|
-
|
101
|
-
|
102
|
-
|
127
|
+
ak = os.getenv("AK", "")
|
128
|
+
sk = os.getenv("SK", "")
|
129
|
+
endpoint_url = os.getenv("ENDPOINT", "")
|
130
|
+
bucket = os.getenv("S3_BUCKET", "")
|
131
|
+
prefix = os.getenv("S3_PREFIX", "")
|
132
|
+
key_basename = os.getenv("S3_KEY_BASENAME", "")
|
133
|
+
s3_reader_writer = S3ReaderWriter(
|
134
|
+
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
|
135
|
+
)
|
136
|
+
content_bin = s3_reader_writer.read_offset(key_basename)
|
137
|
+
assert content_bin[:10] == b'{"track_id'
|
138
|
+
assert content_bin[-10:] == b'r":null}}\n'
|
103
139
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
logger.info(f"Read binary data from S3: {binary_data_read}")
|
140
|
+
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
|
141
|
+
jso = json.dumps(content_bin.decode("utf-8"))
|
142
|
+
print(jso)
|
magic_pdf/tools/cli.py
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
import os
|
2
|
+
import click
|
3
|
+
from loguru import logger
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
7
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
|
+
import magic_pdf.model as model_config
|
9
|
+
from magic_pdf.tools.common import parse_pdf_methods, do_parse
|
10
|
+
from magic_pdf.libs.version import __version__
|
11
|
+
|
12
|
+
|
13
|
+
@click.command()
|
14
|
+
@click.version_option(__version__, "--version", "-v", help="display the version and exit")
|
15
|
+
@click.option(
|
16
|
+
"-p",
|
17
|
+
"--path",
|
18
|
+
"path",
|
19
|
+
type=click.Path(exists=True),
|
20
|
+
required=True,
|
21
|
+
help="local pdf filepath or directory",
|
22
|
+
)
|
23
|
+
@click.option(
|
24
|
+
"-o",
|
25
|
+
"--output-dir",
|
26
|
+
"output_dir",
|
27
|
+
type=str,
|
28
|
+
help="output local directory",
|
29
|
+
default="",
|
30
|
+
)
|
31
|
+
@click.option(
|
32
|
+
"-m",
|
33
|
+
"--method",
|
34
|
+
"method",
|
35
|
+
type=parse_pdf_methods,
|
36
|
+
help="""the method for parsing pdf.
|
37
|
+
ocr: using ocr technique to extract information from pdf.
|
38
|
+
txt: suitable for the text-based pdf only and outperform ocr.
|
39
|
+
auto: automatically choose the best method for parsing pdf from ocr and txt.
|
40
|
+
without method specified, auto will be used by default.""",
|
41
|
+
default="auto",
|
42
|
+
)
|
43
|
+
def cli(path, output_dir, method):
|
44
|
+
model_config.__use_inside_model__ = True
|
45
|
+
model_config.__model_mode__ = "full"
|
46
|
+
if output_dir == "":
|
47
|
+
if os.path.isdir(path):
|
48
|
+
output_dir = os.path.join(path, "output")
|
49
|
+
else:
|
50
|
+
output_dir = os.path.join(os.path.dirname(path), "output")
|
51
|
+
|
52
|
+
def read_fn(path):
|
53
|
+
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
54
|
+
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
55
|
+
|
56
|
+
def parse_doc(doc_path: str):
|
57
|
+
try:
|
58
|
+
file_name = str(Path(doc_path).stem)
|
59
|
+
pdf_data = read_fn(doc_path)
|
60
|
+
do_parse(
|
61
|
+
output_dir,
|
62
|
+
file_name,
|
63
|
+
pdf_data,
|
64
|
+
[],
|
65
|
+
method,
|
66
|
+
)
|
67
|
+
|
68
|
+
except Exception as e:
|
69
|
+
logger.exception(e)
|
70
|
+
|
71
|
+
if os.path.isdir(path):
|
72
|
+
for doc_path in Path(path).glob("*.pdf"):
|
73
|
+
parse_doc(doc_path)
|
74
|
+
else:
|
75
|
+
parse_doc(path)
|
76
|
+
|
77
|
+
|
78
|
+
if __name__ == "__main__":
|
79
|
+
cli()
|
@@ -0,0 +1,156 @@
|
|
1
|
+
import os
|
2
|
+
import json as json_parse
|
3
|
+
import click
|
4
|
+
from pathlib import Path
|
5
|
+
from magic_pdf.libs.path_utils import (
|
6
|
+
parse_s3path,
|
7
|
+
parse_s3_range_params,
|
8
|
+
remove_non_official_s3_args,
|
9
|
+
)
|
10
|
+
from magic_pdf.libs.config_reader import (
|
11
|
+
get_s3_config,
|
12
|
+
)
|
13
|
+
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
14
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
15
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
16
|
+
import magic_pdf.model as model_config
|
17
|
+
from magic_pdf.tools.common import parse_pdf_methods, do_parse
|
18
|
+
from magic_pdf.libs.version import __version__
|
19
|
+
|
20
|
+
|
21
|
+
def read_s3_path(s3path):
|
22
|
+
bucket, key = parse_s3path(s3path)
|
23
|
+
|
24
|
+
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
25
|
+
s3_rw = S3ReaderWriter(
|
26
|
+
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
27
|
+
)
|
28
|
+
may_range_params = parse_s3_range_params(s3path)
|
29
|
+
if may_range_params is None or 2 != len(may_range_params):
|
30
|
+
byte_start, byte_end = 0, None
|
31
|
+
else:
|
32
|
+
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
33
|
+
byte_end += byte_start - 1
|
34
|
+
return s3_rw.read_jsonl(
|
35
|
+
remove_non_official_s3_args(s3path),
|
36
|
+
byte_start,
|
37
|
+
byte_end,
|
38
|
+
AbsReaderWriter.MODE_BIN,
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
@click.group()
|
43
|
+
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
44
|
+
def cli():
|
45
|
+
pass
|
46
|
+
|
47
|
+
|
48
|
+
@cli.command()
|
49
|
+
@click.option(
|
50
|
+
"-j",
|
51
|
+
"--jsonl",
|
52
|
+
"jsonl",
|
53
|
+
type=str,
|
54
|
+
help="输入 jsonl 路径,本地或者 s3 上的文件",
|
55
|
+
required=True,
|
56
|
+
)
|
57
|
+
@click.option(
|
58
|
+
"-m",
|
59
|
+
"--method",
|
60
|
+
"method",
|
61
|
+
type=parse_pdf_methods,
|
62
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
63
|
+
default="auto",
|
64
|
+
)
|
65
|
+
@click.option(
|
66
|
+
"-o",
|
67
|
+
"--output-dir",
|
68
|
+
"output_dir",
|
69
|
+
type=str,
|
70
|
+
help="输出到本地目录",
|
71
|
+
default="",
|
72
|
+
)
|
73
|
+
def jsonl(jsonl, method, output_dir):
|
74
|
+
print("haha")
|
75
|
+
model_config.__use_inside_model__ = False
|
76
|
+
full_jsonl_path = os.path.realpath(jsonl)
|
77
|
+
if output_dir == "":
|
78
|
+
output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
|
79
|
+
|
80
|
+
if jsonl.startswith("s3://"):
|
81
|
+
jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
|
82
|
+
else:
|
83
|
+
with open(jsonl) as f:
|
84
|
+
jso = json_parse.loads(f.readline())
|
85
|
+
s3_file_path = jso.get("file_location")
|
86
|
+
if s3_file_path is None:
|
87
|
+
s3_file_path = jso.get("path")
|
88
|
+
pdf_file_name = Path(s3_file_path).stem
|
89
|
+
pdf_data = read_s3_path(s3_file_path)
|
90
|
+
|
91
|
+
|
92
|
+
print(pdf_file_name, jso, method)
|
93
|
+
do_parse(
|
94
|
+
output_dir,
|
95
|
+
pdf_file_name,
|
96
|
+
pdf_data,
|
97
|
+
jso["doc_layout_result"],
|
98
|
+
method,
|
99
|
+
f_dump_content_list=True,
|
100
|
+
)
|
101
|
+
|
102
|
+
|
103
|
+
@cli.command()
|
104
|
+
@click.option(
|
105
|
+
"-p",
|
106
|
+
"--pdf",
|
107
|
+
"pdf",
|
108
|
+
type=click.Path(exists=True),
|
109
|
+
required=True,
|
110
|
+
help="本地 PDF 文件",
|
111
|
+
)
|
112
|
+
@click.option(
|
113
|
+
"-j",
|
114
|
+
"--json",
|
115
|
+
"json_data",
|
116
|
+
type=click.Path(exists=True),
|
117
|
+
required=True,
|
118
|
+
help="本地模型推理出的 json 数据",
|
119
|
+
)
|
120
|
+
@click.option(
|
121
|
+
"-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
|
122
|
+
)
|
123
|
+
@click.option(
|
124
|
+
"-m",
|
125
|
+
"--method",
|
126
|
+
"method",
|
127
|
+
type=parse_pdf_methods,
|
128
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
129
|
+
default="auto",
|
130
|
+
)
|
131
|
+
def pdf(pdf, json_data, output_dir, method):
|
132
|
+
model_config.__use_inside_model__ = False
|
133
|
+
full_pdf_path = os.path.realpath(pdf)
|
134
|
+
if output_dir == "":
|
135
|
+
output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
|
136
|
+
|
137
|
+
def read_fn(path):
|
138
|
+
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
139
|
+
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
140
|
+
|
141
|
+
model_json_list = json_parse.loads(read_fn(json_data).decode("utf-8"))
|
142
|
+
|
143
|
+
file_name = str(Path(full_pdf_path).stem)
|
144
|
+
pdf_data = read_fn(full_pdf_path)
|
145
|
+
do_parse(
|
146
|
+
output_dir,
|
147
|
+
file_name,
|
148
|
+
pdf_data,
|
149
|
+
model_json_list,
|
150
|
+
method,
|
151
|
+
f_dump_content_list=True,
|
152
|
+
)
|
153
|
+
|
154
|
+
|
155
|
+
if __name__ == "__main__":
|
156
|
+
cli()
|