magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +7 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +188 -5
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +16 -15
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +19 -22
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +35 -5
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +43 -7
- magic_pdf/model/sub_modules/model_utils.py +17 -5
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/operators/models.py +154 -0
- magic_pdf/operators/pipes.py +191 -0
- magic_pdf/pdf_parse_union_core_v2.py +77 -27
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +120 -61
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -23
- magic_pdf/pdf_parse_by_txt.py +0 -24
- magic_pdf/pipe/AbsPipe.py +0 -98
- magic_pdf/pipe/OCRPipe.py +0 -41
- magic_pdf/pipe/TXTPipe.py +0 -41
- magic_pdf/pipe/UNIPipe.py +0 -98
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -121
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
magic_pdf/rw/S3ReaderWriter.py
DELETED
@@ -1,142 +0,0 @@
|
|
1
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
2
|
-
from magic_pdf.libs.commons import parse_bucket_key, join_path
|
3
|
-
import boto3
|
4
|
-
from loguru import logger
|
5
|
-
from botocore.config import Config
|
6
|
-
|
7
|
-
|
8
|
-
class S3ReaderWriter(AbsReaderWriter):
|
9
|
-
def __init__(
|
10
|
-
self,
|
11
|
-
ak: str,
|
12
|
-
sk: str,
|
13
|
-
endpoint_url: str,
|
14
|
-
addressing_style: str = "auto",
|
15
|
-
parent_path: str = "",
|
16
|
-
):
|
17
|
-
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
|
18
|
-
self.path = parent_path
|
19
|
-
|
20
|
-
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
|
21
|
-
s3_client = boto3.client(
|
22
|
-
service_name="s3",
|
23
|
-
aws_access_key_id=ak,
|
24
|
-
aws_secret_access_key=sk,
|
25
|
-
endpoint_url=endpoint_url,
|
26
|
-
config=Config(
|
27
|
-
s3={"addressing_style": addressing_style},
|
28
|
-
retries={"max_attempts": 5, "mode": "standard"},
|
29
|
-
),
|
30
|
-
)
|
31
|
-
return s3_client
|
32
|
-
|
33
|
-
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
|
34
|
-
if s3_relative_path.startswith("s3://"):
|
35
|
-
s3_path = s3_relative_path
|
36
|
-
else:
|
37
|
-
s3_path = join_path(self.path, s3_relative_path)
|
38
|
-
bucket_name, key = parse_bucket_key(s3_path)
|
39
|
-
res = self.client.get_object(Bucket=bucket_name, Key=key)
|
40
|
-
body = res["Body"].read()
|
41
|
-
if mode == AbsReaderWriter.MODE_TXT:
|
42
|
-
data = body.decode(encoding) # Decode bytes to text
|
43
|
-
elif mode == AbsReaderWriter.MODE_BIN:
|
44
|
-
data = body
|
45
|
-
else:
|
46
|
-
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
47
|
-
return data
|
48
|
-
|
49
|
-
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
|
50
|
-
if s3_relative_path.startswith("s3://"):
|
51
|
-
s3_path = s3_relative_path
|
52
|
-
else:
|
53
|
-
s3_path = join_path(self.path, s3_relative_path)
|
54
|
-
if mode == AbsReaderWriter.MODE_TXT:
|
55
|
-
body = content.encode(encoding) # Encode text data as bytes
|
56
|
-
elif mode == AbsReaderWriter.MODE_BIN:
|
57
|
-
body = content
|
58
|
-
else:
|
59
|
-
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
60
|
-
bucket_name, key = parse_bucket_key(s3_path)
|
61
|
-
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
|
62
|
-
logger.info(f"内容已写入 {s3_path} ")
|
63
|
-
|
64
|
-
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
|
65
|
-
if path.startswith("s3://"):
|
66
|
-
s3_path = path
|
67
|
-
else:
|
68
|
-
s3_path = join_path(self.path, path)
|
69
|
-
bucket_name, key = parse_bucket_key(s3_path)
|
70
|
-
|
71
|
-
range_header = (
|
72
|
-
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
|
73
|
-
)
|
74
|
-
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
|
75
|
-
return res["Body"].read()
|
76
|
-
|
77
|
-
|
78
|
-
if __name__ == "__main__":
|
79
|
-
if 0:
|
80
|
-
# Config the connection info
|
81
|
-
ak = ""
|
82
|
-
sk = ""
|
83
|
-
endpoint_url = ""
|
84
|
-
addressing_style = "auto"
|
85
|
-
bucket_name = ""
|
86
|
-
# Create an S3ReaderWriter object
|
87
|
-
s3_reader_writer = S3ReaderWriter(
|
88
|
-
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
|
89
|
-
)
|
90
|
-
|
91
|
-
# Write text data to S3
|
92
|
-
text_data = "This is some text data"
|
93
|
-
s3_reader_writer.write(
|
94
|
-
text_data,
|
95
|
-
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
|
96
|
-
mode=AbsReaderWriter.MODE_TXT,
|
97
|
-
)
|
98
|
-
|
99
|
-
# Read text data from S3
|
100
|
-
text_data_read = s3_reader_writer.read(
|
101
|
-
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
|
102
|
-
)
|
103
|
-
logger.info(f"Read text data from S3: {text_data_read}")
|
104
|
-
# Write binary data to S3
|
105
|
-
binary_data = b"This is some binary data"
|
106
|
-
s3_reader_writer.write(
|
107
|
-
text_data,
|
108
|
-
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
|
109
|
-
mode=AbsReaderWriter.MODE_BIN,
|
110
|
-
)
|
111
|
-
|
112
|
-
# Read binary data from S3
|
113
|
-
binary_data_read = s3_reader_writer.read(
|
114
|
-
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
|
115
|
-
)
|
116
|
-
logger.info(f"Read binary data from S3: {binary_data_read}")
|
117
|
-
|
118
|
-
# Range Read text data from S3
|
119
|
-
binary_data_read = s3_reader_writer.read_offset(
|
120
|
-
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
|
121
|
-
)
|
122
|
-
logger.info(f"Read binary data from S3: {binary_data_read}")
|
123
|
-
if 1:
|
124
|
-
import os
|
125
|
-
import json
|
126
|
-
|
127
|
-
ak = os.getenv("AK", "")
|
128
|
-
sk = os.getenv("SK", "")
|
129
|
-
endpoint_url = os.getenv("ENDPOINT", "")
|
130
|
-
bucket = os.getenv("S3_BUCKET", "")
|
131
|
-
prefix = os.getenv("S3_PREFIX", "")
|
132
|
-
key_basename = os.getenv("S3_KEY_BASENAME", "")
|
133
|
-
s3_reader_writer = S3ReaderWriter(
|
134
|
-
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
|
135
|
-
)
|
136
|
-
content_bin = s3_reader_writer.read_offset(key_basename)
|
137
|
-
assert content_bin[:10] == b'{"track_id'
|
138
|
-
assert content_bin[-10:] == b'r":null}}\n'
|
139
|
-
|
140
|
-
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
|
141
|
-
jso = json.dumps(content_bin.decode("utf-8"))
|
142
|
-
print(jso)
|
magic_pdf/rw/__init__.py
DELETED
File without changes
|
magic_pdf/user_api.py
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
"""用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
|
2
|
-
|
3
|
-
然后:
|
4
|
-
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
|
5
|
-
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
|
6
|
-
|
7
|
-
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
|
8
|
-
"""
|
9
|
-
|
10
|
-
from loguru import logger
|
11
|
-
|
12
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
13
|
-
from magic_pdf.libs.version import __version__
|
14
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
15
|
-
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
16
|
-
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
|
17
|
-
|
18
|
-
PARSE_TYPE_TXT = 'txt'
|
19
|
-
PARSE_TYPE_OCR = 'ocr'
|
20
|
-
|
21
|
-
|
22
|
-
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
23
|
-
start_page_id=0, end_page_id=None, lang=None,
|
24
|
-
*args, **kwargs):
|
25
|
-
"""解析文本类pdf."""
|
26
|
-
pdf_info_dict = parse_pdf_by_txt(
|
27
|
-
pdf_bytes,
|
28
|
-
pdf_models,
|
29
|
-
imageWriter,
|
30
|
-
start_page_id=start_page_id,
|
31
|
-
end_page_id=end_page_id,
|
32
|
-
debug_mode=is_debug,
|
33
|
-
lang=lang,
|
34
|
-
)
|
35
|
-
|
36
|
-
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
|
37
|
-
|
38
|
-
pdf_info_dict['_version_name'] = __version__
|
39
|
-
|
40
|
-
if lang is not None:
|
41
|
-
pdf_info_dict['_lang'] = lang
|
42
|
-
|
43
|
-
return pdf_info_dict
|
44
|
-
|
45
|
-
|
46
|
-
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
47
|
-
start_page_id=0, end_page_id=None, lang=None,
|
48
|
-
*args, **kwargs):
|
49
|
-
"""解析ocr类pdf."""
|
50
|
-
pdf_info_dict = parse_pdf_by_ocr(
|
51
|
-
pdf_bytes,
|
52
|
-
pdf_models,
|
53
|
-
imageWriter,
|
54
|
-
start_page_id=start_page_id,
|
55
|
-
end_page_id=end_page_id,
|
56
|
-
debug_mode=is_debug,
|
57
|
-
lang=lang,
|
58
|
-
)
|
59
|
-
|
60
|
-
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
|
61
|
-
|
62
|
-
pdf_info_dict['_version_name'] = __version__
|
63
|
-
|
64
|
-
if lang is not None:
|
65
|
-
pdf_info_dict['_lang'] = lang
|
66
|
-
|
67
|
-
return pdf_info_dict
|
68
|
-
|
69
|
-
|
70
|
-
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
71
|
-
input_model_is_empty: bool = False,
|
72
|
-
start_page_id=0, end_page_id=None, lang=None,
|
73
|
-
*args, **kwargs):
|
74
|
-
"""ocr和文本混合的pdf,全部解析出来."""
|
75
|
-
|
76
|
-
def parse_pdf(method):
|
77
|
-
try:
|
78
|
-
return method(
|
79
|
-
pdf_bytes,
|
80
|
-
pdf_models,
|
81
|
-
imageWriter,
|
82
|
-
start_page_id=start_page_id,
|
83
|
-
end_page_id=end_page_id,
|
84
|
-
debug_mode=is_debug,
|
85
|
-
lang=lang,
|
86
|
-
)
|
87
|
-
except Exception as e:
|
88
|
-
logger.exception(e)
|
89
|
-
return None
|
90
|
-
|
91
|
-
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
92
|
-
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
|
93
|
-
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
|
94
|
-
if input_model_is_empty:
|
95
|
-
layout_model = kwargs.get('layout_model', None)
|
96
|
-
formula_enable = kwargs.get('formula_enable', None)
|
97
|
-
table_enable = kwargs.get('table_enable', None)
|
98
|
-
pdf_models = doc_analyze(
|
99
|
-
pdf_bytes,
|
100
|
-
ocr=True,
|
101
|
-
start_page_id=start_page_id,
|
102
|
-
end_page_id=end_page_id,
|
103
|
-
lang=lang,
|
104
|
-
layout_model=layout_model,
|
105
|
-
formula_enable=formula_enable,
|
106
|
-
table_enable=table_enable,
|
107
|
-
)
|
108
|
-
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
109
|
-
if pdf_info_dict is None:
|
110
|
-
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
|
111
|
-
else:
|
112
|
-
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
|
113
|
-
else:
|
114
|
-
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
|
115
|
-
|
116
|
-
pdf_info_dict['_version_name'] = __version__
|
117
|
-
|
118
|
-
if lang is not None:
|
119
|
-
pdf_info_dict['_lang'] = lang
|
120
|
-
|
121
|
-
return pdf_info_dict
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|