magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
magic_pdf/__init__.py
ADDED
File without changes
|
File without changes
|
@@ -0,0 +1,294 @@
|
|
1
|
+
"""
|
2
|
+
这里实现2个click命令:
|
3
|
+
第一个:
|
4
|
+
接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
|
5
|
+
1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
|
6
|
+
2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
|
7
|
+
3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
|
8
|
+
4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
|
9
|
+
|
10
|
+
最后把以上步骤准备好的对象传入真正的解析API
|
11
|
+
|
12
|
+
第二个:
|
13
|
+
接收1)pdf的本地路径。2)模型json文件(可选)。然后:
|
14
|
+
1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
|
15
|
+
2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
|
16
|
+
3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
|
17
|
+
|
18
|
+
|
19
|
+
效果:
|
20
|
+
python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
|
21
|
+
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
|
22
|
+
"""
|
23
|
+
|
24
|
+
import os
|
25
|
+
import json as json_parse
|
26
|
+
import sys
|
27
|
+
import click
|
28
|
+
from loguru import logger
|
29
|
+
from pathlib import Path
|
30
|
+
from magic_pdf.libs.version import __version__
|
31
|
+
|
32
|
+
from magic_pdf.libs.MakeContentConfig import DropMode
|
33
|
+
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
34
|
+
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
35
|
+
from magic_pdf.pipe.UNIPipe import UNIPipe
|
36
|
+
from magic_pdf.pipe.OCRPipe import OCRPipe
|
37
|
+
from magic_pdf.pipe.TXTPipe import TXTPipe
|
38
|
+
from magic_pdf.libs.config_reader import get_s3_config
|
39
|
+
from magic_pdf.libs.path_utils import (
|
40
|
+
parse_s3path,
|
41
|
+
parse_s3_range_params,
|
42
|
+
remove_non_official_s3_args,
|
43
|
+
)
|
44
|
+
from magic_pdf.libs.config_reader import get_local_dir
|
45
|
+
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
46
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
47
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
48
|
+
import csv
|
49
|
+
|
50
|
+
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
51
|
+
|
52
|
+
|
53
|
+
def prepare_env(pdf_file_name, method):
|
54
|
+
local_parent_dir = os.path.join(
|
55
|
+
get_local_dir(), "magic-pdf", pdf_file_name, method
|
56
|
+
)
|
57
|
+
|
58
|
+
local_image_dir = os.path.join(str(local_parent_dir), "images")
|
59
|
+
local_md_dir = local_parent_dir
|
60
|
+
os.makedirs(local_image_dir, exist_ok=True)
|
61
|
+
os.makedirs(local_md_dir, exist_ok=True)
|
62
|
+
return local_image_dir, local_md_dir
|
63
|
+
|
64
|
+
|
65
|
+
def write_to_csv(csv_file_path, csv_data):
|
66
|
+
with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
|
67
|
+
# 创建csv writer对象
|
68
|
+
csv_writer = csv.writer(csvfile)
|
69
|
+
# 写入数据
|
70
|
+
csv_writer.writerow(csv_data)
|
71
|
+
print(f"数据已成功追加到 '{csv_file_path}'")
|
72
|
+
|
73
|
+
|
74
|
+
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
|
75
|
+
if parse_method == "auto":
|
76
|
+
jso_useful_key = {
|
77
|
+
"_pdf_type": "",
|
78
|
+
"model_list": model_list
|
79
|
+
}
|
80
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
81
|
+
elif parse_method == "txt":
|
82
|
+
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
83
|
+
elif parse_method == "ocr":
|
84
|
+
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
85
|
+
else:
|
86
|
+
print("unknow parse method")
|
87
|
+
sys.exit(1)
|
88
|
+
|
89
|
+
pipe.pipe_classify()
|
90
|
+
|
91
|
+
'''如果没有传入有效的模型数据,则使用内置paddle解析'''
|
92
|
+
if len(model_list) == 0:
|
93
|
+
pipe.pipe_analyze()
|
94
|
+
|
95
|
+
pipe.pipe_parse()
|
96
|
+
pdf_info = pipe.pdf_mid_data['pdf_info']
|
97
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
98
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
99
|
+
|
100
|
+
# write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
|
101
|
+
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
|
102
|
+
|
103
|
+
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
|
104
|
+
md_writer.write(
|
105
|
+
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
|
106
|
+
)
|
107
|
+
md_writer.write(
|
108
|
+
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
109
|
+
path=f"{pdf_file_name}.json",
|
110
|
+
mode=AbsReaderWriter.MODE_TXT,
|
111
|
+
)
|
112
|
+
|
113
|
+
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
114
|
+
md_writer.write(
|
115
|
+
str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
|
116
|
+
)
|
117
|
+
|
118
|
+
|
119
|
+
@click.group()
|
120
|
+
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
121
|
+
@click.help_option("--help", "-h", help="显示帮助信息")
|
122
|
+
def cli():
|
123
|
+
pass
|
124
|
+
|
125
|
+
|
126
|
+
@cli.command()
|
127
|
+
@click.option("--json", type=str, help="输入一个S3路径")
|
128
|
+
@click.option(
|
129
|
+
"--method",
|
130
|
+
type=parse_pdf_methods,
|
131
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
132
|
+
default="auto",
|
133
|
+
)
|
134
|
+
def json_command(json, method):
|
135
|
+
if not json.startswith("s3://"):
|
136
|
+
print("usage: python magipdf.py --json s3://some_bucket/some_path")
|
137
|
+
sys.exit(1)
|
138
|
+
|
139
|
+
def read_s3_path(s3path):
|
140
|
+
bucket, key = parse_s3path(s3path)
|
141
|
+
|
142
|
+
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
143
|
+
s3_rw = S3ReaderWriter(
|
144
|
+
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
145
|
+
)
|
146
|
+
may_range_params = parse_s3_range_params(s3path)
|
147
|
+
if may_range_params is None or 2 != len(may_range_params):
|
148
|
+
byte_start, byte_end = 0, None
|
149
|
+
else:
|
150
|
+
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
151
|
+
byte_end += byte_start - 1
|
152
|
+
return s3_rw.read_jsonl(
|
153
|
+
remove_non_official_s3_args(s3path),
|
154
|
+
byte_start,
|
155
|
+
byte_end,
|
156
|
+
AbsReaderWriter.MODE_BIN,
|
157
|
+
)
|
158
|
+
|
159
|
+
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
|
160
|
+
s3_file_path = jso.get("file_location")
|
161
|
+
if s3_file_path is None:
|
162
|
+
s3_file_path = jso.get("path")
|
163
|
+
pdf_file_name = Path(s3_file_path).stem
|
164
|
+
pdf_data = read_s3_path(s3_file_path)
|
165
|
+
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
166
|
+
|
167
|
+
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
168
|
+
local_md_dir
|
169
|
+
)
|
170
|
+
|
171
|
+
_do_parse(
|
172
|
+
pdf_file_name,
|
173
|
+
pdf_data,
|
174
|
+
jso["doc_layout_result"],
|
175
|
+
method,
|
176
|
+
local_image_rw,
|
177
|
+
local_md_rw,
|
178
|
+
os.path.basename(local_image_dir),
|
179
|
+
local_md_dir
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
@cli.command()
|
184
|
+
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
|
185
|
+
@click.option(
|
186
|
+
"--method",
|
187
|
+
type=parse_pdf_methods,
|
188
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
189
|
+
default="auto",
|
190
|
+
)
|
191
|
+
def local_json_command(local_json, method):
|
192
|
+
def read_s3_path(s3path):
|
193
|
+
bucket, key = parse_s3path(s3path)
|
194
|
+
|
195
|
+
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
196
|
+
s3_rw = S3ReaderWriter(
|
197
|
+
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
198
|
+
)
|
199
|
+
may_range_params = parse_s3_range_params(s3path)
|
200
|
+
if may_range_params is None or 2 != len(may_range_params):
|
201
|
+
byte_start, byte_end = 0, None
|
202
|
+
else:
|
203
|
+
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
204
|
+
byte_end += byte_start - 1
|
205
|
+
return s3_rw.read_jsonl(
|
206
|
+
remove_non_official_s3_args(s3path),
|
207
|
+
byte_start,
|
208
|
+
byte_end,
|
209
|
+
AbsReaderWriter.MODE_BIN,
|
210
|
+
)
|
211
|
+
|
212
|
+
with open(local_json, "r", encoding="utf-8") as f:
|
213
|
+
for json_line in f:
|
214
|
+
jso = json_parse.loads(json_line)
|
215
|
+
|
216
|
+
s3_file_path = jso.get("file_location")
|
217
|
+
if s3_file_path is None:
|
218
|
+
s3_file_path = jso.get("path")
|
219
|
+
pdf_file_name = Path(s3_file_path).stem
|
220
|
+
pdf_data = read_s3_path(s3_file_path)
|
221
|
+
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
222
|
+
|
223
|
+
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
224
|
+
local_md_dir
|
225
|
+
)
|
226
|
+
|
227
|
+
_do_parse(
|
228
|
+
pdf_file_name,
|
229
|
+
pdf_data,
|
230
|
+
jso["doc_layout_result"],
|
231
|
+
method,
|
232
|
+
local_image_rw,
|
233
|
+
local_md_rw,
|
234
|
+
os.path.basename(local_image_dir),
|
235
|
+
local_md_dir
|
236
|
+
)
|
237
|
+
|
238
|
+
|
239
|
+
@cli.command()
|
240
|
+
@click.option(
|
241
|
+
"--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
|
242
|
+
)
|
243
|
+
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
|
244
|
+
@click.option(
|
245
|
+
"--method",
|
246
|
+
type=parse_pdf_methods,
|
247
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
248
|
+
default="auto",
|
249
|
+
)
|
250
|
+
def pdf_command(pdf, model, method):
|
251
|
+
def read_fn(path):
|
252
|
+
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
253
|
+
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
254
|
+
|
255
|
+
pdf_data = read_fn(pdf)
|
256
|
+
|
257
|
+
def get_model_json(model_path):
|
258
|
+
# 这里处理pdf和模型相关的逻辑
|
259
|
+
if model_path is None:
|
260
|
+
model_path = pdf.replace(".pdf", ".json")
|
261
|
+
if not os.path.exists(model_path):
|
262
|
+
logger.warning(f"not found json {model_path} existed, use paddle analyze")
|
263
|
+
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
|
264
|
+
model_json = "[]"
|
265
|
+
else:
|
266
|
+
model_json = read_fn(model_path).decode("utf-8")
|
267
|
+
else:
|
268
|
+
model_json = read_fn(model_path).decode("utf-8")
|
269
|
+
|
270
|
+
return model_json
|
271
|
+
|
272
|
+
jso = json_parse.loads(get_model_json(model))
|
273
|
+
pdf_file_name = Path(pdf).stem
|
274
|
+
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
275
|
+
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
276
|
+
local_md_dir
|
277
|
+
)
|
278
|
+
_do_parse(
|
279
|
+
pdf_file_name,
|
280
|
+
pdf_data,
|
281
|
+
jso,
|
282
|
+
method,
|
283
|
+
local_image_rw,
|
284
|
+
local_md_rw,
|
285
|
+
os.path.basename(local_image_dir),
|
286
|
+
local_md_dir
|
287
|
+
)
|
288
|
+
|
289
|
+
|
290
|
+
if __name__ == "__main__":
|
291
|
+
"""
|
292
|
+
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
|
293
|
+
"""
|
294
|
+
cli()
|
File without changes
|