magic-pdf 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +76 -75
- magic_pdf/libs/version.py +1 -1
- magic_pdf-0.5.9.dist-info/METADATA +96 -0
- {magic_pdf-0.5.7.dist-info → magic_pdf-0.5.9.dist-info}/RECORD +8 -7
- magic_pdf-0.5.9.dist-info/entry_points.txt +2 -0
- magic_pdf-0.5.7.dist-info/METADATA +0 -28
- {magic_pdf-0.5.7.dist-info → magic_pdf-0.5.9.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.7.dist-info → magic_pdf-0.5.9.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.7.dist-info → magic_pdf-0.5.9.dist-info}/top_level.txt +0 -0
magic_pdf/cli/magicpdf.py
CHANGED
@@ -50,9 +50,7 @@ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
|
50
50
|
|
51
51
|
|
52
52
|
def prepare_env(pdf_file_name, method):
|
53
|
-
local_parent_dir = os.path.join(
|
54
|
-
get_local_dir(), "magic-pdf", pdf_file_name, method
|
55
|
-
)
|
53
|
+
local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
|
56
54
|
|
57
55
|
local_image_dir = os.path.join(str(local_parent_dir), "images")
|
58
56
|
local_md_dir = local_parent_dir
|
@@ -62,7 +60,7 @@ def prepare_env(pdf_file_name, method):
|
|
62
60
|
|
63
61
|
|
64
62
|
def write_to_csv(csv_file_path, csv_data):
|
65
|
-
with open(csv_file_path, mode=
|
63
|
+
with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
|
66
64
|
# 创建csv writer对象
|
67
65
|
csv_writer = csv.writer(csvfile)
|
68
66
|
# 写入数据
|
@@ -70,65 +68,92 @@ def write_to_csv(csv_file_path, csv_data):
|
|
70
68
|
print(f"数据已成功追加到 '{csv_file_path}'")
|
71
69
|
|
72
70
|
|
73
|
-
def
|
71
|
+
def do_parse(
|
72
|
+
pdf_file_name,
|
73
|
+
pdf_bytes,
|
74
|
+
model_list,
|
75
|
+
parse_method,
|
76
|
+
f_draw_span_bbox=True,
|
77
|
+
f_draw_layout_bbox=True,
|
78
|
+
f_dump_md=True,
|
79
|
+
f_dump_middle_json=True,
|
80
|
+
f_dump_model_json=True,
|
81
|
+
f_dump_orig_pdf=True,
|
82
|
+
f_dump_content_list=True,
|
83
|
+
):
|
84
|
+
|
85
|
+
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
|
86
|
+
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
|
87
|
+
image_dir = (os.path.basename(local_image_dir),)
|
88
|
+
|
74
89
|
if parse_method == "auto":
|
75
|
-
jso_useful_key = {
|
76
|
-
"_pdf_type": "",
|
77
|
-
"model_list": model_list
|
78
|
-
}
|
90
|
+
jso_useful_key = {"_pdf_type": "", "model_list": model_list}
|
79
91
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
80
92
|
elif parse_method == "txt":
|
81
93
|
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
82
94
|
elif parse_method == "ocr":
|
83
95
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
84
96
|
else:
|
85
|
-
print("
|
97
|
+
print("unknown parse method")
|
86
98
|
sys.exit(1)
|
87
99
|
|
88
100
|
pipe.pipe_classify()
|
89
101
|
|
90
|
-
|
102
|
+
"""如果没有传入有效的模型数据,则使用内置paddle解析"""
|
91
103
|
if len(model_list) == 0:
|
92
104
|
pipe.pipe_analyze()
|
93
105
|
|
94
106
|
pipe.pipe_parse()
|
95
|
-
pdf_info = pipe.pdf_mid_data[
|
96
|
-
|
97
|
-
|
107
|
+
pdf_info = pipe.pdf_mid_data["pdf_info"]
|
108
|
+
if f_draw_layout_bbox:
|
109
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
110
|
+
if f_draw_span_bbox:
|
111
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
98
112
|
|
99
113
|
# write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
|
100
114
|
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
|
101
115
|
|
102
|
-
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
116
|
+
md_content = pipe.pipe_mk_markdown(str(image_dir), drop_mode=DropMode.NONE)
|
117
|
+
if f_dump_md:
|
118
|
+
"""写markdown"""
|
119
|
+
md_writer.write(
|
120
|
+
content=md_content,
|
121
|
+
path=f"{pdf_file_name}.md",
|
122
|
+
mode=AbsReaderWriter.MODE_TXT,
|
123
|
+
)
|
124
|
+
|
125
|
+
if f_dump_middle_json:
|
126
|
+
"""写middle_json"""
|
127
|
+
md_writer.write(
|
128
|
+
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
129
|
+
path=f"{pdf_file_name}_middle.json",
|
130
|
+
mode=AbsReaderWriter.MODE_TXT,
|
131
|
+
)
|
132
|
+
|
133
|
+
if f_dump_model_json:
|
134
|
+
"""写model_json"""
|
135
|
+
md_writer.write(
|
136
|
+
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
|
137
|
+
path=f"{pdf_file_name}_model.json",
|
138
|
+
mode=AbsReaderWriter.MODE_TXT,
|
139
|
+
)
|
140
|
+
|
141
|
+
if f_dump_orig_pdf:
|
142
|
+
"""写源pdf"""
|
143
|
+
md_writer.write(
|
144
|
+
content=pdf_bytes,
|
145
|
+
path=f"{pdf_file_name}_origin.pdf",
|
146
|
+
mode=AbsReaderWriter.MODE_BIN,
|
147
|
+
)
|
148
|
+
|
149
|
+
content_list = pipe.pipe_mk_uni_format(str(image_dir), drop_mode=DropMode.NONE)
|
150
|
+
if f_dump_content_list:
|
151
|
+
"""写content_list"""
|
152
|
+
md_writer.write(
|
153
|
+
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
|
154
|
+
path=f"{pdf_file_name}_content_list.json",
|
155
|
+
mode=AbsReaderWriter.MODE_TXT,
|
156
|
+
)
|
132
157
|
|
133
158
|
|
134
159
|
@click.group()
|
@@ -177,21 +202,12 @@ def json_command(json, method):
|
|
177
202
|
s3_file_path = jso.get("path")
|
178
203
|
pdf_file_name = Path(s3_file_path).stem
|
179
204
|
pdf_data = read_s3_path(s3_file_path)
|
180
|
-
local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
|
181
|
-
|
182
|
-
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
183
|
-
local_md_dir
|
184
|
-
)
|
185
205
|
|
186
|
-
|
206
|
+
do_parse(
|
187
207
|
pdf_file_name,
|
188
208
|
pdf_data,
|
189
209
|
jso["doc_layout_result"],
|
190
210
|
method,
|
191
|
-
local_image_rw,
|
192
|
-
local_md_rw,
|
193
|
-
os.path.basename(local_image_dir),
|
194
|
-
local_md_dir
|
195
211
|
)
|
196
212
|
|
197
213
|
|
@@ -233,21 +249,11 @@ def local_json_command(local_json, method):
|
|
233
249
|
s3_file_path = jso.get("path")
|
234
250
|
pdf_file_name = Path(s3_file_path).stem
|
235
251
|
pdf_data = read_s3_path(s3_file_path)
|
236
|
-
|
237
|
-
|
238
|
-
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
239
|
-
local_md_dir
|
240
|
-
)
|
241
|
-
|
242
|
-
_do_parse(
|
252
|
+
do_parse(
|
243
253
|
pdf_file_name,
|
244
254
|
pdf_data,
|
245
255
|
jso["doc_layout_result"],
|
246
256
|
method,
|
247
|
-
local_image_rw,
|
248
|
-
local_md_rw,
|
249
|
-
os.path.basename(local_image_dir),
|
250
|
-
local_md_dir
|
251
257
|
)
|
252
258
|
|
253
259
|
|
@@ -274,7 +280,9 @@ def pdf_command(pdf, model, method):
|
|
274
280
|
if model_path is None:
|
275
281
|
model_path = pdf.replace(".pdf", ".json")
|
276
282
|
if not os.path.exists(model_path):
|
277
|
-
logger.warning(
|
283
|
+
logger.warning(
|
284
|
+
f"not found json {model_path} existed, use paddle analyze"
|
285
|
+
)
|
278
286
|
# 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
|
279
287
|
model_json = "[]"
|
280
288
|
else:
|
@@ -286,19 +294,12 @@ def pdf_command(pdf, model, method):
|
|
286
294
|
|
287
295
|
jso = json_parse.loads(get_model_json(model))
|
288
296
|
pdf_file_name = Path(pdf).stem
|
289
|
-
|
290
|
-
|
291
|
-
local_md_dir
|
292
|
-
)
|
293
|
-
_do_parse(
|
297
|
+
|
298
|
+
do_parse(
|
294
299
|
pdf_file_name,
|
295
300
|
pdf_data,
|
296
301
|
jso,
|
297
302
|
method,
|
298
|
-
local_image_rw,
|
299
|
-
local_md_rw,
|
300
|
-
os.path.basename(local_image_dir),
|
301
|
-
local_md_dir
|
302
303
|
)
|
303
304
|
|
304
305
|
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.9"
|
@@ -0,0 +1,96 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: magic-pdf
|
3
|
+
Version: 0.5.9
|
4
|
+
Summary: A practical tool for converting PDF to Markdown
|
5
|
+
Home-page: https://github.com/magicpdf/Magic-PDF
|
6
|
+
Requires-Python: >=3.9
|
7
|
+
Description-Content-Type: text/markdown
|
8
|
+
License-File: LICENSE.md
|
9
|
+
Requires-Dist: boto3 >=1.28.43
|
10
|
+
Requires-Dist: Brotli >=1.1.0
|
11
|
+
Requires-Dist: click >=8.1.7
|
12
|
+
Requires-Dist: Distance >=0.1.3
|
13
|
+
Requires-Dist: PyMuPDF >=1.24.5
|
14
|
+
Requires-Dist: loguru >=0.6.0
|
15
|
+
Requires-Dist: matplotlib >=3.8.3
|
16
|
+
Requires-Dist: numpy >=1.21.6
|
17
|
+
Requires-Dist: pandas >=1.3.5
|
18
|
+
Requires-Dist: fast-langdetect >=0.1.1
|
19
|
+
Requires-Dist: regex >=2023.12.25
|
20
|
+
Requires-Dist: termcolor >=2.4.0
|
21
|
+
Requires-Dist: wordninja >=2.0.0
|
22
|
+
Requires-Dist: scikit-learn >=1.0.2
|
23
|
+
Requires-Dist: nltk ==3.8.1
|
24
|
+
Requires-Dist: s3pathlib >=2.1.1
|
25
|
+
Requires-Dist: paddleocr
|
26
|
+
Requires-Dist: pdfminer.six >=20231228
|
27
|
+
Provides-Extra: cpu
|
28
|
+
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
29
|
+
Provides-Extra: gpu
|
30
|
+
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
31
|
+
|
32
|
+
<div id="top"></div>
|
33
|
+
<div align="center">
|
34
|
+
|
35
|
+
[](https://github.com/magicpdf/Magic-PDF)
|
36
|
+
[](https://github.com/magicpdf/Magic-PDF)
|
37
|
+
[](https://github.com/magicpdf/Magic-PDF/tree/main/LICENSE)
|
38
|
+
[](https://github.com/magicpdf/Magic-PDF/issues)
|
39
|
+
[](https://github.com/magicpdf/Magic-PDF/issues)
|
40
|
+
|
41
|
+
[English](README.md) | [简体中文](README_zh-CN.md)
|
42
|
+
|
43
|
+
</div>
|
44
|
+
|
45
|
+
<div align="center">
|
46
|
+
|
47
|
+
</div>
|
48
|
+
|
49
|
+
# Magic-PDF
|
50
|
+
|
51
|
+
## Introduction
|
52
|
+
|
53
|
+
Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
|
54
|
+
|
55
|
+
Key features include:
|
56
|
+
|
57
|
+
- Support for multiple front-end model inputs
|
58
|
+
- Removal of headers, footers, footnotes, and page numbers
|
59
|
+
- Human-readable layout formatting
|
60
|
+
- Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
|
61
|
+
- Extraction and display of images and tables within markdown
|
62
|
+
- Conversion of equations into LaTeX format
|
63
|
+
- Automatic detection and conversion of garbled PDFs
|
64
|
+
- Compatibility with CPU and GPU environments
|
65
|
+
- Available for Windows, Linux, and macOS platforms
|
66
|
+
|
67
|
+
## Getting Started
|
68
|
+
|
69
|
+
### Requirements
|
70
|
+
|
71
|
+
- Python 3.9 or newer
|
72
|
+
|
73
|
+
### Usage Instructions
|
74
|
+
|
75
|
+
1. **Install Magic-PDF**
|
76
|
+
|
77
|
+
```bash
|
78
|
+
pip install magic-pdf[cpu] # Install the CPU version
|
79
|
+
or
|
80
|
+
pip install magic-pdf[gpu] # Install the GPU version
|
81
|
+
```
|
82
|
+
|
83
|
+
2. **Usage via Command Line**
|
84
|
+
|
85
|
+
```bash
|
86
|
+
magic-pdf --help
|
87
|
+
```
|
88
|
+
|
89
|
+
## License Information
|
90
|
+
|
91
|
+
See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
|
92
|
+
|
93
|
+
## Acknowledgments
|
94
|
+
|
95
|
+
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
96
|
+
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
@@ -7,7 +7,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
|
|
7
7
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
8
8
|
magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
|
9
9
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
10
|
+
magic_pdf/cli/magicpdf.py,sha256=aMmY_J83_8IdkkxyZUHg8WzIbCA_oW4cyjlUUZE0Wvc,11117
|
11
11
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
13
13
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
@@ -46,7 +46,7 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
46
46
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
47
47
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
48
48
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
49
|
-
magic_pdf/libs/version.py,sha256=
|
49
|
+
magic_pdf/libs/version.py,sha256=JXLyhF5WmLgRZBfWGz9zWe2g5ISKSLpn2jp8yLaC-s4,22
|
50
50
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
51
51
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
52
52
|
magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -117,8 +117,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
117
117
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
118
118
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
119
119
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
124
|
-
magic_pdf-0.5.
|
120
|
+
magic_pdf-0.5.9.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
121
|
+
magic_pdf-0.5.9.dist-info/METADATA,sha256=6Y0tWpKEWrjYaNVrBWddqU9mn4EKR8cSbka47hUSmog,2971
|
122
|
+
magic_pdf-0.5.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
123
|
+
magic_pdf-0.5.9.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
124
|
+
magic_pdf-0.5.9.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
125
|
+
magic_pdf-0.5.9.dist-info/RECORD,,
|
@@ -1,28 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: magic-pdf
|
3
|
-
Version: 0.5.7
|
4
|
-
Requires-Python: >=3.9
|
5
|
-
License-File: LICENSE.md
|
6
|
-
Requires-Dist: boto3 >=1.28.43
|
7
|
-
Requires-Dist: Brotli >=1.1.0
|
8
|
-
Requires-Dist: click >=8.1.7
|
9
|
-
Requires-Dist: Distance >=0.1.3
|
10
|
-
Requires-Dist: PyMuPDF >=1.24.5
|
11
|
-
Requires-Dist: loguru >=0.6.0
|
12
|
-
Requires-Dist: matplotlib >=3.8.3
|
13
|
-
Requires-Dist: numpy >=1.21.6
|
14
|
-
Requires-Dist: pandas >=1.3.5
|
15
|
-
Requires-Dist: fast-langdetect >=0.1.1
|
16
|
-
Requires-Dist: regex >=2023.12.25
|
17
|
-
Requires-Dist: termcolor >=2.4.0
|
18
|
-
Requires-Dist: wordninja >=2.0.0
|
19
|
-
Requires-Dist: scikit-learn >=1.0.2
|
20
|
-
Requires-Dist: nltk ==3.8.1
|
21
|
-
Requires-Dist: s3pathlib >=2.1.1
|
22
|
-
Requires-Dist: paddleocr
|
23
|
-
Requires-Dist: pdfminer.six >=20231228
|
24
|
-
Provides-Extra: cpu
|
25
|
-
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
26
|
-
Provides-Extra: gpu
|
27
|
-
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
28
|
-
|
File without changes
|
File without changes
|
File without changes
|