magic-pdf 0.5.10__py3-none-any.whl → 0.5.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +35 -22
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +9 -1
- magic_pdf/model/pp_structure_v2.py +6 -1
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.11.dist-info}/METADATA +21 -2
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.11.dist-info}/RECORD +11 -11
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.11.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.11.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.11.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.11.dist-info}/top_level.txt +0 -0
magic_pdf/cli/magicpdf.py
CHANGED
@@ -23,7 +23,6 @@ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home
|
|
23
23
|
|
24
24
|
import os
|
25
25
|
import json as json_parse
|
26
|
-
import sys
|
27
26
|
import click
|
28
27
|
from loguru import logger
|
29
28
|
from pathlib import Path
|
@@ -46,6 +45,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
|
46
45
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
47
46
|
import csv
|
48
47
|
import copy
|
48
|
+
import magic_pdf.model as model_config
|
49
49
|
|
50
50
|
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
51
51
|
|
@@ -66,21 +66,21 @@ def write_to_csv(csv_file_path, csv_data):
|
|
66
66
|
csv_writer = csv.writer(csvfile)
|
67
67
|
# 写入数据
|
68
68
|
csv_writer.writerow(csv_data)
|
69
|
-
|
69
|
+
logger.info(f"数据已成功追加到 '{csv_file_path}'")
|
70
70
|
|
71
71
|
|
72
72
|
def do_parse(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
73
|
+
pdf_file_name,
|
74
|
+
pdf_bytes,
|
75
|
+
model_list,
|
76
|
+
parse_method,
|
77
|
+
f_draw_span_bbox=True,
|
78
|
+
f_draw_layout_bbox=True,
|
79
|
+
f_dump_md=True,
|
80
|
+
f_dump_middle_json=True,
|
81
|
+
f_dump_model_json=True,
|
82
|
+
f_dump_orig_pdf=True,
|
83
|
+
f_dump_content_list=True,
|
84
84
|
):
|
85
85
|
orig_model_list = copy.deepcopy(model_list)
|
86
86
|
|
@@ -96,14 +96,18 @@ def do_parse(
|
|
96
96
|
elif parse_method == "ocr":
|
97
97
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
98
98
|
else:
|
99
|
-
|
100
|
-
|
99
|
+
logger.error("unknown parse method")
|
100
|
+
exit(1)
|
101
101
|
|
102
102
|
pipe.pipe_classify()
|
103
103
|
|
104
|
-
"""如果没有传入有效的模型数据,则使用内置
|
104
|
+
"""如果没有传入有效的模型数据,则使用内置model解析"""
|
105
105
|
if len(model_list) == 0:
|
106
|
-
|
106
|
+
if model_config.__use_inside_model__:
|
107
|
+
pipe.pipe_analyze()
|
108
|
+
else:
|
109
|
+
logger.error("need model list input")
|
110
|
+
exit(1)
|
107
111
|
|
108
112
|
pipe.pipe_parse()
|
109
113
|
pdf_info = pipe.pdf_mid_data["pdf_info"]
|
@@ -170,10 +174,13 @@ def cli():
|
|
170
174
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
171
175
|
default="auto",
|
172
176
|
)
|
173
|
-
|
177
|
+
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
|
178
|
+
def json_command(json, method, inside_model):
|
179
|
+
model_config.__use_inside_model__ = inside_model
|
180
|
+
|
174
181
|
if not json.startswith("s3://"):
|
175
|
-
|
176
|
-
|
182
|
+
logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
|
183
|
+
exit(1)
|
177
184
|
|
178
185
|
def read_s3_path(s3path):
|
179
186
|
bucket, key = parse_s3path(s3path)
|
@@ -218,7 +225,10 @@ def json_command(json, method):
|
|
218
225
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
219
226
|
default="auto",
|
220
227
|
)
|
221
|
-
|
228
|
+
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
|
229
|
+
def local_json_command(local_json, method, inside_model):
|
230
|
+
model_config.__use_inside_model__ = inside_model
|
231
|
+
|
222
232
|
def read_s3_path(s3path):
|
223
233
|
bucket, key = parse_s3path(s3path)
|
224
234
|
|
@@ -267,7 +277,10 @@ def local_json_command(local_json, method):
|
|
267
277
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
268
278
|
default="auto",
|
269
279
|
)
|
270
|
-
|
280
|
+
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
|
281
|
+
def pdf_command(pdf, model, method, inside_model):
|
282
|
+
model_config.__use_inside_model__ = inside_model
|
283
|
+
|
271
284
|
def read_fn(path):
|
272
285
|
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
273
286
|
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.11"
|
magic_pdf/model/__init__.py
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
__use_inside_model__ = False
|
@@ -2,9 +2,10 @@ import fitz
|
|
2
2
|
import cv2
|
3
3
|
from PIL import Image
|
4
4
|
import numpy as np
|
5
|
+
from loguru import logger
|
5
6
|
|
6
7
|
from magic_pdf.model.model_list import MODEL
|
7
|
-
|
8
|
+
import magic_pdf.model as model_config
|
8
9
|
|
9
10
|
|
10
11
|
def dict_compare(d1, d2):
|
@@ -41,6 +42,13 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
|
41
42
|
|
42
43
|
|
43
44
|
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
|
45
|
+
|
46
|
+
if model_config.__use_inside_model__:
|
47
|
+
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
48
|
+
else:
|
49
|
+
logger.error("use_inside_model is False, not allow to use inside model")
|
50
|
+
exit(1)
|
51
|
+
|
44
52
|
images = load_images_from_pdf(pdf_bytes)
|
45
53
|
custom_model = None
|
46
54
|
if model == MODEL.Paddle:
|
@@ -1,7 +1,12 @@
|
|
1
1
|
import random
|
2
2
|
|
3
3
|
from loguru import logger
|
4
|
-
|
4
|
+
|
5
|
+
try:
|
6
|
+
from paddleocr import PPStructure
|
7
|
+
except ImportError:
|
8
|
+
logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
|
9
|
+
exit(1)
|
5
10
|
|
6
11
|
|
7
12
|
def region_to_bbox(region):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.11
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/magicpdf/Magic-PDF
|
6
6
|
Requires-Python: >=3.9
|
@@ -22,11 +22,26 @@ Requires-Dist: wordninja >=2.0.0
|
|
22
22
|
Requires-Dist: scikit-learn >=1.0.2
|
23
23
|
Requires-Dist: nltk ==3.8.1
|
24
24
|
Requires-Dist: s3pathlib >=2.1.1
|
25
|
-
Requires-Dist: paddleocr
|
26
25
|
Requires-Dist: pdfminer.six >=20231228
|
26
|
+
Requires-Dist: Levenshtein
|
27
|
+
Requires-Dist: nltk
|
28
|
+
Requires-Dist: rapidfuzz
|
29
|
+
Requires-Dist: statistics
|
30
|
+
Requires-Dist: openxlab
|
31
|
+
Requires-Dist: pandas
|
32
|
+
Requires-Dist: numpy
|
33
|
+
Requires-Dist: matplotlib
|
34
|
+
Requires-Dist: seaborn
|
35
|
+
Requires-Dist: scipy
|
36
|
+
Requires-Dist: scikit-learn
|
37
|
+
Requires-Dist: tqdm
|
38
|
+
Requires-Dist: htmltabletomd
|
39
|
+
Requires-Dist: pypandoc
|
27
40
|
Provides-Extra: cpu
|
41
|
+
Requires-Dist: paddleocr ; extra == 'cpu'
|
28
42
|
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
29
43
|
Provides-Extra: gpu
|
44
|
+
Requires-Dist: paddleocr ; extra == 'gpu'
|
30
45
|
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
31
46
|
|
32
47
|
<div id="top"></div>
|
@@ -64,6 +79,10 @@ Key features include:
|
|
64
79
|
- Compatibility with CPU and GPU environments
|
65
80
|
- Available for Windows, Linux, and macOS platforms
|
66
81
|
|
82
|
+
## Project Panorama
|
83
|
+
|
84
|
+

|
85
|
+
|
67
86
|
## Getting Started
|
68
87
|
|
69
88
|
### Requirements
|
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
|
|
5
5
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
6
6
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
7
7
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
8
|
+
magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
|
9
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
11
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
@@ -44,14 +44,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
44
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
|
48
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
49
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
50
|
-
magic_pdf/model/__init__.py,sha256=
|
51
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
50
|
+
magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
|
51
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
|
52
52
|
magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
|
53
53
|
magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
|
54
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
54
|
+
magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
|
55
55
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
57
57
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
115
115
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
116
116
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
117
117
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
118
|
-
magic_pdf-0.5.
|
119
|
-
magic_pdf-0.5.
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
118
|
+
magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
119
|
+
magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
|
120
|
+
magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
121
|
+
magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
122
|
+
magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|