magic-pdf 0.6.2b1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +10 -3
- magic_pdf/libs/config_reader.py +10 -10
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +8 -2
- magic_pdf/model/magic_model.py +4 -0
- magic_pdf/model/pdf_extract_kit.py +45 -2
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
- magic_pdf/resources/model_config/model_configs.yaml +4 -0
- magic_pdf/rw/AbsReaderWriter.py +1 -18
- magic_pdf/rw/DiskReaderWriter.py +32 -24
- magic_pdf/rw/S3ReaderWriter.py +83 -48
- magic_pdf/tools/cli.py +79 -0
- magic_pdf/tools/cli_dev.py +156 -0
- magic_pdf/tools/common.py +119 -0
- {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +49 -31
- {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +22 -24
- {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
- magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
- magic_pdf/cli/magicpdf.py +0 -359
- magic_pdf/pdf_parse_for_train.py +0 -685
- magic_pdf/train_utils/convert_to_train_format.py +0 -65
- magic_pdf/train_utils/extract_caption.py +0 -59
- magic_pdf/train_utils/remove_footer_header.py +0 -159
- magic_pdf/train_utils/vis_utils.py +0 -327
- magic_pdf-0.6.2b1.dist-info/entry_points.txt +0 -2
- /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
- /magic_pdf/{train_utils → tools}/__init__.py +0 -0
- {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
import os
|
2
|
+
import json as json_parse
|
3
|
+
import click
|
4
|
+
from pathlib import Path
|
5
|
+
from magic_pdf.libs.path_utils import (
|
6
|
+
parse_s3path,
|
7
|
+
parse_s3_range_params,
|
8
|
+
remove_non_official_s3_args,
|
9
|
+
)
|
10
|
+
from magic_pdf.libs.config_reader import (
|
11
|
+
get_s3_config,
|
12
|
+
)
|
13
|
+
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
14
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
15
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
16
|
+
import magic_pdf.model as model_config
|
17
|
+
from magic_pdf.tools.common import parse_pdf_methods, do_parse
|
18
|
+
from magic_pdf.libs.version import __version__
|
19
|
+
|
20
|
+
|
21
|
+
def read_s3_path(s3path):
|
22
|
+
bucket, key = parse_s3path(s3path)
|
23
|
+
|
24
|
+
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
25
|
+
s3_rw = S3ReaderWriter(
|
26
|
+
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
|
27
|
+
)
|
28
|
+
may_range_params = parse_s3_range_params(s3path)
|
29
|
+
if may_range_params is None or 2 != len(may_range_params):
|
30
|
+
byte_start, byte_end = 0, None
|
31
|
+
else:
|
32
|
+
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
|
33
|
+
byte_end += byte_start - 1
|
34
|
+
return s3_rw.read_jsonl(
|
35
|
+
remove_non_official_s3_args(s3path),
|
36
|
+
byte_start,
|
37
|
+
byte_end,
|
38
|
+
AbsReaderWriter.MODE_BIN,
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
@click.group()
|
43
|
+
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
|
44
|
+
def cli():
|
45
|
+
pass
|
46
|
+
|
47
|
+
|
48
|
+
@cli.command()
|
49
|
+
@click.option(
|
50
|
+
"-j",
|
51
|
+
"--jsonl",
|
52
|
+
"jsonl",
|
53
|
+
type=str,
|
54
|
+
help="输入 jsonl 路径,本地或者 s3 上的文件",
|
55
|
+
required=True,
|
56
|
+
)
|
57
|
+
@click.option(
|
58
|
+
"-m",
|
59
|
+
"--method",
|
60
|
+
"method",
|
61
|
+
type=parse_pdf_methods,
|
62
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
63
|
+
default="auto",
|
64
|
+
)
|
65
|
+
@click.option(
|
66
|
+
"-o",
|
67
|
+
"--output-dir",
|
68
|
+
"output_dir",
|
69
|
+
type=str,
|
70
|
+
help="输出到本地目录",
|
71
|
+
default="",
|
72
|
+
)
|
73
|
+
def jsonl(jsonl, method, output_dir):
|
74
|
+
print("haha")
|
75
|
+
model_config.__use_inside_model__ = False
|
76
|
+
full_jsonl_path = os.path.realpath(jsonl)
|
77
|
+
if output_dir == "":
|
78
|
+
output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
|
79
|
+
|
80
|
+
if jsonl.startswith("s3://"):
|
81
|
+
jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
|
82
|
+
else:
|
83
|
+
with open(jsonl) as f:
|
84
|
+
jso = json_parse.loads(f.readline())
|
85
|
+
s3_file_path = jso.get("file_location")
|
86
|
+
if s3_file_path is None:
|
87
|
+
s3_file_path = jso.get("path")
|
88
|
+
pdf_file_name = Path(s3_file_path).stem
|
89
|
+
pdf_data = read_s3_path(s3_file_path)
|
90
|
+
|
91
|
+
|
92
|
+
print(pdf_file_name, jso, method)
|
93
|
+
do_parse(
|
94
|
+
output_dir,
|
95
|
+
pdf_file_name,
|
96
|
+
pdf_data,
|
97
|
+
jso["doc_layout_result"],
|
98
|
+
method,
|
99
|
+
f_dump_content_list=True,
|
100
|
+
)
|
101
|
+
|
102
|
+
|
103
|
+
@cli.command()
|
104
|
+
@click.option(
|
105
|
+
"-p",
|
106
|
+
"--pdf",
|
107
|
+
"pdf",
|
108
|
+
type=click.Path(exists=True),
|
109
|
+
required=True,
|
110
|
+
help="本地 PDF 文件",
|
111
|
+
)
|
112
|
+
@click.option(
|
113
|
+
"-j",
|
114
|
+
"--json",
|
115
|
+
"json_data",
|
116
|
+
type=click.Path(exists=True),
|
117
|
+
required=True,
|
118
|
+
help="本地模型推理出的 json 数据",
|
119
|
+
)
|
120
|
+
@click.option(
|
121
|
+
"-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
|
122
|
+
)
|
123
|
+
@click.option(
|
124
|
+
"-m",
|
125
|
+
"--method",
|
126
|
+
"method",
|
127
|
+
type=parse_pdf_methods,
|
128
|
+
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
129
|
+
default="auto",
|
130
|
+
)
|
131
|
+
def pdf(pdf, json_data, output_dir, method):
|
132
|
+
model_config.__use_inside_model__ = False
|
133
|
+
full_pdf_path = os.path.realpath(pdf)
|
134
|
+
if output_dir == "":
|
135
|
+
output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
|
136
|
+
|
137
|
+
def read_fn(path):
|
138
|
+
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
139
|
+
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
140
|
+
|
141
|
+
model_json_list = json_parse.loads(read_fn(json_data).decode("utf-8"))
|
142
|
+
|
143
|
+
file_name = str(Path(full_pdf_path).stem)
|
144
|
+
pdf_data = read_fn(full_pdf_path)
|
145
|
+
do_parse(
|
146
|
+
output_dir,
|
147
|
+
file_name,
|
148
|
+
pdf_data,
|
149
|
+
model_json_list,
|
150
|
+
method,
|
151
|
+
f_dump_content_list=True,
|
152
|
+
)
|
153
|
+
|
154
|
+
|
155
|
+
if __name__ == "__main__":
|
156
|
+
cli()
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import os
|
2
|
+
import json as json_parse
|
3
|
+
import copy
|
4
|
+
import click
|
5
|
+
from loguru import logger
|
6
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
7
|
+
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
8
|
+
from magic_pdf.pipe.UNIPipe import UNIPipe
|
9
|
+
from magic_pdf.pipe.OCRPipe import OCRPipe
|
10
|
+
from magic_pdf.pipe.TXTPipe import TXTPipe
|
11
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
12
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
+
import magic_pdf.model as model_config
|
14
|
+
|
15
|
+
|
16
|
+
def prepare_env(output_dir, pdf_file_name, method):
|
17
|
+
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
|
18
|
+
|
19
|
+
local_image_dir = os.path.join(str(local_parent_dir), "images")
|
20
|
+
local_md_dir = local_parent_dir
|
21
|
+
os.makedirs(local_image_dir, exist_ok=True)
|
22
|
+
os.makedirs(local_md_dir, exist_ok=True)
|
23
|
+
return local_image_dir, local_md_dir
|
24
|
+
|
25
|
+
|
26
|
+
def do_parse(
|
27
|
+
output_dir,
|
28
|
+
pdf_file_name,
|
29
|
+
pdf_bytes,
|
30
|
+
model_list,
|
31
|
+
parse_method,
|
32
|
+
f_draw_span_bbox=True,
|
33
|
+
f_draw_layout_bbox=True,
|
34
|
+
f_dump_md=True,
|
35
|
+
f_dump_middle_json=True,
|
36
|
+
f_dump_model_json=True,
|
37
|
+
f_dump_orig_pdf=True,
|
38
|
+
f_dump_content_list=False,
|
39
|
+
f_make_md_mode=MakeMode.MM_MD,
|
40
|
+
):
|
41
|
+
orig_model_list = copy.deepcopy(model_list)
|
42
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
43
|
+
|
44
|
+
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
45
|
+
local_md_dir
|
46
|
+
)
|
47
|
+
image_dir = str(os.path.basename(local_image_dir))
|
48
|
+
|
49
|
+
if parse_method == "auto":
|
50
|
+
jso_useful_key = {"_pdf_type": "", "model_list": model_list}
|
51
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
52
|
+
elif parse_method == "txt":
|
53
|
+
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
54
|
+
elif parse_method == "ocr":
|
55
|
+
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
56
|
+
else:
|
57
|
+
logger.error("unknown parse method")
|
58
|
+
exit(1)
|
59
|
+
|
60
|
+
pipe.pipe_classify()
|
61
|
+
|
62
|
+
if len(model_list) == 0:
|
63
|
+
if model_config.__use_inside_model__:
|
64
|
+
pipe.pipe_analyze()
|
65
|
+
orig_model_list = copy.deepcopy(pipe.model_list)
|
66
|
+
else:
|
67
|
+
logger.error("need model list input")
|
68
|
+
exit(2)
|
69
|
+
|
70
|
+
pipe.pipe_parse()
|
71
|
+
pdf_info = pipe.pdf_mid_data["pdf_info"]
|
72
|
+
if f_draw_layout_bbox:
|
73
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
74
|
+
if f_draw_span_bbox:
|
75
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
76
|
+
|
77
|
+
md_content = pipe.pipe_mk_markdown(
|
78
|
+
image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
|
79
|
+
)
|
80
|
+
if f_dump_md:
|
81
|
+
md_writer.write(
|
82
|
+
content=md_content,
|
83
|
+
path=f"{pdf_file_name}.md",
|
84
|
+
mode=AbsReaderWriter.MODE_TXT,
|
85
|
+
)
|
86
|
+
|
87
|
+
if f_dump_middle_json:
|
88
|
+
md_writer.write(
|
89
|
+
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
90
|
+
path="middle.json",
|
91
|
+
mode=AbsReaderWriter.MODE_TXT,
|
92
|
+
)
|
93
|
+
|
94
|
+
if f_dump_model_json:
|
95
|
+
md_writer.write(
|
96
|
+
content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
|
97
|
+
path="model.json",
|
98
|
+
mode=AbsReaderWriter.MODE_TXT,
|
99
|
+
)
|
100
|
+
|
101
|
+
if f_dump_orig_pdf:
|
102
|
+
md_writer.write(
|
103
|
+
content=pdf_bytes,
|
104
|
+
path="origin.pdf",
|
105
|
+
mode=AbsReaderWriter.MODE_BIN,
|
106
|
+
)
|
107
|
+
|
108
|
+
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
109
|
+
if f_dump_content_list:
|
110
|
+
md_writer.write(
|
111
|
+
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
|
112
|
+
path="content_list.json",
|
113
|
+
mode=AbsReaderWriter.MODE_TXT,
|
114
|
+
)
|
115
|
+
|
116
|
+
logger.info(f"local output dir is {local_md_dir}")
|
117
|
+
|
118
|
+
|
119
|
+
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
@@ -1,33 +1,36 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0a1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE.md
|
9
|
-
Requires-Dist: boto3
|
10
|
-
Requires-Dist: Brotli
|
11
|
-
Requires-Dist: click
|
12
|
-
Requires-Dist: PyMuPDF
|
13
|
-
Requires-Dist: loguru
|
14
|
-
Requires-Dist: numpy
|
15
|
-
Requires-Dist: fast-langdetect
|
16
|
-
Requires-Dist: wordninja
|
17
|
-
Requires-Dist: scikit-learn
|
18
|
-
Requires-Dist: pdfminer.six
|
9
|
+
Requires-Dist: boto3>=1.28.43
|
10
|
+
Requires-Dist: Brotli>=1.1.0
|
11
|
+
Requires-Dist: click>=8.1.7
|
12
|
+
Requires-Dist: PyMuPDF>=1.24.9
|
13
|
+
Requires-Dist: loguru>=0.6.0
|
14
|
+
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
+
Requires-Dist: fast-langdetect==0.2.0
|
16
|
+
Requires-Dist: wordninja>=2.0.0
|
17
|
+
Requires-Dist: scikit-learn>=1.0.2
|
18
|
+
Requires-Dist: pdfminer.six==20231228
|
19
19
|
Provides-Extra: full
|
20
|
-
Requires-Dist: unimernet
|
21
|
-
Requires-Dist:
|
22
|
-
Requires-Dist:
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist:
|
25
|
-
Requires-Dist:
|
26
|
-
Requires-Dist: paddlepaddle
|
20
|
+
Requires-Dist: unimernet==0.1.6; extra == "full"
|
21
|
+
Requires-Dist: ultralytics; extra == "full"
|
22
|
+
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
23
|
+
Requires-Dist: pypandoc; extra == "full"
|
24
|
+
Requires-Dist: struct-eqtable==0.1.0; extra == "full"
|
25
|
+
Requires-Dist: detectron2; extra == "full"
|
26
|
+
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
27
|
+
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
28
|
+
Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
|
29
|
+
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "full"
|
27
30
|
Provides-Extra: lite
|
28
|
-
Requires-Dist: paddleocr
|
29
|
-
Requires-Dist: paddlepaddle
|
30
|
-
Requires-Dist: paddlepaddle
|
31
|
+
Requires-Dist: paddleocr==2.7.3; extra == "lite"
|
32
|
+
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
|
33
|
+
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
|
31
34
|
|
32
35
|
<div id="top">
|
33
36
|
|
@@ -62,7 +65,7 @@ Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_
|
|
62
65
|
</p>
|
63
66
|
|
64
67
|
<p align="center">
|
65
|
-
👋 join us on <a href="https://discord.gg/
|
68
|
+
👋 join us on <a href="https://discord.gg/gPxmVeGC" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
|
66
69
|
</p>
|
67
70
|
</div>
|
68
71
|
|
@@ -131,21 +134,30 @@ conda activate MinerU
|
|
131
134
|
|
132
135
|
#### 1. Install Magic-PDF
|
133
136
|
|
134
|
-
Install
|
135
|
-
>Note: The pip-installed package supports CPU-only and is ideal for quick tests.
|
136
|
-
>
|
137
|
-
>For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
|
137
|
+
**1.Install dependencies**
|
138
138
|
|
139
|
-
```bash
|
140
|
-
pip install magic-pdf[full-cpu]
|
141
|
-
```
|
142
139
|
The full-feature package depends on detectron2, which requires a compilation installation.
|
143
140
|
If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114
|
144
141
|
Alternatively, you can directly use our precompiled whl package (limited to Python 3.10):
|
145
142
|
|
146
143
|
```bash
|
147
|
-
pip install detectron2 --extra-index-url https://myhloli.
|
144
|
+
pip install detectron2 --extra-index-url https://wheels.myhloli.com
|
145
|
+
```
|
146
|
+
|
147
|
+
**2.Install the full-feature package with pip**
|
148
|
+
>Note: The pip-installed package supports CPU-only and is ideal for quick tests.
|
149
|
+
>
|
150
|
+
>For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
|
151
|
+
|
152
|
+
```bash
|
153
|
+
pip install magic-pdf[full]==0.6.2b1
|
148
154
|
```
|
155
|
+
> ❗️❗️❗️
|
156
|
+
> We have pre-released the 0.6.2 beta version, addressing numerous issues mentioned in our logs. However, this build has not undergone full QA testing and does not represent the final release quality. Should you encounter any problems, please promptly report them to us via issues or revert to using version 0.6.1.
|
157
|
+
> ```bash
|
158
|
+
> pip install magic-pdf[full-cpu]==0.6.1
|
159
|
+
> ```
|
160
|
+
|
149
161
|
|
150
162
|
|
151
163
|
#### 2. Downloading model weights files
|
@@ -174,10 +186,16 @@ If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you c
|
|
174
186
|
##### CUDA
|
175
187
|
|
176
188
|
You need to install the corresponding PyTorch version according to your CUDA version.
|
177
|
-
This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
|
189
|
+
This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
|
178
190
|
```bash
|
179
191
|
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
|
180
192
|
```
|
193
|
+
> ❗ ️Make sure to specify version
|
194
|
+
> ```bash
|
195
|
+
> torch==2.3.1 torchvision==0.18.1
|
196
|
+
> ```
|
197
|
+
> in the command, as these are the highest versions we support. Failing to specify the versions may result in automatically installing higher versions which can cause the program to fail.
|
198
|
+
|
181
199
|
Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json.
|
182
200
|
```json
|
183
201
|
{
|
@@ -1,14 +1,11 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
|
4
|
-
magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
|
5
4
|
magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zxFngs,10620
|
6
5
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
7
|
-
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
magic_pdf/cli/magicpdf.py,sha256=jzGSxLcx_tmjFiZfIKpomfU9rfvAwZDyMm4bmNGQmJw,13541
|
9
6
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
7
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
8
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=mq6tACGkL383bdUla7xOkRXRTBBydRtdbgIHwkk_daM,16169
|
12
9
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
10
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
14
11
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -25,7 +22,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
22
|
magic_pdf/libs/boxbase.py,sha256=MvD0DypR4sTEF3T2RrI_yJ8mPDUBYHAqAaau2mnBSxY,15343
|
26
23
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
27
24
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
28
|
-
magic_pdf/libs/config_reader.py,sha256=
|
25
|
+
magic_pdf/libs/config_reader.py,sha256=dPx6JJJuCw9AzNgKtrTG1elmfdeN6gDhgFK9r15-NsE,2505
|
29
26
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
30
27
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
31
28
|
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
@@ -44,13 +41,13 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
44
41
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
42
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
43
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
44
|
+
magic_pdf/libs/version.py,sha256=iEqOsQ5JUsdTQDAPqryKypkJMTOXXBl71cd4Drh5pDs,24
|
48
45
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
46
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
50
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
51
|
-
magic_pdf/model/magic_model.py,sha256=
|
47
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=cSmh27RG1cvY0pd98T82rq0pANBwkYN0siZIN6oPNI8,4332
|
48
|
+
magic_pdf/model/magic_model.py,sha256=xwKV9BrdjOJecJSzbErT54N6qeJu0tvFuJg2S1z_2kU,25413
|
52
49
|
magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
|
53
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
50
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=I3pZBWQu8y5YVjURTUJnsIySjeSGK-Yefit9PiqN9VE,12952
|
54
51
|
magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
|
55
52
|
magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
53
|
magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
|
@@ -75,6 +72,8 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
|
|
75
72
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
76
73
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
77
74
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
75
|
+
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=BdrBZ_2B2jgF0vzn_ted8bE9Te-DC1Ea2UijqULNKjg,928
|
76
|
+
magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
77
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
79
78
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
80
79
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -128,23 +127,22 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
|
|
128
127
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
129
128
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
130
129
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
131
|
-
magic_pdf/resources/model_config/model_configs.yaml,sha256=
|
130
|
+
magic_pdf/resources/model_config/model_configs.yaml,sha256=2MxCOJ5yNUupQqvrAvEuJKlygjxxV_o1qE64K_4NWKA,235
|
132
131
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
|
133
132
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
134
|
-
magic_pdf/rw/AbsReaderWriter.py,sha256=
|
135
|
-
magic_pdf/rw/DiskReaderWriter.py,sha256=
|
136
|
-
magic_pdf/rw/S3ReaderWriter.py,sha256=
|
133
|
+
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
134
|
+
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
135
|
+
magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
|
137
136
|
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
138
137
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
139
138
|
magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,1131
|
140
|
-
magic_pdf/
|
141
|
-
magic_pdf/
|
142
|
-
magic_pdf/
|
143
|
-
magic_pdf/
|
144
|
-
magic_pdf/
|
145
|
-
magic_pdf-0.
|
146
|
-
magic_pdf-0.
|
147
|
-
magic_pdf-0.
|
148
|
-
magic_pdf-0.
|
149
|
-
magic_pdf-0.
|
150
|
-
magic_pdf-0.6.2b1.dist-info/RECORD,,
|
139
|
+
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
140
|
+
magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
|
141
|
+
magic_pdf/tools/cli_dev.py,sha256=uDc4fDxVuOIrkaKRdjNAqyh9htyLd-fYDEfJBNFUYao,4149
|
142
|
+
magic_pdf/tools/common.py,sha256=x4W-Tyo0A-TGsOjzlUGAhxiU2AisU3nBE3_2H_RLUO4,3801
|
143
|
+
magic_pdf-0.7.0a1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
144
|
+
magic_pdf-0.7.0a1.dist-info/METADATA,sha256=NBLsixinI-5iHwdweKr13SM5qg6Jf-fWCwg5ihavlpY,12455
|
145
|
+
magic_pdf-0.7.0a1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
146
|
+
magic_pdf-0.7.0a1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
147
|
+
magic_pdf-0.7.0a1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
148
|
+
magic_pdf-0.7.0a1.dist-info/RECORD,,
|