magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +20 -7
- magic_pdf/libs/config_reader.py +28 -10
- magic_pdf/libs/language.py +12 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
- magic_pdf/model/magic_model.py +49 -41
- magic_pdf/model/pdf_extract_kit.py +155 -60
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
- magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
- magic_pdf/model/pp_structure_v2.py +1 -1
- magic_pdf/pdf_parse_union_core.py +4 -2
- magic_pdf/pre_proc/citationmarker_remove.py +5 -1
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
- magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
- magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
- magic_pdf/resources/model_config/model_configs.yaml +4 -0
- magic_pdf/rw/AbsReaderWriter.py +1 -18
- magic_pdf/rw/DiskReaderWriter.py +32 -24
- magic_pdf/rw/S3ReaderWriter.py +83 -48
- magic_pdf/tools/cli.py +79 -0
- magic_pdf/tools/cli_dev.py +156 -0
- magic_pdf/tools/common.py +119 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
- magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
- magic_pdf/cli/magicpdf.py +0 -337
- magic_pdf/pdf_parse_for_train.py +0 -685
- magic_pdf/train_utils/convert_to_train_format.py +0 -65
- magic_pdf/train_utils/extract_caption.py +0 -59
- magic_pdf/train_utils/remove_footer_header.py +0 -159
- magic_pdf/train_utils/vis_utils.py +0 -327
- magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
- /magic_pdf/libs/{math.py → local_math.py} +0 -0
- /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
- /magic_pdf/{train_utils → tools}/__init__.py +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
import os
|
2
|
+
import json as json_parse
|
3
|
+
import copy
|
4
|
+
import click
|
5
|
+
from loguru import logger
|
6
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
7
|
+
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
8
|
+
from magic_pdf.pipe.UNIPipe import UNIPipe
|
9
|
+
from magic_pdf.pipe.OCRPipe import OCRPipe
|
10
|
+
from magic_pdf.pipe.TXTPipe import TXTPipe
|
11
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
12
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
+
import magic_pdf.model as model_config
|
14
|
+
|
15
|
+
|
16
|
+
def prepare_env(output_dir, pdf_file_name, method):
|
17
|
+
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
|
18
|
+
|
19
|
+
local_image_dir = os.path.join(str(local_parent_dir), "images")
|
20
|
+
local_md_dir = local_parent_dir
|
21
|
+
os.makedirs(local_image_dir, exist_ok=True)
|
22
|
+
os.makedirs(local_md_dir, exist_ok=True)
|
23
|
+
return local_image_dir, local_md_dir
|
24
|
+
|
25
|
+
|
26
|
+
def do_parse(
|
27
|
+
output_dir,
|
28
|
+
pdf_file_name,
|
29
|
+
pdf_bytes,
|
30
|
+
model_list,
|
31
|
+
parse_method,
|
32
|
+
f_draw_span_bbox=True,
|
33
|
+
f_draw_layout_bbox=True,
|
34
|
+
f_dump_md=True,
|
35
|
+
f_dump_middle_json=True,
|
36
|
+
f_dump_model_json=True,
|
37
|
+
f_dump_orig_pdf=True,
|
38
|
+
f_dump_content_list=False,
|
39
|
+
f_make_md_mode=MakeMode.MM_MD,
|
40
|
+
):
|
41
|
+
orig_model_list = copy.deepcopy(model_list)
|
42
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
43
|
+
|
44
|
+
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
|
45
|
+
local_md_dir
|
46
|
+
)
|
47
|
+
image_dir = str(os.path.basename(local_image_dir))
|
48
|
+
|
49
|
+
if parse_method == "auto":
|
50
|
+
jso_useful_key = {"_pdf_type": "", "model_list": model_list}
|
51
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
52
|
+
elif parse_method == "txt":
|
53
|
+
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
54
|
+
elif parse_method == "ocr":
|
55
|
+
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
56
|
+
else:
|
57
|
+
logger.error("unknown parse method")
|
58
|
+
exit(1)
|
59
|
+
|
60
|
+
pipe.pipe_classify()
|
61
|
+
|
62
|
+
if len(model_list) == 0:
|
63
|
+
if model_config.__use_inside_model__:
|
64
|
+
pipe.pipe_analyze()
|
65
|
+
orig_model_list = copy.deepcopy(pipe.model_list)
|
66
|
+
else:
|
67
|
+
logger.error("need model list input")
|
68
|
+
exit(2)
|
69
|
+
|
70
|
+
pipe.pipe_parse()
|
71
|
+
pdf_info = pipe.pdf_mid_data["pdf_info"]
|
72
|
+
if f_draw_layout_bbox:
|
73
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
74
|
+
if f_draw_span_bbox:
|
75
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
76
|
+
|
77
|
+
md_content = pipe.pipe_mk_markdown(
|
78
|
+
image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
|
79
|
+
)
|
80
|
+
if f_dump_md:
|
81
|
+
md_writer.write(
|
82
|
+
content=md_content,
|
83
|
+
path=f"{pdf_file_name}.md",
|
84
|
+
mode=AbsReaderWriter.MODE_TXT,
|
85
|
+
)
|
86
|
+
|
87
|
+
if f_dump_middle_json:
|
88
|
+
md_writer.write(
|
89
|
+
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
90
|
+
path="middle.json",
|
91
|
+
mode=AbsReaderWriter.MODE_TXT,
|
92
|
+
)
|
93
|
+
|
94
|
+
if f_dump_model_json:
|
95
|
+
md_writer.write(
|
96
|
+
content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
|
97
|
+
path="model.json",
|
98
|
+
mode=AbsReaderWriter.MODE_TXT,
|
99
|
+
)
|
100
|
+
|
101
|
+
if f_dump_orig_pdf:
|
102
|
+
md_writer.write(
|
103
|
+
content=pdf_bytes,
|
104
|
+
path="origin.pdf",
|
105
|
+
mode=AbsReaderWriter.MODE_BIN,
|
106
|
+
)
|
107
|
+
|
108
|
+
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
109
|
+
if f_dump_content_list:
|
110
|
+
md_writer.write(
|
111
|
+
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
|
112
|
+
path="content_list.json",
|
113
|
+
mode=AbsReaderWriter.MODE_TXT,
|
114
|
+
)
|
115
|
+
|
116
|
+
logger.info(f"local output dir is {local_md_dir}")
|
117
|
+
|
118
|
+
|
119
|
+
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
@@ -1,35 +1,44 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0a1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE.md
|
9
|
-
Requires-Dist: boto3
|
10
|
-
Requires-Dist: Brotli
|
11
|
-
Requires-Dist: click
|
12
|
-
Requires-Dist: PyMuPDF
|
13
|
-
Requires-Dist: loguru
|
14
|
-
Requires-Dist: numpy
|
15
|
-
Requires-Dist: fast-langdetect
|
16
|
-
Requires-Dist: wordninja
|
17
|
-
Requires-Dist: scikit-learn
|
18
|
-
Requires-Dist: pdfminer.six
|
19
|
-
Provides-Extra:
|
20
|
-
Requires-Dist:
|
21
|
-
Requires-Dist:
|
22
|
-
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist:
|
25
|
-
Requires-Dist:
|
26
|
-
Requires-Dist:
|
27
|
-
Requires-Dist:
|
28
|
-
|
29
|
-
Requires-Dist:
|
30
|
-
|
31
|
-
|
32
|
-
|
9
|
+
Requires-Dist: boto3>=1.28.43
|
10
|
+
Requires-Dist: Brotli>=1.1.0
|
11
|
+
Requires-Dist: click>=8.1.7
|
12
|
+
Requires-Dist: PyMuPDF>=1.24.9
|
13
|
+
Requires-Dist: loguru>=0.6.0
|
14
|
+
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
+
Requires-Dist: fast-langdetect==0.2.0
|
16
|
+
Requires-Dist: wordninja>=2.0.0
|
17
|
+
Requires-Dist: scikit-learn>=1.0.2
|
18
|
+
Requires-Dist: pdfminer.six==20231228
|
19
|
+
Provides-Extra: full
|
20
|
+
Requires-Dist: unimernet==0.1.6; extra == "full"
|
21
|
+
Requires-Dist: ultralytics; extra == "full"
|
22
|
+
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
23
|
+
Requires-Dist: pypandoc; extra == "full"
|
24
|
+
Requires-Dist: struct-eqtable==0.1.0; extra == "full"
|
25
|
+
Requires-Dist: detectron2; extra == "full"
|
26
|
+
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
27
|
+
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
28
|
+
Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
|
29
|
+
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "full"
|
30
|
+
Provides-Extra: lite
|
31
|
+
Requires-Dist: paddleocr==2.7.3; extra == "lite"
|
32
|
+
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
|
33
|
+
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
|
34
|
+
|
35
|
+
<div id="top">
|
36
|
+
|
37
|
+
<p align="center">
|
38
|
+
<img src="docs/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
|
39
|
+
</p>
|
40
|
+
|
41
|
+
</div>
|
33
42
|
<div align="center">
|
34
43
|
|
35
44
|
[](https://github.com/opendatalab/MinerU)
|
@@ -40,15 +49,24 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
|
40
49
|
[](https://pepy.tech/project/magic-pdf)
|
41
50
|
[](https://pepy.tech/project/magic-pdf)
|
42
51
|
|
52
|
+
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 200px; height: 55px;"/></a>
|
43
53
|
|
44
54
|
|
45
55
|
|
46
|
-
|
56
|
+
|
57
|
+
[English](README.md) | [简体中文](README_zh-CN.md) | [日本語](README_ja-JP.md)
|
47
58
|
|
48
59
|
</div>
|
49
60
|
|
50
61
|
<div align="center">
|
51
|
-
|
62
|
+
<p align="center">
|
63
|
+
<a href="https://github.com/opendatalab/MinerU">MinerU: An end-to-end PDF parsing tool based on PDF-Extract-Kit, supporting conversion from PDF to Markdown.</a>🚀🚀🚀<br>
|
64
|
+
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: A Comprehensive Toolkit for High-Quality PDF Content Extraction</a>🔥🔥🔥
|
65
|
+
</p>
|
66
|
+
|
67
|
+
<p align="center">
|
68
|
+
👋 join us on <a href="https://discord.gg/gPxmVeGC" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
|
69
|
+
</p>
|
52
70
|
</div>
|
53
71
|
|
54
72
|
# MinerU
|
@@ -82,7 +100,7 @@ Key features include:
|
|
82
100
|
- Available for Windows, Linux, and macOS platforms
|
83
101
|
|
84
102
|
|
85
|
-
https://github.com/
|
103
|
+
https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
86
104
|
|
87
105
|
|
88
106
|
|
@@ -95,10 +113,9 @@ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3
|
|
95
113
|
|
96
114
|

|
97
115
|
|
98
|
-
###
|
116
|
+
### Dependency repositorys
|
99
117
|
|
100
|
-
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
101
|
-
- A Comprehensive Toolkit for High-Quality PDF Content Extraction
|
118
|
+
- [PDF-Extract-Kit : A Comprehensive Toolkit for High-Quality PDF Content Extraction](https://github.com/opendatalab/PDF-Extract-Kit) 🚀🚀🚀
|
102
119
|
|
103
120
|
## Getting Started
|
104
121
|
|
@@ -113,24 +130,34 @@ conda create -n MinerU python=3.10
|
|
113
130
|
conda activate MinerU
|
114
131
|
```
|
115
132
|
|
116
|
-
###
|
133
|
+
### Installation and Configuration
|
117
134
|
|
118
135
|
#### 1. Install Magic-PDF
|
119
136
|
|
120
|
-
Install
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
Alternatively,
|
137
|
+
**1.Install dependencies**
|
138
|
+
|
139
|
+
The full-feature package depends on detectron2, which requires a compilation installation.
|
140
|
+
If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114
|
141
|
+
Alternatively, you can directly use our precompiled whl package (limited to Python 3.10):
|
142
|
+
|
125
143
|
```bash
|
126
|
-
pip install
|
144
|
+
pip install detectron2 --extra-index-url https://wheels.myhloli.com
|
127
145
|
```
|
128
|
-
|
129
|
-
|
130
|
-
|
146
|
+
|
147
|
+
**2.Install the full-feature package with pip**
|
148
|
+
>Note: The pip-installed package supports CPU-only and is ideal for quick tests.
|
149
|
+
>
|
150
|
+
>For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
|
151
|
+
|
131
152
|
```bash
|
132
|
-
pip install
|
153
|
+
pip install magic-pdf[full]==0.6.2b1
|
133
154
|
```
|
155
|
+
> ❗️❗️❗️
|
156
|
+
> We have pre-released the 0.6.2 beta version, addressing numerous issues mentioned in our logs. However, this build has not undergone full QA testing and does not represent the final release quality. Should you encounter any problems, please promptly report them to us via issues or revert to using version 0.6.1.
|
157
|
+
> ```bash
|
158
|
+
> pip install magic-pdf[full-cpu]==0.6.1
|
159
|
+
> ```
|
160
|
+
|
134
161
|
|
135
162
|
|
136
163
|
#### 2. Downloading model weights files
|
@@ -154,38 +181,21 @@ In magic-pdf.json, configure "models-dir" to point to the directory where the mo
|
|
154
181
|
```
|
155
182
|
|
156
183
|
|
157
|
-
#### 4.
|
158
|
-
|
159
|
-
###### simple
|
160
|
-
|
161
|
-
```bash
|
162
|
-
magic-pdf pdf-command --pdf "pdf_path" --inside_model true
|
163
|
-
```
|
164
|
-
After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
|
165
|
-
You can find the corresponding xxx_model.json file in the markdown directory.
|
166
|
-
If you intend to do secondary development on the post-processing pipeline, you can use the command:
|
167
|
-
```bash
|
168
|
-
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
169
|
-
```
|
170
|
-
In this way, you won't need to re-run the model data, making debugging more convenient.
|
171
|
-
|
172
|
-
|
173
|
-
###### more
|
174
|
-
|
175
|
-
```bash
|
176
|
-
magic-pdf --help
|
177
|
-
```
|
178
|
-
|
179
|
-
|
180
|
-
#### 5. Acceleration Using CUDA or MPS
|
181
|
-
|
184
|
+
#### 4. Acceleration Using CUDA or MPS
|
185
|
+
If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you can leverage acceleration with CUDA or MPS respectively.
|
182
186
|
##### CUDA
|
183
187
|
|
184
188
|
You need to install the corresponding PyTorch version according to your CUDA version.
|
185
|
-
This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
|
189
|
+
This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
|
186
190
|
```bash
|
187
191
|
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
|
188
192
|
```
|
193
|
+
> ❗ ️Make sure to specify version
|
194
|
+
> ```bash
|
195
|
+
> torch==2.3.1 torchvision==0.18.1
|
196
|
+
> ```
|
197
|
+
> in the command, as these are the highest versions we support. Failing to specify the versions may result in automatically installing higher versions which can cause the program to fail.
|
198
|
+
|
189
199
|
Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json.
|
190
200
|
```json
|
191
201
|
{
|
@@ -203,13 +213,39 @@ You also need to modify the value of "device-mode" in the configuration file mag
|
|
203
213
|
}
|
204
214
|
```
|
205
215
|
|
206
|
-
|
216
|
+
|
217
|
+
### Usage
|
218
|
+
|
219
|
+
#### 1.Usage via Command Line
|
220
|
+
|
221
|
+
###### simple
|
222
|
+
|
223
|
+
```bash
|
224
|
+
magic-pdf pdf-command --pdf "pdf_path" --inside_model true
|
225
|
+
```
|
226
|
+
After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
|
227
|
+
You can find the corresponding xxx_model.json file in the markdown directory.
|
228
|
+
If you intend to do secondary development on the post-processing pipeline, you can use the command:
|
229
|
+
```bash
|
230
|
+
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
231
|
+
```
|
232
|
+
In this way, you won't need to re-run the model data, making debugging more convenient.
|
233
|
+
|
234
|
+
|
235
|
+
###### more
|
236
|
+
|
237
|
+
```bash
|
238
|
+
magic-pdf --help
|
239
|
+
```
|
240
|
+
|
241
|
+
|
242
|
+
#### 2. Usage via Api
|
207
243
|
|
208
244
|
###### Local
|
209
245
|
```python
|
210
246
|
image_writer = DiskReaderWriter(local_image_dir)
|
211
247
|
image_dir = str(os.path.basename(local_image_dir))
|
212
|
-
jso_useful_key = {"_pdf_type": "", "model_list":
|
248
|
+
jso_useful_key = {"_pdf_type": "", "model_list": []}
|
213
249
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
|
214
250
|
pipe.pipe_classify()
|
215
251
|
pipe.pipe_parse()
|
@@ -222,7 +258,7 @@ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
|
222
258
|
image_dir = "s3://img_bucket/"
|
223
259
|
s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
|
224
260
|
pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
|
225
|
-
jso_useful_key = {"_pdf_type": "", "model_list":
|
261
|
+
jso_useful_key = {"_pdf_type": "", "model_list": []}
|
226
262
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
|
227
263
|
pipe.pipe_classify()
|
228
264
|
pipe.pipe_parse()
|
@@ -271,7 +307,7 @@ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d7
|
|
271
307
|
|
272
308
|
# All Thanks To Our Contributors
|
273
309
|
|
274
|
-
<a href="https://github.com/
|
310
|
+
<a href="https://github.com/opendatalab/MinerU/graphs/contributors">
|
275
311
|
<img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
|
276
312
|
</a>
|
277
313
|
|
@@ -294,6 +330,13 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
|
|
294
330
|
# Citation
|
295
331
|
|
296
332
|
```bibtex
|
333
|
+
@article{he2024opendatalab,
|
334
|
+
title={Opendatalab: Empowering general artificial intelligence with open datasets},
|
335
|
+
author={He, Conghui and Li, Wei and Jin, Zhenjiang and Xu, Chao and Wang, Bin and Lin, Dahua},
|
336
|
+
journal={arXiv preprint arXiv:2407.13773},
|
337
|
+
year={2024}
|
338
|
+
}
|
339
|
+
|
297
340
|
@misc{2024mineru,
|
298
341
|
title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
|
299
342
|
author={MinerU Contributors},
|
@@ -312,3 +355,8 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
|
|
312
355
|
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
|
313
356
|
</picture>
|
314
357
|
</a>
|
358
|
+
|
359
|
+
# Links
|
360
|
+
- [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
|
361
|
+
- [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
|
362
|
+
- [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
|
@@ -1,14 +1,11 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
|
4
|
-
magic_pdf/
|
5
|
-
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
4
|
+
magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zxFngs,10620
|
6
5
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
7
|
-
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
magic_pdf/cli/magicpdf.py,sha256=EcTiX-MaiDc4Fv9qZ_UdjHt5tYnBEu6vlbp0w030sA0,12691
|
9
6
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
7
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
8
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=mq6tACGkL383bdUla7xOkRXRTBBydRtdbgIHwkk_daM,16169
|
12
9
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
10
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
14
11
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -25,7 +22,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
22
|
magic_pdf/libs/boxbase.py,sha256=MvD0DypR4sTEF3T2RrI_yJ8mPDUBYHAqAaau2mnBSxY,15343
|
26
23
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
27
24
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
28
|
-
magic_pdf/libs/config_reader.py,sha256=
|
25
|
+
magic_pdf/libs/config_reader.py,sha256=dPx6JJJuCw9AzNgKtrTG1elmfdeN6gDhgFK9r15-NsE,2505
|
29
26
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
30
27
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
31
28
|
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
@@ -34,9 +31,9 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
|
|
34
31
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
35
32
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
36
33
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
37
|
-
magic_pdf/libs/language.py,sha256=
|
34
|
+
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
35
|
+
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
38
36
|
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
39
|
-
magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
40
37
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
41
38
|
magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
|
42
39
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
@@ -44,22 +41,22 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
44
41
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
42
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
43
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
44
|
+
magic_pdf/libs/version.py,sha256=iEqOsQ5JUsdTQDAPqryKypkJMTOXXBl71cd4Drh5pDs,24
|
48
45
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
|
-
magic_pdf/model/__init__.py,sha256=
|
50
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
51
|
-
magic_pdf/model/magic_model.py,sha256=
|
46
|
+
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
47
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=cSmh27RG1cvY0pd98T82rq0pANBwkYN0siZIN6oPNI8,4332
|
48
|
+
magic_pdf/model/magic_model.py,sha256=xwKV9BrdjOJecJSzbErT54N6qeJu0tvFuJg2S1z_2kU,25413
|
52
49
|
magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
|
53
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
54
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
50
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=I3pZBWQu8y5YVjURTUJnsIySjeSGK-Yefit9PiqN9VE,12952
|
51
|
+
magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
|
55
52
|
magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
53
|
magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
|
57
|
-
magic_pdf/model/pek_sub_modules/self_modify.py,sha256=
|
54
|
+
magic_pdf/model/pek_sub_modules/self_modify.py,sha256=wl7HPzfUyKrjsSRp_iMXlbM2q1brVk5TD9NbgyY6UWU,11397
|
58
55
|
magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
59
56
|
magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
60
57
|
magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
|
61
58
|
magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
|
62
|
-
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=
|
59
|
+
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
|
63
60
|
magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
|
64
61
|
magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
|
65
62
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
|
@@ -75,6 +72,8 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
|
|
75
72
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
76
73
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
77
74
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
75
|
+
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=BdrBZ_2B2jgF0vzn_ted8bE9Te-DC1Ea2UijqULNKjg,928
|
76
|
+
magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
77
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
79
78
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
80
79
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -99,7 +98,7 @@ magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_
|
|
99
98
|
magic_pdf/post_proc/pdf_post_filter.py,sha256=FeZceyjGG_UvBrBoa51Ohge5edQzCoJtZTaocidKCHg,2530
|
100
99
|
magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
|
101
100
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
102
|
-
magic_pdf/pre_proc/citationmarker_remove.py,sha256=
|
101
|
+
magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
|
103
102
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
|
104
103
|
magic_pdf/pre_proc/cut_image.py,sha256=bbeELTg2-SFyHkVEnGAL_7S6k8hyy1xtDSoFmXDQDOA,2768
|
105
104
|
magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
|
@@ -114,10 +113,10 @@ magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLb
|
|
114
113
|
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
115
114
|
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
116
115
|
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
117
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=
|
116
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=aM_RZWg-2nPBQtqOVoMuvexmP5g1CuP94WxKxXuqnW4,6574
|
118
117
|
magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
|
119
118
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=lz1viJJkjZAklKZYmkWP39YBnv5BoWVr3fv6mmNr33E,12620
|
120
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=
|
119
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=jqBheXF8EuYCfS9tn6typr-aE57nfMoeBC36J5GjpbQ,11519
|
121
120
|
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=FIMwe8Lei9LI2RmkqiaSyTHV5b7ViADbpyBwgVwZH-c,2687
|
122
121
|
magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
123
122
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=u_ObNLkZ8pPDNBUkSMpA9ffiSpfz42B4807cdBPZmLU,3085
|
@@ -127,23 +126,23 @@ magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=0FlBXeiEwjZAGAWo-DiMptclFOj04POu
|
|
127
126
|
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6TOCW0TLXbPii_Q,7307
|
128
127
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
129
128
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
130
|
-
magic_pdf/resources/
|
129
|
+
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
130
|
+
magic_pdf/resources/model_config/model_configs.yaml,sha256=2MxCOJ5yNUupQqvrAvEuJKlygjxxV_o1qE64K_4NWKA,235
|
131
131
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
|
132
|
-
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=
|
133
|
-
magic_pdf/rw/AbsReaderWriter.py,sha256=
|
134
|
-
magic_pdf/rw/DiskReaderWriter.py,sha256=
|
135
|
-
magic_pdf/rw/S3ReaderWriter.py,sha256=
|
132
|
+
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
133
|
+
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
134
|
+
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
135
|
+
magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
|
136
136
|
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
137
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
138
138
|
magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,1131
|
139
|
-
magic_pdf/
|
140
|
-
magic_pdf/
|
141
|
-
magic_pdf/
|
142
|
-
magic_pdf/
|
143
|
-
magic_pdf/
|
144
|
-
magic_pdf-0.
|
145
|
-
magic_pdf-0.
|
146
|
-
magic_pdf-0.
|
147
|
-
magic_pdf-0.
|
148
|
-
magic_pdf-0.
|
149
|
-
magic_pdf-0.6.1.dist-info/RECORD,,
|
139
|
+
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
140
|
+
magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
|
141
|
+
magic_pdf/tools/cli_dev.py,sha256=uDc4fDxVuOIrkaKRdjNAqyh9htyLd-fYDEfJBNFUYao,4149
|
142
|
+
magic_pdf/tools/common.py,sha256=x4W-Tyo0A-TGsOjzlUGAhxiU2AisU3nBE3_2H_RLUO4,3801
|
143
|
+
magic_pdf-0.7.0a1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
144
|
+
magic_pdf-0.7.0a1.dist-info/METADATA,sha256=NBLsixinI-5iHwdweKr13SM5qg6Jf-fWCwg5ihavlpY,12455
|
145
|
+
magic_pdf-0.7.0a1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
146
|
+
magic_pdf-0.7.0a1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
147
|
+
magic_pdf-0.7.0a1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
148
|
+
magic_pdf-0.7.0a1.dist-info/RECORD,,
|