magic-pdf 0.7.0a1__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,12 +30,10 @@ def read_s3_path(s3path):
30
30
  byte_start, byte_end = 0, None
31
31
  else:
32
32
  byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
33
- byte_end += byte_start - 1
34
- return s3_rw.read_jsonl(
33
+ return s3_rw.read_offset(
35
34
  remove_non_official_s3_args(s3path),
36
35
  byte_start,
37
36
  byte_end,
38
- AbsReaderWriter.MODE_BIN,
39
37
  )
40
38
 
41
39
 
@@ -71,24 +69,23 @@ def cli():
71
69
  default="",
72
70
  )
73
71
  def jsonl(jsonl, method, output_dir):
74
- print("haha")
75
72
  model_config.__use_inside_model__ = False
76
- full_jsonl_path = os.path.realpath(jsonl)
77
- if output_dir == "":
78
- output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
79
-
80
73
  if jsonl.startswith("s3://"):
81
74
  jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
75
+ full_jsonl_path = "."
82
76
  else:
77
+ full_jsonl_path = os.path.realpath(jsonl)
83
78
  with open(jsonl) as f:
84
79
  jso = json_parse.loads(f.readline())
80
+
81
+ if output_dir == "":
82
+ output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
85
83
  s3_file_path = jso.get("file_location")
86
84
  if s3_file_path is None:
87
85
  s3_file_path = jso.get("path")
88
86
  pdf_file_name = Path(s3_file_path).stem
89
87
  pdf_data = read_s3_path(s3_file_path)
90
88
 
91
-
92
89
  print(pdf_file_name, jso, method)
93
90
  do_parse(
94
91
  output_dir,
@@ -97,6 +94,7 @@ def jsonl(jsonl, method, output_dir):
97
94
  jso["doc_layout_result"],
98
95
  method,
99
96
  f_dump_content_list=True,
97
+ f_draw_model_bbox=True,
100
98
  )
101
99
 
102
100
 
@@ -149,6 +147,7 @@ def pdf(pdf, json_data, output_dir, method):
149
147
  model_json_list,
150
148
  method,
151
149
  f_dump_content_list=True,
150
+ f_draw_model_bbox=True,
152
151
  )
153
152
 
154
153
 
magic_pdf/tools/common.py CHANGED
@@ -4,7 +4,7 @@ import copy
4
4
  import click
5
5
  from loguru import logger
6
6
  from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
7
- from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
7
+ from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
8
8
  from magic_pdf.pipe.UNIPipe import UNIPipe
9
9
  from magic_pdf.pipe.OCRPipe import OCRPipe
10
10
  from magic_pdf.pipe.TXTPipe import TXTPipe
@@ -37,6 +37,7 @@ def do_parse(
37
37
  f_dump_orig_pdf=True,
38
38
  f_dump_content_list=False,
39
39
  f_make_md_mode=MakeMode.MM_MD,
40
+ f_draw_model_bbox=False,
40
41
  ):
41
42
  orig_model_list = copy.deepcopy(model_list)
42
43
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
@@ -73,6 +74,8 @@ def do_parse(
73
74
  draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
74
75
  if f_draw_span_bbox:
75
76
  draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
77
+ if f_draw_model_bbox:
78
+ drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
76
79
 
77
80
  md_content = pipe.pipe_mk_markdown(
78
81
  image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
@@ -0,0 +1,417 @@
1
+ Metadata-Version: 2.1
2
+ Name: magic-pdf
3
+ Version: 0.7.1
4
+ Summary: A practical tool for converting PDF to Markdown
5
+ Home-page: https://github.com/opendatalab/MinerU
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE.md
9
+ Requires-Dist: boto3>=1.28.43
10
+ Requires-Dist: Brotli>=1.1.0
11
+ Requires-Dist: click>=8.1.7
12
+ Requires-Dist: PyMuPDF>=1.24.9
13
+ Requires-Dist: loguru>=0.6.0
14
+ Requires-Dist: numpy<2.0.0,>=1.21.6
15
+ Requires-Dist: fast-langdetect==0.2.0
16
+ Requires-Dist: wordninja>=2.0.0
17
+ Requires-Dist: scikit-learn>=1.0.2
18
+ Requires-Dist: pdfminer.six==20231228
19
+ Provides-Extra: full
20
+ Requires-Dist: unimernet==0.1.6; extra == "full"
21
+ Requires-Dist: ultralytics; extra == "full"
22
+ Requires-Dist: paddleocr==2.7.3; extra == "full"
23
+ Requires-Dist: pypandoc; extra == "full"
24
+ Requires-Dist: struct-eqtable==0.1.0; extra == "full"
25
+ Requires-Dist: detectron2; extra == "full"
26
+ Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
27
+ Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
28
+ Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
29
+ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "full"
30
+ Provides-Extra: lite
31
+ Requires-Dist: paddleocr==2.7.3; extra == "lite"
32
+ Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
33
+ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
34
+
35
+ <div align="center" xmlns="http://www.w3.org/1999/html">
36
+ <!-- logo -->
37
+ <p align="center">
38
+ <img src="docs/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
39
+ </p>
40
+
41
+ <!-- icon -->
42
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
43
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
44
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
45
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
46
+ [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
47
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
48
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
49
+ <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
50
+
51
+ <!-- language -->
52
+ [English](README.md) | [简体中文](README_zh-CN.md)
53
+
54
+ <!-- hot link -->
55
+ <p align="center">
56
+ <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
57
+ </p>
58
+
59
+ <!-- join us -->
60
+ <p align="center">
61
+ 👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
62
+ </p>
63
+
64
+ </div>
65
+
66
+ # Changelog
67
+ - 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
68
+ - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
69
+ - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
70
+ - 2024/07/05: Initial open-source release
71
+
72
+ <!-- TABLE OF CONTENT -->
73
+ <details open="open">
74
+ <summary><h2 style="display: inline-block">Table of Contents</h2></summary>
75
+ <ol>
76
+ <li>
77
+ <a href="#mineru">MinerU</a>
78
+ <ul>
79
+ <li><a href="#project-introduction">Project Introduction</a></li>
80
+ <li><a href="#key-features">Key Features</a></li>
81
+ <li><a href="#quick-start">Quick Start</a>
82
+ <ul>
83
+ <li><a href="#online-demo">Online Demo</a></li>
84
+ <li><a href="#quick-cpu-demo">Quick CPU Demo</a></li>
85
+ <li><a href="#using-gpu">Using GPU</a></li>
86
+ </ul>
87
+ </li>
88
+ <li><a href="#usage">Usage</a>
89
+ <ul>
90
+ <li><a href="#command-line">Command Line</a></li>
91
+ <li><a href="#api">API</a></li>
92
+ <li><a href="#development-guide">Development Guide</a></li>
93
+ </ul>
94
+ </li>
95
+ </ul>
96
+ </li>
97
+ <li><a href="#todo">TODO</a></li>
98
+ <li><a href="#known-issues">Known Issues</a></li>
99
+ <li><a href="#faq">FAQ</a></li>
100
+ <li><a href="#all-thanks-to-our-contributors">All Thanks To Our Contributors</a></li>
101
+ <li><a href="#license-information">License Information</a></li>
102
+ <li><a href="#acknowledgments">Acknowledgments</a></li>
103
+ <li><a href="#citation">Citation</a></li>
104
+ <li><a href="#star-history">Star History</a></li>
105
+ <li><a href="#magic-doc">Magic-doc</a></li>
106
+ <li><a href="#magic-html">Magic-html</a></li>
107
+ <li><a href="#links">Links</a></li>
108
+ </ol>
109
+ </details>
110
+
111
+
112
+
113
+ # MinerU
114
+ ## Project Introduction
115
+ MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
116
+ MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
117
+ Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
118
+
119
+ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
120
+
121
+ ## Key Features
122
+
123
+ - Removes elements such as headers, footers, footnotes, and page numbers while maintaining semantic continuity
124
+ - Outputs text in a human-readable order from multi-column documents
125
+ - Retains the original structure of the document, including titles, paragraphs, and lists
126
+ - Extracts images, image captions, tables, and table captions
127
+ - Automatically recognizes formulas in the document and converts them to LaTeX
128
+ - Automatically recognizes tables in the document and converts them to LaTeX
129
+ - Automatically detects and enables OCR for corrupted PDFs
130
+ - Supports both CPU and GPU environments
131
+ - Supports Windows, Linux, and Mac platforms
132
+
133
+ ## Quick Start
134
+
135
+ If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
136
+ If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
137
+ There are three different ways to experience MinerU:
138
+ - [Online Demo (No Installation Required)](#online-demo)
139
+ - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
140
+ - [Linux/Windows + CUDA](#Using-GPU)
141
+
142
+ **⚠️ Pre-installation Notice—Hardware and Software Environment Support**
143
+
144
+ To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
145
+
146
+ By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
147
+
148
+ In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
149
+
150
+ <table>
151
+ <tr>
152
+ <td colspan="3" rowspan="2">Operating System</td>
153
+ </tr>
154
+ <tr>
155
+ <td>Ubuntu 22.04 LTS</td>
156
+ <td>Windows 10 / 11</td>
157
+ <td>macOS 11+</td>
158
+ </tr>
159
+ <tr>
160
+ <td colspan="3">CPU</td>
161
+ <td>x86_64</td>
162
+ <td>x86_64</td>
163
+ <td>x86_64 / arm64</td>
164
+ </tr>
165
+ <tr>
166
+ <td colspan="3">Memory</td>
167
+ <td colspan="3">16GB or more, recommended 32GB+</td>
168
+ </tr>
169
+ <tr>
170
+ <td colspan="3">Python Version</td>
171
+ <td colspan="3">3.10</td>
172
+ </tr>
173
+ <tr>
174
+ <td colspan="3">Nvidia Driver Version</td>
175
+ <td>latest (Proprietary Driver)</td>
176
+ <td>latest</td>
177
+ <td>None</td>
178
+ </tr>
179
+ <tr>
180
+ <td colspan="3">CUDA Environment</td>
181
+ <td>Automatic installation [12.1 (pytorch) + 11.8 (paddle)]</td>
182
+ <td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
183
+ <td>None</td>
184
+ </tr>
185
+ <tr>
186
+ <td rowspan="2">GPU Hardware Support List</td>
187
+ <td colspan="2">Minimum Requirement 8G+ VRAM</td>
188
+ <td colspan="2">3060ti/3070/3080/3080ti/4060/4070/4070ti<br>
189
+ 8G VRAM only enables layout and formula recognition acceleration</td>
190
+ <td rowspan="2">None</td>
191
+ </tr>
192
+ <tr>
193
+ <td colspan="2">Recommended Configuration 16G+ VRAM</td>
194
+ <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
195
+ 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously</td>
196
+ </tr>
197
+ </table>
198
+
199
+ ### Online Demo
200
+
201
+ [Click here for the online demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
202
+
203
+ ### Quick CPU Demo
204
+
205
+ #### 1. Install magic-pdf
206
+ ```bash
207
+ conda create -n MinerU python=3.10
208
+ conda activate MinerU
209
+ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
210
+ ```
211
+ #### 2. Download model weight files
212
+
213
+ Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
214
+ > ❗️After downloading the models, please make sure to verify the completeness of the model files.
215
+ >
216
+ > Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
217
+
218
+ #### 3. Copy and configure the template file
219
+ You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
220
+ > ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
221
+ >
222
+ > The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
223
+ ```bash
224
+ cp magic-pdf.template.json ~/magic-pdf.json
225
+ ```
226
+
227
+ Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
228
+ > ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
229
+ >
230
+ > On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
231
+ >
232
+ > For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
233
+ ```json
234
+ {
235
+ // other config
236
+ "models-dir": "D:/models",
237
+ "table-config": {
238
+ "model": "TableMaster", // Another option of this value is 'struct_eqtable'
239
+ "is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
240
+ "max_time": 400
241
+ }
242
+ }
243
+ ```
244
+
245
+
246
+ ### Using GPU
247
+ If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
248
+
249
+ - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
250
+ - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
251
+
252
+
253
+ ## Usage
254
+
255
+ ### Command Line
256
+
257
+ ```bash
258
+ magic-pdf --help
259
+ Usage: magic-pdf [OPTIONS]
260
+
261
+ Options:
262
+ -v, --version display the version and exit
263
+ -p, --path PATH local pdf filepath or directory [required]
264
+ -o, --output-dir TEXT output local directory
265
+ -m, --method [ocr|txt|auto] the method for parsing pdf.
266
+ ocr: using ocr technique to extract information from pdf,
267
+ txt: suitable for the text-based pdf only and outperform ocr,
268
+ auto: automatically choose the best method for parsing pdf
269
+ from ocr and txt.
270
+ without method specified, auto will be used by default.
271
+ --help Show this message and exit.
272
+
273
+
274
+ ## show version
275
+ magic-pdf -v
276
+
277
+ ## command line example
278
+ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
279
+ ```
280
+
281
+ `{some_pdf}` can be a single PDF file or a directory containing multiple PDFs.
282
+ The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
283
+
284
+ ```text
285
+ ├── some_pdf.md # markdown file
286
+ ├── images # directory for storing images
287
+ ├── layout.pdf # layout diagram
288
+ ├── middle.json # MinerU intermediate processing result
289
+ ├── model.json # model inference result
290
+ ├── origin.pdf # original PDF file
291
+ └── spans.pdf # smallest granularity bbox position information diagram
292
+ ```
293
+
294
+ For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
295
+
296
+ ### API
297
+
298
+ Processing files from local disk
299
+ ```python
300
+ image_writer = DiskReaderWriter(local_image_dir)
301
+ image_dir = str(os.path.basename(local_image_dir))
302
+ jso_useful_key = {"_pdf_type": "", "model_list": []}
303
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
304
+ pipe.pipe_classify()
305
+ pipe.pipe_analyze()
306
+ pipe.pipe_parse()
307
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
308
+ ```
309
+
310
+ Processing files from object storage
311
+ ```python
312
+ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
313
+ image_dir = "s3://img_bucket/"
314
+ s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
315
+ pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
316
+ jso_useful_key = {"_pdf_type": "", "model_list": []}
317
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
318
+ pipe.pipe_classify()
319
+ pipe.pipe_analyze()
320
+ pipe.pipe_parse()
321
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
322
+ ```
323
+
324
+ For detailed implementation, refer to:
325
+ - [demo.py Simplest Processing Method](demo/demo.py)
326
+ - [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
327
+
328
+
329
+ ### Development Guide
330
+
331
+ TODO
332
+
333
+ # TODO
334
+
335
+ - [ ] Semantic-based reading order
336
+ - [ ] List recognition within the text
337
+ - [ ] Code block recognition within the text
338
+ - [ ] Table of contents recognition
339
+ - [x] Table recognition
340
+ - [ ] Chemical formula recognition
341
+ - [ ] Geometric shape recognition
342
+
343
+ # Known Issues
344
+ - Reading order is segmented based on rules, which can cause disordered sequences in some cases
345
+ - Vertical text is not supported
346
+ - Lists, code blocks, and table of contents are not yet supported in the layout model
347
+ - Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
348
+ - Enabling OCR may produce better results in PDFs with a high density of formulas
349
+ - If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
350
+
351
+
352
+ # FAQ
353
+ [FAQ in Chinese](docs/FAQ_zh_cn.md)
354
+
355
+ [FAQ in English](docs/FAQ_en_us.md)
356
+
357
+
358
+ # All Thanks To Our Contributors
359
+
360
+ <a href="https://github.com/opendatalab/MinerU/graphs/contributors">
361
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
362
+ </a>
363
+
364
+ # License Information
365
+
366
+ [LICENSE.md](LICENSE.md)
367
+
368
+ This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
369
+
370
+
371
+ # Acknowledgments
372
+ - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
373
+ - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
374
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
375
+ - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
376
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
377
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
378
+
379
+ # Citation
380
+
381
+ ```bibtex
382
+ @article{he2024opendatalab,
383
+ title={Opendatalab: Empowering general artificial intelligence with open datasets},
384
+ author={He, Conghui and Li, Wei and Jin, Zhenjiang and Xu, Chao and Wang, Bin and Lin, Dahua},
385
+ journal={arXiv preprint arXiv:2407.13773},
386
+ year={2024}
387
+ }
388
+
389
+ @misc{2024mineru,
390
+ title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
391
+ author={MinerU Contributors},
392
+ howpublished = {\url{https://github.com/opendatalab/MinerU}},
393
+ year={2024}
394
+ }
395
+ ```
396
+
397
+ # Star History
398
+
399
+ <a>
400
+ <picture>
401
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
402
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
403
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
404
+ </picture>
405
+ </a>
406
+
407
+ # Magic-doc
408
+ [Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
409
+
410
+ # Magic-html
411
+ [Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
412
+
413
+ # Links
414
+
415
+ - [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
416
+ - [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
417
+ - [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zx
5
5
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
6
6
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
8
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=mq6tACGkL383bdUla7xOkRXRTBBydRtdbgIHwkk_daM,16169
8
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=jg_v2Bj62xBObg0LDayvqUVX_O9DrIBli5Z9_i7Qduw,16479
9
9
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
11
11
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -15,7 +15,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
15
15
  magic_pdf/layout/layout_sort.py,sha256=ovqRX1xcRA7E7s8VvsI7ZNbaNSElJe07bApCh5hxwIE,33533
16
16
  magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
17
17
  magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
18
- magic_pdf/libs/Constants.py,sha256=AwQw5aK7JkWjerEyq5vxxMTHH1Gvku8K9NS8xjHKimI,189
18
+ magic_pdf/libs/Constants.py,sha256=rdJVadmgN0UlIB-xcMQ9j7Qk9q1Qahxt3KEY-vL7hSU,774
19
19
  magic_pdf/libs/MakeContentConfig.py,sha256=UDZPpsv8q4DqTy8h0vRtrT2kHqWiVI205VnVhlUEQc0,206
20
20
  magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
21
21
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,7 +26,7 @@ magic_pdf/libs/config_reader.py,sha256=dPx6JJJuCw9AzNgKtrTG1elmfdeN6gDhgFK9r15-N
26
26
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
27
27
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
28
28
  magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
29
- magic_pdf/libs/draw_bbox.py,sha256=90FDAYN3dxgN07_xRzdUgnDAyEswpl9VCXaDo_SMZkA,9449
29
+ magic_pdf/libs/draw_bbox.py,sha256=wPoaxXteZDCL20pjVEQ4kcu0u6Ea-9balUaxrkMsYks,12531
30
30
  magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
31
31
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
32
32
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
@@ -35,19 +35,20 @@ magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,10
35
35
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
36
36
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
37
37
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
38
- magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
38
+ magic_pdf/libs/ocr_content_type.py,sha256=9c12CoJ8xvdFa4Rk81J_S238yuQl0bDQeapvqb-JkEk,794
39
39
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
40
40
  magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
41
41
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
42
42
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
43
43
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
44
- magic_pdf/libs/version.py,sha256=iEqOsQ5JUsdTQDAPqryKypkJMTOXXBl71cd4Drh5pDs,24
44
+ magic_pdf/libs/version.py,sha256=2KJZDSMOG7KS82AxYOrZ4ZihYxX0wjfUjDsIZh3L024,22
45
45
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
46
46
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
47
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=cSmh27RG1cvY0pd98T82rq0pANBwkYN0siZIN6oPNI8,4332
48
- magic_pdf/model/magic_model.py,sha256=xwKV9BrdjOJecJSzbErT54N6qeJu0tvFuJg2S1z_2kU,25413
47
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=SoT21VHy6ICXoRfC9V3XS6BMiX8EZI6zaqSNgoE17oo,4347
48
+ magic_pdf/model/magic_model.py,sha256=3eAfmglKFkmIVPoz3TG8xAzkNK2g_VLI5rRMQAb_cK4,25544
49
49
  magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
50
- magic_pdf/model/pdf_extract_kit.py,sha256=I3pZBWQu8y5YVjURTUJnsIySjeSGK-Yefit9PiqN9VE,12952
50
+ magic_pdf/model/pdf_extract_kit.py,sha256=WO54IoxX8XYXLGrjPts--84qRO1FQZm9f_yVyfpPi0s,14539
51
+ magic_pdf/model/ppTableModel.py,sha256=wWiui9VOjkKYlNX-viPqsWpzgkNJ-9_S2Se-j4oyLqU,2687
51
52
  magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
52
53
  magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
54
  magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -72,7 +73,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
72
73
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
73
74
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
74
75
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
75
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=BdrBZ_2B2jgF0vzn_ted8bE9Te-DC1Ea2UijqULNKjg,928
76
+ magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=C9WluPhwaqsFg154WsNxN2HlhFXVkAAw0prR7t8r5J4,918
76
77
  magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
78
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
79
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
@@ -84,7 +85,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
84
85
  magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
85
86
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
86
87
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
87
- magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
88
+ magic_pdf/para/para_split_v2.py,sha256=jJnn8numhxVgojGwKGCqBNIIYn2AYsucO-q-eQgsPb4,36911
88
89
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
89
90
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
90
91
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -127,7 +128,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
127
128
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
128
129
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
129
130
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
130
- magic_pdf/resources/model_config/model_configs.yaml,sha256=2MxCOJ5yNUupQqvrAvEuJKlygjxxV_o1qE64K_4NWKA,235
131
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=_gOSxK9jxe1bFwtH_uwovsyZnRi1sEVNYb1OAexDmF4,301
131
132
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
132
133
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
133
134
  magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
@@ -138,11 +139,11 @@ magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
139
  magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,1131
139
140
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
141
  magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
141
- magic_pdf/tools/cli_dev.py,sha256=uDc4fDxVuOIrkaKRdjNAqyh9htyLd-fYDEfJBNFUYao,4149
142
- magic_pdf/tools/common.py,sha256=x4W-Tyo0A-TGsOjzlUGAhxiU2AisU3nBE3_2H_RLUO4,3801
143
- magic_pdf-0.7.0a1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
144
- magic_pdf-0.7.0a1.dist-info/METADATA,sha256=NBLsixinI-5iHwdweKr13SM5qg6Jf-fWCwg5ihavlpY,12455
145
- magic_pdf-0.7.0a1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
146
- magic_pdf-0.7.0a1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
147
- magic_pdf-0.7.0a1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
148
- magic_pdf-0.7.0a1.dist-info/RECORD,,
142
+ magic_pdf/tools/cli_dev.py,sha256=w-J4OixDzHjknnUuRW44PXsUlUqyiD4nPbBSSk9WkXM,4160
143
+ magic_pdf/tools/common.py,sha256=XoSs19DD-4ubbjrDFQer83T9O6O_MmgEO61NbjlP_2M,3939
144
+ magic_pdf-0.7.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
145
+ magic_pdf-0.7.1.dist-info/METADATA,sha256=SD5oVg3vUEuFg7IyAbwncQ_mtgXljhKiJCOwRCTSOVo,18232
146
+ magic_pdf-0.7.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
147
+ magic_pdf-0.7.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
148
+ magic_pdf-0.7.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
149
+ magic_pdf-0.7.1.dist-info/RECORD,,