magic-pdf 0.6.1__py3-none-any.whl → 0.6.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +43 -21
- magic_pdf/dict2md/ocr_mkcontent.py +10 -4
- magic_pdf/libs/config_reader.py +25 -7
- magic_pdf/libs/language.py +12 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +27 -1
- magic_pdf/model/magic_model.py +45 -41
- magic_pdf/model/pdf_extract_kit.py +111 -59
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
- magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
- magic_pdf/model/pp_structure_v2.py +1 -1
- magic_pdf/pdf_parse_union_core.py +4 -2
- magic_pdf/pre_proc/citationmarker_remove.py +5 -1
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
- magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
- magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.6.2b1.dist-info}/METADATA +93 -63
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.6.2b1.dist-info}/RECORD +25 -24
- /magic_pdf/libs/{math.py → local_math.py} +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.6.2b1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.6.2b1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.6.2b1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.6.2b1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.6.
|
3
|
+
Version: 0.6.2b1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,27 +9,33 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3 >=1.28.43
|
10
10
|
Requires-Dist: Brotli >=1.1.0
|
11
11
|
Requires-Dist: click >=8.1.7
|
12
|
-
Requires-Dist: PyMuPDF >=1.24.
|
12
|
+
Requires-Dist: PyMuPDF >=1.24.9
|
13
13
|
Requires-Dist: loguru >=0.6.0
|
14
|
-
Requires-Dist: numpy
|
15
|
-
Requires-Dist: fast-langdetect
|
14
|
+
Requires-Dist: numpy <2.0.0,>=1.21.6
|
15
|
+
Requires-Dist: fast-langdetect ==0.2.0
|
16
16
|
Requires-Dist: wordninja >=2.0.0
|
17
17
|
Requires-Dist: scikit-learn >=1.0.2
|
18
|
-
Requires-Dist: pdfminer.six
|
19
|
-
Provides-Extra:
|
20
|
-
Requires-Dist:
|
21
|
-
Requires-Dist:
|
22
|
-
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist:
|
25
|
-
Requires-Dist:
|
26
|
-
Requires-Dist:
|
27
|
-
|
28
|
-
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist: paddlepaddle
|
31
|
-
|
32
|
-
<div id="top"
|
18
|
+
Requires-Dist: pdfminer.six ==20231228
|
19
|
+
Provides-Extra: full
|
20
|
+
Requires-Dist: unimernet ==0.1.6 ; extra == 'full'
|
21
|
+
Requires-Dist: matplotlib ; extra == 'full'
|
22
|
+
Requires-Dist: ultralytics ; extra == 'full'
|
23
|
+
Requires-Dist: paddleocr ==2.7.3 ; extra == 'full'
|
24
|
+
Requires-Dist: detectron2 ; extra == 'full'
|
25
|
+
Requires-Dist: paddlepaddle ==3.0.0b1 ; (platform_system == "Linux") and extra == 'full'
|
26
|
+
Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_system == "Darwin") and extra == 'full'
|
27
|
+
Provides-Extra: lite
|
28
|
+
Requires-Dist: paddleocr ==2.7.3 ; extra == 'lite'
|
29
|
+
Requires-Dist: paddlepaddle ==3.0.0b1 ; (platform_system == "Linux") and extra == 'lite'
|
30
|
+
Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_system == "Darwin") and extra == 'lite'
|
31
|
+
|
32
|
+
<div id="top">
|
33
|
+
|
34
|
+
<p align="center">
|
35
|
+
<img src="docs/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
|
36
|
+
</p>
|
37
|
+
|
38
|
+
</div>
|
33
39
|
<div align="center">
|
34
40
|
|
35
41
|
[](https://github.com/opendatalab/MinerU)
|
@@ -40,15 +46,24 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
|
40
46
|
[](https://pepy.tech/project/magic-pdf)
|
41
47
|
[](https://pepy.tech/project/magic-pdf)
|
42
48
|
|
49
|
+
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 200px; height: 55px;"/></a>
|
43
50
|
|
44
51
|
|
45
52
|
|
46
|
-
|
53
|
+
|
54
|
+
[English](README.md) | [简体中文](README_zh-CN.md) | [日本語](README_ja-JP.md)
|
47
55
|
|
48
56
|
</div>
|
49
57
|
|
50
58
|
<div align="center">
|
51
|
-
|
59
|
+
<p align="center">
|
60
|
+
<a href="https://github.com/opendatalab/MinerU">MinerU: An end-to-end PDF parsing tool based on PDF-Extract-Kit, supporting conversion from PDF to Markdown.</a>🚀🚀🚀<br>
|
61
|
+
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: A Comprehensive Toolkit for High-Quality PDF Content Extraction</a>🔥🔥🔥
|
62
|
+
</p>
|
63
|
+
|
64
|
+
<p align="center">
|
65
|
+
👋 join us on <a href="https://discord.gg/AsQMhuMN" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
|
66
|
+
</p>
|
52
67
|
</div>
|
53
68
|
|
54
69
|
# MinerU
|
@@ -82,7 +97,7 @@ Key features include:
|
|
82
97
|
- Available for Windows, Linux, and macOS platforms
|
83
98
|
|
84
99
|
|
85
|
-
https://github.com/
|
100
|
+
https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
86
101
|
|
87
102
|
|
88
103
|
|
@@ -95,10 +110,9 @@ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3
|
|
95
110
|
|
96
111
|

|
97
112
|
|
98
|
-
###
|
113
|
+
### Dependency repositorys
|
99
114
|
|
100
|
-
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
101
|
-
- A Comprehensive Toolkit for High-Quality PDF Content Extraction
|
115
|
+
- [PDF-Extract-Kit : A Comprehensive Toolkit for High-Quality PDF Content Extraction](https://github.com/opendatalab/PDF-Extract-Kit) 🚀🚀🚀
|
102
116
|
|
103
117
|
## Getting Started
|
104
118
|
|
@@ -113,21 +127,22 @@ conda create -n MinerU python=3.10
|
|
113
127
|
conda activate MinerU
|
114
128
|
```
|
115
129
|
|
116
|
-
###
|
130
|
+
### Installation and Configuration
|
117
131
|
|
118
132
|
#### 1. Install Magic-PDF
|
119
133
|
|
120
|
-
Install
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
134
|
+
Install the full-feature package with pip:
|
135
|
+
>Note: The pip-installed package supports CPU-only and is ideal for quick tests.
|
136
|
+
>
|
137
|
+
>For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
|
138
|
+
|
125
139
|
```bash
|
126
140
|
pip install magic-pdf[full-cpu]
|
127
141
|
```
|
128
|
-
The
|
129
|
-
If you need to compile it yourself, refer to https://github.com/facebookresearch/detectron2/issues/5114
|
130
|
-
|
142
|
+
The full-feature package depends on detectron2, which requires a compilation installation.
|
143
|
+
If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114
|
144
|
+
Alternatively, you can directly use our precompiled whl package (limited to Python 3.10):
|
145
|
+
|
131
146
|
```bash
|
132
147
|
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
|
133
148
|
```
|
@@ -154,31 +169,8 @@ In magic-pdf.json, configure "models-dir" to point to the directory where the mo
|
|
154
169
|
```
|
155
170
|
|
156
171
|
|
157
|
-
#### 4.
|
158
|
-
|
159
|
-
###### simple
|
160
|
-
|
161
|
-
```bash
|
162
|
-
magic-pdf pdf-command --pdf "pdf_path" --inside_model true
|
163
|
-
```
|
164
|
-
After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
|
165
|
-
You can find the corresponding xxx_model.json file in the markdown directory.
|
166
|
-
If you intend to do secondary development on the post-processing pipeline, you can use the command:
|
167
|
-
```bash
|
168
|
-
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
169
|
-
```
|
170
|
-
In this way, you won't need to re-run the model data, making debugging more convenient.
|
171
|
-
|
172
|
-
|
173
|
-
###### more
|
174
|
-
|
175
|
-
```bash
|
176
|
-
magic-pdf --help
|
177
|
-
```
|
178
|
-
|
179
|
-
|
180
|
-
#### 5. Acceleration Using CUDA or MPS
|
181
|
-
|
172
|
+
#### 4. Acceleration Using CUDA or MPS
|
173
|
+
If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you can leverage acceleration with CUDA or MPS respectively.
|
182
174
|
##### CUDA
|
183
175
|
|
184
176
|
You need to install the corresponding PyTorch version according to your CUDA version.
|
@@ -203,13 +195,39 @@ You also need to modify the value of "device-mode" in the configuration file mag
|
|
203
195
|
}
|
204
196
|
```
|
205
197
|
|
206
|
-
|
198
|
+
|
199
|
+
### Usage
|
200
|
+
|
201
|
+
#### 1.Usage via Command Line
|
202
|
+
|
203
|
+
###### simple
|
204
|
+
|
205
|
+
```bash
|
206
|
+
magic-pdf pdf-command --pdf "pdf_path" --inside_model true
|
207
|
+
```
|
208
|
+
After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
|
209
|
+
You can find the corresponding xxx_model.json file in the markdown directory.
|
210
|
+
If you intend to do secondary development on the post-processing pipeline, you can use the command:
|
211
|
+
```bash
|
212
|
+
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
213
|
+
```
|
214
|
+
In this way, you won't need to re-run the model data, making debugging more convenient.
|
215
|
+
|
216
|
+
|
217
|
+
###### more
|
218
|
+
|
219
|
+
```bash
|
220
|
+
magic-pdf --help
|
221
|
+
```
|
222
|
+
|
223
|
+
|
224
|
+
#### 2. Usage via Api
|
207
225
|
|
208
226
|
###### Local
|
209
227
|
```python
|
210
228
|
image_writer = DiskReaderWriter(local_image_dir)
|
211
229
|
image_dir = str(os.path.basename(local_image_dir))
|
212
|
-
jso_useful_key = {"_pdf_type": "", "model_list":
|
230
|
+
jso_useful_key = {"_pdf_type": "", "model_list": []}
|
213
231
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
|
214
232
|
pipe.pipe_classify()
|
215
233
|
pipe.pipe_parse()
|
@@ -222,7 +240,7 @@ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
|
222
240
|
image_dir = "s3://img_bucket/"
|
223
241
|
s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
|
224
242
|
pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
|
225
|
-
jso_useful_key = {"_pdf_type": "", "model_list":
|
243
|
+
jso_useful_key = {"_pdf_type": "", "model_list": []}
|
226
244
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
|
227
245
|
pipe.pipe_classify()
|
228
246
|
pipe.pipe_parse()
|
@@ -271,7 +289,7 @@ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d7
|
|
271
289
|
|
272
290
|
# All Thanks To Our Contributors
|
273
291
|
|
274
|
-
<a href="https://github.com/
|
292
|
+
<a href="https://github.com/opendatalab/MinerU/graphs/contributors">
|
275
293
|
<img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
|
276
294
|
</a>
|
277
295
|
|
@@ -294,6 +312,13 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
|
|
294
312
|
# Citation
|
295
313
|
|
296
314
|
```bibtex
|
315
|
+
@article{he2024opendatalab,
|
316
|
+
title={Opendatalab: Empowering general artificial intelligence with open datasets},
|
317
|
+
author={He, Conghui and Li, Wei and Jin, Zhenjiang and Xu, Chao and Wang, Bin and Lin, Dahua},
|
318
|
+
journal={arXiv preprint arXiv:2407.13773},
|
319
|
+
year={2024}
|
320
|
+
}
|
321
|
+
|
297
322
|
@misc{2024mineru,
|
298
323
|
title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
|
299
324
|
author={MinerU Contributors},
|
@@ -312,3 +337,8 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
|
|
312
337
|
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
|
313
338
|
</picture>
|
314
339
|
</a>
|
340
|
+
|
341
|
+
# Links
|
342
|
+
- [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
|
343
|
+
- [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
|
344
|
+
- [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
|
@@ -2,13 +2,13 @@ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
|
4
4
|
magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
|
5
|
-
magic_pdf/pdf_parse_union_core.py,sha256=
|
5
|
+
magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zxFngs,10620
|
6
6
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
7
7
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
8
|
+
magic_pdf/cli/magicpdf.py,sha256=jzGSxLcx_tmjFiZfIKpomfU9rfvAwZDyMm4bmNGQmJw,13541
|
9
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
11
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=L5D6aLB3vSATfDmsXRI_Mnq79ijH6yPYdSxSJb5aFh8,15678
|
12
12
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
14
14
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -25,7 +25,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
25
|
magic_pdf/libs/boxbase.py,sha256=MvD0DypR4sTEF3T2RrI_yJ8mPDUBYHAqAaau2mnBSxY,15343
|
26
26
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
27
27
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
28
|
-
magic_pdf/libs/config_reader.py,sha256=
|
28
|
+
magic_pdf/libs/config_reader.py,sha256=Wlig8KRhKpz8fVNc7dlspxoccX5SnyT7FmwKj9i-gWE,2435
|
29
29
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
30
30
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
31
31
|
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
@@ -34,9 +34,9 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
|
|
34
34
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
35
35
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
36
36
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
37
|
-
magic_pdf/libs/language.py,sha256=
|
37
|
+
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
38
|
+
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
38
39
|
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
39
|
-
magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
40
40
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
41
41
|
magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
|
42
42
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
@@ -44,22 +44,22 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
44
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=ZMr9EH74nzdVNg9PXVs5XLZcabTX-_J2XZD1VgXHRBE,24
|
48
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
|
-
magic_pdf/model/__init__.py,sha256=
|
50
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
51
|
-
magic_pdf/model/magic_model.py,sha256=
|
49
|
+
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
50
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=0YyZoDUQBFQIDiHDK59JEql9FKSkZMr8aGN0Tw1eL8k,4064
|
51
|
+
magic_pdf/model/magic_model.py,sha256=tIGi-vtn6OUDqs3mtW2YLzoFvSPu4kkBiHJUv6o0Kic,25233
|
52
52
|
magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
|
53
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
54
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
53
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=ecoTBn7ROmfU1KxFpFzu7yXw7aezmPV6xOht7PRmPHA,10692
|
54
|
+
magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
|
55
55
|
magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
|
57
|
-
magic_pdf/model/pek_sub_modules/self_modify.py,sha256=
|
57
|
+
magic_pdf/model/pek_sub_modules/self_modify.py,sha256=wl7HPzfUyKrjsSRp_iMXlbM2q1brVk5TD9NbgyY6UWU,11397
|
58
58
|
magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
59
59
|
magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
60
60
|
magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
|
61
61
|
magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
|
62
|
-
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=
|
62
|
+
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
|
63
63
|
magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
|
64
64
|
magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
|
65
65
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
|
@@ -99,7 +99,7 @@ magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_
|
|
99
99
|
magic_pdf/post_proc/pdf_post_filter.py,sha256=FeZceyjGG_UvBrBoa51Ohge5edQzCoJtZTaocidKCHg,2530
|
100
100
|
magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
|
101
101
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
102
|
-
magic_pdf/pre_proc/citationmarker_remove.py,sha256=
|
102
|
+
magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
|
103
103
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
|
104
104
|
magic_pdf/pre_proc/cut_image.py,sha256=bbeELTg2-SFyHkVEnGAL_7S6k8hyy1xtDSoFmXDQDOA,2768
|
105
105
|
magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
|
@@ -114,10 +114,10 @@ magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLb
|
|
114
114
|
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
115
115
|
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
116
116
|
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
117
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=
|
117
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=aM_RZWg-2nPBQtqOVoMuvexmP5g1CuP94WxKxXuqnW4,6574
|
118
118
|
magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
|
119
119
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=lz1viJJkjZAklKZYmkWP39YBnv5BoWVr3fv6mmNr33E,12620
|
120
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=
|
120
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=jqBheXF8EuYCfS9tn6typr-aE57nfMoeBC36J5GjpbQ,11519
|
121
121
|
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=FIMwe8Lei9LI2RmkqiaSyTHV5b7ViADbpyBwgVwZH-c,2687
|
122
122
|
magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
123
123
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=u_ObNLkZ8pPDNBUkSMpA9ffiSpfz42B4807cdBPZmLU,3085
|
@@ -127,9 +127,10 @@ magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=0FlBXeiEwjZAGAWo-DiMptclFOj04POu
|
|
127
127
|
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6TOCW0TLXbPii_Q,7307
|
128
128
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
129
129
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
130
|
+
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
130
131
|
magic_pdf/resources/model_config/model_configs.yaml,sha256=C_9UfFMlHOX-iSgcwCHjyHKazKKuwpy1RcGHeTQD1kY,139
|
131
132
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
|
132
|
-
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=
|
133
|
+
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
133
134
|
magic_pdf/rw/AbsReaderWriter.py,sha256=1Hd6Xo2g12CaRAo5Sze-R_GSQA6GQ0rQwSmgQvw4V_c,1297
|
134
135
|
magic_pdf/rw/DiskReaderWriter.py,sha256=0tt8lbRyqrOfFgGlhjt24YMdj2xN7QUIVysfhFIxPgo,2113
|
135
136
|
magic_pdf/rw/S3ReaderWriter.py,sha256=O7Quf3CUqXBjMz4sIE7kNVI3TIQROeg5PuXneAacieY,4474
|
@@ -141,9 +142,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
141
142
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
142
143
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
143
144
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
144
|
-
magic_pdf-0.6.
|
145
|
-
magic_pdf-0.6.
|
146
|
-
magic_pdf-0.6.
|
147
|
-
magic_pdf-0.6.
|
148
|
-
magic_pdf-0.6.
|
149
|
-
magic_pdf-0.6.
|
145
|
+
magic_pdf-0.6.2b1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
146
|
+
magic_pdf-0.6.2b1.dist-info/METADATA,sha256=DVFHn5RBQilga9OZp87sGf4e4wAE-XT8QpB6KSVUgoo,11541
|
147
|
+
magic_pdf-0.6.2b1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
148
|
+
magic_pdf-0.6.2b1.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
149
|
+
magic_pdf-0.6.2b1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
150
|
+
magic_pdf-0.6.2b1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|