magic-pdf 1.3.7__py3-none-any.whl → 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +310 -15
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +3 -2
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +25 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt +15629 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +5 -1
- {magic_pdf-1.3.7.dist-info → magic_pdf-1.3.9.dist-info}/METADATA +12 -3
- {magic_pdf-1.3.7.dist-info → magic_pdf-1.3.9.dist-info}/RECORD +12 -11
- {magic_pdf-1.3.7.dist-info → magic_pdf-1.3.9.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.7.dist-info → magic_pdf-1.3.9.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.7.dist-info → magic_pdf-1.3.9.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.7.dist-info → magic_pdf-1.3.9.dist-info}/top_level.txt +0 -0
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
CHANGED
@@ -3,10 +3,14 @@ lang:
|
|
3
3
|
det: ch_PP-OCRv3_det_infer.pth
|
4
4
|
rec: ch_PP-OCRv4_rec_infer.pth
|
5
5
|
dict: ppocr_keys_v1.txt
|
6
|
-
|
6
|
+
ch_server:
|
7
7
|
det: ch_PP-OCRv3_det_infer.pth
|
8
8
|
rec: ch_PP-OCRv4_rec_server_infer.pth
|
9
9
|
dict: ppocr_keys_v1.txt
|
10
|
+
ch:
|
11
|
+
det: ch_PP-OCRv3_det_infer.pth
|
12
|
+
rec: ch_PP-OCRv4_rec_server_doc_infer.pth
|
13
|
+
dict: ppocrv4_doc_dict.txt
|
10
14
|
en:
|
11
15
|
det: en_PP-OCRv3_det_infer.pth
|
12
16
|
rec: en_PP-OCRv4_rec_infer.pth
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.3.
|
3
|
+
Version: 1.3.9
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
License: AGPL-3.0
|
6
6
|
Project-URL: Home, https://mineru.net/
|
@@ -20,7 +20,7 @@ Requires-Dist: click >=8.1.7
|
|
20
20
|
Requires-Dist: fast-langdetect <0.3.0,>=0.2.3
|
21
21
|
Requires-Dist: loguru >=0.6.0
|
22
22
|
Requires-Dist: numpy >=1.21.6
|
23
|
-
Requires-Dist: pdfminer.six
|
23
|
+
Requires-Dist: pdfminer.six >=20250416
|
24
24
|
Requires-Dist: pydantic <2.11,>=2.7.2
|
25
25
|
Requires-Dist: scikit-learn >=1.0.2
|
26
26
|
Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
|
@@ -107,6 +107,15 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
107
107
|
</div>
|
108
108
|
|
109
109
|
# Changelog
|
110
|
+
- 2025/04/27 1.3.9 Released
|
111
|
+
- Optimized the formula parsing function to improve the success rate of formula rendering
|
112
|
+
- Updated `pdfminer.six` to the latest version, fixing some abnormal PDF parsing issues
|
113
|
+
- 2025/04/23 1.3.8 Released
|
114
|
+
- The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
|
115
|
+
- `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
|
116
|
+
- [Performance comparison between PP-OCRv4_server_rec_doc, PP-OCRv4_server_rec, and PP-OCRv4_mobile_rec](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/text_recognition.html#ii-supported-model-list)
|
117
|
+
- Verified results show that the `PP-OCRv4_server_rec_doc` model significantly improves accuracy in both single-language (`Chinese`, `English`, `Japanese`, `Traditional Chinese`) and mixed-language scenarios, with speed comparable to `PP-OCRv4_server_rec`, making it suitable for most use cases.
|
118
|
+
- In a small number of pure English scenarios, the `PP-OCRv4_server_rec_doc` model may encounter word concatenation issues, whereas `PP-OCRv4_server_rec` performs better in such cases. Therefore, we have retained the `PP-OCRv4_server_rec` model, which users can invoke by passing the parameter `lang='ch_server'`(python api) or `--lang ch_server`(cli).
|
110
119
|
- 2025/04/22 1.3.7 Released
|
111
120
|
- Fixed the issue where the `lang` parameter was ineffective during table parsing model initialization.
|
112
121
|
- Fixed the significant slowdown in OCR and table parsing speed in `cpu` mode.
|
@@ -427,7 +436,7 @@ There are three different ways to experience MinerU:
|
|
427
436
|
<td colspan="2">GPU VRAM 6GB or more</td>
|
428
437
|
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
|
429
438
|
More than 6GB VRAM </td>
|
430
|
-
<td rowspan="2">
|
439
|
+
<td rowspan="2">Apple silicon</td>
|
431
440
|
</tr>
|
432
441
|
</table>
|
433
442
|
|
@@ -52,7 +52,7 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
|
|
52
52
|
magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
|
53
53
|
magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
|
54
54
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
55
|
-
magic_pdf/libs/version.py,sha256=
|
55
|
+
magic_pdf/libs/version.py,sha256=SaWgUI6v92kVfF_Qdoxbfc38bwA34RuDGZmXMqa5g3c,22
|
56
56
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
57
57
|
magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
|
58
58
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
|
@@ -97,7 +97,7 @@ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
97
97
|
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
|
98
98
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
99
99
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
|
100
|
-
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=
|
100
|
+
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=uRqyXsr3S0bw6CnkkJzAgne_MT3Q9Wz-npIXIMlRnlo,17986
|
101
101
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
|
102
102
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
|
103
103
|
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
|
@@ -109,7 +109,7 @@ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unim
|
|
109
109
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
110
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
111
111
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py,sha256=3qxu0lAjqzZQ2Ci-C_wz_YSakyq_5-KnckA3-5bICTM,12589
|
112
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=
|
112
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=xbQWxgeOTNvLNFSio4Too2Yyg20uikjJlyVXuQqwq3Y,7555
|
113
113
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
114
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py,sha256=5bI7MAu65r-vn28krwdJ6pjZMkEvWjspE7EQaTsRERw,1319
|
115
115
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py,sha256=YYu3c-W4fgEErxxDM98uQ3oWwPEh-6w75LY4zcj4VtM,199
|
@@ -140,8 +140,8 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_pos
|
|
140
140
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py,sha256=EAc-cmhXtxLfFA6h5C871hIcDfXsjPDISiVCKwxh-qM,6339
|
141
141
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py,sha256=HiHNr4bhW5U1j4pYoyi8fPOaFsn8TUc4nSB6q8chfV4,26899
|
142
142
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
143
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=
|
144
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=
|
143
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=UzG7gH1unPJWwICJWdz1VNUXmetSqrSb_iR5o59Lo9A,7508
|
144
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=_AjgxSML_Tj2VkARYFIrhZzrz36PZX9koGRF-xZC2ho,1629
|
145
145
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
|
146
146
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
|
147
147
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
|
@@ -152,6 +152,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dic
|
|
152
152
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt,sha256=qh_ciuj3zUCg7E7bRy6wQh4RQn5sz-6ZFUQHQsGLCiA,14480
|
153
153
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt,sha256=jm1ONil4jDXDH35TAofWFHtUm7eiZb1nCLsoETRCniw,468
|
154
154
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt,sha256=KLI2KtSrLcOHaapy_rU146nds_0qdYWgWSDmOTsdx_c,26249
|
155
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt,sha256=pbw4h8Q8kB5aP5exP_rfHFdU7efMjJ9aviLodafEg3I,62346
|
155
156
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt,sha256=6T5pSBSv2f8ekYtvS7Qmf7TGWpNE7l10ZPkTW5DAonA,352
|
156
157
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt,sha256=7plGpg13AZd0dOiYg2lKTKIOqjhoojM0v3lA3NAI8Pk,429
|
157
158
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py,sha256=xEqR65Z8YOzOLorLjK0LCHos2zX-tCuxSrxndjU00hE,49
|
@@ -195,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
|
|
195
196
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
196
197
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
197
198
|
magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
|
198
|
-
magic_pdf-1.3.
|
199
|
-
magic_pdf-1.3.
|
200
|
-
magic_pdf-1.3.
|
201
|
-
magic_pdf-1.3.
|
202
|
-
magic_pdf-1.3.
|
203
|
-
magic_pdf-1.3.
|
199
|
+
magic_pdf-1.3.9.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
200
|
+
magic_pdf-1.3.9.dist-info/METADATA,sha256=L98_kmfvo1RrBO3LR3Np2ySVd9nkMmXDf3TA5LmTVcQ,47611
|
201
|
+
magic_pdf-1.3.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
202
|
+
magic_pdf-1.3.9.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
203
|
+
magic_pdf-1.3.9.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
204
|
+
magic_pdf-1.3.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|