magic-pdf 1.3.6__py3-none-any.whl → 1.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +1 -8
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +7 -1
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +25 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt +15629 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +5 -1
- {magic_pdf-1.3.6.dist-info → magic_pdf-1.3.8.dist-info}/METADATA +10 -1
- {magic_pdf-1.3.6.dist-info → magic_pdf-1.3.8.dist-info}/RECORD +12 -11
- {magic_pdf-1.3.6.dist-info → magic_pdf-1.3.8.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.6.dist-info → magic_pdf-1.3.8.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.6.dist-info → magic_pdf-1.3.8.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.6.dist-info → magic_pdf-1.3.8.dist-info}/top_level.txt +0 -0
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
CHANGED
@@ -3,10 +3,14 @@ lang:
|
|
3
3
|
det: ch_PP-OCRv3_det_infer.pth
|
4
4
|
rec: ch_PP-OCRv4_rec_infer.pth
|
5
5
|
dict: ppocr_keys_v1.txt
|
6
|
-
|
6
|
+
ch_server:
|
7
7
|
det: ch_PP-OCRv3_det_infer.pth
|
8
8
|
rec: ch_PP-OCRv4_rec_server_infer.pth
|
9
9
|
dict: ppocr_keys_v1.txt
|
10
|
+
ch:
|
11
|
+
det: ch_PP-OCRv3_det_infer.pth
|
12
|
+
rec: ch_PP-OCRv4_rec_server_doc_infer.pth
|
13
|
+
dict: ppocrv4_doc_dict.txt
|
10
14
|
en:
|
11
15
|
det: en_PP-OCRv3_det_infer.pth
|
12
16
|
rec: en_PP-OCRv4_rec_infer.pth
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.3.
|
3
|
+
Version: 1.3.8
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
License: AGPL-3.0
|
6
6
|
Project-URL: Home, https://mineru.net/
|
@@ -107,6 +107,15 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
107
107
|
</div>
|
108
108
|
|
109
109
|
# Changelog
|
110
|
+
- Released on 2025/04/23, version 1.3.8
|
111
|
+
- The default `ocr` model (`ch`) has been updated to `PP-OCRv4_server_rec_doc` (model update required)
|
112
|
+
- `PP-OCRv4_server_rec_doc` is trained on a mix of more Chinese document data and PP-OCR training data, enhancing recognition capabilities for some traditional Chinese characters, Japanese, and special characters. It supports over 15,000 recognizable characters, improving text recognition in documents while also boosting general text recognition.
|
113
|
+
- [Performance comparison between PP-OCRv4_server_rec_doc, PP-OCRv4_server_rec, and PP-OCRv4_mobile_rec](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/text_recognition.html#ii-supported-model-list)
|
114
|
+
- Verified results show that the `PP-OCRv4_server_rec_doc` model significantly improves accuracy in both single-language (`Chinese`, `English`, `Japanese`, `Traditional Chinese`) and mixed-language scenarios, with speed comparable to `PP-OCRv4_server_rec`, making it suitable for most use cases.
|
115
|
+
- In a small number of pure English scenarios, the `PP-OCRv4_server_rec_doc` model may encounter word concatenation issues, whereas `PP-OCRv4_server_rec` performs better in such cases. Therefore, we have retained the `PP-OCRv4_server_rec` model, which users can invoke by passing the parameter `lang='ch_server'`(python api) or `--lang ch_server`(cli).
|
116
|
+
- 2025/04/22 1.3.7 Released
|
117
|
+
- Fixed the issue where the `lang` parameter was ineffective during table parsing model initialization.
|
118
|
+
- Fixed the significant slowdown in OCR and table parsing speed in `cpu` mode.
|
110
119
|
- 2025/04/16 1.3.4 Released
|
111
120
|
- Slightly improved the speed of OCR detection by removing some unused blocks.
|
112
121
|
- Fixed page-level sorting errors caused by footnotes in certain cases.
|
@@ -52,9 +52,9 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
|
|
52
52
|
magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
|
53
53
|
magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
|
54
54
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
55
|
-
magic_pdf/libs/version.py,sha256=
|
55
|
+
magic_pdf/libs/version.py,sha256=47xEhOdVR5Y8-pZH8aVP6Z2UhhY8jGWTQ-rJHt5fIeU,22
|
56
56
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
57
|
-
magic_pdf/model/batch_analyze.py,sha256=
|
57
|
+
magic_pdf/model/batch_analyze.py,sha256=F0WsjbQ6z9txdiUiVy6n6zhyJWJ-4moljNx8fe8HFws,10977
|
58
58
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
|
59
59
|
magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
|
60
60
|
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
@@ -109,7 +109,7 @@ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unim
|
|
109
109
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
110
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
111
111
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py,sha256=3qxu0lAjqzZQ2Ci-C_wz_YSakyq_5-KnckA3-5bICTM,12589
|
112
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=
|
112
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=xbQWxgeOTNvLNFSio4Too2Yyg20uikjJlyVXuQqwq3Y,7555
|
113
113
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
114
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py,sha256=5bI7MAu65r-vn28krwdJ6pjZMkEvWjspE7EQaTsRERw,1319
|
115
115
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py,sha256=YYu3c-W4fgEErxxDM98uQ3oWwPEh-6w75LY4zcj4VtM,199
|
@@ -140,8 +140,8 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_pos
|
|
140
140
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py,sha256=EAc-cmhXtxLfFA6h5C871hIcDfXsjPDISiVCKwxh-qM,6339
|
141
141
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py,sha256=HiHNr4bhW5U1j4pYoyi8fPOaFsn8TUc4nSB6q8chfV4,26899
|
142
142
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
143
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=
|
144
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=
|
143
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=UzG7gH1unPJWwICJWdz1VNUXmetSqrSb_iR5o59Lo9A,7508
|
144
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=_AjgxSML_Tj2VkARYFIrhZzrz36PZX9koGRF-xZC2ho,1629
|
145
145
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
|
146
146
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
|
147
147
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
|
@@ -152,6 +152,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dic
|
|
152
152
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt,sha256=qh_ciuj3zUCg7E7bRy6wQh4RQn5sz-6ZFUQHQsGLCiA,14480
|
153
153
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt,sha256=jm1ONil4jDXDH35TAofWFHtUm7eiZb1nCLsoETRCniw,468
|
154
154
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt,sha256=KLI2KtSrLcOHaapy_rU146nds_0qdYWgWSDmOTsdx_c,26249
|
155
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt,sha256=pbw4h8Q8kB5aP5exP_rfHFdU7efMjJ9aviLodafEg3I,62346
|
155
156
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt,sha256=6T5pSBSv2f8ekYtvS7Qmf7TGWpNE7l10ZPkTW5DAonA,352
|
156
157
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt,sha256=7plGpg13AZd0dOiYg2lKTKIOqjhoojM0v3lA3NAI8Pk,429
|
157
158
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py,sha256=xEqR65Z8YOzOLorLjK0LCHos2zX-tCuxSrxndjU00hE,49
|
@@ -195,9 +196,9 @@ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,126
|
|
195
196
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
196
197
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
197
198
|
magic_pdf/utils/office_to_pdf.py,sha256=bFRYe6v3-pfx5R8-bV8cmf12jPnOrYZsleKoECTXzbM,3958
|
198
|
-
magic_pdf-1.3.
|
199
|
-
magic_pdf-1.3.
|
200
|
-
magic_pdf-1.3.
|
201
|
-
magic_pdf-1.3.
|
202
|
-
magic_pdf-1.3.
|
203
|
-
magic_pdf-1.3.
|
199
|
+
magic_pdf-1.3.8.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
200
|
+
magic_pdf-1.3.8.dist-info/METADATA,sha256=8DvNpL0env37CB35wE7uErGK_mxx3LVSC6Qgr1kttEM,47408
|
201
|
+
magic_pdf-1.3.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
202
|
+
magic_pdf-1.3.8.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
203
|
+
magic_pdf-1.3.8.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
204
|
+
magic_pdf-1.3.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|