mineru 2.6.8__py3-none-any.whl → 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. mineru/backend/hybrid/__init__.py +1 -0
  2. mineru/backend/hybrid/hybrid_analyze.py +526 -0
  3. mineru/backend/hybrid/hybrid_magic_model.py +617 -0
  4. mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
  5. mineru/backend/pipeline/batch_analyze.py +9 -1
  6. mineru/backend/pipeline/model_init.py +96 -1
  7. mineru/backend/pipeline/pipeline_analyze.py +6 -4
  8. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
  9. mineru/backend/vlm/utils.py +3 -1
  10. mineru/backend/vlm/vlm_analyze.py +12 -12
  11. mineru/backend/vlm/vlm_magic_model.py +24 -89
  12. mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
  13. mineru/cli/client.py +17 -17
  14. mineru/cli/common.py +169 -20
  15. mineru/cli/fast_api.py +39 -13
  16. mineru/cli/gradio_app.py +232 -206
  17. mineru/model/mfd/yolo_v8.py +12 -6
  18. mineru/model/mfr/unimernet/Unimernet.py +71 -3
  19. mineru/resources/header.html +5 -1
  20. mineru/utils/boxbase.py +23 -0
  21. mineru/utils/char_utils.py +55 -0
  22. mineru/utils/engine_utils.py +74 -0
  23. mineru/utils/enum_class.py +18 -1
  24. mineru/utils/magic_model_utils.py +85 -2
  25. mineru/utils/pdf_image_tools.py +37 -17
  26. mineru/utils/span_pre_proc.py +5 -3
  27. mineru/utils/table_merge.py +13 -22
  28. mineru/version.py +1 -1
  29. mineru-2.7.1.dist-info/METADATA +438 -0
  30. {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/RECORD +34 -28
  31. mineru-2.6.8.dist-info/METADATA +0 -954
  32. {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/WHEEL +0 -0
  33. {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/entry_points.txt +0 -0
  34. {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/licenses/LICENSE.md +0 -0
  35. {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,438 @@
1
+ Metadata-Version: 2.4
2
+ Name: mineru
3
+ Version: 2.7.1
4
+ Summary: A practical tool for converting PDF to Markdown
5
+ License: AGPL-3.0
6
+ Project-URL: homepage, https://mineru.net/
7
+ Project-URL: documentation, https://opendatalab.github.io/MinerU/
8
+ Project-URL: repository, https://github.com/opendatalab/MinerU
9
+ Project-URL: issues, https://github.com/opendatalab/MinerU/issues
10
+ Keywords: magic-pdf,mineru,MinerU,convert,pdf,markdown
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Python: <3.14,>=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE.md
18
+ Requires-Dist: boto3>=1.28.43
19
+ Requires-Dist: click>=8.1.7
20
+ Requires-Dist: loguru>=0.7.2
21
+ Requires-Dist: numpy>=1.21.6
22
+ Requires-Dist: pdfminer.six>=20251230
23
+ Requires-Dist: tqdm>=4.67.1
24
+ Requires-Dist: requests
25
+ Requires-Dist: httpx
26
+ Requires-Dist: pillow>=11.0.0
27
+ Requires-Dist: pypdfium2>=4.30.0
28
+ Requires-Dist: pypdf>=5.6.0
29
+ Requires-Dist: reportlab
30
+ Requires-Dist: pdftext>=0.6.3
31
+ Requires-Dist: modelscope>=1.26.0
32
+ Requires-Dist: huggingface-hub>=0.32.4
33
+ Requires-Dist: json-repair>=0.46.2
34
+ Requires-Dist: opencv-python>=4.11.0.86
35
+ Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
36
+ Requires-Dist: scikit-image<1.0.0,>=0.25.0
37
+ Requires-Dist: openai<3,>=1.70.0
38
+ Requires-Dist: beautifulsoup4<5,>=4.13.5
39
+ Requires-Dist: magika<1.1.0,>=0.6.2
40
+ Requires-Dist: mineru-vl-utils<1,>=0.1.19.1
41
+ Requires-Dist: qwen-vl-utils<1,>=0.0.14
42
+ Provides-Extra: test
43
+ Requires-Dist: mineru[core]; extra == "test"
44
+ Requires-Dist: pytest; extra == "test"
45
+ Requires-Dist: pytest-cov; extra == "test"
46
+ Requires-Dist: coverage; extra == "test"
47
+ Requires-Dist: fuzzywuzzy; extra == "test"
48
+ Provides-Extra: vlm
49
+ Requires-Dist: torch<3,>=2.6.0; extra == "vlm"
50
+ Requires-Dist: transformers!=4.57.2,<5.0.0,>=4.51.1; extra == "vlm"
51
+ Requires-Dist: accelerate>=1.5.1; extra == "vlm"
52
+ Provides-Extra: vllm
53
+ Requires-Dist: vllm<0.12,>=0.10.1.1; extra == "vllm"
54
+ Provides-Extra: lmdeploy
55
+ Requires-Dist: lmdeploy<0.12,>=0.10.2; extra == "lmdeploy"
56
+ Provides-Extra: mlx
57
+ Requires-Dist: mlx-vlm<0.4,>=0.3.3; extra == "mlx"
58
+ Provides-Extra: pipeline
59
+ Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
60
+ Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
61
+ Requires-Dist: doclayout_yolo==0.0.4; extra == "pipeline"
62
+ Requires-Dist: dill<1,>=0.3.8; extra == "pipeline"
63
+ Requires-Dist: PyYAML<7,>=6.0.2; extra == "pipeline"
64
+ Requires-Dist: ftfy<7,>=6.3.1; extra == "pipeline"
65
+ Requires-Dist: shapely<3,>=2.0.7; extra == "pipeline"
66
+ Requires-Dist: pyclipper<2,>=1.3.0; extra == "pipeline"
67
+ Requires-Dist: omegaconf<3,>=2.3.0; extra == "pipeline"
68
+ Requires-Dist: torch<3,>=2.6.0; extra == "pipeline"
69
+ Requires-Dist: torchvision; extra == "pipeline"
70
+ Requires-Dist: transformers!=4.51.0,<5.0.0,>=4.49.0; extra == "pipeline"
71
+ Requires-Dist: onnxruntime>1.17.0; extra == "pipeline"
72
+ Provides-Extra: api
73
+ Requires-Dist: fastapi; extra == "api"
74
+ Requires-Dist: python-multipart; extra == "api"
75
+ Requires-Dist: uvicorn; extra == "api"
76
+ Provides-Extra: gradio
77
+ Requires-Dist: gradio==5.49.1; extra == "gradio"
78
+ Requires-Dist: gradio-pdf==0.0.22; extra == "gradio"
79
+ Provides-Extra: core
80
+ Requires-Dist: mineru[vlm]; extra == "core"
81
+ Requires-Dist: mineru[pipeline]; extra == "core"
82
+ Requires-Dist: mineru[api]; extra == "core"
83
+ Requires-Dist: mineru[gradio]; extra == "core"
84
+ Provides-Extra: all
85
+ Requires-Dist: mineru[core]; extra == "all"
86
+ Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "all"
87
+ Requires-Dist: mineru[vllm]; sys_platform == "linux" and extra == "all"
88
+ Requires-Dist: mineru[lmdeploy]; sys_platform == "windows" and extra == "all"
89
+ Dynamic: license-file
90
+
91
+ <div align="center" xmlns="http://www.w3.org/1999/html">
92
+ <!-- logo -->
93
+ <p align="center">
94
+ <img src="https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docs/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
95
+ </p>
96
+
97
+ <!-- icon -->
98
+
99
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
100
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
101
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
102
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
103
+ [![PyPI version](https://img.shields.io/pypi/v/mineru)](https://pypi.org/project/mineru/)
104
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mineru)](https://pypi.org/project/mineru/)
105
+ [![Downloads](https://static.pepy.tech/badge/mineru)](https://pepy.tech/project/mineru)
106
+ [![Downloads](https://static.pepy.tech/badge/mineru/month)](https://pepy.tech/project/mineru)
107
+ [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTM0IiBoZWlnaHQ9IjEzNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48cGF0aCBkPSJtMTIyLDljMCw1LTQsOS05LDlzLTktNC05LTksNC05LDktOSw5LDQsOSw5eiIgZmlsbD0idXJsKCNhKSIvPjxwYXRoIGQ9Im0xMjIsOWMwLDUtNCw5LTksOXMtOS00LTktOSw0LTksOS05LDksNCw5LDl6IiBmaWxsPSIjMDEwMTAxIi8+PHBhdGggZD0ibTkxLDE4YzAsNS00LDktOSw5cy05LTQtOS05LDQtOSw5LTksOSw0LDksOXoiIGZpbGw9InVybCgjYikiLz48cGF0aCBkPSJtOTEsMThjMCw1LTQsOS05LDlzLTktNC05LTksNC05LDktOSw5LDQsOSw5eiIgZmlsbD0iIzAxMDEwMSIvPjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJtMzksNjJjMCwxNiw4LDMwLDIwLDM4LDctNiwxMi0xNiwxMi0yNlY0OWMwLTQsMy03LDYtOGw0Ni0xMmM1LTEsMTEsMywxMSw4djMxYzAsMzctMzAsNjYtNjYsNjYtMzcsMC02Ni0zMC02Ni02NlY0NmMwLTQsMy03LDYtOGwyMC02YzUtMSwxMSwzLDExLDh2MjF6bS0yOSw2YzAsMTYsNiwzMCwxNyw0MCwzLDEsNSwxLDgsMSw1LDAsMTAtMSwxNS0zQzM3LDk1LDI5LDc5LDI5LDYyVjQybC0xOSw1djIweiIgZmlsbD0idXJsKCNjKSIvPjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJtMzksNjJjMCwxNiw4LDMwLDIwLDM4LDctNiwxMi0xNiwxMi0yNlY0OWMwLTQsMy03LDYtOGw0Ni0xMmM1LTEsMTEsMywxMSw4djMxYzAsMzctMzAsNjYtNjYsNjYtMzcsMC02Ni0zMC02Ni02NlY0NmMwLTQsMy03LDYtOGwyMC02YzUtMSwxMSwzLDExLDh2MjF6bS0yOSw2YzAsMTYsNiwzMCwxNyw0MCwzLDEsNSwxLDgsMSw1LDAsMTAtMSwxNS0zQzM3LDk1LDI5LDc5LDI5LDYyVjQybC0xOSw1djIweiIgZmlsbD0iIzAxMDEwMSIvPjxkZWZzPjxsaW5lYXJHcmFkaWVudCBpZD0iYSIgeDE9Ijg0IiB5MT0iNDEiIHgyPSI3NSIgeTI9IjEyMCIgZ3JhZGllbnRVbml0cz0idXNlclNwYWNlT25Vc2UiPjxzdG9wIHN0b3AtY29sb3I9IiNmZmYiLz48c3RvcCBvZmZzZXQ9IjEiIHN0b3AtY29sb3I9IiMyZTJlMmUiLz48L2xpbmVhckdyYWRpZW50PjxsaW5lYXJHcmFkaWVudCBpZD0iYiIgeDE9Ijg0IiB5MT0iNDEiIHgyPSI3NSIgeTI9IjEyMCIgZ3JhZGllbnRVbml0cz0idXNlclNwYWNlT25Vc2UiPjxzdG9wIHN0b3AtY29sb3I9IiNmZmYiLz48c3RvcCBvZmZzZXQ9IjEiIHN0b3AtY29sb3I9IiMyZTJlMmUiLz48L2xpbmVhckdyYWRpZW50PjxsaW5lYXJHcmFkaWVudCBpZD0iYyIgeDE9Ijg0IiB5MT0iNDEiIHgyPSI3NSIgeTI9IjEyMCIgZ3JhZGllbnRVbml0cz0idXNlclNwYWNlT25Vc2UiPjxzdG9wIHN0b3AtY29sb3I9IiNmZmYiLz48c3RvcCBvZmZzZXQ9IjEiIHN0b3AtY29sb3I9IiMyZTJlMmUiLz48L2xpbmVhckdyYWRpZW50PjwvZGVmcz48L3N2Zz4=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
108
+ [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
109
+ [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
110
+ [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/a3cb16570ab3cfeadf9d8f0ac91b4fca/mineru_demo.ipynb)
111
+ [![arXiv](https://img.shields.io/badge/MinerU-Technical%20Report-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2409.18839)
112
+ [![arXiv](https://img.shields.io/badge/MinerU2.5-Technical%20Report-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2509.22186)
113
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/opendatalab/MinerU)
114
+
115
+
116
+ <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
117
+
118
+ <!-- language -->
119
+
120
+ [English](README.md) | [简体中文](README_zh-CN.md)
121
+
122
+ <!-- hot link -->
123
+
124
+ <p align="center">
125
+ 🚀<a href="https://mineru.net/?source=github">Access MinerU Now→✅ Zero-Install Web Version ✅ Full-Featured Desktop Client ✅ Instant API Access; Skip deployment headaches – get all product formats in one click. Developers, dive in!</a>
126
+ </p>
127
+
128
+ <!-- join us -->
129
+
130
+ <p align="center">
131
+ 👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://mineru.net/community-portal/?aliasId=3c430f94" target="_blank">WeChat</a>
132
+ </p>
133
+
134
+ </div>
135
+
136
+ # Changelog
137
+
138
+ - 2026/01/06 2.7.1 Release
139
+ - fix bug: #4300
140
+ - Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
141
+ - Support automatic correction of input image exif orientation to improve OCR recognition accuracy #4283
142
+
143
+ - 2025/12/30 2.7.0 Release
144
+ - Simplified installation process. No need to separately install `vlm` acceleration engine dependencies. Using `uv pip install mineru[all]` during installation will install all optional backend dependencies.
145
+ - Added new `hybrid` backend, which combines the advantages of `pipeline` and `vlm` backends. Built on vlm, it integrates some capabilities of pipeline, adding extra extensibility on top of high accuracy:
146
+ - Directly extracts text from text PDFs, natively supports multi-language recognition in text PDF scenarios, and greatly reduces parsing hallucinations;
147
+ - Supports text recognition in 109 languages for scanned PDF scenarios by specifying OCR language;
148
+ - Independent inline formula recognition switch, which can be disabled separately when inline formula recognition is not needed, improving the visual effect of parsing results.
149
+ - Simplified engine selection logic for `vlm/hybrid` backends. Users only need to specify the backend as `*-auto-engine`, and the system will automatically select the appropriate engine for inference acceleration based on the current environment, improving usability.
150
+ - Switched default parsing backend from `pipeline` to `hybrid-auto-engine`, improving out-of-the-box result consistency for new users and avoiding cognitive differences in parsing results.
151
+ - Added i18n support to gradio application, supporting switching between Chinese and English languages.
152
+
153
+ > 📝 View the complete [Changelog](https://opendatalab.github.io/MinerU/reference/changelog/) for more historical version information
154
+
155
+ # MinerU
156
+
157
+ ## Project Introduction
158
+
159
+ MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
160
+ MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
161
+ Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
162
+
163
+ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
164
+
165
+ ## Key Features
166
+
167
+ - Remove headers, footers, footnotes, page numbers, etc., to ensure semantic coherence.
168
+ - Output text in human-readable order, suitable for single-column, multi-column, and complex layouts.
169
+ - Preserve the structure of the original document, including headings, paragraphs, lists, etc.
170
+ - Extract images, image descriptions, tables, table titles, and footnotes.
171
+ - Automatically recognize and convert formulas in the document to LaTeX format.
172
+ - Automatically recognize and convert tables in the document to HTML format.
173
+ - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
174
+ - OCR supports detection and recognition of 109 languages.
175
+ - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
176
+ - Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality.
177
+ - Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
178
+ - Compatible with Windows, Linux, and Mac platforms.
179
+
180
+ # Quick Start
181
+
182
+ If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
183
+ If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
184
+
185
+ ## Online Experience
186
+
187
+ ### Official online web application
188
+ The official online version has the same functionality as the client, with a beautiful interface and rich features, requires login to use
189
+
190
+ - [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTM0IiBoZWlnaHQ9IjEzNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48cGF0aCBkPSJtMTIyLDljMCw1LTQsOS05LDlzLTktNC05LTksNC05LDktOSw5LDQsOSw5eiIgZmlsbD0idXJsKCNhKSIvPjxwYXRoIGQ9Im0xMjIsOWMwLDUtNCw5LTksOXMtOS00LTktOSw0LTksOS05LDksNCw5LDl6IiBmaWxsPSIjMDEwMTAxIi8+PHBhdGggZD0ibTkxLDE4YzAsNS00LDktOSw5cy05LTQtOS05LDQtOSw5LTksOSw0LDksOXoiIGZpbGw9InVybCgjYikiLz48cGF0aCBkPSJtOTEsMThjMCw1LTQsOS05LDlzLTktNC05LTksNC05LDktOSw5LDQsOSw5eiIgZmlsbD0iIzAxMDEwMSIvPjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJtMzksNjJjMCwxNiw4LDMwLDIwLDM4LDctNiwxMi0xNiwxMi0yNlY0OWMwLTQsMy03LDYtOGw0Ni0xMmM1LTEsMTEsMywxMSw4djMxYzAsMzctMzAsNjYtNjYsNjYtMzcsMC02Ni0zMC02Ni02NlY0NmMwLTQsMy03LDYtOGwyMC02YzUtMSwxMSwzLDExLDh2MjF6bS0yOSw2YzAsMTYsNiwzMCwxNyw0MCwzLDEsNSwxLDgsMSw1LDAsMTAtMSwxNS0zQzM3LDk1LDI5LDc5LDI5LDYyVjQybC0xOSw1djIweiIgZmlsbD0idXJsKCNjKSIvPjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJtMzksNjJjMCwxNiw4LDMwLDIwLDM4LDctNiwxMi0xNiwxMi0yNlY0OWMwLTQsMy03LDYtOGw0Ni0xMmM1LTEsMTEsMywxMSw4djMxYzAsMzctMzAsNjYtNjYsNjYtMzcsMC02Ni0zMC02Ni02NlY0NmMwLTQsMy03LDYtOGwyMC02YzUtMSwxMSwzLDExLDh2MjF6bS0yOSw2YzAsMTYsNiwzMCwxNyw0MCwzLDEsNSwxLDgsMSw1LDAsMTAtMSwxNS0zQzM3LDk1LDI5LDc5LDI5LDYyVjQybC0xOSw1djIweiIgZmlsbD0iIzAxMDEwMSIvPjxkZWZzPjxsaW5lYXJHcmFkaWVudCBpZD0iYSIgeDE9Ijg0IiB5MT0iNDEiIHgyPSI3NSIgeTI9IjEyMCIgZ3JhZGllbnRVbml0cz0idXNlclNwYWNlT25Vc2UiPjxzdG9wIHN0b3AtY29sb3I9IiNmZmYiLz48c3RvcCBvZmZzZXQ9IjEiIHN0b3AtY29sb3I9IiMyZTJlMmUiLz48L2xpbmVhckdyYWRpZW50PjxsaW5lYXJHcmFkaWVudCBpZD0iYiIgeDE9Ijg0IiB5MT0iNDEiIHgyPSI3NSIgeTI9IjEyMCIgZ3JhZGllbnRVbml0cz0idXNlclNwYWNlT25Vc2UiPjxzdG9wIHN0b3AtY29sb3I9IiNmZmYiLz48c3RvcCBvZmZzZXQ9IjEiIHN0b3AtY29sb3I9IiMyZTJlMmUiLz48L2xpbmVhckdyYWRpZW50PjxsaW5lYXJHcmFkaWVudCBpZD0iYyIgeDE9Ijg0IiB5MT0iNDEiIHgyPSI3NSIgeTI9IjEyMCIgZ3JhZGllbnRVbml0cz0idXNlclNwYWNlT25Vc2UiPjxzdG9wIHN0b3AtY29sb3I9IiNmZmYiLz48c3RvcCBvZmZzZXQ9IjEiIHN0b3AtY29sb3I9IiMyZTJlMmUiLz48L2xpbmVhckdyYWRpZW50PjwvZGVmcz48L3N2Zz4=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
191
+
192
+ ### Gradio-based online demo
193
+ A WebUI developed based on Gradio, with a simple interface and only core parsing functionality, no login required
194
+
195
+ - [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
196
+ - [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
197
+
198
+ ## Local Deployment
199
+
200
+
201
+ > [!WARNING]
202
+ > **Pre-installation Notice—Hardware and Software Environment Support**
203
+ >
204
+ > To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
205
+ >
206
+ > By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
207
+ >
208
+ > In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
209
+
210
+ <table>
211
+ <thead>
212
+ <tr>
213
+ <th rowspan="2">Parsing Backend</th>
214
+ <th rowspan="2">pipeline</th>
215
+ <th colspan="2">*-auto-engine</th>
216
+ <th colspan="2">*-http-client</th>
217
+ </tr>
218
+ <tr>
219
+ <th>hybrid</th>
220
+ <th>vlm</th>
221
+ <th>hybrid</th>
222
+ <th>vlm</th>
223
+ </tr>
224
+ </thead>
225
+ <tbody>
226
+ <tr>
227
+ <th>Backend Features</th>
228
+ <td >Good Compatibility</td>
229
+ <td colspan="2">High Hardware Requirements</td>
230
+ <td colspan="2">For OpenAI Compatible Servers<sup>2</sup></td>
231
+ </tr>
232
+ <tr>
233
+ <th>Accuracy<sup>1</sup></th>
234
+ <td style="text-align:center;">82+</td>
235
+ <td colspan="4" style="text-align:center;">90+</td>
236
+ </tr>
237
+ <tr>
238
+ <th>Operating System</th>
239
+ <td colspan="5" style="text-align:center;">Linux<sup>3</sup> / Windows<sup>4</sup> / macOS<sup>5</sup></td>
240
+ </tr>
241
+ <tr>
242
+ <th>Pure CPU Support</th>
243
+ <td style="text-align:center;">✅</td>
244
+ <td colspan="2" style="text-align:center;">❌</td>
245
+ <td colspan="2" style="text-align:center;">✅</td>
246
+ </tr>
247
+ <tr>
248
+ <th>GPU Acceleration</th>
249
+ <td colspan="4" style="text-align:center;">Volta and later architecture GPUs or Apple Silicon</td>
250
+ <td rowspan="2">Not Required</td>
251
+ </tr>
252
+ <tr>
253
+ <th>Min VRAM</th>
254
+ <td style="text-align:center;">6GB</td>
255
+ <td style="text-align:center;">10GB</td>
256
+ <td style="text-align:center;">8GB</td>
257
+ <td style="text-align:center;">3GB</td>
258
+ </tr>
259
+ <tr>
260
+ <th>RAM</th>
261
+ <td colspan="3" style="text-align:center;">Min 16GB, Recommended 32GB or more</td>
262
+ <td colspan="2" style="text-align:center;">Min 8GB</td>
263
+ </tr>
264
+ <tr>
265
+ <th>Disk Space</th>
266
+ <td colspan="3" style="text-align:center;">Min 20GB, SSD Recommended</td>
267
+ <td colspan="2" style="text-align:center;">Min 2GB</td>
268
+ </tr>
269
+ <tr>
270
+ <th>Python Version</th>
271
+ <td colspan="5" style="text-align:center;">3.10-3.13</td>
272
+ </tr>
273
+ </tbody>
274
+ </table>
275
+
276
+ <sup>1</sup> Accuracy metrics are the End-to-End Evaluation Overall scores from OmniDocBench (v1.5), based on the latest version of `MinerU`.
277
+ <sup>2</sup> Servers compatible with OpenAI API, such as local model servers or remote model services deployed via inference frameworks like `vLLM`/`SGLang`/`LMDeploy`.
278
+ <sup>3</sup> Linux only supports distributions from 2019 and later.
279
+ <sup>4</sup> Since the key dependency `ray` does not support Python 3.13 on Windows, only versions 3.10~3.12 are supported.
280
+ <sup>5</sup> macOS requires version 14.0 or later.
281
+
282
+
283
+ ### Install MinerU
284
+
285
+ #### Install MinerU using pip or uv
286
+ ```bash
287
+ pip install --upgrade pip
288
+ pip install uv
289
+ uv pip install -U "mineru[all]"
290
+ ```
291
+
292
+ #### Install MinerU from source code
293
+ ```bash
294
+ git clone https://github.com/opendatalab/MinerU.git
295
+ cd MinerU
296
+ uv pip install -e .[all]
297
+ ```
298
+
299
+ > [!TIP]
300
+ > `mineru[all]` includes all core features, compatible with Windows / Linux / macOS systems, suitable for most users.
301
+ > If you need to specify the inference framework for the VLM model, or only intend to install a lightweight client on an edge device, please refer to the documentation [Extension Modules Installation Guide](https://opendatalab.github.io/MinerU/quick_start/extension_modules/).
302
+
303
+ ---
304
+
305
+ #### Deploy MinerU using Docker
306
+ MinerU provides a convenient Docker deployment method, which helps quickly set up the environment and solve some tricky environment compatibility issues.
307
+ You can get the [Docker Deployment Instructions](https://opendatalab.github.io/MinerU/quick_start/docker_deployment/) in the documentation.
308
+
309
+ ---
310
+
311
+ ### Using MinerU
312
+
313
+
314
+ If your device meets the GPU acceleration requirements in the table above, you can use a simple command line for document parsing:
315
+ ```bash
316
+ mineru -p <input_path> -o <output_path>
317
+ ```
318
+ If your device does not meet the GPU acceleration requirements, you can specify the backend as `pipeline` to run in a pure CPU environment:
319
+ ```bash
320
+ mineru -p <input_path> -o <output_path> -b pipeline
321
+ ```
322
+
323
+ You can use MinerU for PDF parsing through various methods such as command line, API, and WebUI. For detailed instructions, please refer to the [Usage Guide](https://opendatalab.github.io/MinerU/usage/).
324
+
325
+ # TODO
326
+
327
+ - [x] Reading order based on the model
328
+ - [x] Recognition of `index` and `list` in the main text
329
+ - [x] Table recognition
330
+ - [x] Heading Classification
331
+ - [x] Handwritten Text Recognition
332
+ - [x] Vertical Text Recognition
333
+ - [x] Latin Accent Mark Recognition
334
+ - [x] Code block recognition in the main text
335
+ - [x] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)(mineru.net)
336
+ - [ ] Geometric shape recognition
337
+
338
+ # Known Issues
339
+
340
+ - Reading order is determined by the model based on the spatial distribution of readable content, and may be out of order in some areas under extremely complex layouts.
341
+ - Limited support for vertical text.
342
+ - Tables of contents and lists are recognized through rules, and some uncommon list formats may not be recognized.
343
+ - Code blocks are not yet supported in the layout model.
344
+ - Comic books, art albums, primary school textbooks, and exercises cannot be parsed well.
345
+ - Table recognition may result in row/column recognition errors in complex tables.
346
+ - OCR recognition may produce inaccurate characters in PDFs of lesser-known languages (e.g., diacritical marks in Latin script, easily confused characters in Arabic script).
347
+ - Some formulas may not render correctly in Markdown.
348
+
349
+ # FAQ
350
+
351
+ - If you encounter any issues during usage, you can first check the [FAQ](https://opendatalab.github.io/MinerU/faq/) for solutions.
352
+ - If your issue remains unresolved, you may also use [DeepWiki](https://deepwiki.com/opendatalab/MinerU) to interact with an AI assistant, which can address most common problems.
353
+ - If you still cannot resolve the issue, you are welcome to join our community via [Discord](https://discord.gg/Tdedn9GTXq) or [WeChat](https://mineru.net/community-portal/?aliasId=3c430f94) to discuss with other users and developers.
354
+
355
+ # All Thanks To Our Contributors
356
+
357
+ <a href="https://github.com/opendatalab/MinerU/graphs/contributors">
358
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
359
+ </a>
360
+
361
+ # License Information
362
+
363
+ [LICENSE.md](LICENSE.md)
364
+
365
+ Currently, some models in this project are trained based on YOLO. However, since YOLO follows the AGPL license, it may impose restrictions on certain use cases. In future iterations, we plan to explore and replace these with models under more permissive licenses to enhance user-friendliness and flexibility.
366
+
367
+ # Acknowledgments
368
+
369
+ - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
370
+ - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
371
+ - [UniMERNet](https://github.com/opendatalab/UniMERNet)
372
+ - [RapidTable](https://github.com/RapidAI/RapidTable)
373
+ - [TableStructureRec](https://github.com/RapidAI/TableStructureRec)
374
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
375
+ - [PaddleOCR2Pytorch](https://github.com/frotms/PaddleOCR2Pytorch)
376
+ - [layoutreader](https://github.com/ppaanngggg/layoutreader)
377
+ - [xy-cut](https://github.com/Sanster/xy-cut)
378
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
379
+ - [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
380
+ - [pdftext](https://github.com/datalab-to/pdftext)
381
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
382
+ - [pypdf](https://github.com/py-pdf/pypdf)
383
+ - [magika](https://github.com/google/magika)
384
+ - [vLLM](https://github.com/vllm-project/vllm)
385
+ - [LMDeploy](https://github.com/InternLM/lmdeploy)
386
+
387
+ # Citation
388
+
389
+ ```bibtex
390
+ @misc{niu2025mineru25decoupledvisionlanguagemodel,
391
+ title={MinerU2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing},
392
+ author={Junbo Niu and Zheng Liu and Zhuangcheng Gu and Bin Wang and Linke Ouyang and Zhiyuan Zhao and Tao Chu and Tianyao He and Fan Wu and Qintong Zhang and Zhenjiang Jin and Guang Liang and Rui Zhang and Wenzheng Zhang and Yuan Qu and Zhifei Ren and Yuefeng Sun and Yuanhong Zheng and Dongsheng Ma and Zirui Tang and Boyu Niu and Ziyang Miao and Hejun Dong and Siyi Qian and Junyuan Zhang and Jingzhou Chen and Fangdong Wang and Xiaomeng Zhao and Liqun Wei and Wei Li and Shasha Wang and Ruiliang Xu and Yuanyuan Cao and Lu Chen and Qianqian Wu and Huaiyu Gu and Lindong Lu and Keming Wang and Dechen Lin and Guanlin Shen and Xuanhe Zhou and Linfeng Zhang and Yuhang Zang and Xiaoyi Dong and Jiaqi Wang and Bo Zhang and Lei Bai and Pei Chu and Weijia Li and Jiang Wu and Lijun Wu and Zhenxiang Li and Guangyu Wang and Zhongying Tu and Chao Xu and Kai Chen and Yu Qiao and Bowen Zhou and Dahua Lin and Wentao Zhang and Conghui He},
393
+ year={2025},
394
+ eprint={2509.22186},
395
+ archivePrefix={arXiv},
396
+ primaryClass={cs.CV},
397
+ url={https://arxiv.org/abs/2509.22186},
398
+ }
399
+
400
+ @misc{wang2024mineruopensourcesolutionprecise,
401
+ title={MinerU: An Open-Source Solution for Precise Document Content Extraction},
402
+ author={Bin Wang and Chao Xu and Xiaomeng Zhao and Linke Ouyang and Fan Wu and Zhiyuan Zhao and Rui Xu and Kaiwen Liu and Yuan Qu and Fukai Shang and Bo Zhang and Liqun Wei and Zhihao Sui and Wei Li and Botian Shi and Yu Qiao and Dahua Lin and Conghui He},
403
+ year={2024},
404
+ eprint={2409.18839},
405
+ archivePrefix={arXiv},
406
+ primaryClass={cs.CV},
407
+ url={https://arxiv.org/abs/2409.18839},
408
+ }
409
+
410
+ @article{he2024opendatalab,
411
+ title={Opendatalab: Empowering general artificial intelligence with open datasets},
412
+ author={He, Conghui and Li, Wei and Jin, Zhenjiang and Xu, Chao and Wang, Bin and Lin, Dahua},
413
+ journal={arXiv preprint arXiv:2407.13773},
414
+ year={2024}
415
+ }
416
+ ```
417
+
418
+ # Star History
419
+
420
+ <a>
421
+ <picture>
422
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
423
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
424
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
425
+ </picture>
426
+ </a>
427
+
428
+
429
+ # Links
430
+ - [Easy Data Preparation with latest LLMs-based Operators and Pipelines](https://github.com/OpenDCAI/DataFlow)
431
+ - [Vis3 (OSS browser based on s3)](https://github.com/opendatalab/Vis3)
432
+ - [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
433
+ - [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
434
+ - [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
435
+ - [OmniDocBench (A Comprehensive Benchmark for Document Parsing and Evaluation)](https://github.com/opendatalab/OmniDocBench)
436
+ - [Magic-HTML (Mixed web page extraction tool)](https://github.com/opendatalab/magic-html)
437
+ - [Magic-Doc (Fast speed ppt/pptx/doc/docx/pdf extraction tool)](https://github.com/InternLM/magic-doc)
438
+ - [Dingo: A Comprehensive AI Data Quality Evaluation Tool](https://github.com/MigoXLab/dingo)
@@ -1,27 +1,31 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=3WRCYKawwH5XZgRMQ8fXEoGyCz5YorbyTfOWuY9SV3g,22
2
+ mineru/version.py,sha256=yRpSH6mBb4BJgbFlT7rt8MSjCUW17Ycx0RziLf-lQLA,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
+ mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
6
+ mineru/backend/hybrid/hybrid_analyze.py,sha256=Sckw6T-pvMv3V_nqZkBeW8kY4zNIBlWxqeS2vXqNqtY,20939
7
+ mineru/backend/hybrid/hybrid_magic_model.py,sha256=39ByeZh54KBbPe77bzGCqZrZ5RNwNxGYttcoisgDOrc,24668
8
+ mineru/backend/hybrid/hybrid_model_output_to_middle_json.py,sha256=yE-c1eGa5LzPqLfKfvBON_SJRljqyz2B7LiglFcE7FQ,8468
5
9
  mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
6
- mineru/backend/pipeline/batch_analyze.py,sha256=gnilKhFlMe8-55X2PJnb-ZSVeZIS-5DxIbMpHnwLne8,20889
7
- mineru/backend/pipeline/model_init.py,sha256=OAylOcQD9gu5TBcX7nMt7X5NpJMtQICI5IvEQ648lpI,9358
10
+ mineru/backend/pipeline/batch_analyze.py,sha256=3UBs2WOwcI-mfGAlxZt437OqSOleXPLnpYbrD9h5D54,21303
11
+ mineru/backend/pipeline/model_init.py,sha256=wz8VdieUXCxzS9NQqkmaiXweey8xz-_DBvmC1OuK_B4,12469
8
12
  mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
9
13
  mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
10
14
  mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
11
- mineru/backend/pipeline/pipeline_analyze.py,sha256=GkGOrWGnBSswUik3nt_m76bCwDISC9sxXZ6xRX3L154,6528
15
+ mineru/backend/pipeline/pipeline_analyze.py,sha256=RPQTtu-m4WYVQxIRKLj39VPyaKqroi5Tc9-FCEuQhX8,6599
12
16
  mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
13
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=YlnEbbUnkniZXS13aLo5mjfFQvQM5SrIVvTAGBZsLmw,14478
17
+ mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
14
18
  mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
15
19
  mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
16
- mineru/backend/vlm/utils.py,sha256=JMgS3SMFcHJYH2jIx-Xhs-P2a1bmT8U6Kn60IL0OmQA,3570
17
- mineru/backend/vlm/vlm_analyze.py,sha256=wP3vuYGVec0hRsDAuzfSm2HD4Muu7wSWL767qxd_yqw,11690
18
- mineru/backend/vlm/vlm_magic_model.py,sha256=e8BWkfeRkZNJjFdm9oPmvIs9ATjdzCy_5OZw2qTziZA,23839
19
- mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=Oc2w2BSRXeJPKVcM1Bkb2LjLNTQAG5yUYRtF7YNBTss,24070
20
+ mineru/backend/vlm/utils.py,sha256=sqcS4WVCcxVL1aElKII1zNYMu2yH5tRpVqRb4lXVm38,3650
21
+ mineru/backend/vlm/vlm_analyze.py,sha256=EdfEmkroA3lafRZLqN4uOaLWx9oxVnUocqFsWZyS82c,11666
22
+ mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflbB4a-TmsU,21402
23
+ mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
20
24
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
21
- mineru/cli/client.py,sha256=XSEIr4klUuufMAWn5IioZdXpg1xAxqRZF0HkaVIhxh0,6815
22
- mineru/cli/common.py,sha256=Rld3P4FzbTYngy3BP-Irto9aMS7EPq2lIWgeIF1xPMc,14667
23
- mineru/cli/fast_api.py,sha256=lLxQKKHmD8ruoZGcE6LrXzr3pQIxvw8OdJrQq_FNLSM,14447
24
- mineru/cli/gradio_app.py,sha256=EUPuRHHCOECrE3E3VNEeuMDYeC3nicurOYfk8YJSOMw,15646
25
+ mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
26
+ mineru/cli/common.py,sha256=fMPc235DtnupQkh9uFIMHUpxOSvCp5yc3A56sAabAWY,20475
27
+ mineru/cli/fast_api.py,sha256=TGpZqyUE1kg2eXsP76pr0p1yqNOOU9jyjL5Pc0FJwRc,16637
28
+ mineru/cli/gradio_app.py,sha256=2IIWOm2bEHHq5BZMlfmN3yAJw1Nf8SUALTQ95o-bYy0,21863
25
29
  mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
26
30
  mineru/cli/vlm_server.py,sha256=27HaqO3wpMXSA_nA3CC6JOBTHK3q66SP00cD6m9HuQE,1974
27
31
  mineru/data/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
@@ -43,13 +47,13 @@ mineru/model/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
43
47
  mineru/model/layout/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
44
48
  mineru/model/layout/doclayoutyolo.py,sha256=DttINdulzTiYcVDl_70oDtUdfVmGc9qkKWmbPOGAeV0,3867
45
49
  mineru/model/mfd/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
46
- mineru/model/mfd/yolo_v8.py,sha256=3zrxPQWgrSdq13CqcL9dNtZ8oJPLjQzH10hptNA1iLA,3492
50
+ mineru/model/mfd/yolo_v8.py,sha256=t7ptmShHoZCW9WkqLNCL1KRChxpa2E7j5g5fibXlUvY,3681
47
51
  mineru/model/mfr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
48
52
  mineru/model/mfr/utils.py,sha256=pAi1HnkTuO0R6251Hdl-o50m0wH0Ce89PAf74WCsXPU,11499
49
53
  mineru/model/mfr/pp_formulanet_plus_m/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
54
  mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=alGX_sPJxZh_7v1sOK3DJ8akfkWO-2c5I_JR7aXMTLU,5588
51
55
  mineru/model/mfr/pp_formulanet_plus_m/processors.py,sha256=MSKyanxiDDjgDQHBov-GjKtPnMx9tSmxBC9GIkM3ft8,23832
52
- mineru/model/mfr/unimernet/Unimernet.py,sha256=MrW6F084EHBmD-IbbtKbllrZ6MSH65otMJfrhBuRweg,5589
56
+ mineru/model/mfr/unimernet/Unimernet.py,sha256=1SGLSQ2rc6oslnEwP4Ti7JxaNlyCSGge0js-Tr1VikE,7864
53
57
  mineru/model/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
58
  mineru/model/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
55
59
  mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=_lN3zDKxeqsW-h9tXx79DYiT5uT4P9ixG49WrSYKFxE,7551
@@ -148,40 +152,42 @@ mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7ig
148
152
  mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
153
  mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
150
154
  mineru/model/vlm/vllm_server.py,sha256=w5ddusPbcVaEoWAo_BRjmwv_Ywxrc_bCMRhxihoyykY,2263
151
- mineru/resources/header.html,sha256=PUselBXLBn8gfeP3zwEtj6zIxfhcCN4vN_B796nQFNQ,4410
155
+ mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
152
156
  mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
153
157
  mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
154
158
  mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
155
159
  mineru/utils/block_sort.py,sha256=5e1mOLB3W7xu5Y1hmhvGSHPL_aQ41R_4VXcP4vjYAOU,12976
156
- mineru/utils/boxbase.py,sha256=moP660AmZq_udHEsfvFkTQdJ4gjrrBwN7t0Enx7CIL8,6903
160
+ mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
161
+ mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
157
162
  mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
158
163
  mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
159
164
  mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
160
165
  mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
161
166
  mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
162
- mineru/utils/enum_class.py,sha256=NO9FiM2Ni6-SI1rzMm5X98ssNDxgJTyykZddJSscgJs,3221
167
+ mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
168
+ mineru/utils/enum_class.py,sha256=Wejt7zX-EbmvQM_aiIPCS4cFN_K0-8OYzlO5Pyysugc,3728
163
169
  mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,10233
164
170
  mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6of8eDVkc,1198
165
171
  mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
166
172
  mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
167
173
  mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
168
- mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZSaYs,5650
174
+ mineru/utils/magic_model_utils.py,sha256=I6vdN56aqhQBGOasoWHiJbjnXsBwUojw6xFjbWZSHaU,8656
169
175
  mineru/utils/model_utils.py,sha256=6moOQqE5ShHaJKkENXP8BXJA7RCWtOGlYHZ3nidwmZs,18977
170
176
  mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
171
177
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
172
178
  mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
173
179
  mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
174
- mineru/utils/pdf_image_tools.py,sha256=86_xvsGOEde5QGlKz5uJemjoO1upr6n_K7o3lCdyIjQ,7981
180
+ mineru/utils/pdf_image_tools.py,sha256=L2kHKoFaQo4CGjS1d68JACrlBycx6gyCnnFlbBFRKuw,8273
175
181
  mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
176
182
  mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
177
183
  mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
178
184
  mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
179
185
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
180
- mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
181
- mineru/utils/table_merge.py,sha256=pZHP0mIUcWPcdu0HecZDEFZ_ms_rT7C43m0Eg2iEEF4,15592
182
- mineru-2.6.8.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
183
- mineru-2.6.8.dist-info/METADATA,sha256=nbMEGfGHC_Z7OUDegE18y0lrzSXaQCTtpWDYi_KZVSE,73142
184
- mineru-2.6.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
185
- mineru-2.6.8.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
186
- mineru-2.6.8.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
187
- mineru-2.6.8.dist-info/RECORD,,
186
+ mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
187
+ mineru/utils/table_merge.py,sha256=X2vQCCKx8hG9Iipn4UEP8pXHc9jeNmYNYvl5zxaTS2E,15185
188
+ mineru-2.7.1.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
+ mineru-2.7.1.dist-info/METADATA,sha256=gtaeoZmMvmHA8JDW1QnpszDa0-cTwogQ-5BOPTdikWA,35540
190
+ mineru-2.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
191
+ mineru-2.7.1.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
+ mineru-2.7.1.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
+ mineru-2.7.1.dist-info/RECORD,,