magic-pdf 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +2 -2
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/pdf_extract_kit.py +24 -20
- {magic_pdf-0.6.0.dist-info → magic_pdf-0.6.1.dist-info}/METADATA +78 -5
- {magic_pdf-0.6.0.dist-info → magic_pdf-0.6.1.dist-info}/RECORD +9 -9
- {magic_pdf-0.6.0.dist-info → magic_pdf-0.6.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.6.0.dist-info → magic_pdf-0.6.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.6.0.dist-info → magic_pdf-0.6.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.6.0.dist-info → magic_pdf-0.6.1.dist-info}/top_level.txt +0 -0
@@ -112,7 +112,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
112
112
|
for line in block['lines']:
|
113
113
|
for span in line['spans']:
|
114
114
|
if span['type'] == ContentType.Image:
|
115
|
-
para_text += f"\n})\n"
|
115
|
+
para_text += f"\n}) \n"
|
116
116
|
for block in para_block['blocks']: # 2nd.拼image_caption
|
117
117
|
if block['type'] == BlockType.ImageCaption:
|
118
118
|
para_text += merge_para_with_text(block)
|
@@ -128,7 +128,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
128
128
|
for line in block['lines']:
|
129
129
|
for span in line['spans']:
|
130
130
|
if span['type'] == ContentType.Table:
|
131
|
-
para_text += f"\n})\n"
|
131
|
+
para_text += f"\n}) \n"
|
132
132
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
133
133
|
if block['type'] == BlockType.TableFootnote:
|
134
134
|
para_text += merge_para_with_text(block)
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.6.
|
1
|
+
__version__ = "0.6.1"
|
@@ -1,24 +1,28 @@
|
|
1
|
-
import os
|
2
|
-
import cv2
|
3
|
-
import yaml
|
4
|
-
import time
|
5
|
-
import argparse
|
6
|
-
import numpy as np
|
7
|
-
import torch
|
8
1
|
from loguru import logger
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
import
|
17
|
-
|
18
|
-
|
19
|
-
from
|
20
|
-
from
|
21
|
-
from
|
2
|
+
import os
|
3
|
+
try:
|
4
|
+
import cv2
|
5
|
+
import yaml
|
6
|
+
import time
|
7
|
+
import argparse
|
8
|
+
import numpy as np
|
9
|
+
import torch
|
10
|
+
|
11
|
+
from paddleocr import draw_ocr
|
12
|
+
from PIL import Image
|
13
|
+
from torchvision import transforms
|
14
|
+
from torch.utils.data import Dataset, DataLoader
|
15
|
+
from ultralytics import YOLO
|
16
|
+
from unimernet.common.config import Config
|
17
|
+
import unimernet.tasks as tasks
|
18
|
+
from unimernet.processors import load_processor
|
19
|
+
|
20
|
+
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
|
21
|
+
from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
|
22
|
+
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
|
23
|
+
except ImportError:
|
24
|
+
logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
|
25
|
+
exit(1)
|
22
26
|
|
23
27
|
|
24
28
|
def mfd_model_init(weight):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.6.
|
3
|
+
Version: 0.6.1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -106,23 +106,69 @@ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3
|
|
106
106
|
|
107
107
|
- Python >= 3.9
|
108
108
|
|
109
|
+
Using a virtual environment is recommended to avoid potential dependency conflicts; both venv and conda are suitable.
|
110
|
+
For example:
|
111
|
+
```bash
|
112
|
+
conda create -n MinerU python=3.10
|
113
|
+
conda activate MinerU
|
114
|
+
```
|
115
|
+
|
109
116
|
### Usage Instructions
|
110
117
|
|
111
118
|
#### 1. Install Magic-PDF
|
112
119
|
|
120
|
+
Install using pip:
|
113
121
|
```bash
|
114
122
|
pip install magic-pdf
|
115
123
|
```
|
124
|
+
Alternatively, for built-in high-precision model parsing capabilities, use:
|
125
|
+
```bash
|
126
|
+
pip install magic-pdf[full-cpu]
|
127
|
+
```
|
128
|
+
The high-precision models depend on detectron2, which requires a compiled installation.
|
129
|
+
If you need to compile it yourself, refer to https://github.com/facebookresearch/detectron2/issues/5114
|
130
|
+
Or directly use our pre-compiled wheel packages (limited to python 3.10):
|
131
|
+
```bash
|
132
|
+
pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
|
133
|
+
```
|
116
134
|
|
117
|
-
#### 2. Usage via Command Line
|
118
135
|
|
119
|
-
|
136
|
+
#### 2. Downloading model weights files
|
137
|
+
|
138
|
+
For detailed references, please see below [how_to_download_models](docs/how_to_download_models_en.md)
|
120
139
|
|
140
|
+
After downloading the model weights, move the 'models' directory to a directory on a larger disk space, preferably an SSD.
|
141
|
+
|
142
|
+
|
143
|
+
#### 3. Copy the Configuration File and Make Configurations
|
144
|
+
You can get the [magic-pdf.template.json](magic-pdf.template.json) file in the repository root directory.
|
121
145
|
```bash
|
122
146
|
cp magic-pdf.template.json ~/magic-pdf.json
|
147
|
+
```
|
148
|
+
In magic-pdf.json, configure "models-dir" to point to the directory where the model weights files are located.
|
149
|
+
|
150
|
+
```json
|
151
|
+
{
|
152
|
+
"models-dir": "/tmp/models"
|
153
|
+
}
|
154
|
+
```
|
155
|
+
|
156
|
+
|
157
|
+
#### 4. Usage via Command Line
|
158
|
+
|
159
|
+
###### simple
|
160
|
+
|
161
|
+
```bash
|
162
|
+
magic-pdf pdf-command --pdf "pdf_path" --inside_model true
|
163
|
+
```
|
164
|
+
After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
|
165
|
+
You can find the corresponding xxx_model.json file in the markdown directory.
|
166
|
+
If you intend to do secondary development on the post-processing pipeline, you can use the command:
|
167
|
+
```bash
|
123
168
|
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
124
169
|
```
|
125
|
-
|
170
|
+
In this way, you won't need to re-run the model data, making debugging more convenient.
|
171
|
+
|
126
172
|
|
127
173
|
###### more
|
128
174
|
|
@@ -130,7 +176,34 @@ After the program has finished, you can find the generated markdown files under
|
|
130
176
|
magic-pdf --help
|
131
177
|
```
|
132
178
|
|
133
|
-
|
179
|
+
|
180
|
+
#### 5. Acceleration Using CUDA or MPS
|
181
|
+
|
182
|
+
##### CUDA
|
183
|
+
|
184
|
+
You need to install the corresponding PyTorch version according to your CUDA version.
|
185
|
+
This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
|
186
|
+
```bash
|
187
|
+
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
|
188
|
+
```
|
189
|
+
Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json.
|
190
|
+
```json
|
191
|
+
{
|
192
|
+
"device-mode":"cuda"
|
193
|
+
}
|
194
|
+
```
|
195
|
+
|
196
|
+
##### MPS
|
197
|
+
|
198
|
+
For macOS users with M-series chip devices, you can use MPS for inference acceleration.
|
199
|
+
You also need to modify the value of "device-mode" in the configuration file magic-pdf.json.
|
200
|
+
```json
|
201
|
+
{
|
202
|
+
"device-mode":"mps"
|
203
|
+
}
|
204
|
+
```
|
205
|
+
|
206
|
+
#### 6. Usage via Api
|
134
207
|
|
135
208
|
###### Local
|
136
209
|
```python
|
@@ -8,7 +8,7 @@ magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
magic_pdf/cli/magicpdf.py,sha256=EcTiX-MaiDc4Fv9qZ_UdjHt5tYnBEu6vlbp0w030sA0,12691
|
9
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
11
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=oV1x1N6AoVkRcQEdrCQ5uaFwOrmirosnGgTeqLzbJCA,15446
|
12
12
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
14
14
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -44,13 +44,13 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
44
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=baAcEjLSYFIeNZF51tOMmA_zAMhN8HvKael-UU-Ruec,22
|
48
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
49
|
magic_pdf/model/__init__.py,sha256=XeYcF4RMZ3DosyLqiz0_n1JVa2k5RhTwUXwKt5sAjEQ,53
|
50
50
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=kssz_Nn6zTYED_iEgGuFRjus947xoK5dTqj88FOehE0,3256
|
51
51
|
magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
|
52
52
|
magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
|
53
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
53
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=yuhelZaVq2NzuL0hygX61i0OmxfD1BIYhtZ0nFHwitw,8651
|
54
54
|
magic_pdf/model/pp_structure_v2.py,sha256=apYWwWiCjlks5CLXolcynnuPV7llCm2PdP-6tg0-Kt0,2903
|
55
55
|
magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
|
@@ -141,9 +141,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
141
141
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
142
142
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
143
143
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
144
|
-
magic_pdf-0.6.
|
145
|
-
magic_pdf-0.6.
|
146
|
-
magic_pdf-0.6.
|
147
|
-
magic_pdf-0.6.
|
148
|
-
magic_pdf-0.6.
|
149
|
-
magic_pdf-0.6.
|
144
|
+
magic_pdf-0.6.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
145
|
+
magic_pdf-0.6.1.dist-info/METADATA,sha256=uu10UAPzDB8N0AMZGaftgw-OEVpbgOrxVgm5EykcC8w,9488
|
146
|
+
magic_pdf-0.6.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
147
|
+
magic_pdf-0.6.1.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
148
|
+
magic_pdf-0.6.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
149
|
+
magic_pdf-0.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|