magic-pdf 0.5.12__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. magic_pdf/cli/magicpdf.py +23 -8
  2. magic_pdf/libs/config_reader.py +10 -0
  3. magic_pdf/libs/language.py +3 -3
  4. magic_pdf/libs/version.py +1 -1
  5. magic_pdf/model/__init__.py +1 -0
  6. magic_pdf/model/doc_analyze_by_custom_model.py +38 -15
  7. magic_pdf/model/model_list.py +1 -0
  8. magic_pdf/model/pdf_extract_kit.py +196 -0
  9. magic_pdf/model/pek_sub_modules/__init__.py +0 -0
  10. magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py +0 -0
  11. magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py +179 -0
  12. magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py +671 -0
  13. magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py +476 -0
  14. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py +7 -0
  15. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py +2 -0
  16. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py +171 -0
  17. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py +124 -0
  18. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py +136 -0
  19. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py +284 -0
  20. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py +213 -0
  21. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py +7 -0
  22. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +24 -0
  23. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +60 -0
  24. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +1282 -0
  25. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +32 -0
  26. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +34 -0
  27. magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +150 -0
  28. magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py +163 -0
  29. magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py +1236 -0
  30. magic_pdf/model/pek_sub_modules/post_process.py +36 -0
  31. magic_pdf/model/pek_sub_modules/self_modify.py +260 -0
  32. magic_pdf/model/pp_structure_v2.py +7 -0
  33. magic_pdf/pipe/AbsPipe.py +8 -14
  34. magic_pdf/pipe/OCRPipe.py +12 -8
  35. magic_pdf/pipe/TXTPipe.py +12 -8
  36. magic_pdf/pipe/UNIPipe.py +9 -7
  37. magic_pdf/resources/model_config/UniMERNet/demo.yaml +46 -0
  38. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +351 -0
  39. magic_pdf/resources/model_config/model_configs.yaml +9 -0
  40. {magic_pdf-0.5.12.dist-info → magic_pdf-0.6.0.dist-info}/METADATA +68 -34
  41. {magic_pdf-0.5.12.dist-info → magic_pdf-0.6.0.dist-info}/RECORD +45 -19
  42. magic_pdf/model/360_layout_analysis.py +0 -8
  43. {magic_pdf-0.5.12.dist-info → magic_pdf-0.6.0.dist-info}/LICENSE.md +0 -0
  44. {magic_pdf-0.5.12.dist-info → magic_pdf-0.6.0.dist-info}/WHEEL +0 -0
  45. {magic_pdf-0.5.12.dist-info → magic_pdf-0.6.0.dist-info}/entry_points.txt +0 -0
  46. {magic_pdf-0.5.12.dist-info → magic_pdf-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,351 @@
1
+ AUG:
2
+ DETR: true
3
+ CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface
4
+ CUDNN_BENCHMARK: false
5
+ DATALOADER:
6
+ ASPECT_RATIO_GROUPING: true
7
+ FILTER_EMPTY_ANNOTATIONS: false
8
+ NUM_WORKERS: 4
9
+ REPEAT_THRESHOLD: 0.0
10
+ SAMPLER_TRAIN: TrainingSampler
11
+ DATASETS:
12
+ PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
13
+ PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
14
+ PROPOSAL_FILES_TEST: []
15
+ PROPOSAL_FILES_TRAIN: []
16
+ TEST:
17
+ - scihub_train
18
+ TRAIN:
19
+ - scihub_train
20
+ GLOBAL:
21
+ HACK: 1.0
22
+ ICDAR_DATA_DIR_TEST: ''
23
+ ICDAR_DATA_DIR_TRAIN: ''
24
+ INPUT:
25
+ CROP:
26
+ ENABLED: true
27
+ SIZE:
28
+ - 384
29
+ - 600
30
+ TYPE: absolute_range
31
+ FORMAT: RGB
32
+ MASK_FORMAT: polygon
33
+ MAX_SIZE_TEST: 1333
34
+ MAX_SIZE_TRAIN: 1333
35
+ MIN_SIZE_TEST: 800
36
+ MIN_SIZE_TRAIN:
37
+ - 480
38
+ - 512
39
+ - 544
40
+ - 576
41
+ - 608
42
+ - 640
43
+ - 672
44
+ - 704
45
+ - 736
46
+ - 768
47
+ - 800
48
+ MIN_SIZE_TRAIN_SAMPLING: choice
49
+ RANDOM_FLIP: horizontal
50
+ MODEL:
51
+ ANCHOR_GENERATOR:
52
+ ANGLES:
53
+ - - -90
54
+ - 0
55
+ - 90
56
+ ASPECT_RATIOS:
57
+ - - 0.5
58
+ - 1.0
59
+ - 2.0
60
+ NAME: DefaultAnchorGenerator
61
+ OFFSET: 0.0
62
+ SIZES:
63
+ - - 32
64
+ - - 64
65
+ - - 128
66
+ - - 256
67
+ - - 512
68
+ BACKBONE:
69
+ FREEZE_AT: 2
70
+ NAME: build_vit_fpn_backbone
71
+ CONFIG_PATH: ''
72
+ DEVICE: cuda
73
+ FPN:
74
+ FUSE_TYPE: sum
75
+ IN_FEATURES:
76
+ - layer3
77
+ - layer5
78
+ - layer7
79
+ - layer11
80
+ NORM: ''
81
+ OUT_CHANNELS: 256
82
+ IMAGE_ONLY: true
83
+ KEYPOINT_ON: false
84
+ LOAD_PROPOSALS: false
85
+ MASK_ON: true
86
+ META_ARCHITECTURE: VLGeneralizedRCNN
87
+ PANOPTIC_FPN:
88
+ COMBINE:
89
+ ENABLED: true
90
+ INSTANCES_CONFIDENCE_THRESH: 0.5
91
+ OVERLAP_THRESH: 0.5
92
+ STUFF_AREA_LIMIT: 4096
93
+ INSTANCE_LOSS_WEIGHT: 1.0
94
+ PIXEL_MEAN:
95
+ - 127.5
96
+ - 127.5
97
+ - 127.5
98
+ PIXEL_STD:
99
+ - 127.5
100
+ - 127.5
101
+ - 127.5
102
+ PROPOSAL_GENERATOR:
103
+ MIN_SIZE: 0
104
+ NAME: RPN
105
+ RESNETS:
106
+ DEFORM_MODULATED: false
107
+ DEFORM_NUM_GROUPS: 1
108
+ DEFORM_ON_PER_STAGE:
109
+ - false
110
+ - false
111
+ - false
112
+ - false
113
+ DEPTH: 50
114
+ NORM: FrozenBN
115
+ NUM_GROUPS: 1
116
+ OUT_FEATURES:
117
+ - res4
118
+ RES2_OUT_CHANNELS: 256
119
+ RES5_DILATION: 1
120
+ STEM_OUT_CHANNELS: 64
121
+ STRIDE_IN_1X1: true
122
+ WIDTH_PER_GROUP: 64
123
+ RETINANET:
124
+ BBOX_REG_LOSS_TYPE: smooth_l1
125
+ BBOX_REG_WEIGHTS:
126
+ - 1.0
127
+ - 1.0
128
+ - 1.0
129
+ - 1.0
130
+ FOCAL_LOSS_ALPHA: 0.25
131
+ FOCAL_LOSS_GAMMA: 2.0
132
+ IN_FEATURES:
133
+ - p3
134
+ - p4
135
+ - p5
136
+ - p6
137
+ - p7
138
+ IOU_LABELS:
139
+ - 0
140
+ - -1
141
+ - 1
142
+ IOU_THRESHOLDS:
143
+ - 0.4
144
+ - 0.5
145
+ NMS_THRESH_TEST: 0.5
146
+ NORM: ''
147
+ NUM_CLASSES: 10
148
+ NUM_CONVS: 4
149
+ PRIOR_PROB: 0.01
150
+ SCORE_THRESH_TEST: 0.05
151
+ SMOOTH_L1_LOSS_BETA: 0.1
152
+ TOPK_CANDIDATES_TEST: 1000
153
+ ROI_BOX_CASCADE_HEAD:
154
+ BBOX_REG_WEIGHTS:
155
+ - - 10.0
156
+ - 10.0
157
+ - 5.0
158
+ - 5.0
159
+ - - 20.0
160
+ - 20.0
161
+ - 10.0
162
+ - 10.0
163
+ - - 30.0
164
+ - 30.0
165
+ - 15.0
166
+ - 15.0
167
+ IOUS:
168
+ - 0.5
169
+ - 0.6
170
+ - 0.7
171
+ ROI_BOX_HEAD:
172
+ BBOX_REG_LOSS_TYPE: smooth_l1
173
+ BBOX_REG_LOSS_WEIGHT: 1.0
174
+ BBOX_REG_WEIGHTS:
175
+ - 10.0
176
+ - 10.0
177
+ - 5.0
178
+ - 5.0
179
+ CLS_AGNOSTIC_BBOX_REG: true
180
+ CONV_DIM: 256
181
+ FC_DIM: 1024
182
+ NAME: FastRCNNConvFCHead
183
+ NORM: ''
184
+ NUM_CONV: 0
185
+ NUM_FC: 2
186
+ POOLER_RESOLUTION: 7
187
+ POOLER_SAMPLING_RATIO: 0
188
+ POOLER_TYPE: ROIAlignV2
189
+ SMOOTH_L1_BETA: 0.0
190
+ TRAIN_ON_PRED_BOXES: false
191
+ ROI_HEADS:
192
+ BATCH_SIZE_PER_IMAGE: 512
193
+ IN_FEATURES:
194
+ - p2
195
+ - p3
196
+ - p4
197
+ - p5
198
+ IOU_LABELS:
199
+ - 0
200
+ - 1
201
+ IOU_THRESHOLDS:
202
+ - 0.5
203
+ NAME: CascadeROIHeads
204
+ NMS_THRESH_TEST: 0.5
205
+ NUM_CLASSES: 10
206
+ POSITIVE_FRACTION: 0.25
207
+ PROPOSAL_APPEND_GT: true
208
+ SCORE_THRESH_TEST: 0.05
209
+ ROI_KEYPOINT_HEAD:
210
+ CONV_DIMS:
211
+ - 512
212
+ - 512
213
+ - 512
214
+ - 512
215
+ - 512
216
+ - 512
217
+ - 512
218
+ - 512
219
+ LOSS_WEIGHT: 1.0
220
+ MIN_KEYPOINTS_PER_IMAGE: 1
221
+ NAME: KRCNNConvDeconvUpsampleHead
222
+ NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
223
+ NUM_KEYPOINTS: 17
224
+ POOLER_RESOLUTION: 14
225
+ POOLER_SAMPLING_RATIO: 0
226
+ POOLER_TYPE: ROIAlignV2
227
+ ROI_MASK_HEAD:
228
+ CLS_AGNOSTIC_MASK: false
229
+ CONV_DIM: 256
230
+ NAME: MaskRCNNConvUpsampleHead
231
+ NORM: ''
232
+ NUM_CONV: 4
233
+ POOLER_RESOLUTION: 14
234
+ POOLER_SAMPLING_RATIO: 0
235
+ POOLER_TYPE: ROIAlignV2
236
+ RPN:
237
+ BATCH_SIZE_PER_IMAGE: 256
238
+ BBOX_REG_LOSS_TYPE: smooth_l1
239
+ BBOX_REG_LOSS_WEIGHT: 1.0
240
+ BBOX_REG_WEIGHTS:
241
+ - 1.0
242
+ - 1.0
243
+ - 1.0
244
+ - 1.0
245
+ BOUNDARY_THRESH: -1
246
+ CONV_DIMS:
247
+ - -1
248
+ HEAD_NAME: StandardRPNHead
249
+ IN_FEATURES:
250
+ - p2
251
+ - p3
252
+ - p4
253
+ - p5
254
+ - p6
255
+ IOU_LABELS:
256
+ - 0
257
+ - -1
258
+ - 1
259
+ IOU_THRESHOLDS:
260
+ - 0.3
261
+ - 0.7
262
+ LOSS_WEIGHT: 1.0
263
+ NMS_THRESH: 0.7
264
+ POSITIVE_FRACTION: 0.5
265
+ POST_NMS_TOPK_TEST: 1000
266
+ POST_NMS_TOPK_TRAIN: 2000
267
+ PRE_NMS_TOPK_TEST: 1000
268
+ PRE_NMS_TOPK_TRAIN: 2000
269
+ SMOOTH_L1_BETA: 0.0
270
+ SEM_SEG_HEAD:
271
+ COMMON_STRIDE: 4
272
+ CONVS_DIM: 128
273
+ IGNORE_VALUE: 255
274
+ IN_FEATURES:
275
+ - p2
276
+ - p3
277
+ - p4
278
+ - p5
279
+ LOSS_WEIGHT: 1.0
280
+ NAME: SemSegFPNHead
281
+ NORM: GN
282
+ NUM_CLASSES: 10
283
+ VIT:
284
+ DROP_PATH: 0.1
285
+ IMG_SIZE:
286
+ - 224
287
+ - 224
288
+ NAME: layoutlmv3_base
289
+ OUT_FEATURES:
290
+ - layer3
291
+ - layer5
292
+ - layer7
293
+ - layer11
294
+ POS_TYPE: abs
295
+ WEIGHTS:
296
+ OUTPUT_DIR:
297
+ SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train
298
+ SEED: 42
299
+ SOLVER:
300
+ AMP:
301
+ ENABLED: true
302
+ BACKBONE_MULTIPLIER: 1.0
303
+ BASE_LR: 0.0002
304
+ BIAS_LR_FACTOR: 1.0
305
+ CHECKPOINT_PERIOD: 2000
306
+ CLIP_GRADIENTS:
307
+ CLIP_TYPE: full_model
308
+ CLIP_VALUE: 1.0
309
+ ENABLED: true
310
+ NORM_TYPE: 2.0
311
+ GAMMA: 0.1
312
+ GRADIENT_ACCUMULATION_STEPS: 1
313
+ IMS_PER_BATCH: 32
314
+ LR_SCHEDULER_NAME: WarmupCosineLR
315
+ MAX_ITER: 20000
316
+ MOMENTUM: 0.9
317
+ NESTEROV: false
318
+ OPTIMIZER: ADAMW
319
+ REFERENCE_WORLD_SIZE: 0
320
+ STEPS:
321
+ - 10000
322
+ WARMUP_FACTOR: 0.01
323
+ WARMUP_ITERS: 333
324
+ WARMUP_METHOD: linear
325
+ WEIGHT_DECAY: 0.05
326
+ WEIGHT_DECAY_BIAS: null
327
+ WEIGHT_DECAY_NORM: 0.0
328
+ TEST:
329
+ AUG:
330
+ ENABLED: false
331
+ FLIP: true
332
+ MAX_SIZE: 4000
333
+ MIN_SIZES:
334
+ - 400
335
+ - 500
336
+ - 600
337
+ - 700
338
+ - 800
339
+ - 900
340
+ - 1000
341
+ - 1100
342
+ - 1200
343
+ DETECTIONS_PER_IMAGE: 100
344
+ EVAL_PERIOD: 1000
345
+ EXPECTED_RESULTS: []
346
+ KEYPOINT_OKS_SIGMAS: []
347
+ PRECISE_BN:
348
+ ENABLED: false
349
+ NUM_ITER: 200
350
+ VERSION: 2
351
+ VIS_PERIOD: 0
@@ -0,0 +1,9 @@
1
+ config:
2
+ device: cpu
3
+ layout: True
4
+ formula: True
5
+
6
+ weights:
7
+ layout: Layout/model_final.pth
8
+ mfd: MFD/weights.pt
9
+ mfr: MFR/UniMERNet
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.12
3
+ Version: 0.6.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
- Home-page: https://github.com/magicpdf/Magic-PDF
5
+ Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE.md
@@ -12,15 +12,21 @@ Requires-Dist: click >=8.1.7
12
12
  Requires-Dist: PyMuPDF >=1.24.7
13
13
  Requires-Dist: loguru >=0.6.0
14
14
  Requires-Dist: numpy >=1.21.6
15
- Requires-Dist: fast-langdetect >=0.1.1
15
+ Requires-Dist: fast-langdetect >=0.2.1
16
16
  Requires-Dist: wordninja >=2.0.0
17
17
  Requires-Dist: scikit-learn >=1.0.2
18
18
  Requires-Dist: pdfminer.six >=20231228
19
19
  Provides-Extra: cpu
20
- Requires-Dist: paddleocr ; extra == 'cpu'
20
+ Requires-Dist: paddleocr ==2.7.3 ; extra == 'cpu'
21
21
  Requires-Dist: paddlepaddle ; extra == 'cpu'
22
+ Provides-Extra: full-cpu
23
+ Requires-Dist: unimernet ; extra == 'full-cpu'
24
+ Requires-Dist: matplotlib ; extra == 'full-cpu'
25
+ Requires-Dist: ultralytics ; extra == 'full-cpu'
26
+ Requires-Dist: paddleocr ==2.7.3 ; extra == 'full-cpu'
27
+ Requires-Dist: paddlepaddle ; extra == 'full-cpu'
22
28
  Provides-Extra: gpu
23
- Requires-Dist: paddleocr ; extra == 'gpu'
29
+ Requires-Dist: paddleocr ==2.7.3 ; extra == 'gpu'
24
30
  Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
25
31
 
26
32
  <div id="top"></div>
@@ -28,9 +34,14 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
28
34
 
29
35
  [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
30
36
  [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
31
- [![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
32
- [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
33
37
  [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
38
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
39
+ [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
40
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
41
+ [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
42
+
43
+
44
+
34
45
 
35
46
  [English](README.md) | [简体中文](README_zh-CN.md)
36
47
 
@@ -45,7 +56,7 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
45
56
 
46
57
  ## Introduction
47
58
 
48
- MinerU is a one-stop, open-source data extraction tool, primarily includes the following features:
59
+ MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
49
60
 
50
61
  - [Magic-PDF](#Magic-PDF) PDF Document Extraction
51
62
  - [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
@@ -71,7 +82,7 @@ Key features include:
71
82
  - Available for Windows, Linux, and macOS platforms
72
83
 
73
84
 
74
- https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
85
+ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
75
86
 
76
87
 
77
88
 
@@ -88,9 +99,6 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
88
99
 
89
100
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
90
101
  - A Comprehensive Toolkit for High-Quality PDF Content Extraction
91
- - [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
92
- - An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
93
-
94
102
 
95
103
  ## Getting Started
96
104
 
@@ -111,9 +119,10 @@ pip install magic-pdf
111
119
  ###### simple
112
120
 
113
121
  ```bash
114
- cp magic-pdf.template.json to ~/magic-pdf.json
122
+ cp magic-pdf.template.json ~/magic-pdf.json
115
123
  magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
116
124
  ```
125
+ After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
117
126
 
118
127
  ###### more
119
128
 
@@ -150,26 +159,6 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
150
159
  Demo can be referred to [demo.py](demo/demo.py)
151
160
 
152
161
 
153
- ## All Thanks To Our Contributors
154
-
155
- <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
156
- <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
157
- </a>
158
-
159
-
160
- ## License Information
161
-
162
- [LICENSE.md](LICENSE.md)
163
-
164
- The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
165
-
166
-
167
- ## Acknowledgments
168
-
169
- - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
170
- - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
171
-
172
-
173
162
  # Magic-Doc
174
163
 
175
164
 
@@ -203,5 +192,50 @@ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d7
203
192
 
204
193
  ## Project Repository
205
194
 
206
- - [Magic-Doc](https://github.com/magicpdf/Magic-Doc)
195
+ - [Magic-Doc](https://github.com/InternLM/magic-doc)
207
196
  Outstanding Webpage and E-book Extraction Tool
197
+
198
+
199
+ # All Thanks To Our Contributors
200
+
201
+ <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
202
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
203
+ </a>
204
+
205
+
206
+ # License Information
207
+
208
+ [LICENSE.md](LICENSE.md)
209
+
210
+ The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
211
+
212
+
213
+ # Acknowledgments
214
+
215
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
216
+ - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
217
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
218
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
219
+
220
+
221
+ # Citation
222
+
223
+ ```bibtex
224
+ @misc{2024mineru,
225
+ title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
226
+ author={MinerU Contributors},
227
+ howpublished = {\url{https://github.com/opendatalab/MinerU}},
228
+ year={2024}
229
+ }
230
+ ```
231
+
232
+
233
+ # Star History
234
+
235
+ <a>
236
+ <picture>
237
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
238
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
239
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
240
+ </picture>
241
+ </a>
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
5
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
6
6
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
7
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
8
+ magic_pdf/cli/magicpdf.py,sha256=EcTiX-MaiDc4Fv9qZ_UdjHt5tYnBEu6vlbp0w030sA0,12691
9
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
11
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -25,7 +25,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  magic_pdf/libs/boxbase.py,sha256=MvD0DypR4sTEF3T2RrI_yJ8mPDUBYHAqAaau2mnBSxY,15343
26
26
  magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
27
27
  magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
28
- magic_pdf/libs/config_reader.py,sha256=ADe9DknbSBb3-JQlQJix-fkVDPIQCkytl4mKdXnIraA,1607
28
+ magic_pdf/libs/config_reader.py,sha256=wB0Zn6qEwuAWmv2Icz9owPIqxrhFEH5i6sUr8Nt5ULo,1806
29
29
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
30
30
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
31
31
  magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
@@ -34,7 +34,7 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
34
34
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
35
35
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
36
36
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
37
- magic_pdf/libs/language.py,sha256=U8bOttqtJiBvqOFUksiHeeC3vgjzJIWTLqQrmorg7T0,683
37
+ magic_pdf/libs/language.py,sha256=l0LGIz-dlerU9Xct-7ypNKGNEI_q-CTadsJAnVTF9VY,692
38
38
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
39
39
  magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
40
40
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
@@ -44,14 +44,37 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
44
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=LznNzk7nDbJCv7NVxCOu958-1uT_nFJ79_3vJt7WPDc,23
47
+ magic_pdf/libs/version.py,sha256=cID1jLnC_vj48GgMN6Yb1FA3JsQ95zNmCHmRYE8TFhY,22
48
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
- magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
- magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
51
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
49
+ magic_pdf/model/__init__.py,sha256=XeYcF4RMZ3DosyLqiz0_n1JVa2k5RhTwUXwKt5sAjEQ,53
50
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=kssz_Nn6zTYED_iEgGuFRjus947xoK5dTqj88FOehE0,3256
52
51
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
53
- magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
54
- magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
52
+ magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
53
+ magic_pdf/model/pdf_extract_kit.py,sha256=hiK1zDrwn5QhqUwI7BvM1JOoq_JIab4uVx_flHrBmWE,8374
54
+ magic_pdf/model/pp_structure_v2.py,sha256=apYWwWiCjlks5CLXolcynnuPV7llCm2PdP-6tg0-Kt0,2903
55
+ magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
+ magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
57
+ magic_pdf/model/pek_sub_modules/self_modify.py,sha256=XiwLUCiY_E0JkaIQr5m1hOD75-iGrgkMVe-1bzeF_Go,10522
58
+ magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
+ magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
60
+ magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
61
+ magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
62
+ magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=POs5s4_9rS-GlE7f_iHBuZpTwOuyfI6VE3DUb37fgxA,4483
63
+ magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
64
+ magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
65
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
66
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
67
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
68
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
69
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
70
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
71
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
72
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
73
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
74
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
75
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
76
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
77
+ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
55
78
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
79
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
57
80
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -66,10 +89,10 @@ magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gP
66
89
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
67
90
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
68
91
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
69
- magic_pdf/pipe/AbsPipe.py,sha256=28e3HxybBO86npy_L4WD6F7hfjKHHc86-IhiwzAnLdk,3979
70
- magic_pdf/pipe/OCRPipe.py,sha256=iKnNveVfsrBGl_2Xtd4hAAS5HntYyjwfBeVIKGc8V5U,1196
71
- magic_pdf/pipe/TXTPipe.py,sha256=R0UzMZ7Z_59Vh7cPdBAO4gvHtgA5wLoODnCPnpEjbPM,1255
72
- magic_pdf/pipe/UNIPipe.py,sha256=47a9jx1a_zO4m3sVnhcOnrmNc_QT-TI-9mv2x7L6SrQ,3507
92
+ magic_pdf/pipe/AbsPipe.py,sha256=rMZd0FRTxGWt-7MZNmjgI1bKXlmSb9ZTA6A9fhEE7Gk,4131
93
+ magic_pdf/pipe/OCRPipe.py,sha256=55VGQVxxjunnmt3L8tYlo9A8y3vVB1JRCO9wMQtk-N0,1317
94
+ magic_pdf/pipe/TXTPipe.py,sha256=2Xn0fDDbLm2qW6xtXXHsNwXlAKnMHFbiIgnP1J2zNh8,1376
95
+ magic_pdf/pipe/UNIPipe.py,sha256=0w1XLmUQUxvqm3BaVB800pZIeLiDD3NGvQb32OcI0Fg,3587
73
96
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
97
  magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
98
  magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
@@ -104,6 +127,9 @@ magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=0FlBXeiEwjZAGAWo-DiMptclFOj04POu
104
127
  magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6TOCW0TLXbPii_Q,7307
105
128
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
106
129
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
130
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=C_9UfFMlHOX-iSgcwCHjyHKazKKuwpy1RcGHeTQD1kY,139
131
+ magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
132
+ magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=noqVE3GmZUG86NYDfs9DiFgdQFjXaICyCU7KPzgR3II,6174
107
133
  magic_pdf/rw/AbsReaderWriter.py,sha256=1Hd6Xo2g12CaRAo5Sze-R_GSQA6GQ0rQwSmgQvw4V_c,1297
108
134
  magic_pdf/rw/DiskReaderWriter.py,sha256=0tt8lbRyqrOfFgGlhjt24YMdj2xN7QUIVysfhFIxPgo,2113
109
135
  magic_pdf/rw/S3ReaderWriter.py,sha256=O7Quf3CUqXBjMz4sIE7kNVI3TIQROeg5PuXneAacieY,4474
@@ -115,9 +141,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
115
141
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
116
142
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
117
143
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
118
- magic_pdf-0.5.12.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
- magic_pdf-0.5.12.dist-info/METADATA,sha256=iNTDKGkj4D77ErkS0P1dNZ4ttFriYHbTSjsEE3f8MP0,5917
120
- magic_pdf-0.5.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
- magic_pdf-0.5.12.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
- magic_pdf-0.5.12.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
- magic_pdf-0.5.12.dist-info/RECORD,,
144
+ magic_pdf-0.6.0.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
145
+ magic_pdf-0.6.0.dist-info/METADATA,sha256=rqkC7PCOuqDzqnsVWW8DSqwogF3jHwGp6-sS8xjCi6o,7093
146
+ magic_pdf-0.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
147
+ magic_pdf-0.6.0.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
148
+ magic_pdf-0.6.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
149
+ magic_pdf-0.6.0.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- from ultralytics import YOLO
2
-
3
- image_path = '' # 待预测图片路径
4
- model_path = '' # 权重路径
5
- model = YOLO(model_path)
6
-
7
- result = model(image_path, save=True, conf=0.5, save_crop=False, line_width=2)
8
- print(result)