magic-pdf 0.5.10__py3-none-any.whl → 0.5.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +35 -22
- magic_pdf/libs/language.py +0 -7
- magic_pdf/libs/path_utils.py +13 -4
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -4
- magic_pdf/model/pp_structure_v2.py +6 -1
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.12.dist-info}/METADATA +93 -19
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.12.dist-info}/RECORD +13 -13
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.12.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.12.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.12.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.5.10.dist-info → magic_pdf-0.5.12.dist-info}/top_level.txt +0 -0
magic_pdf/cli/magicpdf.py
CHANGED
@@ -23,7 +23,6 @@ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home
|
|
23
23
|
|
24
24
|
import os
|
25
25
|
import json as json_parse
|
26
|
-
import sys
|
27
26
|
import click
|
28
27
|
from loguru import logger
|
29
28
|
from pathlib import Path
|
@@ -46,6 +45,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
|
46
45
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
47
46
|
import csv
|
48
47
|
import copy
|
48
|
+
import magic_pdf.model as model_config
|
49
49
|
|
50
50
|
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
|
51
51
|
|
@@ -66,21 +66,21 @@ def write_to_csv(csv_file_path, csv_data):
|
|
66
66
|
csv_writer = csv.writer(csvfile)
|
67
67
|
# 写入数据
|
68
68
|
csv_writer.writerow(csv_data)
|
69
|
-
|
69
|
+
logger.info(f"数据已成功追加到 '{csv_file_path}'")
|
70
70
|
|
71
71
|
|
72
72
|
def do_parse(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
73
|
+
pdf_file_name,
|
74
|
+
pdf_bytes,
|
75
|
+
model_list,
|
76
|
+
parse_method,
|
77
|
+
f_draw_span_bbox=True,
|
78
|
+
f_draw_layout_bbox=True,
|
79
|
+
f_dump_md=True,
|
80
|
+
f_dump_middle_json=True,
|
81
|
+
f_dump_model_json=True,
|
82
|
+
f_dump_orig_pdf=True,
|
83
|
+
f_dump_content_list=True,
|
84
84
|
):
|
85
85
|
orig_model_list = copy.deepcopy(model_list)
|
86
86
|
|
@@ -96,14 +96,18 @@ def do_parse(
|
|
96
96
|
elif parse_method == "ocr":
|
97
97
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
|
98
98
|
else:
|
99
|
-
|
100
|
-
|
99
|
+
logger.error("unknown parse method")
|
100
|
+
exit(1)
|
101
101
|
|
102
102
|
pipe.pipe_classify()
|
103
103
|
|
104
|
-
"""如果没有传入有效的模型数据,则使用内置
|
104
|
+
"""如果没有传入有效的模型数据,则使用内置model解析"""
|
105
105
|
if len(model_list) == 0:
|
106
|
-
|
106
|
+
if model_config.__use_inside_model__:
|
107
|
+
pipe.pipe_analyze()
|
108
|
+
else:
|
109
|
+
logger.error("need model list input")
|
110
|
+
exit(1)
|
107
111
|
|
108
112
|
pipe.pipe_parse()
|
109
113
|
pdf_info = pipe.pdf_mid_data["pdf_info"]
|
@@ -170,10 +174,13 @@ def cli():
|
|
170
174
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
171
175
|
default="auto",
|
172
176
|
)
|
173
|
-
|
177
|
+
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
|
178
|
+
def json_command(json, method, inside_model):
|
179
|
+
model_config.__use_inside_model__ = inside_model
|
180
|
+
|
174
181
|
if not json.startswith("s3://"):
|
175
|
-
|
176
|
-
|
182
|
+
logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
|
183
|
+
exit(1)
|
177
184
|
|
178
185
|
def read_s3_path(s3path):
|
179
186
|
bucket, key = parse_s3path(s3path)
|
@@ -218,7 +225,10 @@ def json_command(json, method):
|
|
218
225
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
219
226
|
default="auto",
|
220
227
|
)
|
221
|
-
|
228
|
+
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
|
229
|
+
def local_json_command(local_json, method, inside_model):
|
230
|
+
model_config.__use_inside_model__ = inside_model
|
231
|
+
|
222
232
|
def read_s3_path(s3path):
|
223
233
|
bucket, key = parse_s3path(s3path)
|
224
234
|
|
@@ -267,7 +277,10 @@ def local_json_command(local_json, method):
|
|
267
277
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
|
268
278
|
default="auto",
|
269
279
|
)
|
270
|
-
|
280
|
+
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
|
281
|
+
def pdf_command(pdf, model, method, inside_model):
|
282
|
+
model_config.__use_inside_model__ = inside_model
|
283
|
+
|
271
284
|
def read_fn(path):
|
272
285
|
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
273
286
|
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
magic_pdf/libs/language.py
CHANGED
@@ -1,13 +1,6 @@
|
|
1
|
-
import regex
|
2
1
|
import unicodedata
|
3
2
|
from fast_langdetect import detect_langs
|
4
3
|
|
5
|
-
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
|
6
|
-
|
7
|
-
|
8
|
-
def remove_bad_chars(text):
|
9
|
-
return RE_BAD_CHARS.sub("", text)
|
10
|
-
|
11
4
|
|
12
5
|
def detect_lang(text: str) -> str:
|
13
6
|
if len(text) == 0:
|
magic_pdf/libs/path_utils.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
from s3pathlib import S3Path
|
4
|
-
|
5
3
|
def remove_non_official_s3_args(s3path):
|
6
4
|
"""
|
7
5
|
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
|
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
|
|
10
8
|
return arr[0]
|
11
9
|
|
12
10
|
def parse_s3path(s3path: str):
|
13
|
-
|
14
|
-
|
11
|
+
# from s3pathlib import S3Path
|
12
|
+
# p = S3Path(remove_non_official_s3_args(s3path))
|
13
|
+
# return p.bucket, p.key
|
14
|
+
s3path = remove_non_official_s3_args(s3path).strip()
|
15
|
+
if s3path.startswith(('s3://', 's3a://')):
|
16
|
+
prefix, path = s3path.split('://', 1)
|
17
|
+
bucket_name, key = path.split('/', 1)
|
18
|
+
return bucket_name, key
|
19
|
+
elif s3path.startswith('/'):
|
20
|
+
raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
|
21
|
+
else:
|
22
|
+
raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
|
23
|
+
|
15
24
|
|
16
25
|
def parse_s3_range_params(s3path: str):
|
17
26
|
"""
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.12"
|
magic_pdf/model/__init__.py
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
__use_inside_model__ = False
|
@@ -1,10 +1,8 @@
|
|
1
1
|
import fitz
|
2
|
-
import cv2
|
3
|
-
from PIL import Image
|
4
2
|
import numpy as np
|
5
|
-
|
3
|
+
from loguru import logger
|
6
4
|
from magic_pdf.model.model_list import MODEL
|
7
|
-
|
5
|
+
import magic_pdf.model as model_config
|
8
6
|
|
9
7
|
|
10
8
|
def dict_compare(d1, d2):
|
@@ -22,6 +20,11 @@ def remove_duplicates_dicts(lst):
|
|
22
20
|
|
23
21
|
|
24
22
|
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
23
|
+
try:
|
24
|
+
import cv2
|
25
|
+
from PIL import Image
|
26
|
+
except ImportError:
|
27
|
+
logger.error("opencv-python and Pillow are not installed, please install by pip.")
|
25
28
|
images = []
|
26
29
|
with fitz.open("pdf", pdf_bytes) as doc:
|
27
30
|
for index in range(0, doc.page_count):
|
@@ -41,6 +44,13 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
|
41
44
|
|
42
45
|
|
43
46
|
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
|
47
|
+
|
48
|
+
if model_config.__use_inside_model__:
|
49
|
+
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
50
|
+
else:
|
51
|
+
logger.error("use_inside_model is False, not allow to use inside model")
|
52
|
+
exit(1)
|
53
|
+
|
44
54
|
images = load_images_from_pdf(pdf_bytes)
|
45
55
|
custom_model = None
|
46
56
|
if model == MODEL.Paddle:
|
@@ -1,7 +1,12 @@
|
|
1
1
|
import random
|
2
2
|
|
3
3
|
from loguru import logger
|
4
|
-
|
4
|
+
|
5
|
+
try:
|
6
|
+
from paddleocr import PPStructure
|
7
|
+
except ImportError:
|
8
|
+
logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
|
9
|
+
exit(1)
|
5
10
|
|
6
11
|
|
7
12
|
def region_to_bbox(region):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.12
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/magicpdf/Magic-PDF
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,34 +9,28 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3 >=1.28.43
|
10
10
|
Requires-Dist: Brotli >=1.1.0
|
11
11
|
Requires-Dist: click >=8.1.7
|
12
|
-
Requires-Dist:
|
13
|
-
Requires-Dist: PyMuPDF >=1.24.5
|
12
|
+
Requires-Dist: PyMuPDF >=1.24.7
|
14
13
|
Requires-Dist: loguru >=0.6.0
|
15
|
-
Requires-Dist: matplotlib >=3.8.3
|
16
14
|
Requires-Dist: numpy >=1.21.6
|
17
|
-
Requires-Dist: pandas >=1.3.5
|
18
15
|
Requires-Dist: fast-langdetect >=0.1.1
|
19
|
-
Requires-Dist: regex >=2023.12.25
|
20
|
-
Requires-Dist: termcolor >=2.4.0
|
21
16
|
Requires-Dist: wordninja >=2.0.0
|
22
17
|
Requires-Dist: scikit-learn >=1.0.2
|
23
|
-
Requires-Dist: nltk ==3.8.1
|
24
|
-
Requires-Dist: s3pathlib >=2.1.1
|
25
|
-
Requires-Dist: paddleocr
|
26
18
|
Requires-Dist: pdfminer.six >=20231228
|
27
19
|
Provides-Extra: cpu
|
20
|
+
Requires-Dist: paddleocr ; extra == 'cpu'
|
28
21
|
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
29
22
|
Provides-Extra: gpu
|
23
|
+
Requires-Dist: paddleocr ; extra == 'gpu'
|
30
24
|
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
31
25
|
|
32
26
|
<div id="top"></div>
|
33
27
|
<div align="center">
|
34
28
|
|
35
|
-
[](https://github.com/opendatalab/MinerU)
|
30
|
+
[](https://github.com/opendatalab/MinerU)
|
31
|
+
[](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
|
32
|
+
[](https://github.com/opendatalab/MinerU/issues)
|
33
|
+
[](https://github.com/opendatalab/MinerU/issues)
|
40
34
|
|
41
35
|
[English](README.md) | [简体中文](README_zh-CN.md)
|
42
36
|
|
@@ -46,8 +40,20 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
|
46
40
|
|
47
41
|
</div>
|
48
42
|
|
43
|
+
# MinerU
|
44
|
+
|
45
|
+
|
46
|
+
## Introduction
|
47
|
+
|
48
|
+
MinerU is a one-stop, open-source data extraction tool, primarily includes the following features:
|
49
|
+
|
50
|
+
- [Magic-PDF](#Magic-PDF) PDF Document Extraction
|
51
|
+
- [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
|
52
|
+
|
53
|
+
|
49
54
|
# Magic-PDF
|
50
55
|
|
56
|
+
|
51
57
|
## Introduction
|
52
58
|
|
53
59
|
Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
|
@@ -64,15 +70,38 @@ Key features include:
|
|
64
70
|
- Compatibility with CPU and GPU environments
|
65
71
|
- Available for Windows, Linux, and macOS platforms
|
66
72
|
|
73
|
+
|
74
|
+
https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
## Project Panorama
|
79
|
+
|
80
|
+

|
81
|
+
|
82
|
+
|
83
|
+
## Flowchart
|
84
|
+
|
85
|
+

|
86
|
+
|
87
|
+
### Submodule Repositories
|
88
|
+
|
89
|
+
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
90
|
+
- A Comprehensive Toolkit for High-Quality PDF Content Extraction
|
91
|
+
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
|
92
|
+
- An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
|
93
|
+
|
94
|
+
|
67
95
|
## Getting Started
|
68
96
|
|
69
97
|
### Requirements
|
70
98
|
|
71
|
-
- Python 3.9
|
99
|
+
- Python >= 3.9
|
72
100
|
|
73
101
|
### Usage Instructions
|
74
102
|
|
75
103
|
#### 1. Install Magic-PDF
|
104
|
+
|
76
105
|
```bash
|
77
106
|
pip install magic-pdf
|
78
107
|
```
|
@@ -80,11 +109,14 @@ pip install magic-pdf
|
|
80
109
|
#### 2. Usage via Command Line
|
81
110
|
|
82
111
|
###### simple
|
112
|
+
|
83
113
|
```bash
|
84
114
|
cp magic-pdf.template.json to ~/magic-pdf.json
|
85
115
|
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
86
116
|
```
|
117
|
+
|
87
118
|
###### more
|
119
|
+
|
88
120
|
```bash
|
89
121
|
magic-pdf --help
|
90
122
|
```
|
@@ -115,19 +147,61 @@ pipe.pipe_parse()
|
|
115
147
|
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
116
148
|
```
|
117
149
|
|
118
|
-
Demo can be referred to [demo.py](
|
150
|
+
Demo can be referred to [demo.py](demo/demo.py)
|
151
|
+
|
119
152
|
|
120
153
|
## All Thanks To Our Contributors
|
121
154
|
|
122
155
|
<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
|
123
|
-
<img src="https://contrib.rocks/image?repo=
|
156
|
+
<img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
|
124
157
|
</a>
|
125
158
|
|
159
|
+
|
126
160
|
## License Information
|
127
161
|
|
128
|
-
|
162
|
+
[LICENSE.md](LICENSE.md)
|
163
|
+
|
164
|
+
The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
|
165
|
+
|
129
166
|
|
130
167
|
## Acknowledgments
|
131
168
|
|
132
169
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
133
170
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
171
|
+
|
172
|
+
|
173
|
+
# Magic-Doc
|
174
|
+
|
175
|
+
|
176
|
+
## Introduction
|
177
|
+
|
178
|
+
Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
|
179
|
+
|
180
|
+
Key Features Include:
|
181
|
+
|
182
|
+
- Web Page Extraction
|
183
|
+
- Cross-modal precise parsing of text, images, tables, and formula information.
|
184
|
+
|
185
|
+
- E-Book Document Extraction
|
186
|
+
- Supports various document formats including epub, mobi, with full adaptation for text and images.
|
187
|
+
|
188
|
+
- Language Type Identification
|
189
|
+
- Accurate recognition of 176 languages.
|
190
|
+
|
191
|
+
https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
|
204
|
+
## Project Repository
|
205
|
+
|
206
|
+
- [Magic-Doc](https://github.com/magicpdf/Magic-Doc)
|
207
|
+
Outstanding Webpage and E-book Extraction Tool
|
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
|
|
5
5
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
6
6
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
7
7
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
8
|
+
magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
|
9
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
11
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
@@ -34,24 +34,24 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
|
|
34
34
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
35
35
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
36
36
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
37
|
-
magic_pdf/libs/language.py,sha256=
|
37
|
+
magic_pdf/libs/language.py,sha256=U8bOttqtJiBvqOFUksiHeeC3vgjzJIWTLqQrmorg7T0,683
|
38
38
|
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
39
39
|
magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
40
40
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
41
41
|
magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
|
42
|
-
magic_pdf/libs/path_utils.py,sha256=
|
42
|
+
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
43
43
|
magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
|
44
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=LznNzk7nDbJCv7NVxCOu958-1uT_nFJ79_3vJt7WPDc,23
|
48
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
49
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
50
|
-
magic_pdf/model/__init__.py,sha256=
|
51
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
50
|
+
magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
|
51
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
|
52
52
|
magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
|
53
53
|
magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
|
54
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
54
|
+
magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
|
55
55
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
57
57
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
115
115
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
116
116
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
117
117
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
118
|
-
magic_pdf-0.5.
|
119
|
-
magic_pdf-0.5.
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
118
|
+
magic_pdf-0.5.12.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
119
|
+
magic_pdf-0.5.12.dist-info/METADATA,sha256=iNTDKGkj4D77ErkS0P1dNZ4ttFriYHbTSjsEE3f8MP0,5917
|
120
|
+
magic_pdf-0.5.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
121
|
+
magic_pdf-0.5.12.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
122
|
+
magic_pdf-0.5.12.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|