magic-pdf 0.10.5__py3-none-any.whl → 0.10.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +5 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/dataset.py +175 -4
- magic_pdf/dict2md/ocr_mkcontent.py +2 -2
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/draw_bbox.py +11 -10
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +124 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +119 -60
- magic_pdf/model/operators.py +190 -0
- magic_pdf/model/pdf_extract_kit.py +20 -1
- magic_pdf/model/sub_modules/model_init.py +13 -3
- magic_pdf/model/sub_modules/model_utils.py +11 -5
- magic_pdf/pdf_parse_by_ocr.py +4 -5
- magic_pdf/pdf_parse_by_txt.py +4 -5
- magic_pdf/pdf_parse_union_core_v2.py +10 -11
- magic_pdf/pipe/AbsPipe.py +3 -2
- magic_pdf/pipe/OCRPipe.py +54 -15
- magic_pdf/pipe/TXTPipe.py +5 -4
- magic_pdf/pipe/UNIPipe.py +82 -30
- magic_pdf/pipe/operators.py +138 -0
- magic_pdf/tools/common.py +108 -59
- magic_pdf/user_api.py +47 -24
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/METADATA +7 -4
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/RECORD +31 -29
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
import copy
|
2
|
-
import json as json_parse
|
3
1
|
import os
|
4
2
|
|
5
3
|
import click
|
@@ -7,13 +5,12 @@ import fitz
|
|
7
5
|
from loguru import logger
|
8
6
|
|
9
7
|
import magic_pdf.model as model_config
|
8
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
10
9
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
11
10
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
12
|
-
from magic_pdf.
|
13
|
-
|
14
|
-
from magic_pdf.
|
15
|
-
from magic_pdf.pipe.TXTPipe import TXTPipe
|
16
|
-
from magic_pdf.pipe.UNIPipe import UNIPipe
|
11
|
+
from magic_pdf.data.dataset import PymuDocDataset
|
12
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
13
|
+
from magic_pdf.model.operators import InferenceResult
|
17
14
|
|
18
15
|
# from io import BytesIO
|
19
16
|
# from pypdf import PdfReader, PdfWriter
|
@@ -56,7 +53,11 @@ def prepare_env(output_dir, pdf_file_name, method):
|
|
56
53
|
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
57
54
|
document = fitz.open('pdf', pdf_bytes)
|
58
55
|
output_document = fitz.open()
|
59
|
-
end_page_id =
|
56
|
+
end_page_id = (
|
57
|
+
end_page_id
|
58
|
+
if end_page_id is not None and end_page_id >= 0
|
59
|
+
else len(document) - 1
|
60
|
+
)
|
60
61
|
if end_page_id > len(document) - 1:
|
61
62
|
logger.warning('end_page_id is out of range, use pdf_docs length')
|
62
63
|
end_page_id = len(document) - 1
|
@@ -94,78 +95,126 @@ def do_parse(
|
|
94
95
|
f_draw_model_bbox = True
|
95
96
|
f_draw_line_sort_bbox = True
|
96
97
|
|
97
|
-
if lang ==
|
98
|
+
if lang == '':
|
98
99
|
lang = None
|
99
100
|
|
100
|
-
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
|
101
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
|
102
|
+
pdf_bytes, start_page_id, end_page_id
|
103
|
+
)
|
101
104
|
|
102
|
-
|
103
|
-
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
104
|
-
parse_method)
|
105
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
105
106
|
|
106
|
-
image_writer, md_writer = FileBasedDataWriter(
|
107
|
-
|
107
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
108
|
+
local_md_dir
|
109
|
+
)
|
108
110
|
image_dir = str(os.path.basename(local_image_dir))
|
109
111
|
|
110
|
-
|
111
|
-
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
112
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
113
|
-
# start_page_id=start_page_id, end_page_id=end_page_id,
|
114
|
-
lang=lang,
|
115
|
-
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
116
|
-
elif parse_method == 'txt':
|
117
|
-
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
118
|
-
# start_page_id=start_page_id, end_page_id=end_page_id,
|
119
|
-
lang=lang,
|
120
|
-
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
121
|
-
elif parse_method == 'ocr':
|
122
|
-
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
123
|
-
# start_page_id=start_page_id, end_page_id=end_page_id,
|
124
|
-
lang=lang,
|
125
|
-
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
126
|
-
else:
|
127
|
-
logger.error('unknown parse method')
|
128
|
-
exit(1)
|
129
|
-
|
130
|
-
pipe.pipe_classify()
|
112
|
+
ds = PymuDocDataset(pdf_bytes)
|
131
113
|
|
132
114
|
if len(model_list) == 0:
|
133
115
|
if model_config.__use_inside_model__:
|
134
|
-
|
135
|
-
|
116
|
+
if parse_method == 'auto':
|
117
|
+
if ds.classify() == SupportedPdfParseMethod.TXT:
|
118
|
+
infer_result = ds.apply(
|
119
|
+
doc_analyze,
|
120
|
+
ocr=False,
|
121
|
+
lang=lang,
|
122
|
+
layout_model=layout_model,
|
123
|
+
formula_enable=formula_enable,
|
124
|
+
table_enable=table_enable,
|
125
|
+
)
|
126
|
+
pipe_result = infer_result.pipe_txt_mode(
|
127
|
+
image_writer, debug_mode=True, lang=lang
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
infer_result = ds.apply(
|
131
|
+
doc_analyze,
|
132
|
+
ocr=True,
|
133
|
+
lang=lang,
|
134
|
+
layout_model=layout_model,
|
135
|
+
formula_enable=formula_enable,
|
136
|
+
table_enable=table_enable,
|
137
|
+
)
|
138
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
139
|
+
image_writer, debug_mode=True, lang=lang
|
140
|
+
)
|
141
|
+
|
142
|
+
elif parse_method == 'txt':
|
143
|
+
infer_result = ds.apply(
|
144
|
+
doc_analyze,
|
145
|
+
ocr=False,
|
146
|
+
lang=lang,
|
147
|
+
layout_model=layout_model,
|
148
|
+
formula_enable=formula_enable,
|
149
|
+
table_enable=table_enable,
|
150
|
+
)
|
151
|
+
pipe_result = infer_result.pipe_txt_mode(
|
152
|
+
image_writer, debug_mode=True, lang=lang
|
153
|
+
)
|
154
|
+
elif parse_method == 'ocr':
|
155
|
+
infer_result = ds.apply(
|
156
|
+
doc_analyze,
|
157
|
+
ocr=True,
|
158
|
+
lang=lang,
|
159
|
+
layout_model=layout_model,
|
160
|
+
formula_enable=formula_enable,
|
161
|
+
table_enable=table_enable,
|
162
|
+
)
|
163
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
164
|
+
image_writer, debug_mode=True, lang=lang
|
165
|
+
)
|
166
|
+
else:
|
167
|
+
logger.error('unknown parse method')
|
168
|
+
exit(1)
|
136
169
|
else:
|
137
170
|
logger.error('need model list input')
|
138
171
|
exit(2)
|
172
|
+
else:
|
173
|
+
infer_result = InferenceResult(model_list, ds)
|
174
|
+
if parse_method == 'ocr':
|
175
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
176
|
+
image_writer, debug_mode=True, lang=lang
|
177
|
+
)
|
178
|
+
elif parse_method == 'txt':
|
179
|
+
pipe_result = infer_result.pipe_txt_mode(
|
180
|
+
image_writer, debug_mode=True, lang=lang
|
181
|
+
)
|
182
|
+
else:
|
183
|
+
pipe_result = infer_result.pipe_auto_mode(
|
184
|
+
image_writer, debug_mode=True, lang=lang
|
185
|
+
)
|
186
|
+
|
187
|
+
if f_draw_model_bbox:
|
188
|
+
infer_result.draw_model(
|
189
|
+
os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
|
190
|
+
)
|
139
191
|
|
140
|
-
pipe.pipe_parse()
|
141
|
-
pdf_info = pipe.pdf_mid_data['pdf_info']
|
142
192
|
if f_draw_layout_bbox:
|
143
|
-
|
193
|
+
pipe_result.draw_layout(
|
194
|
+
os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
|
195
|
+
)
|
144
196
|
if f_draw_span_bbox:
|
145
|
-
|
146
|
-
|
147
|
-
draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
|
197
|
+
pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
|
198
|
+
|
148
199
|
if f_draw_line_sort_bbox:
|
149
|
-
|
200
|
+
pipe_result.draw_line_sort(
|
201
|
+
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
|
202
|
+
)
|
150
203
|
|
151
|
-
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
|
152
204
|
if f_dump_md:
|
153
|
-
|
205
|
+
pipe_result.dump_md(
|
206
|
+
md_writer,
|
154
207
|
f'{pdf_file_name}.md',
|
155
|
-
|
208
|
+
image_dir,
|
209
|
+
drop_mode=DropMode.NONE,
|
210
|
+
md_make_mode=f_make_md_mode,
|
156
211
|
)
|
157
212
|
|
158
213
|
if f_dump_middle_json:
|
159
|
-
md_writer.
|
160
|
-
f'{pdf_file_name}_middle.json',
|
161
|
-
json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
|
162
|
-
)
|
214
|
+
pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
|
163
215
|
|
164
216
|
if f_dump_model_json:
|
165
|
-
md_writer.
|
166
|
-
f'{pdf_file_name}_model.json',
|
167
|
-
json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
|
168
|
-
)
|
217
|
+
infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
|
169
218
|
|
170
219
|
if f_dump_orig_pdf:
|
171
220
|
md_writer.write(
|
@@ -173,11 +222,11 @@ def do_parse(
|
|
173
222
|
pdf_bytes,
|
174
223
|
)
|
175
224
|
|
176
|
-
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
177
225
|
if f_dump_content_list:
|
178
|
-
|
226
|
+
pipe_result.dump_content_list(
|
227
|
+
md_writer,
|
179
228
|
f'{pdf_file_name}_content_list.json',
|
180
|
-
|
229
|
+
image_dir
|
181
230
|
)
|
182
231
|
|
183
232
|
logger.info(f'local output dir is {local_md_dir}')
|
magic_pdf/user_api.py
CHANGED
@@ -10,22 +10,29 @@
|
|
10
10
|
from loguru import logger
|
11
11
|
|
12
12
|
from magic_pdf.data.data_reader_writer import DataWriter
|
13
|
+
from magic_pdf.data.dataset import Dataset
|
13
14
|
from magic_pdf.libs.version import __version__
|
14
15
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
15
16
|
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
16
17
|
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
18
|
+
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
|
19
|
+
|
20
|
+
|
21
|
+
def parse_txt_pdf(
|
22
|
+
dataset: Dataset,
|
23
|
+
model_list: list,
|
24
|
+
imageWriter: DataWriter,
|
25
|
+
is_debug=False,
|
26
|
+
start_page_id=0,
|
27
|
+
end_page_id=None,
|
28
|
+
lang=None,
|
29
|
+
*args,
|
30
|
+
**kwargs
|
31
|
+
):
|
25
32
|
"""解析文本类pdf."""
|
26
33
|
pdf_info_dict = parse_pdf_by_txt(
|
27
|
-
|
28
|
-
|
34
|
+
dataset,
|
35
|
+
model_list,
|
29
36
|
imageWriter,
|
30
37
|
start_page_id=start_page_id,
|
31
38
|
end_page_id=end_page_id,
|
@@ -43,13 +50,21 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
|
|
43
50
|
return pdf_info_dict
|
44
51
|
|
45
52
|
|
46
|
-
def parse_ocr_pdf(
|
47
|
-
|
48
|
-
|
53
|
+
def parse_ocr_pdf(
|
54
|
+
dataset: Dataset,
|
55
|
+
model_list: list,
|
56
|
+
imageWriter: DataWriter,
|
57
|
+
is_debug=False,
|
58
|
+
start_page_id=0,
|
59
|
+
end_page_id=None,
|
60
|
+
lang=None,
|
61
|
+
*args,
|
62
|
+
**kwargs
|
63
|
+
):
|
49
64
|
"""解析ocr类pdf."""
|
50
65
|
pdf_info_dict = parse_pdf_by_ocr(
|
51
|
-
|
52
|
-
|
66
|
+
dataset,
|
67
|
+
model_list,
|
53
68
|
imageWriter,
|
54
69
|
start_page_id=start_page_id,
|
55
70
|
end_page_id=end_page_id,
|
@@ -67,17 +82,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
|
|
67
82
|
return pdf_info_dict
|
68
83
|
|
69
84
|
|
70
|
-
def parse_union_pdf(
|
71
|
-
|
72
|
-
|
73
|
-
|
85
|
+
def parse_union_pdf(
|
86
|
+
dataset: Dataset,
|
87
|
+
model_list: list,
|
88
|
+
imageWriter: DataWriter,
|
89
|
+
is_debug=False,
|
90
|
+
start_page_id=0,
|
91
|
+
end_page_id=None,
|
92
|
+
lang=None,
|
93
|
+
*args,
|
94
|
+
**kwargs
|
95
|
+
):
|
74
96
|
"""ocr和文本混合的pdf,全部解析出来."""
|
75
97
|
|
76
98
|
def parse_pdf(method):
|
77
99
|
try:
|
78
100
|
return method(
|
79
|
-
|
80
|
-
|
101
|
+
dataset,
|
102
|
+
model_list,
|
81
103
|
imageWriter,
|
82
104
|
start_page_id=start_page_id,
|
83
105
|
end_page_id=end_page_id,
|
@@ -91,12 +113,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
|
|
91
113
|
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
92
114
|
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
|
93
115
|
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
|
94
|
-
if
|
116
|
+
if len(model_list) == 0:
|
95
117
|
layout_model = kwargs.get('layout_model', None)
|
96
118
|
formula_enable = kwargs.get('formula_enable', None)
|
97
119
|
table_enable = kwargs.get('table_enable', None)
|
98
|
-
|
99
|
-
|
120
|
+
infer_res = doc_analyze(
|
121
|
+
dataset,
|
100
122
|
ocr=True,
|
101
123
|
start_page_id=start_page_id,
|
102
124
|
end_page_id=end_page_id,
|
@@ -105,6 +127,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
|
|
105
127
|
formula_enable=formula_enable,
|
106
128
|
table_enable=table_enable,
|
107
129
|
)
|
130
|
+
model_list = infer_res.get_infer_res()
|
108
131
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
109
132
|
if pdf_info_dict is None:
|
110
133
|
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.10.
|
3
|
+
Version: 0.10.6
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -15,11 +15,14 @@ Requires-Dist: numpy<2.0.0,>=1.21.6
|
|
15
15
|
Requires-Dist: pydantic<2.8.0,>=2.7.2
|
16
16
|
Requires-Dist: PyMuPDF>=1.24.9
|
17
17
|
Requires-Dist: scikit-learn>=1.0.2
|
18
|
-
Requires-Dist: torch
|
18
|
+
Requires-Dist: torch>=2.2.2
|
19
19
|
Requires-Dist: transformers
|
20
|
+
Requires-Dist: pdfminer.six==20231228
|
20
21
|
Provides-Extra: full
|
21
|
-
Requires-Dist: unimernet==0.2.
|
22
|
-
Requires-Dist:
|
22
|
+
Requires-Dist: unimernet==0.2.2; extra == "full"
|
23
|
+
Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
|
24
|
+
Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
|
25
|
+
Requires-Dist: ultralytics>=8.3.48; extra == "full"
|
23
26
|
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
24
27
|
Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
25
28
|
Requires-Dist: einops; extra == "full"
|
@@ -1,10 +1,10 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/pdf_parse_by_ocr.py,sha256=
|
3
|
-
magic_pdf/pdf_parse_by_txt.py,sha256=
|
4
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=
|
5
|
-
magic_pdf/user_api.py,sha256=
|
2
|
+
magic_pdf/pdf_parse_by_ocr.py,sha256=WFk6jhHSGvy8-hU2Qlpo5q-VORdSK_5Erh9IA_H7ZbQ,840
|
3
|
+
magic_pdf/pdf_parse_by_txt.py,sha256=1-xieVOP8qmAC957ftzSzaeviv0-QC4yL6Lv6Pcg_6Y,722
|
4
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=Hl8PSJOJFHAGCdTwX4YY2_MMgjAuat47yALLb_E-DYg,30879
|
5
|
+
magic_pdf/user_api.py,sha256=EAalk3WfQTfBq4qKMcISuHSjQg2Ku61ox_WiOPeFfuY,4060
|
6
6
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
magic_pdf/config/constants.py,sha256=
|
7
|
+
magic_pdf/config/constants.py,sha256=CEhNtP8o_2zcK6DesO6cNDlpS9fUdRv-QUyHw0_vsso,1222
|
8
8
|
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
9
9
|
magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
|
10
10
|
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
@@ -13,12 +13,12 @@ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLx
|
|
13
13
|
magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
|
14
14
|
magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
15
15
|
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
magic_pdf/data/dataset.py,sha256=
|
16
|
+
magic_pdf/data/dataset.py,sha256=NpljxcttgRk4_Rl8Rf191t_vNIdbqIpK5x1xHAGE2iI,10686
|
17
17
|
magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
|
18
18
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
19
19
|
magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
|
20
20
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
21
|
-
magic_pdf/data/data_reader_writer/base.py,sha256=
|
21
|
+
magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
|
22
22
|
magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
|
23
23
|
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
|
24
24
|
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
@@ -27,10 +27,10 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
27
27
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
28
28
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
29
29
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
31
|
-
magic_pdf/filter/__init__.py,sha256=
|
30
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=zmEbxuIdFPfy3W72Zx_EEgyYtIOKcTa-0JoXHgXkEJ8,13046
|
31
|
+
magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
|
32
32
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
33
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
33
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
|
34
34
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
@@ -43,26 +43,27 @@ magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,117
|
|
43
43
|
magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
|
44
44
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
45
45
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
46
|
-
magic_pdf/libs/draw_bbox.py,sha256=
|
46
|
+
magic_pdf/libs/draw_bbox.py,sha256=Z7-OOETUo90yj3tCV8MwbiJwckThcC0bjs4MXI9ocac,17561
|
47
47
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
48
48
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
49
49
|
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
50
50
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
51
51
|
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
52
52
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
53
|
-
magic_pdf/libs/pdf_check.py,sha256=
|
53
|
+
magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
|
54
54
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
55
55
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
56
|
-
magic_pdf/libs/version.py,sha256=
|
57
|
-
magic_pdf/model/__init__.py,sha256=
|
58
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
56
|
+
magic_pdf/libs/version.py,sha256=7qmFu9Qmzy5OxKJPN-LQOkzV_2T4cJYrUSLTfq7F3kE,23
|
57
|
+
magic_pdf/model/__init__.py,sha256=R6uhAQucHJa87V81ahYHWEffG0-3F1792J4kaSxZpi8,3698
|
58
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=KAPRDgWUAzsXbofZ6i0ll9eaanPdPnfjM1nn4Pl8-Zo,7588
|
59
59
|
magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
|
60
60
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
61
|
-
magic_pdf/model/
|
61
|
+
magic_pdf/model/operators.py,sha256=qcacETf6j-gDUj9g0zYJgBrkq0YWe6ZlfoPjJhCMUYU,6628
|
62
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=6JdWkdKOgL9UyAlI5znPMexs0AMZzn1SgrIpJUxWiGs,11839
|
62
63
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
63
64
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
|
-
magic_pdf/model/sub_modules/model_init.py,sha256=
|
65
|
-
magic_pdf/model/sub_modules/model_utils.py,sha256=
|
65
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=Sp4I2tQ2oFsTIBRHXv8-44WU1PvPSx4L3VfwnQUaRFo,5438
|
66
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=svV5bn_Xw3QqSa22h7OrmlQQQySSqe3DdE6KMEURr2c,2219
|
66
67
|
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
68
|
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
|
68
69
|
magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -111,11 +112,12 @@ magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_T
|
|
111
112
|
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
112
113
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
113
114
|
magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
|
114
|
-
magic_pdf/pipe/AbsPipe.py,sha256=
|
115
|
-
magic_pdf/pipe/OCRPipe.py,sha256=
|
116
|
-
magic_pdf/pipe/TXTPipe.py,sha256=
|
117
|
-
magic_pdf/pipe/UNIPipe.py,sha256=
|
115
|
+
magic_pdf/pipe/AbsPipe.py,sha256=_Lx4Ags5suEvmJEvgHEvg6n0RP4Yqjc1VBWaCP0la2o,4410
|
116
|
+
magic_pdf/pipe/OCRPipe.py,sha256=nH21Rq7mQEw7pS7AVD2MRFdSE0DxGc1wk9VXB6T0m3A,2396
|
117
|
+
magic_pdf/pipe/TXTPipe.py,sha256=JXJ7hzD7TNq5VnCt33dck2FM15GpozJoHibaRlYD14s,2196
|
118
|
+
magic_pdf/pipe/UNIPipe.py,sha256=i0kWflZ5BFHrx8p8vDntRcN6jecaxOfGq11ANtYvrZY,5011
|
118
119
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
+
magic_pdf/pipe/operators.py,sha256=5z7kF95IWyBGxs4tIhqJml2YMlfDkU9B5xy__NiUxz0,4962
|
119
121
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
122
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
123
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
@@ -136,12 +138,12 @@ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,
|
|
136
138
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
139
|
magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
|
138
140
|
magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
|
139
|
-
magic_pdf/tools/common.py,sha256=
|
141
|
+
magic_pdf/tools/common.py,sha256=x3dNHT9wEpdmkkEb4Y70DmUMMPavre5C82T0v9OmA2g,7894
|
140
142
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
141
143
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
142
|
-
magic_pdf-0.10.
|
143
|
-
magic_pdf-0.10.
|
144
|
-
magic_pdf-0.10.
|
145
|
-
magic_pdf-0.10.
|
146
|
-
magic_pdf-0.10.
|
147
|
-
magic_pdf-0.10.
|
144
|
+
magic_pdf-0.10.6.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
145
|
+
magic_pdf-0.10.6.dist-info/METADATA,sha256=CbT8tghajhhMHEawiHakbU-ndjeJ_J9J1011PFoYDbA,37144
|
146
|
+
magic_pdf-0.10.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
147
|
+
magic_pdf-0.10.6.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
148
|
+
magic_pdf-0.10.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
149
|
+
magic_pdf-0.10.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|