magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +12 -12
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +6 -6
- magic_pdf/libs/draw_bbox.py +13 -6
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +142 -351
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +149 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +121 -66
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +253 -50
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +70 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
- magic_pdf-0.10.0.dist-info/RECORD +198 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -53
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- magic_pdf/para/para_pipeline.py +0 -297
- magic_pdf-0.9.2.dist-info/RECORD +0 -178
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py
CHANGED
@@ -3,17 +3,20 @@ import json as json_parse
|
|
3
3
|
import os
|
4
4
|
|
5
5
|
import click
|
6
|
+
import fitz
|
6
7
|
from loguru import logger
|
7
8
|
|
8
9
|
import magic_pdf.model as model_config
|
10
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
11
|
+
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
9
12
|
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
|
10
13
|
draw_model_bbox, draw_span_bbox)
|
11
|
-
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
12
14
|
from magic_pdf.pipe.OCRPipe import OCRPipe
|
13
15
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
14
16
|
from magic_pdf.pipe.UNIPipe import UNIPipe
|
15
|
-
|
16
|
-
from
|
17
|
+
|
18
|
+
# from io import BytesIO
|
19
|
+
# from pypdf import PdfReader, PdfWriter
|
17
20
|
|
18
21
|
|
19
22
|
def prepare_env(output_dir, pdf_file_name, method):
|
@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
|
|
26
29
|
return local_image_dir, local_md_dir
|
27
30
|
|
28
31
|
|
32
|
+
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
33
|
+
# # 将字节数据包装在 BytesIO 对象中
|
34
|
+
# pdf_file = BytesIO(pdf_bytes)
|
35
|
+
# # 读取 PDF 的字节数据
|
36
|
+
# reader = PdfReader(pdf_file)
|
37
|
+
# # 创建一个新的 PDF 写入器
|
38
|
+
# writer = PdfWriter()
|
39
|
+
# # 将所有页面添加到新的 PDF 写入器中
|
40
|
+
# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
|
41
|
+
# if end_page_id > len(reader.pages) - 1:
|
42
|
+
# logger.warning("end_page_id is out of range, use pdf_docs length")
|
43
|
+
# end_page_id = len(reader.pages) - 1
|
44
|
+
# for i, page in enumerate(reader.pages):
|
45
|
+
# if start_page_id <= i <= end_page_id:
|
46
|
+
# writer.add_page(page)
|
47
|
+
# # 创建一个字节缓冲区来存储输出的 PDF 数据
|
48
|
+
# output_buffer = BytesIO()
|
49
|
+
# # 将 PDF 写入字节缓冲区
|
50
|
+
# writer.write(output_buffer)
|
51
|
+
# # 获取字节缓冲区的内容
|
52
|
+
# converted_pdf_bytes = output_buffer.getvalue()
|
53
|
+
# return converted_pdf_bytes
|
54
|
+
|
55
|
+
|
56
|
+
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
57
|
+
document = fitz.open('pdf', pdf_bytes)
|
58
|
+
output_document = fitz.open()
|
59
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
|
60
|
+
if end_page_id > len(document) - 1:
|
61
|
+
logger.warning('end_page_id is out of range, use pdf_docs length')
|
62
|
+
end_page_id = len(document) - 1
|
63
|
+
output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
|
64
|
+
output_bytes = output_document.tobytes()
|
65
|
+
return output_bytes
|
66
|
+
|
67
|
+
|
29
68
|
def do_parse(
|
30
69
|
output_dir,
|
31
70
|
pdf_file_name,
|
@@ -55,26 +94,34 @@ def do_parse(
|
|
55
94
|
f_draw_model_bbox = True
|
56
95
|
f_draw_line_sort_bbox = True
|
57
96
|
|
97
|
+
if lang == "":
|
98
|
+
lang = None
|
99
|
+
|
100
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
|
101
|
+
|
58
102
|
orig_model_list = copy.deepcopy(model_list)
|
59
103
|
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
60
104
|
parse_method)
|
61
105
|
|
62
|
-
image_writer, md_writer =
|
63
|
-
local_image_dir),
|
106
|
+
image_writer, md_writer = FileBasedDataWriter(
|
107
|
+
local_image_dir), FileBasedDataWriter(local_md_dir)
|
64
108
|
image_dir = str(os.path.basename(local_image_dir))
|
65
109
|
|
66
110
|
if parse_method == 'auto':
|
67
111
|
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
68
112
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
69
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
113
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
114
|
+
lang=lang,
|
70
115
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
71
116
|
elif parse_method == 'txt':
|
72
117
|
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
73
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
118
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
119
|
+
lang=lang,
|
74
120
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
75
121
|
elif parse_method == 'ocr':
|
76
122
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
77
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
123
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
124
|
+
lang=lang,
|
78
125
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
79
126
|
else:
|
80
127
|
logger.error('unknown parse method')
|
@@ -101,49 +148,36 @@ def do_parse(
|
|
101
148
|
if f_draw_line_sort_bbox:
|
102
149
|
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
103
150
|
|
104
|
-
md_content = pipe.pipe_mk_markdown(image_dir,
|
105
|
-
drop_mode=DropMode.NONE,
|
106
|
-
md_make_mode=f_make_md_mode)
|
151
|
+
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
|
107
152
|
if f_dump_md:
|
108
|
-
md_writer.
|
109
|
-
|
110
|
-
|
111
|
-
mode=AbsReaderWriter.MODE_TXT,
|
153
|
+
md_writer.write_string(
|
154
|
+
f'{pdf_file_name}.md',
|
155
|
+
md_content
|
112
156
|
)
|
113
157
|
|
114
158
|
if f_dump_middle_json:
|
115
|
-
md_writer.
|
116
|
-
|
117
|
-
|
118
|
-
indent=4),
|
119
|
-
path=f'{pdf_file_name}_middle.json',
|
120
|
-
mode=AbsReaderWriter.MODE_TXT,
|
159
|
+
md_writer.write_string(
|
160
|
+
f'{pdf_file_name}_middle.json',
|
161
|
+
json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
|
121
162
|
)
|
122
163
|
|
123
164
|
if f_dump_model_json:
|
124
|
-
md_writer.
|
125
|
-
|
126
|
-
|
127
|
-
indent=4),
|
128
|
-
path=f'{pdf_file_name}_model.json',
|
129
|
-
mode=AbsReaderWriter.MODE_TXT,
|
165
|
+
md_writer.write_string(
|
166
|
+
f'{pdf_file_name}_model.json',
|
167
|
+
json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
|
130
168
|
)
|
131
169
|
|
132
170
|
if f_dump_orig_pdf:
|
133
171
|
md_writer.write(
|
134
|
-
|
135
|
-
|
136
|
-
mode=AbsReaderWriter.MODE_BIN,
|
172
|
+
f'{pdf_file_name}_origin.pdf',
|
173
|
+
pdf_bytes,
|
137
174
|
)
|
138
175
|
|
139
176
|
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
140
177
|
if f_dump_content_list:
|
141
|
-
md_writer.
|
142
|
-
|
143
|
-
|
144
|
-
indent=4),
|
145
|
-
path=f'{pdf_file_name}_content_list.json',
|
146
|
-
mode=AbsReaderWriter.MODE_TXT,
|
178
|
+
md_writer.write_string(
|
179
|
+
f'{pdf_file_name}_content_list.json',
|
180
|
+
json_parse.dumps(content_list, ensure_ascii=False, indent=4)
|
147
181
|
)
|
148
182
|
|
149
183
|
logger.info(f'local output dir is {local_md_dir}')
|
magic_pdf/user_api.py
CHANGED
@@ -1,36 +1,28 @@
|
|
1
|
-
"""
|
2
|
-
用户输入:
|
3
|
-
model数组,每个元素代表一个页面
|
4
|
-
pdf在s3的路径
|
5
|
-
截图保存的s3位置
|
1
|
+
"""用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
|
6
2
|
|
7
3
|
然后:
|
8
4
|
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
|
9
5
|
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
|
10
6
|
|
11
7
|
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
|
12
|
-
|
13
8
|
"""
|
14
|
-
import re
|
15
9
|
|
16
10
|
from loguru import logger
|
17
11
|
|
12
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
18
13
|
from magic_pdf.libs.version import __version__
|
19
14
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
20
|
-
from magic_pdf.rw import AbsReaderWriter
|
21
15
|
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
22
16
|
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
|
23
17
|
|
24
|
-
PARSE_TYPE_TXT =
|
25
|
-
PARSE_TYPE_OCR =
|
18
|
+
PARSE_TYPE_TXT = 'txt'
|
19
|
+
PARSE_TYPE_OCR = 'ocr'
|
26
20
|
|
27
21
|
|
28
|
-
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter:
|
22
|
+
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
29
23
|
start_page_id=0, end_page_id=None, lang=None,
|
30
24
|
*args, **kwargs):
|
31
|
-
"""
|
32
|
-
解析文本类pdf
|
33
|
-
"""
|
25
|
+
"""解析文本类pdf."""
|
34
26
|
pdf_info_dict = parse_pdf_by_txt(
|
35
27
|
pdf_bytes,
|
36
28
|
pdf_models,
|
@@ -38,24 +30,23 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
38
30
|
start_page_id=start_page_id,
|
39
31
|
end_page_id=end_page_id,
|
40
32
|
debug_mode=is_debug,
|
33
|
+
lang=lang,
|
41
34
|
)
|
42
35
|
|
43
|
-
pdf_info_dict[
|
36
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
|
44
37
|
|
45
|
-
pdf_info_dict[
|
38
|
+
pdf_info_dict['_version_name'] = __version__
|
46
39
|
|
47
40
|
if lang is not None:
|
48
|
-
pdf_info_dict[
|
41
|
+
pdf_info_dict['_lang'] = lang
|
49
42
|
|
50
43
|
return pdf_info_dict
|
51
44
|
|
52
45
|
|
53
|
-
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter:
|
46
|
+
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
54
47
|
start_page_id=0, end_page_id=None, lang=None,
|
55
48
|
*args, **kwargs):
|
56
|
-
"""
|
57
|
-
解析ocr类pdf
|
58
|
-
"""
|
49
|
+
"""解析ocr类pdf."""
|
59
50
|
pdf_info_dict = parse_pdf_by_ocr(
|
60
51
|
pdf_bytes,
|
61
52
|
pdf_models,
|
@@ -63,25 +54,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
63
54
|
start_page_id=start_page_id,
|
64
55
|
end_page_id=end_page_id,
|
65
56
|
debug_mode=is_debug,
|
57
|
+
lang=lang,
|
66
58
|
)
|
67
59
|
|
68
|
-
pdf_info_dict[
|
60
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
|
69
61
|
|
70
|
-
pdf_info_dict[
|
62
|
+
pdf_info_dict['_version_name'] = __version__
|
71
63
|
|
72
64
|
if lang is not None:
|
73
|
-
pdf_info_dict[
|
65
|
+
pdf_info_dict['_lang'] = lang
|
74
66
|
|
75
67
|
return pdf_info_dict
|
76
68
|
|
77
69
|
|
78
|
-
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter:
|
70
|
+
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
|
79
71
|
input_model_is_empty: bool = False,
|
80
72
|
start_page_id=0, end_page_id=None, lang=None,
|
81
73
|
*args, **kwargs):
|
82
|
-
"""
|
83
|
-
ocr和文本混合的pdf,全部解析出来
|
84
|
-
"""
|
74
|
+
"""ocr和文本混合的pdf,全部解析出来."""
|
85
75
|
|
86
76
|
def parse_pdf(method):
|
87
77
|
try:
|
@@ -92,18 +82,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
92
82
|
start_page_id=start_page_id,
|
93
83
|
end_page_id=end_page_id,
|
94
84
|
debug_mode=is_debug,
|
85
|
+
lang=lang,
|
95
86
|
)
|
96
87
|
except Exception as e:
|
97
88
|
logger.exception(e)
|
98
89
|
return None
|
99
90
|
|
100
91
|
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
101
|
-
if pdf_info_dict is None or pdf_info_dict.get(
|
102
|
-
logger.warning(
|
92
|
+
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
|
93
|
+
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
|
103
94
|
if input_model_is_empty:
|
104
|
-
layout_model = kwargs.get(
|
105
|
-
formula_enable = kwargs.get(
|
106
|
-
table_enable = kwargs.get(
|
95
|
+
layout_model = kwargs.get('layout_model', None)
|
96
|
+
formula_enable = kwargs.get('formula_enable', None)
|
97
|
+
table_enable = kwargs.get('table_enable', None)
|
107
98
|
pdf_models = doc_analyze(
|
108
99
|
pdf_bytes,
|
109
100
|
ocr=True,
|
@@ -116,15 +107,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
116
107
|
)
|
117
108
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
118
109
|
if pdf_info_dict is None:
|
119
|
-
raise Exception(
|
110
|
+
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
|
120
111
|
else:
|
121
|
-
pdf_info_dict[
|
112
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
|
122
113
|
else:
|
123
|
-
pdf_info_dict[
|
114
|
+
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
|
124
115
|
|
125
|
-
pdf_info_dict[
|
116
|
+
pdf_info_dict['_version_name'] = __version__
|
126
117
|
|
127
118
|
if lang is not None:
|
128
|
-
pdf_info_dict[
|
119
|
+
pdf_info_dict['_lang'] = lang
|
129
120
|
|
130
121
|
return pdf_info_dict
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.10.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -26,6 +26,9 @@ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
|
26
26
|
Requires-Dist: einops; extra == "full"
|
27
27
|
Requires-Dist: accelerate; extra == "full"
|
28
28
|
Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
|
29
|
+
Requires-Dist: rapidocr-paddle; extra == "full"
|
30
|
+
Requires-Dist: rapid-table; extra == "full"
|
31
|
+
Requires-Dist: PyYAML; extra == "full"
|
29
32
|
Requires-Dist: detectron2; extra == "full"
|
30
33
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
31
34
|
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
@@ -35,6 +38,8 @@ Provides-Extra: lite
|
|
35
38
|
Requires-Dist: paddleocr==2.7.3; extra == "lite"
|
36
39
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
|
37
40
|
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
|
41
|
+
Provides-Extra: old_linux
|
42
|
+
Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
38
43
|
|
39
44
|
<div align="center" xmlns="http://www.w3.org/1999/html">
|
40
45
|
<!-- logo -->
|
@@ -80,6 +85,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
80
85
|
</div>
|
81
86
|
|
82
87
|
# Changelog
|
88
|
+
- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
|
89
|
+
- Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
|
90
|
+
- Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
|
91
|
+
- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
|
83
92
|
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
|
84
93
|
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
|
85
94
|
- Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
|
@@ -158,7 +167,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
158
167
|
- Preserve the structure of the original document, including headings, paragraphs, lists, etc.
|
159
168
|
- Extract images, image descriptions, tables, table titles, and footnotes.
|
160
169
|
- Automatically recognize and convert formulas in the document to LaTeX format.
|
161
|
-
- Automatically recognize and convert tables in the document to
|
170
|
+
- Automatically recognize and convert tables in the document to HTML format.
|
162
171
|
- Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
|
163
172
|
- OCR supports detection and recognition of 84 languages.
|
164
173
|
- Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
|
@@ -222,17 +231,11 @@ There are three different ways to experience MinerU:
|
|
222
231
|
</tr>
|
223
232
|
<tr>
|
224
233
|
<td rowspan="2">GPU Hardware Support List</td>
|
225
|
-
<td colspan="2">
|
226
|
-
<td colspan="2">
|
227
|
-
8G VRAM
|
234
|
+
<td colspan="2">GPU VRAM 8GB or more</td>
|
235
|
+
<td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
|
236
|
+
8G VRAM can enable all acceleration features</td>
|
228
237
|
<td rowspan="2">None</td>
|
229
238
|
</tr>
|
230
|
-
<tr>
|
231
|
-
<td colspan="2">Recommended Configuration 10G+ VRAM</td>
|
232
|
-
<td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
|
233
|
-
10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
|
234
|
-
</td>
|
235
|
-
</tr>
|
236
239
|
</table>
|
237
240
|
|
238
241
|
### Online Demo
|
@@ -284,7 +287,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
284
287
|
"enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
285
288
|
},
|
286
289
|
"table-config": {
|
287
|
-
"model": "
|
290
|
+
"model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
|
288
291
|
"enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
|
289
292
|
"max_time": 400
|
290
293
|
}
|
@@ -299,7 +302,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
299
302
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
300
303
|
- Quick Deployment with Docker
|
301
304
|
> [!IMPORTANT]
|
302
|
-
> Docker requires a GPU with at least
|
305
|
+
> Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
|
303
306
|
>
|
304
307
|
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
305
308
|
>
|
@@ -459,7 +462,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
459
462
|
# Acknowledgments
|
460
463
|
|
461
464
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
465
|
+
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
462
466
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
467
|
+
- [RapidTable](https://github.com/RapidAI/RapidTable)
|
463
468
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
464
469
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
465
470
|
- [layoutreader](https://github.com/ppaanngggg/layoutreader)
|
@@ -0,0 +1,198 @@
|
|
1
|
+
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
|
3
|
+
magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
|
4
|
+
magic_pdf/pdf_parse_union_core.py,sha256=w90lFIMOYUMAq4iv8bpsbBtLXFphPV4HyYeqbOTYQUI,12420
|
5
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
|
6
|
+
magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
|
7
|
+
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
|
9
|
+
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
10
|
+
magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
|
11
|
+
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
12
|
+
magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zkE,622
|
13
|
+
magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
|
14
|
+
magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
|
15
|
+
magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
16
|
+
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
|
18
|
+
magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
|
19
|
+
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
20
|
+
magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
|
21
|
+
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
22
|
+
magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
|
23
|
+
magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
|
24
|
+
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
|
25
|
+
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
26
|
+
magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
|
27
|
+
magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
|
28
|
+
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
29
|
+
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
30
|
+
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
+
magic_pdf/dict2md/mkcontent.py,sha256=bMQK7uiay76YaWA92VIK57YajINV20SnOs65wOEXyKE,18667
|
32
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
|
33
|
+
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
+
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
35
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=h4D4O0OeAlEy2A8mJ6E0aQ8wIizIfsIxEagbjaomnAo,17823
|
36
|
+
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
+
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
39
|
+
magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
|
40
|
+
magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
|
41
|
+
magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
+
magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
|
43
|
+
magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
|
44
|
+
magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
|
45
|
+
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
46
|
+
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
47
|
+
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
48
|
+
magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
|
49
|
+
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
50
|
+
magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
|
51
|
+
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
52
|
+
magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
|
53
|
+
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
54
|
+
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
55
|
+
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
56
|
+
magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
|
57
|
+
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
58
|
+
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
59
|
+
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
60
|
+
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
61
|
+
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
62
|
+
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
63
|
+
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
64
|
+
magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
|
65
|
+
magic_pdf/libs/pdf_image_tools.py,sha256=sh8hgBQu_83R71qBLodOFdByBUuQujsOMfgpSD9mrhE,1981
|
66
|
+
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
|
+
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
68
|
+
magic_pdf/libs/version.py,sha256=v4zmKjsKOPZbp6BrWoz7iK4ST0sdZdUh9bQSJmluZ5o,23
|
69
|
+
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
70
|
+
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=HOT6chGx2VPyH6O9WB0c6xGPeDs9m_6oZn3iOa745yw,7125
|
72
|
+
magic_pdf/model/magic_model.py,sha256=8nJLzNCa0Ag4JhMAQbjj5qrkj617qKPCXVJAiT9DnaA,43472
|
73
|
+
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
|
75
|
+
magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
|
76
|
+
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
|
78
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
|
79
|
+
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
80
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
|
81
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
82
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
83
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
84
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
|
85
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
|
86
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
|
87
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
|
88
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
|
89
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
|
90
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
|
91
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
|
92
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
|
93
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
|
94
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
|
95
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
|
96
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
|
97
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
|
98
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
|
99
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
100
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
101
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
102
|
+
magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
103
|
+
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
|
104
|
+
magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
|
+
magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
106
|
+
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
|
107
|
+
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
|
+
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=UP7fADPGoxAMj2SUKmeW-fe_AcAQxlT9Mfy4WF6vHmU,9796
|
111
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=a6xkQHqLMUL4NCaORp8oo4Tfa8GB8PN9MVvG7Pj6jIE,7316
|
112
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
113
|
+
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
115
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
|
116
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
|
117
|
+
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
118
|
+
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
119
|
+
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx6DCnhqYzP-4b1zSWptrefimxFTmy8Q,583
|
121
|
+
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
|
+
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
123
|
+
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
124
|
+
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
|
125
|
+
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
126
|
+
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
127
|
+
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
128
|
+
magic_pdf/para/commons.py,sha256=VdJ8SY9qJTtcRyx8HH-PFeZSJwL4Tsf50197RD_-dwc,5414
|
129
|
+
magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,10443
|
130
|
+
magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
|
131
|
+
magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
|
132
|
+
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
133
|
+
magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
|
134
|
+
magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
|
135
|
+
magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
|
136
|
+
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
137
|
+
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
138
|
+
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
139
|
+
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
140
|
+
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
141
|
+
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
142
|
+
magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
|
143
|
+
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
144
|
+
magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
145
|
+
magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
|
146
|
+
magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
|
147
|
+
magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
|
148
|
+
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
149
|
+
magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
|
150
|
+
magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
|
151
|
+
magic_pdf/pre_proc/cut_image.py,sha256=TghshkDTgdUbyLSbKZoFI9-n-xaFub02IYPyu0IAnRY,2761
|
152
|
+
magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
|
153
|
+
magic_pdf/pre_proc/detect_footer_by_model.py,sha256=_EghAM_zWBcqVY8XBkbSoprKqKUa0mlN1U8YNWxNNLI,2848
|
154
|
+
magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
|
155
|
+
magic_pdf/pre_proc/detect_footnote.py,sha256=UxFuTCRwXdAv3wKCgRQJJVt12hM9O9oPTwzPAChQXoM,8309
|
156
|
+
magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1itbY7g,2848
|
157
|
+
magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
|
158
|
+
magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
|
159
|
+
magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
|
160
|
+
magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
|
161
|
+
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
162
|
+
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
163
|
+
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
164
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
|
165
|
+
magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
|
166
|
+
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
|
167
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
|
168
|
+
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
|
169
|
+
magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
170
|
+
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
|
171
|
+
magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
|
172
|
+
magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
|
173
|
+
magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
|
174
|
+
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
|
175
|
+
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
176
|
+
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
177
|
+
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
178
|
+
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
179
|
+
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
180
|
+
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
181
|
+
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
182
|
+
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
183
|
+
magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
|
184
|
+
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
185
|
+
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
186
|
+
magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
|
187
|
+
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
188
|
+
magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
|
189
|
+
magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
|
190
|
+
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
191
|
+
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
192
|
+
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
193
|
+
magic_pdf-0.10.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
194
|
+
magic_pdf-0.10.0.dist-info/METADATA,sha256=U_TtQjdODFjAADoZro_ipfGiasBCVq2_zZlF2DFyNpM,40300
|
195
|
+
magic_pdf-0.10.0.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
|
196
|
+
magic_pdf-0.10.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
197
|
+
magic_pdf-0.10.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
198
|
+
magic_pdf-0.10.0.dist-info/RECORD,,
|