magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
- magic_pdf/data/data_reader_writer/s3.py +69 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +0 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +15 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +74 -234
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +54 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +418 -51
- magic_pdf/model/pdf_extract_kit.py +164 -80
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
- magic_pdf/model/ppTableModel.py +2 -2
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +296 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +19 -9
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
- magic_pdf-0.9.0.dist-info/METADATA +507 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
- magic_pdf-0.8.0.dist-info/METADATA +0 -459
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
magic_pdf/libs/config_reader.py
CHANGED
@@ -1,46 +1,44 @@
|
|
1
|
-
"""
|
2
|
-
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
|
3
|
-
|
4
|
-
"""
|
1
|
+
"""根据bucket的名字返回对应的s3 AK, SK,endpoint三元组."""
|
5
2
|
|
6
3
|
import json
|
7
4
|
import os
|
8
5
|
|
9
6
|
from loguru import logger
|
10
7
|
|
8
|
+
from magic_pdf.libs.Constants import MODEL_NAME
|
11
9
|
from magic_pdf.libs.commons import parse_bucket_key
|
12
10
|
|
13
11
|
# 定义配置文件名常量
|
14
|
-
CONFIG_FILE_NAME =
|
12
|
+
CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'magic-pdf.json')
|
15
13
|
|
16
14
|
|
17
15
|
def read_config():
|
18
|
-
|
19
|
-
|
20
|
-
|
16
|
+
if os.path.isabs(CONFIG_FILE_NAME):
|
17
|
+
config_file = CONFIG_FILE_NAME
|
18
|
+
else:
|
19
|
+
home_dir = os.path.expanduser('~')
|
20
|
+
config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
|
21
21
|
|
22
22
|
if not os.path.exists(config_file):
|
23
|
-
raise FileNotFoundError(f
|
23
|
+
raise FileNotFoundError(f'{config_file} not found')
|
24
24
|
|
25
|
-
with open(config_file,
|
25
|
+
with open(config_file, 'r', encoding='utf-8') as f:
|
26
26
|
config = json.load(f)
|
27
27
|
return config
|
28
28
|
|
29
29
|
|
30
30
|
def get_s3_config(bucket_name: str):
|
31
|
-
"""
|
32
|
-
~/magic-pdf.json 读出来
|
33
|
-
"""
|
31
|
+
"""~/magic-pdf.json 读出来."""
|
34
32
|
config = read_config()
|
35
33
|
|
36
|
-
bucket_info = config.get(
|
34
|
+
bucket_info = config.get('bucket_info')
|
37
35
|
if bucket_name not in bucket_info:
|
38
|
-
access_key, secret_key, storage_endpoint = bucket_info[
|
36
|
+
access_key, secret_key, storage_endpoint = bucket_info['[default]']
|
39
37
|
else:
|
40
38
|
access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
|
41
39
|
|
42
40
|
if access_key is None or secret_key is None or storage_endpoint is None:
|
43
|
-
raise Exception(f
|
41
|
+
raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}')
|
44
42
|
|
45
43
|
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
|
46
44
|
|
@@ -49,7 +47,7 @@ def get_s3_config(bucket_name: str):
|
|
49
47
|
|
50
48
|
def get_s3_config_dict(path: str):
|
51
49
|
access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
|
52
|
-
return {
|
50
|
+
return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint}
|
53
51
|
|
54
52
|
|
55
53
|
def get_bucket_name(path):
|
@@ -59,33 +57,65 @@ def get_bucket_name(path):
|
|
59
57
|
|
60
58
|
def get_local_models_dir():
|
61
59
|
config = read_config()
|
62
|
-
models_dir = config.get(
|
60
|
+
models_dir = config.get('models-dir')
|
63
61
|
if models_dir is None:
|
64
62
|
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
|
65
|
-
return
|
63
|
+
return '/tmp/models'
|
66
64
|
else:
|
67
65
|
return models_dir
|
68
66
|
|
69
67
|
|
68
|
+
def get_local_layoutreader_model_dir():
|
69
|
+
config = read_config()
|
70
|
+
layoutreader_model_dir = config.get('layoutreader-model-dir')
|
71
|
+
if layoutreader_model_dir is None or not os.path.exists(layoutreader_model_dir):
|
72
|
+
home_dir = os.path.expanduser('~')
|
73
|
+
layoutreader_at_modelscope_dir_path = os.path.join(home_dir, '.cache/modelscope/hub/ppaanngggg/layoutreader')
|
74
|
+
logger.warning(f"'layoutreader-model-dir' not exists, use {layoutreader_at_modelscope_dir_path} as default")
|
75
|
+
return layoutreader_at_modelscope_dir_path
|
76
|
+
else:
|
77
|
+
return layoutreader_model_dir
|
78
|
+
|
79
|
+
|
70
80
|
def get_device():
|
71
81
|
config = read_config()
|
72
|
-
device = config.get(
|
82
|
+
device = config.get('device-mode')
|
73
83
|
if device is None:
|
74
84
|
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
|
75
|
-
return
|
85
|
+
return 'cpu'
|
76
86
|
else:
|
77
87
|
return device
|
78
88
|
|
79
89
|
|
80
90
|
def get_table_recog_config():
|
81
91
|
config = read_config()
|
82
|
-
table_config = config.get(
|
92
|
+
table_config = config.get('table-config')
|
83
93
|
if table_config is None:
|
84
94
|
logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
|
85
|
-
return json.loads('{"
|
95
|
+
return json.loads(f'{{"model": "{MODEL_NAME.TABLE_MASTER}","enable": false, "max_time": 400}}')
|
86
96
|
else:
|
87
97
|
return table_config
|
88
98
|
|
89
99
|
|
100
|
+
def get_layout_config():
|
101
|
+
config = read_config()
|
102
|
+
layout_config = config.get("layout-config")
|
103
|
+
if layout_config is None:
|
104
|
+
logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
|
105
|
+
return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
|
106
|
+
else:
|
107
|
+
return layout_config
|
108
|
+
|
109
|
+
|
110
|
+
def get_formula_config():
|
111
|
+
config = read_config()
|
112
|
+
formula_config = config.get("formula-config")
|
113
|
+
if formula_config is None:
|
114
|
+
logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
|
115
|
+
return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
|
116
|
+
else:
|
117
|
+
return formula_config
|
118
|
+
|
119
|
+
|
90
120
|
if __name__ == "__main__":
|
91
121
|
ak, sk, endpoint = get_s3_config("llm-raw")
|
magic_pdf/libs/draw_bbox.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
from magic_pdf.data.dataset import PymuDocDataset
|
1
2
|
from magic_pdf.libs.commons import fitz # PyMuPDF
|
2
3
|
from magic_pdf.libs.Constants import CROSS_PAGE
|
3
4
|
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
|
@@ -33,7 +34,7 @@ def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
|
|
33
34
|
) # Draw the rectangle
|
34
35
|
|
35
36
|
|
36
|
-
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
|
37
|
+
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
|
37
38
|
new_rgb = []
|
38
39
|
for item in rgb_config:
|
39
40
|
item = float(item) / 255
|
@@ -42,31 +43,31 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
|
|
42
43
|
for j, bbox in enumerate(page_data):
|
43
44
|
x0, y0, x1, y1 = bbox
|
44
45
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
45
|
-
if
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
46
|
+
if draw_bbox:
|
47
|
+
if fill_config:
|
48
|
+
page.draw_rect(
|
49
|
+
rect_coords,
|
50
|
+
color=None,
|
51
|
+
fill=new_rgb,
|
52
|
+
fill_opacity=0.3,
|
53
|
+
width=0.5,
|
54
|
+
overlay=True,
|
55
|
+
) # Draw the rectangle
|
56
|
+
else:
|
57
|
+
page.draw_rect(
|
58
|
+
rect_coords,
|
59
|
+
color=new_rgb,
|
60
|
+
fill=None,
|
61
|
+
fill_opacity=1,
|
62
|
+
width=0.5,
|
63
|
+
overlay=True,
|
64
|
+
) # Draw the rectangle
|
63
65
|
page.insert_text(
|
64
|
-
(
|
66
|
+
(x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
|
65
67
|
) # Insert the index in the top left corner of the rectangle
|
66
68
|
|
67
69
|
|
68
70
|
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
69
|
-
layout_bbox_list = []
|
70
71
|
dropped_bbox_list = []
|
71
72
|
tables_list, tables_body_list = [], []
|
72
73
|
tables_caption_list, tables_footnote_list = [], []
|
@@ -75,17 +76,19 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
75
76
|
titles_list = []
|
76
77
|
texts_list = []
|
77
78
|
interequations_list = []
|
79
|
+
lists_list = []
|
80
|
+
indexs_list = []
|
78
81
|
for page in pdf_info:
|
79
|
-
|
82
|
+
|
80
83
|
page_dropped_list = []
|
81
84
|
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
|
82
85
|
imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
|
83
86
|
titles = []
|
84
87
|
texts = []
|
85
88
|
interequations = []
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
+
lists = []
|
90
|
+
indices = []
|
91
|
+
|
89
92
|
for dropped_bbox in page['discarded_blocks']:
|
90
93
|
page_dropped_list.append(dropped_bbox['bbox'])
|
91
94
|
dropped_bbox_list.append(page_dropped_list)
|
@@ -117,6 +120,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
117
120
|
texts.append(bbox)
|
118
121
|
elif block['type'] == BlockType.InterlineEquation:
|
119
122
|
interequations.append(bbox)
|
123
|
+
elif block['type'] == BlockType.List:
|
124
|
+
lists.append(bbox)
|
125
|
+
elif block['type'] == BlockType.Index:
|
126
|
+
indices.append(bbox)
|
127
|
+
|
120
128
|
tables_list.append(tables)
|
121
129
|
tables_body_list.append(tables_body)
|
122
130
|
tables_caption_list.append(tables_caption)
|
@@ -128,30 +136,62 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
128
136
|
titles_list.append(titles)
|
129
137
|
texts_list.append(texts)
|
130
138
|
interequations_list.append(interequations)
|
139
|
+
lists_list.append(lists)
|
140
|
+
indexs_list.append(indices)
|
141
|
+
|
142
|
+
layout_bbox_list = []
|
143
|
+
|
144
|
+
table_type_order = {
|
145
|
+
'table_caption': 1,
|
146
|
+
'table_body': 2,
|
147
|
+
'table_footnote': 3
|
148
|
+
}
|
149
|
+
for page in pdf_info:
|
150
|
+
page_block_list = []
|
151
|
+
for block in page['para_blocks']:
|
152
|
+
if block['type'] in [
|
153
|
+
BlockType.Text,
|
154
|
+
BlockType.Title,
|
155
|
+
BlockType.InterlineEquation,
|
156
|
+
BlockType.List,
|
157
|
+
BlockType.Index,
|
158
|
+
]:
|
159
|
+
bbox = block['bbox']
|
160
|
+
page_block_list.append(bbox)
|
161
|
+
elif block['type'] in [BlockType.Image]:
|
162
|
+
for sub_block in block['blocks']:
|
163
|
+
bbox = sub_block['bbox']
|
164
|
+
page_block_list.append(bbox)
|
165
|
+
elif block['type'] in [BlockType.Table]:
|
166
|
+
sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
|
167
|
+
for sub_block in sorted_blocks:
|
168
|
+
bbox = sub_block['bbox']
|
169
|
+
page_block_list.append(bbox)
|
170
|
+
|
171
|
+
layout_bbox_list.append(page_block_list)
|
131
172
|
|
132
173
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
174
|
+
|
133
175
|
for i, page in enumerate(pdf_docs):
|
134
|
-
|
135
|
-
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
|
136
|
-
|
137
|
-
draw_bbox_without_number(i,
|
138
|
-
|
139
|
-
draw_bbox_without_number(i,
|
140
|
-
|
141
|
-
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
|
142
|
-
True)
|
143
|
-
draw_bbox_without_number(i, tables_footnote_list, page,
|
144
|
-
[229, 255, 204], True)
|
145
|
-
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
176
|
+
|
177
|
+
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
|
178
|
+
# draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
|
179
|
+
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
|
180
|
+
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
|
181
|
+
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
182
|
+
# draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
146
183
|
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
|
147
|
-
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
|
148
|
-
|
149
|
-
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
|
150
|
-
True),
|
184
|
+
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
185
|
+
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
|
151
186
|
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
|
152
187
|
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
|
153
|
-
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
|
154
|
-
|
188
|
+
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
|
189
|
+
draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
|
190
|
+
draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
|
191
|
+
|
192
|
+
draw_bbox_with_number(
|
193
|
+
i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
|
194
|
+
)
|
155
195
|
|
156
196
|
# Save the PDF
|
157
197
|
pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
|
@@ -209,11 +249,14 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
209
249
|
page_dropped_list.append(span['bbox'])
|
210
250
|
dropped_list.append(page_dropped_list)
|
211
251
|
# 构造其余useful_list
|
212
|
-
for block in page['para_blocks']:
|
252
|
+
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
|
253
|
+
for block in page['preproc_blocks']:
|
213
254
|
if block['type'] in [
|
214
|
-
|
215
|
-
|
216
|
-
|
255
|
+
BlockType.Text,
|
256
|
+
BlockType.Title,
|
257
|
+
BlockType.InterlineEquation,
|
258
|
+
BlockType.List,
|
259
|
+
BlockType.Index,
|
217
260
|
]:
|
218
261
|
for line in block['lines']:
|
219
262
|
for span in line['spans']:
|
@@ -232,10 +275,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
232
275
|
for i, page in enumerate(pdf_docs):
|
233
276
|
# 获取当前页面的数据
|
234
277
|
draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
|
235
|
-
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
|
236
|
-
|
237
|
-
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
|
238
|
-
False)
|
278
|
+
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
|
279
|
+
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
|
239
280
|
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
|
240
281
|
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
|
241
282
|
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
|
@@ -244,7 +285,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
|
244
285
|
pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
|
245
286
|
|
246
287
|
|
247
|
-
def
|
288
|
+
def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
|
248
289
|
dropped_bbox_list = []
|
249
290
|
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
|
250
291
|
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
|
@@ -252,7 +293,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
|
|
252
293
|
texts_list = []
|
253
294
|
interequations_list = []
|
254
295
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
255
|
-
magic_model = MagicModel(model_list,
|
296
|
+
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
|
256
297
|
for i in range(len(model_list)):
|
257
298
|
page_dropped_list = []
|
258
299
|
tables_body, tables_caption, tables_footnote = [], [], []
|
@@ -278,8 +319,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
|
|
278
319
|
imgs_body.append(bbox)
|
279
320
|
elif layout_det['category_id'] == CategoryId.ImageCaption:
|
280
321
|
imgs_caption.append(bbox)
|
281
|
-
elif layout_det[
|
282
|
-
'category_id'] == CategoryId.InterlineEquation_YOLO:
|
322
|
+
elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
|
283
323
|
interequations.append(bbox)
|
284
324
|
elif layout_det['category_id'] == CategoryId.Abandon:
|
285
325
|
page_dropped_list.append(bbox)
|
@@ -298,21 +338,66 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
|
|
298
338
|
imgs_footnote_list.append(imgs_footnote)
|
299
339
|
|
300
340
|
for i, page in enumerate(pdf_docs):
|
301
|
-
draw_bbox_with_number(
|
302
|
-
|
341
|
+
draw_bbox_with_number(
|
342
|
+
i, dropped_bbox_list, page, [158, 158, 158], True
|
343
|
+
) # color !
|
303
344
|
draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
|
304
|
-
draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
|
305
|
-
|
306
|
-
draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
|
307
|
-
True)
|
345
|
+
draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
|
346
|
+
draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
308
347
|
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
|
309
|
-
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
|
310
|
-
|
311
|
-
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
|
312
|
-
True)
|
348
|
+
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
349
|
+
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
|
313
350
|
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
|
314
351
|
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
|
315
352
|
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
|
316
353
|
|
317
354
|
# Save the PDF
|
318
355
|
pdf_docs.save(f'{out_path}/{filename}_model.pdf')
|
356
|
+
|
357
|
+
|
358
|
+
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
359
|
+
layout_bbox_list = []
|
360
|
+
|
361
|
+
for page in pdf_info:
|
362
|
+
page_line_list = []
|
363
|
+
for block in page['preproc_blocks']:
|
364
|
+
if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
|
365
|
+
for line in block['lines']:
|
366
|
+
bbox = line['bbox']
|
367
|
+
index = line['index']
|
368
|
+
page_line_list.append({'index': index, 'bbox': bbox})
|
369
|
+
if block['type'] in [BlockType.Image, BlockType.Table]:
|
370
|
+
for sub_block in block['blocks']:
|
371
|
+
if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
372
|
+
for line in sub_block['virtual_lines']:
|
373
|
+
bbox = line['bbox']
|
374
|
+
index = line['index']
|
375
|
+
page_line_list.append({'index': index, 'bbox': bbox})
|
376
|
+
elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
|
377
|
+
for line in sub_block['lines']:
|
378
|
+
bbox = line['bbox']
|
379
|
+
index = line['index']
|
380
|
+
page_line_list.append({'index': index, 'bbox': bbox})
|
381
|
+
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
|
382
|
+
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
|
383
|
+
pdf_docs = fitz.open('pdf', pdf_bytes)
|
384
|
+
for i, page in enumerate(pdf_docs):
|
385
|
+
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
|
386
|
+
|
387
|
+
pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
|
388
|
+
|
389
|
+
|
390
|
+
def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
391
|
+
layout_bbox_list = []
|
392
|
+
|
393
|
+
for page in pdf_info:
|
394
|
+
page_block_list = []
|
395
|
+
for block in page['para_blocks']:
|
396
|
+
bbox = block['bbox']
|
397
|
+
page_block_list.append(bbox)
|
398
|
+
layout_bbox_list.append(page_block_list)
|
399
|
+
pdf_docs = fitz.open('pdf', pdf_bytes)
|
400
|
+
for i, page in enumerate(pdf_docs):
|
401
|
+
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
|
402
|
+
|
403
|
+
pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.9.0"
|
@@ -4,7 +4,9 @@ import fitz
|
|
4
4
|
import numpy as np
|
5
5
|
from loguru import logger
|
6
6
|
|
7
|
-
from magic_pdf.libs.
|
7
|
+
from magic_pdf.libs.clean_memory import clean_memory
|
8
|
+
from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config, get_layout_config, \
|
9
|
+
get_formula_config
|
8
10
|
from magic_pdf.model.model_list import MODEL
|
9
11
|
import magic_pdf.model as model_config
|
10
12
|
|
@@ -23,7 +25,7 @@ def remove_duplicates_dicts(lst):
|
|
23
25
|
return unique_dicts
|
24
26
|
|
25
27
|
|
26
|
-
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
28
|
+
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
27
29
|
try:
|
28
30
|
from PIL import Image
|
29
31
|
except ImportError:
|
@@ -32,18 +34,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
|
32
34
|
|
33
35
|
images = []
|
34
36
|
with fitz.open("pdf", pdf_bytes) as doc:
|
37
|
+
pdf_page_num = doc.page_count
|
38
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
39
|
+
if end_page_id > pdf_page_num - 1:
|
40
|
+
logger.warning("end_page_id is out of range, use images length")
|
41
|
+
end_page_id = pdf_page_num - 1
|
42
|
+
|
35
43
|
for index in range(0, doc.page_count):
|
36
|
-
|
37
|
-
|
38
|
-
|
44
|
+
if start_page_id <= index <= end_page_id:
|
45
|
+
page = doc[index]
|
46
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
47
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
48
|
+
|
49
|
+
# If the width or height exceeds 9000 after scaling, do not scale further.
|
50
|
+
if pm.width > 9000 or pm.height > 9000:
|
51
|
+
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
39
52
|
|
40
|
-
|
41
|
-
|
42
|
-
|
53
|
+
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
54
|
+
img = np.array(img)
|
55
|
+
img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
56
|
+
else:
|
57
|
+
img_dict = {"img": [], "width": 0, "height": 0}
|
43
58
|
|
44
|
-
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
45
|
-
img = np.array(img)
|
46
|
-
img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
47
59
|
images.append(img_dict)
|
48
60
|
return images
|
49
61
|
|
@@ -57,14 +69,17 @@ class ModelSingleton:
|
|
57
69
|
cls._instance = super().__new__(cls)
|
58
70
|
return cls._instance
|
59
71
|
|
60
|
-
def get_model(self, ocr: bool, show_log: bool):
|
61
|
-
key = (ocr, show_log)
|
72
|
+
def get_model(self, ocr: bool, show_log: bool, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
73
|
+
key = (ocr, show_log, lang, layout_model, formula_enable, table_enable)
|
62
74
|
if key not in self._models:
|
63
|
-
self._models[key] = custom_model_init(ocr=ocr, show_log=show_log
|
75
|
+
self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang, layout_model=layout_model,
|
76
|
+
formula_enable=formula_enable, table_enable=table_enable)
|
64
77
|
return self._models[key]
|
65
78
|
|
66
79
|
|
67
|
-
def custom_model_init(ocr: bool = False, show_log: bool = False
|
80
|
+
def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None,
|
81
|
+
layout_model=None, formula_enable=None, table_enable=None):
|
82
|
+
|
68
83
|
model = None
|
69
84
|
|
70
85
|
if model_config.__model_mode__ == "lite":
|
@@ -78,18 +93,36 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
|
|
78
93
|
model_init_start = time.time()
|
79
94
|
if model == MODEL.Paddle:
|
80
95
|
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
81
|
-
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
|
96
|
+
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang)
|
82
97
|
elif model == MODEL.PEK:
|
83
98
|
from magic_pdf.model.pdf_extract_kit import CustomPEKModel
|
84
99
|
# 从配置文件读取model-dir和device
|
85
100
|
local_models_dir = get_local_models_dir()
|
86
101
|
device = get_device()
|
102
|
+
|
103
|
+
layout_config = get_layout_config()
|
104
|
+
if layout_model is not None:
|
105
|
+
layout_config["model"] = layout_model
|
106
|
+
|
107
|
+
formula_config = get_formula_config()
|
108
|
+
if formula_enable is not None:
|
109
|
+
formula_config["enable"] = formula_enable
|
110
|
+
|
87
111
|
table_config = get_table_recog_config()
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
112
|
+
if table_enable is not None:
|
113
|
+
table_config["enable"] = table_enable
|
114
|
+
|
115
|
+
model_input = {
|
116
|
+
"ocr": ocr,
|
117
|
+
"show_log": show_log,
|
118
|
+
"models_dir": local_models_dir,
|
119
|
+
"device": device,
|
120
|
+
"table_config": table_config,
|
121
|
+
"layout_config": layout_config,
|
122
|
+
"formula_config": formula_config,
|
123
|
+
"lang": lang,
|
124
|
+
}
|
125
|
+
|
93
126
|
custom_model = CustomPEKModel(**model_input)
|
94
127
|
else:
|
95
128
|
logger.error("Not allow model_name!")
|
@@ -104,19 +137,23 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
|
|
104
137
|
|
105
138
|
|
106
139
|
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
|
107
|
-
start_page_id=0, end_page_id=None
|
140
|
+
start_page_id=0, end_page_id=None, lang=None,
|
141
|
+
layout_model=None, formula_enable=None, table_enable=None):
|
108
142
|
|
109
|
-
|
110
|
-
|
143
|
+
if lang == "":
|
144
|
+
lang = None
|
111
145
|
|
112
|
-
|
146
|
+
model_manager = ModelSingleton()
|
147
|
+
custom_model = model_manager.get_model(ocr, show_log, lang, layout_model, formula_enable, table_enable)
|
113
148
|
|
114
|
-
|
115
|
-
|
149
|
+
with fitz.open("pdf", pdf_bytes) as doc:
|
150
|
+
pdf_page_num = doc.page_count
|
151
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
152
|
+
if end_page_id > pdf_page_num - 1:
|
153
|
+
logger.warning("end_page_id is out of range, use images length")
|
154
|
+
end_page_id = pdf_page_num - 1
|
116
155
|
|
117
|
-
|
118
|
-
logger.warning("end_page_id is out of range, use images length")
|
119
|
-
end_page_id = len(images) - 1
|
156
|
+
images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)
|
120
157
|
|
121
158
|
model_json = []
|
122
159
|
doc_analyze_start = time.time()
|
@@ -132,7 +169,15 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
|
|
132
169
|
page_info = {"page_no": index, "height": page_height, "width": page_width}
|
133
170
|
page_dict = {"layout_dets": result, "page_info": page_info}
|
134
171
|
model_json.append(page_dict)
|
135
|
-
|
136
|
-
|
172
|
+
|
173
|
+
gc_start = time.time()
|
174
|
+
clean_memory()
|
175
|
+
gc_time = round(time.time() - gc_start, 2)
|
176
|
+
logger.info(f"gc time: {gc_time}")
|
177
|
+
|
178
|
+
doc_analyze_time = round(time.time() - doc_analyze_start, 2)
|
179
|
+
doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
|
180
|
+
logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
|
181
|
+
f" speed: {doc_analyze_speed} pages/second")
|
137
182
|
|
138
183
|
return model_json
|