magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
magic_pdf/dict2md/mkcontent.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
import math
|
2
|
+
|
2
3
|
from loguru import logger
|
3
4
|
|
4
|
-
from magic_pdf.
|
5
|
+
from magic_pdf.config.ocr_content_type import ContentType
|
6
|
+
from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
|
7
|
+
find_top_nearest_text_bbox)
|
5
8
|
from magic_pdf.libs.commons import join_path
|
6
|
-
from magic_pdf.libs.ocr_content_type import ContentType
|
7
9
|
|
8
10
|
TYPE_INLINE_EQUATION = ContentType.InlineEquation
|
9
11
|
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
|
@@ -12,33 +14,30 @@ UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
|
|
12
14
|
|
13
15
|
@DeprecationWarning
|
14
16
|
def mk_nlp_markdown_1(para_dict: dict):
|
15
|
-
"""
|
16
|
-
对排序后的bboxes拼接内容
|
17
|
-
"""
|
17
|
+
"""对排序后的bboxes拼接内容."""
|
18
18
|
content_lst = []
|
19
19
|
for _, page_info in para_dict.items():
|
20
|
-
para_blocks = page_info.get(
|
20
|
+
para_blocks = page_info.get('para_blocks')
|
21
21
|
if not para_blocks:
|
22
22
|
continue
|
23
23
|
|
24
24
|
for block in para_blocks:
|
25
|
-
item = block[
|
25
|
+
item = block['paras']
|
26
26
|
for _, p in item.items():
|
27
|
-
para_text = p[
|
28
|
-
is_title = p[
|
27
|
+
para_text = p['para_text']
|
28
|
+
is_title = p['is_para_title']
|
29
29
|
title_level = p['para_title_level']
|
30
|
-
md_title_prefix =
|
30
|
+
md_title_prefix = '#' * title_level
|
31
31
|
if is_title:
|
32
|
-
content_lst.append(f
|
32
|
+
content_lst.append(f'{md_title_prefix} {para_text}')
|
33
33
|
else:
|
34
34
|
content_lst.append(para_text)
|
35
35
|
|
36
|
-
content_text =
|
36
|
+
content_text = '\n\n'.join(content_lst)
|
37
37
|
|
38
38
|
return content_text
|
39
39
|
|
40
40
|
|
41
|
-
|
42
41
|
# 找到目标字符串在段落中的索引
|
43
42
|
def __find_index(paragraph, target):
|
44
43
|
index = paragraph.find(target)
|
@@ -48,69 +47,76 @@ def __find_index(paragraph, target):
|
|
48
47
|
return None
|
49
48
|
|
50
49
|
|
51
|
-
def __insert_string(paragraph, target,
|
52
|
-
new_paragraph = paragraph[:
|
50
|
+
def __insert_string(paragraph, target, position):
|
51
|
+
new_paragraph = paragraph[:position] + target + paragraph[position:]
|
53
52
|
return new_paragraph
|
54
53
|
|
55
54
|
|
56
55
|
def __insert_after(content, image_content, target):
|
57
|
-
"""
|
58
|
-
在content中找到target,将image_content插入到target后面
|
59
|
-
"""
|
56
|
+
"""在content中找到target,将image_content插入到target后面."""
|
60
57
|
index = content.find(target)
|
61
58
|
if index != -1:
|
62
|
-
content =
|
59
|
+
content = (
|
60
|
+
content[: index + len(target)]
|
61
|
+
+ '\n\n'
|
62
|
+
+ image_content
|
63
|
+
+ '\n\n'
|
64
|
+
+ content[index + len(target) :]
|
65
|
+
)
|
63
66
|
else:
|
64
|
-
logger.error(
|
67
|
+
logger.error(
|
68
|
+
f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
|
69
|
+
)
|
65
70
|
return content
|
66
71
|
|
72
|
+
|
67
73
|
def __insert_before(content, image_content, target):
|
68
|
-
"""
|
69
|
-
在content中找到target,将image_content插入到target前面
|
70
|
-
"""
|
74
|
+
"""在content中找到target,将image_content插入到target前面."""
|
71
75
|
index = content.find(target)
|
72
76
|
if index != -1:
|
73
|
-
content = content[:index] +
|
77
|
+
content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
|
74
78
|
else:
|
75
|
-
logger.error(
|
79
|
+
logger.error(
|
80
|
+
f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
|
81
|
+
)
|
76
82
|
return content
|
77
83
|
|
78
84
|
|
79
85
|
@DeprecationWarning
|
80
86
|
def mk_mm_markdown_1(para_dict: dict):
|
81
|
-
"""拼装多模态markdown"""
|
87
|
+
"""拼装多模态markdown."""
|
82
88
|
content_lst = []
|
83
89
|
for _, page_info in para_dict.items():
|
84
|
-
page_lst = []
|
85
|
-
para_blocks = page_info.get(
|
86
|
-
pymu_raw_blocks = page_info.get(
|
87
|
-
|
90
|
+
page_lst = [] # 一个page内的段落列表
|
91
|
+
para_blocks = page_info.get('para_blocks')
|
92
|
+
pymu_raw_blocks = page_info.get('preproc_blocks')
|
93
|
+
|
88
94
|
all_page_images = []
|
89
|
-
all_page_images.extend(page_info.get(
|
90
|
-
all_page_images.extend(page_info.get(
|
91
|
-
all_page_images.extend(page_info.get(
|
92
|
-
all_page_images.extend(page_info.get(
|
93
|
-
|
94
|
-
if not para_blocks or not pymu_raw_blocks:
|
95
|
+
all_page_images.extend(page_info.get('images', []))
|
96
|
+
all_page_images.extend(page_info.get('image_backup', []))
|
97
|
+
all_page_images.extend(page_info.get('tables', []))
|
98
|
+
all_page_images.extend(page_info.get('table_backup', []))
|
99
|
+
|
100
|
+
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
|
95
101
|
for img in all_page_images:
|
96
|
-
page_lst.append(f"")
|
97
|
-
page_md =
|
98
|
-
|
102
|
+
page_lst.append(f"") # TODO 图片顺序
|
103
|
+
page_md = '\n\n'.join(page_lst)
|
104
|
+
|
99
105
|
else:
|
100
106
|
for block in para_blocks:
|
101
|
-
item = block[
|
107
|
+
item = block['paras']
|
102
108
|
for _, p in item.items():
|
103
|
-
para_text = p[
|
104
|
-
is_title = p[
|
109
|
+
para_text = p['para_text']
|
110
|
+
is_title = p['is_para_title']
|
105
111
|
title_level = p['para_title_level']
|
106
|
-
md_title_prefix =
|
112
|
+
md_title_prefix = '#' * title_level
|
107
113
|
if is_title:
|
108
|
-
page_lst.append(f
|
114
|
+
page_lst.append(f'{md_title_prefix} {para_text}')
|
109
115
|
else:
|
110
116
|
page_lst.append(para_text)
|
111
|
-
|
117
|
+
|
112
118
|
"""拼装成一个页面的文本"""
|
113
|
-
page_md =
|
119
|
+
page_md = '\n\n'.join(page_lst)
|
114
120
|
"""插入图片"""
|
115
121
|
for img in all_page_images:
|
116
122
|
imgbox = img['bbox']
|
@@ -118,192 +124,215 @@ def mk_mm_markdown_1(para_dict: dict):
|
|
118
124
|
# 先看在哪个block内
|
119
125
|
for block in pymu_raw_blocks:
|
120
126
|
bbox = block['bbox']
|
121
|
-
if
|
122
|
-
|
127
|
+
if (
|
128
|
+
bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
|
129
|
+
and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
|
130
|
+
): # 确定在block内
|
131
|
+
for l in block['lines']: # noqa: E741
|
123
132
|
line_box = l['bbox']
|
124
|
-
if
|
125
|
-
|
126
|
-
|
133
|
+
if (
|
134
|
+
line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
|
135
|
+
and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
|
136
|
+
): # 在line内的,插入line前面
|
137
|
+
line_txt = ''.join([s['text'] for s in l['spans']])
|
138
|
+
page_md = __insert_before(
|
139
|
+
page_md, img_content, line_txt
|
140
|
+
)
|
127
141
|
break
|
128
142
|
break
|
129
|
-
else
|
143
|
+
else: # 在行与行之间
|
130
144
|
# 找到图片x0,y0与line的x0,y0最近的line
|
131
145
|
min_distance = 100000
|
132
146
|
min_line = None
|
133
|
-
for l in block['lines']:
|
147
|
+
for l in block['lines']: # noqa: E741
|
134
148
|
line_box = l['bbox']
|
135
|
-
distance = math.sqrt(
|
149
|
+
distance = math.sqrt(
|
150
|
+
(line_box[0] - imgbox[0]) ** 2
|
151
|
+
+ (line_box[1] - imgbox[1]) ** 2
|
152
|
+
)
|
136
153
|
if distance < min_distance:
|
137
154
|
min_distance = distance
|
138
155
|
min_line = l
|
139
156
|
if min_line:
|
140
|
-
line_txt =
|
157
|
+
line_txt = ''.join(
|
158
|
+
[s['text'] for s in min_line['spans']]
|
159
|
+
)
|
141
160
|
img_h = imgbox[3] - imgbox[1]
|
142
|
-
if min_distance<img_h:
|
143
|
-
page_md = __insert_after(
|
161
|
+
if min_distance < img_h: # 文字在图片前面
|
162
|
+
page_md = __insert_after(
|
163
|
+
page_md, img_content, line_txt
|
164
|
+
)
|
144
165
|
else:
|
145
|
-
page_md = __insert_before(
|
166
|
+
page_md = __insert_before(
|
167
|
+
page_md, img_content, line_txt
|
168
|
+
)
|
146
169
|
else:
|
147
|
-
logger.error(
|
148
|
-
|
170
|
+
logger.error(
|
171
|
+
f"Can't find the location of image {img['image_path']} in the markdown file #1"
|
172
|
+
)
|
173
|
+
else: # 应当在两个block之间
|
149
174
|
# 找到上方最近的block,如果上方没有就找大下方最近的block
|
150
175
|
top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
|
151
176
|
if top_txt_block:
|
152
|
-
line_txt =
|
177
|
+
line_txt = ''.join(
|
178
|
+
[s['text'] for s in top_txt_block['lines'][-1]['spans']]
|
179
|
+
)
|
153
180
|
page_md = __insert_after(page_md, img_content, line_txt)
|
154
181
|
else:
|
155
|
-
bottom_txt_block = find_bottom_nearest_text_bbox(
|
182
|
+
bottom_txt_block = find_bottom_nearest_text_bbox(
|
183
|
+
pymu_raw_blocks, imgbox
|
184
|
+
)
|
156
185
|
if bottom_txt_block:
|
157
|
-
line_txt =
|
186
|
+
line_txt = ''.join(
|
187
|
+
[
|
188
|
+
s['text']
|
189
|
+
for s in bottom_txt_block['lines'][0]['spans']
|
190
|
+
]
|
191
|
+
)
|
158
192
|
page_md = __insert_before(page_md, img_content, line_txt)
|
159
193
|
else:
|
160
|
-
logger.error(
|
161
|
-
|
194
|
+
logger.error(
|
195
|
+
f"Can't find the location of image {img['image_path']} in the markdown file #2"
|
196
|
+
)
|
197
|
+
|
162
198
|
content_lst.append(page_md)
|
163
|
-
|
199
|
+
|
164
200
|
"""拼装成全部页面的文本"""
|
165
|
-
content_text =
|
201
|
+
content_text = '\n\n'.join(content_lst)
|
166
202
|
|
167
203
|
return content_text
|
168
204
|
|
169
205
|
|
170
206
|
def __insert_after_para(text, type, element, content_list):
|
171
|
-
"""
|
172
|
-
在content_list中找到text,将image_path作为一个新的node插入到text后面
|
173
|
-
"""
|
207
|
+
"""在content_list中找到text,将image_path作为一个新的node插入到text后面."""
|
174
208
|
for i, c in enumerate(content_list):
|
175
|
-
content_type = c.get(
|
176
|
-
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get(
|
177
|
-
if type ==
|
209
|
+
content_type = c.get('type')
|
210
|
+
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
|
211
|
+
if type == 'image':
|
178
212
|
content_node = {
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
213
|
+
'type': 'image',
|
214
|
+
'img_path': element.get('image_path'),
|
215
|
+
'img_alt': '',
|
216
|
+
'img_title': '',
|
217
|
+
'img_caption': '',
|
184
218
|
}
|
185
|
-
elif type ==
|
219
|
+
elif type == 'table':
|
186
220
|
content_node = {
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
221
|
+
'type': 'table',
|
222
|
+
'img_path': element.get('image_path'),
|
223
|
+
'table_latex': element.get('text'),
|
224
|
+
'table_title': '',
|
225
|
+
'table_caption': '',
|
226
|
+
'table_quality': element.get('quality'),
|
193
227
|
}
|
194
|
-
content_list.insert(i+1, content_node)
|
228
|
+
content_list.insert(i + 1, content_node)
|
195
229
|
break
|
196
230
|
else:
|
197
|
-
logger.error(
|
198
|
-
|
231
|
+
logger.error(
|
232
|
+
f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
|
233
|
+
)
|
199
234
|
|
200
235
|
|
201
236
|
def __insert_before_para(text, type, element, content_list):
|
202
|
-
"""
|
203
|
-
在content_list中找到text,将image_path作为一个新的node插入到text前面
|
204
|
-
"""
|
237
|
+
"""在content_list中找到text,将image_path作为一个新的node插入到text前面."""
|
205
238
|
for i, c in enumerate(content_list):
|
206
|
-
content_type = c.get(
|
207
|
-
if content_type in
|
208
|
-
if type ==
|
239
|
+
content_type = c.get('type')
|
240
|
+
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
|
241
|
+
if type == 'image':
|
209
242
|
content_node = {
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
243
|
+
'type': 'image',
|
244
|
+
'img_path': element.get('image_path'),
|
245
|
+
'img_alt': '',
|
246
|
+
'img_title': '',
|
247
|
+
'img_caption': '',
|
215
248
|
}
|
216
|
-
elif type ==
|
249
|
+
elif type == 'table':
|
217
250
|
content_node = {
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
251
|
+
'type': 'table',
|
252
|
+
'img_path': element.get('image_path'),
|
253
|
+
'table_latex': element.get('text'),
|
254
|
+
'table_title': '',
|
255
|
+
'table_caption': '',
|
256
|
+
'table_quality': element.get('quality'),
|
224
257
|
}
|
225
258
|
content_list.insert(i, content_node)
|
226
259
|
break
|
227
260
|
else:
|
228
|
-
logger.error(
|
229
|
-
|
261
|
+
logger.error(
|
262
|
+
f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
|
263
|
+
)
|
264
|
+
|
230
265
|
|
231
266
|
def mk_universal_format(pdf_info_list: list, img_buket_path):
|
232
|
-
"""
|
233
|
-
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
|
234
|
-
"""
|
267
|
+
"""构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
|
235
268
|
content_lst = []
|
236
269
|
for page_info in pdf_info_list:
|
237
|
-
page_lst = []
|
238
|
-
para_blocks = page_info.get(
|
239
|
-
pymu_raw_blocks = page_info.get(
|
240
|
-
|
270
|
+
page_lst = [] # 一个page内的段落列表
|
271
|
+
para_blocks = page_info.get('para_blocks')
|
272
|
+
pymu_raw_blocks = page_info.get('preproc_blocks')
|
273
|
+
|
241
274
|
all_page_images = []
|
242
|
-
all_page_images.extend(page_info.get(
|
243
|
-
all_page_images.extend(page_info.get(
|
275
|
+
all_page_images.extend(page_info.get('images', []))
|
276
|
+
all_page_images.extend(page_info.get('image_backup', []))
|
244
277
|
# all_page_images.extend(page_info.get("tables",[]))
|
245
278
|
# all_page_images.extend(page_info.get("table_backup",[]) )
|
246
279
|
all_page_tables = []
|
247
|
-
all_page_tables.extend(page_info.get(
|
280
|
+
all_page_tables.extend(page_info.get('tables', []))
|
248
281
|
|
249
|
-
if not para_blocks or not pymu_raw_blocks:
|
282
|
+
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
|
250
283
|
for img in all_page_images:
|
251
284
|
content_node = {
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
285
|
+
'type': 'image',
|
286
|
+
'img_path': join_path(img_buket_path, img['image_path']),
|
287
|
+
'img_alt': '',
|
288
|
+
'img_title': '',
|
289
|
+
'img_caption': '',
|
257
290
|
}
|
258
|
-
page_lst.append(content_node)
|
291
|
+
page_lst.append(content_node) # TODO 图片顺序
|
259
292
|
for table in all_page_tables:
|
260
293
|
content_node = {
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
294
|
+
'type': 'table',
|
295
|
+
'img_path': join_path(img_buket_path, table['image_path']),
|
296
|
+
'table_latex': table.get('text'),
|
297
|
+
'table_title': '',
|
298
|
+
'table_caption': '',
|
299
|
+
'table_quality': table.get('quality'),
|
267
300
|
}
|
268
|
-
page_lst.append(content_node)
|
301
|
+
page_lst.append(content_node) # TODO 图片顺序
|
269
302
|
else:
|
270
303
|
for block in para_blocks:
|
271
|
-
item = block[
|
304
|
+
item = block['paras']
|
272
305
|
for _, p in item.items():
|
273
|
-
font_type = p[
|
306
|
+
font_type = p[
|
307
|
+
'para_font_type'
|
308
|
+
] # 对于文本来说,要么是普通文本,要么是个行间公式
|
274
309
|
if font_type == TYPE_INTERLINE_EQUATION:
|
275
|
-
content_node = {
|
276
|
-
"type": "equation",
|
277
|
-
"latex": p["para_text"]
|
278
|
-
}
|
310
|
+
content_node = {'type': 'equation', 'latex': p['para_text']}
|
279
311
|
page_lst.append(content_node)
|
280
312
|
else:
|
281
|
-
para_text = p[
|
282
|
-
is_title = p[
|
313
|
+
para_text = p['para_text']
|
314
|
+
is_title = p['is_para_title']
|
283
315
|
title_level = p['para_title_level']
|
284
|
-
|
316
|
+
|
285
317
|
if is_title:
|
286
318
|
content_node = {
|
287
|
-
|
288
|
-
|
319
|
+
'type': f'h{title_level}',
|
320
|
+
'text': para_text,
|
289
321
|
}
|
290
322
|
page_lst.append(content_node)
|
291
323
|
else:
|
292
|
-
content_node = {
|
293
|
-
"type": "text",
|
294
|
-
"text": para_text
|
295
|
-
}
|
324
|
+
content_node = {'type': 'text', 'text': para_text}
|
296
325
|
page_lst.append(content_node)
|
297
|
-
|
326
|
+
|
298
327
|
content_lst.extend(page_lst)
|
299
|
-
|
328
|
+
|
300
329
|
"""插入图片"""
|
301
330
|
for img in all_page_images:
|
302
|
-
insert_img_or_table(
|
331
|
+
insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
|
303
332
|
|
304
333
|
"""插入表格"""
|
305
334
|
for table in all_page_tables:
|
306
|
-
insert_img_or_table(
|
335
|
+
insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
|
307
336
|
# end for
|
308
337
|
return content_lst
|
309
338
|
|
@@ -313,13 +342,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
|
|
313
342
|
# 先看在哪个block内
|
314
343
|
for block in pymu_raw_blocks:
|
315
344
|
bbox = block['bbox']
|
316
|
-
if
|
317
|
-
|
318
|
-
|
345
|
+
if (
|
346
|
+
bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
|
347
|
+
and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
|
348
|
+
): # 确定在这个大的block内,然后进入逐行比较距离
|
349
|
+
for l in block['lines']: # noqa: E741
|
319
350
|
line_box = l['bbox']
|
320
|
-
if
|
321
|
-
|
322
|
-
|
351
|
+
if (
|
352
|
+
line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
|
353
|
+
and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
|
354
|
+
): # 在line内的,插入line前面
|
355
|
+
line_txt = ''.join([s['text'] for s in l['spans']])
|
323
356
|
__insert_before_para(line_txt, type, element, content_lst)
|
324
357
|
break
|
325
358
|
break
|
@@ -327,14 +360,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
|
|
327
360
|
# 找到图片x0,y0与line的x0,y0最近的line
|
328
361
|
min_distance = 100000
|
329
362
|
min_line = None
|
330
|
-
for l in block['lines']:
|
363
|
+
for l in block['lines']: # noqa: E741
|
331
364
|
line_box = l['bbox']
|
332
|
-
distance = math.sqrt(
|
365
|
+
distance = math.sqrt(
|
366
|
+
(line_box[0] - element_bbox[0]) ** 2
|
367
|
+
+ (line_box[1] - element_bbox[1]) ** 2
|
368
|
+
)
|
333
369
|
if distance < min_distance:
|
334
370
|
min_distance = distance
|
335
371
|
min_line = l
|
336
372
|
if min_line:
|
337
|
-
line_txt =
|
373
|
+
line_txt = ''.join([s['text'] for s in min_line['spans']])
|
338
374
|
img_h = element_bbox[3] - element_bbox[1]
|
339
375
|
if min_distance < img_h: # 文字在图片前面
|
340
376
|
__insert_after_para(line_txt, type, element, content_lst)
|
@@ -342,56 +378,61 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
|
|
342
378
|
__insert_before_para(line_txt, type, element, content_lst)
|
343
379
|
break
|
344
380
|
else:
|
345
|
-
logger.error(
|
381
|
+
logger.error(
|
382
|
+
f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
|
383
|
+
)
|
346
384
|
else: # 应当在两个block之间
|
347
385
|
# 找到上方最近的block,如果上方没有就找大下方最近的block
|
348
386
|
top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
|
349
387
|
if top_txt_block:
|
350
|
-
line_txt =
|
388
|
+
line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
|
351
389
|
__insert_after_para(line_txt, type, element, content_lst)
|
352
390
|
else:
|
353
|
-
bottom_txt_block = find_bottom_nearest_text_bbox(
|
391
|
+
bottom_txt_block = find_bottom_nearest_text_bbox(
|
392
|
+
pymu_raw_blocks, element_bbox
|
393
|
+
)
|
354
394
|
if bottom_txt_block:
|
355
|
-
line_txt =
|
395
|
+
line_txt = ''.join(
|
396
|
+
[s['text'] for s in bottom_txt_block['lines'][0]['spans']]
|
397
|
+
)
|
356
398
|
__insert_before_para(line_txt, type, element, content_lst)
|
357
399
|
else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
|
358
|
-
logger.error(
|
400
|
+
logger.error(
|
401
|
+
f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
|
402
|
+
)
|
359
403
|
|
360
404
|
|
361
405
|
def mk_mm_markdown(content_list):
|
362
|
-
"""
|
363
|
-
基于同一格式的内容列表,构造markdown,含图片
|
364
|
-
"""
|
406
|
+
"""基于同一格式的内容列表,构造markdown,含图片."""
|
365
407
|
content_md = []
|
366
408
|
for c in content_list:
|
367
|
-
content_type = c.get(
|
368
|
-
if content_type ==
|
369
|
-
content_md.append(c.get(
|
370
|
-
elif content_type ==
|
371
|
-
content = c.get(
|
372
|
-
if content.startswith(
|
409
|
+
content_type = c.get('type')
|
410
|
+
if content_type == 'text':
|
411
|
+
content_md.append(c.get('text'))
|
412
|
+
elif content_type == 'equation':
|
413
|
+
content = c.get('latex')
|
414
|
+
if content.startswith('$$') and content.endswith('$$'):
|
373
415
|
content_md.append(content)
|
374
416
|
else:
|
375
417
|
content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
|
376
418
|
elif content_type in UNI_FORMAT_TEXT_TYPE:
|
377
419
|
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
|
378
|
-
elif content_type ==
|
420
|
+
elif content_type == 'image':
|
379
421
|
content_md.append(f"})")
|
380
|
-
return
|
422
|
+
return '\n\n'.join(content_md)
|
423
|
+
|
381
424
|
|
382
425
|
def mk_nlp_markdown(content_list):
|
383
|
-
"""
|
384
|
-
基于同一格式的内容列表,构造markdown,不含图片
|
385
|
-
"""
|
426
|
+
"""基于同一格式的内容列表,构造markdown,不含图片."""
|
386
427
|
content_md = []
|
387
428
|
for c in content_list:
|
388
|
-
content_type = c.get(
|
389
|
-
if content_type ==
|
390
|
-
content_md.append(c.get(
|
391
|
-
elif content_type ==
|
429
|
+
content_type = c.get('type')
|
430
|
+
if content_type == 'text':
|
431
|
+
content_md.append(c.get('text'))
|
432
|
+
elif content_type == 'equation':
|
392
433
|
content_md.append(f"$$\n{c.get('latex')}\n$$")
|
393
|
-
elif content_type ==
|
434
|
+
elif content_type == 'table':
|
394
435
|
content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
|
395
436
|
elif content_type in UNI_FORMAT_TEXT_TYPE:
|
396
437
|
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
|
397
|
-
return
|
438
|
+
return '\n\n'.join(content_md)
|