magic-pdf 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +143 -0
- magic_pdf/data/data_reader_writer/s3.py +73 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +6 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +19 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +106 -244
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +35 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +331 -15
- magic_pdf/model/pdf_extract_kit.py +170 -83
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +40 -16
- magic_pdf/model/ppTableModel.py +8 -6
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +322 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +18 -8
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/LICENSE.md +1 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/METADATA +124 -78
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/RECORD +57 -33
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
|
|
1
1
|
import re
|
2
2
|
|
3
|
-
import wordninja
|
4
3
|
from loguru import logger
|
5
4
|
|
6
5
|
from magic_pdf.libs.commons import join_path
|
@@ -8,6 +7,7 @@ from magic_pdf.libs.language import detect_lang
|
|
8
7
|
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
9
8
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
10
9
|
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
|
10
|
+
from magic_pdf.para.para_split_v3 import ListLineTag
|
11
11
|
|
12
12
|
|
13
13
|
def __is_hyphen_at_line_end(line):
|
@@ -24,37 +24,6 @@ def __is_hyphen_at_line_end(line):
|
|
24
24
|
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
25
25
|
|
26
26
|
|
27
|
-
def split_long_words(text):
|
28
|
-
segments = text.split(' ')
|
29
|
-
for i in range(len(segments)):
|
30
|
-
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
|
31
|
-
for j in range(len(words)):
|
32
|
-
if len(words[j]) > 10:
|
33
|
-
words[j] = ' '.join(wordninja.split(words[j]))
|
34
|
-
segments[i] = ''.join(words)
|
35
|
-
return ' '.join(segments)
|
36
|
-
|
37
|
-
|
38
|
-
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
|
39
|
-
markdown = []
|
40
|
-
for page_info in pdf_info_list:
|
41
|
-
paras_of_layout = page_info.get('para_blocks')
|
42
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
43
|
-
paras_of_layout, 'mm', img_buket_path)
|
44
|
-
markdown.extend(page_markdown)
|
45
|
-
return '\n\n'.join(markdown)
|
46
|
-
|
47
|
-
|
48
|
-
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
|
49
|
-
markdown = []
|
50
|
-
for page_info in pdf_info_dict:
|
51
|
-
paras_of_layout = page_info.get('para_blocks')
|
52
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
53
|
-
paras_of_layout, 'nlp')
|
54
|
-
markdown.extend(page_markdown)
|
55
|
-
return '\n\n'.join(markdown)
|
56
|
-
|
57
|
-
|
58
27
|
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
59
28
|
img_buket_path):
|
60
29
|
markdown_with_para_and_pagination = []
|
@@ -67,61 +36,23 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
|
67
36
|
paras_of_layout, 'mm', img_buket_path)
|
68
37
|
markdown_with_para_and_pagination.append({
|
69
38
|
'page_no':
|
70
|
-
|
39
|
+
page_no,
|
71
40
|
'md_content':
|
72
|
-
|
41
|
+
'\n\n'.join(page_markdown)
|
73
42
|
})
|
74
43
|
page_no += 1
|
75
44
|
return markdown_with_para_and_pagination
|
76
45
|
|
77
46
|
|
78
|
-
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
|
79
|
-
page_markdown = []
|
80
|
-
for paras in paras_of_layout:
|
81
|
-
for para in paras:
|
82
|
-
para_text = ''
|
83
|
-
for line in para:
|
84
|
-
for span in line['spans']:
|
85
|
-
span_type = span.get('type')
|
86
|
-
content = ''
|
87
|
-
language = ''
|
88
|
-
if span_type == ContentType.Text:
|
89
|
-
content = span['content']
|
90
|
-
language = detect_lang(content)
|
91
|
-
if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
|
92
|
-
content = ocr_escape_special_markdown_char(
|
93
|
-
split_long_words(content))
|
94
|
-
else:
|
95
|
-
content = ocr_escape_special_markdown_char(content)
|
96
|
-
elif span_type == ContentType.InlineEquation:
|
97
|
-
content = f"${span['content']}$"
|
98
|
-
elif span_type == ContentType.InterlineEquation:
|
99
|
-
content = f"\n$$\n{span['content']}\n$$\n"
|
100
|
-
elif span_type in [ContentType.Image, ContentType.Table]:
|
101
|
-
if mode == 'mm':
|
102
|
-
content = f"\n})\n"
|
103
|
-
elif mode == 'nlp':
|
104
|
-
pass
|
105
|
-
if content != '':
|
106
|
-
if language == 'en': # 英文语境下 content间需要空格分隔
|
107
|
-
para_text += content + ' '
|
108
|
-
else: # 中文语境下,content间不需要空格分隔
|
109
|
-
para_text += content
|
110
|
-
if para_text.strip() == '':
|
111
|
-
continue
|
112
|
-
else:
|
113
|
-
page_markdown.append(para_text.strip() + ' ')
|
114
|
-
return page_markdown
|
115
|
-
|
116
|
-
|
117
47
|
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
118
48
|
mode,
|
119
|
-
img_buket_path=''
|
49
|
+
img_buket_path='',
|
50
|
+
):
|
120
51
|
page_markdown = []
|
121
52
|
for para_block in paras_of_layout:
|
122
53
|
para_text = ''
|
123
54
|
para_type = para_block['type']
|
124
|
-
if para_type
|
55
|
+
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
|
125
56
|
para_text = merge_para_with_text(para_block)
|
126
57
|
elif para_type == BlockType.Title:
|
127
58
|
para_text = f'# {merge_para_with_text(para_block)}'
|
@@ -136,20 +67,21 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
136
67
|
for line in block['lines']:
|
137
68
|
for span in line['spans']:
|
138
69
|
if span['type'] == ContentType.Image:
|
139
|
-
|
70
|
+
if span.get('image_path', ''):
|
71
|
+
para_text += f"\n}) \n"
|
140
72
|
for block in para_block['blocks']: # 2nd.拼image_caption
|
141
73
|
if block['type'] == BlockType.ImageCaption:
|
142
|
-
para_text += merge_para_with_text(block)
|
143
|
-
for block in para_block['blocks']: #
|
74
|
+
para_text += merge_para_with_text(block) + ' \n'
|
75
|
+
for block in para_block['blocks']: # 3rd.拼image_footnote
|
144
76
|
if block['type'] == BlockType.ImageFootnote:
|
145
|
-
para_text += merge_para_with_text(block)
|
77
|
+
para_text += merge_para_with_text(block) + ' \n'
|
146
78
|
elif para_type == BlockType.Table:
|
147
79
|
if mode == 'nlp':
|
148
80
|
continue
|
149
81
|
elif mode == 'mm':
|
150
82
|
for block in para_block['blocks']: # 1st.拼table_caption
|
151
83
|
if block['type'] == BlockType.TableCaption:
|
152
|
-
para_text += merge_para_with_text(block)
|
84
|
+
para_text += merge_para_with_text(block) + ' \n'
|
153
85
|
for block in para_block['blocks']: # 2nd.拼table_body
|
154
86
|
if block['type'] == BlockType.TableBody:
|
155
87
|
for line in block['lines']:
|
@@ -160,11 +92,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
160
92
|
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
|
161
93
|
elif span.get('html', ''):
|
162
94
|
para_text += f"\n\n{span['html']}\n\n"
|
163
|
-
|
95
|
+
elif span.get('image_path', ''):
|
164
96
|
para_text += f"\n}) \n"
|
165
97
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
166
98
|
if block['type'] == BlockType.TableFootnote:
|
167
|
-
para_text += merge_para_with_text(block)
|
99
|
+
para_text += merge_para_with_text(block) + ' \n'
|
168
100
|
|
169
101
|
if para_text.strip() == '':
|
170
102
|
continue
|
@@ -174,22 +106,36 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
174
106
|
return page_markdown
|
175
107
|
|
176
108
|
|
177
|
-
def
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
en_length
|
183
|
-
|
184
|
-
if en_length / len(text) >= 0.5:
|
185
|
-
return 'en'
|
186
|
-
else:
|
187
|
-
return 'unknown'
|
109
|
+
def detect_language(text):
|
110
|
+
en_pattern = r'[a-zA-Z]+'
|
111
|
+
en_matches = re.findall(en_pattern, text)
|
112
|
+
en_length = sum(len(match) for match in en_matches)
|
113
|
+
if len(text) > 0:
|
114
|
+
if en_length / len(text) >= 0.5:
|
115
|
+
return 'en'
|
188
116
|
else:
|
189
|
-
return '
|
117
|
+
return 'unknown'
|
118
|
+
else:
|
119
|
+
return 'empty'
|
120
|
+
|
121
|
+
|
122
|
+
# 连写字符拆分
|
123
|
+
def __replace_ligatures(text: str):
|
124
|
+
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
|
125
|
+
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
|
126
|
+
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
|
127
|
+
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
|
128
|
+
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
|
129
|
+
return text
|
190
130
|
|
131
|
+
|
132
|
+
def merge_para_with_text(para_block):
|
191
133
|
para_text = ''
|
192
|
-
for line in para_block['lines']:
|
134
|
+
for i, line in enumerate(para_block['lines']):
|
135
|
+
|
136
|
+
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
|
137
|
+
para_text += ' \n'
|
138
|
+
|
193
139
|
line_text = ''
|
194
140
|
line_lang = ''
|
195
141
|
for span in line['spans']:
|
@@ -199,208 +145,120 @@ def merge_para_with_text(para_block):
|
|
199
145
|
if line_text != '':
|
200
146
|
line_lang = detect_lang(line_text)
|
201
147
|
for span in line['spans']:
|
148
|
+
|
202
149
|
span_type = span['type']
|
203
150
|
content = ''
|
204
151
|
if span_type == ContentType.Text:
|
205
|
-
content = span['content']
|
206
|
-
# language = detect_lang(content)
|
207
|
-
language = detect_language(content)
|
208
|
-
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
209
|
-
content = ocr_escape_special_markdown_char(
|
210
|
-
split_long_words(content))
|
211
|
-
else:
|
212
|
-
content = ocr_escape_special_markdown_char(content)
|
152
|
+
content = ocr_escape_special_markdown_char(span['content'])
|
213
153
|
elif span_type == ContentType.InlineEquation:
|
214
|
-
content = f"
|
154
|
+
content = f"${span['content']}$"
|
215
155
|
elif span_type == ContentType.InterlineEquation:
|
216
156
|
content = f"\n$$\n{span['content']}\n$$\n"
|
217
157
|
|
158
|
+
content = content.strip()
|
218
159
|
if content != '':
|
219
160
|
langs = ['zh', 'ja', 'ko']
|
220
161
|
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
para_text += content[:-1]
|
226
|
-
else:
|
227
|
-
para_text += content + ' '
|
162
|
+
if span_type in [ContentType.Text, ContentType.InterlineEquation]:
|
163
|
+
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
|
164
|
+
elif span_type == ContentType.InlineEquation:
|
165
|
+
para_text += f" {content} "
|
228
166
|
else:
|
229
|
-
|
230
|
-
|
231
|
-
|
167
|
+
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
168
|
+
# 如果是前一行带有-连字符,那么末尾不应该加空格
|
169
|
+
if __is_hyphen_at_line_end(content):
|
170
|
+
para_text += content[:-1]
|
171
|
+
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
|
172
|
+
para_text += content
|
173
|
+
else: # 西方文本语境下 content间需要空格分隔
|
174
|
+
para_text += f"{content} "
|
175
|
+
elif span_type == ContentType.InterlineEquation:
|
176
|
+
para_text += content
|
177
|
+
else:
|
178
|
+
continue
|
179
|
+
# 连写字符拆分
|
180
|
+
para_text = __replace_ligatures(para_text)
|
232
181
|
|
233
|
-
|
234
|
-
para_content = {}
|
235
|
-
if len(para) == 1:
|
236
|
-
para_content = line_to_standard_format(para[0], img_buket_path)
|
237
|
-
elif len(para) > 1:
|
238
|
-
para_text = ''
|
239
|
-
inline_equation_num = 0
|
240
|
-
for line in para:
|
241
|
-
for span in line['spans']:
|
242
|
-
language = ''
|
243
|
-
span_type = span.get('type')
|
244
|
-
content = ''
|
245
|
-
if span_type == ContentType.Text:
|
246
|
-
content = span['content']
|
247
|
-
language = detect_lang(content)
|
248
|
-
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
249
|
-
content = ocr_escape_special_markdown_char(
|
250
|
-
split_long_words(content))
|
251
|
-
else:
|
252
|
-
content = ocr_escape_special_markdown_char(content)
|
253
|
-
elif span_type == ContentType.InlineEquation:
|
254
|
-
content = f"${span['content']}$"
|
255
|
-
inline_equation_num += 1
|
256
|
-
if language == 'en': # 英文语境下 content间需要空格分隔
|
257
|
-
para_text += content + ' '
|
258
|
-
else: # 中文语境下,content间不需要空格分隔
|
259
|
-
para_text += content
|
260
|
-
para_content = {
|
261
|
-
'type': 'text',
|
262
|
-
'text': para_text,
|
263
|
-
'inline_equation_num': inline_equation_num,
|
264
|
-
}
|
265
|
-
return para_content
|
182
|
+
return para_text
|
266
183
|
|
267
184
|
|
268
|
-
def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
185
|
+
def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
|
269
186
|
para_type = para_block['type']
|
270
|
-
|
187
|
+
para_content = {}
|
188
|
+
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
|
271
189
|
para_content = {
|
272
190
|
'type': 'text',
|
273
191
|
'text': merge_para_with_text(para_block),
|
274
|
-
'page_idx': page_idx,
|
275
192
|
}
|
276
193
|
elif para_type == BlockType.Title:
|
277
194
|
para_content = {
|
278
195
|
'type': 'text',
|
279
196
|
'text': merge_para_with_text(para_block),
|
280
197
|
'text_level': 1,
|
281
|
-
'page_idx': page_idx,
|
282
198
|
}
|
283
199
|
elif para_type == BlockType.InterlineEquation:
|
284
200
|
para_content = {
|
285
201
|
'type': 'equation',
|
286
202
|
'text': merge_para_with_text(para_block),
|
287
203
|
'text_format': 'latex',
|
288
|
-
'page_idx': page_idx,
|
289
204
|
}
|
290
205
|
elif para_type == BlockType.Image:
|
291
|
-
para_content = {'type': 'image', '
|
206
|
+
para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
|
292
207
|
for block in para_block['blocks']:
|
293
208
|
if block['type'] == BlockType.ImageBody:
|
294
|
-
|
295
|
-
|
296
|
-
|
209
|
+
for line in block['lines']:
|
210
|
+
for span in line['spans']:
|
211
|
+
if span['type'] == ContentType.Image:
|
212
|
+
if span.get('image_path', ''):
|
213
|
+
para_content['img_path'] = join_path(img_buket_path, span['image_path'])
|
297
214
|
if block['type'] == BlockType.ImageCaption:
|
298
|
-
para_content['img_caption']
|
215
|
+
para_content['img_caption'].append(merge_para_with_text(block))
|
299
216
|
if block['type'] == BlockType.ImageFootnote:
|
300
|
-
para_content['img_footnote']
|
217
|
+
para_content['img_footnote'].append(merge_para_with_text(block))
|
301
218
|
elif para_type == BlockType.Table:
|
302
|
-
para_content = {'type': 'table', '
|
219
|
+
para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
|
303
220
|
for block in para_block['blocks']:
|
304
221
|
if block['type'] == BlockType.TableBody:
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
222
|
+
for line in block['lines']:
|
223
|
+
for span in line['spans']:
|
224
|
+
if span['type'] == ContentType.Table:
|
225
|
+
|
226
|
+
if span.get('latex', ''):
|
227
|
+
para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
|
228
|
+
elif span.get('html', ''):
|
229
|
+
para_content['table_body'] = f"\n\n{span['html']}\n\n"
|
230
|
+
|
231
|
+
if span.get('image_path', ''):
|
232
|
+
para_content['img_path'] = join_path(img_buket_path, span['image_path'])
|
233
|
+
|
310
234
|
if block['type'] == BlockType.TableCaption:
|
311
|
-
para_content['table_caption']
|
235
|
+
para_content['table_caption'].append(merge_para_with_text(block))
|
312
236
|
if block['type'] == BlockType.TableFootnote:
|
313
|
-
para_content['table_footnote']
|
237
|
+
para_content['table_footnote'].append(merge_para_with_text(block))
|
314
238
|
|
315
|
-
|
239
|
+
para_content['page_idx'] = page_idx
|
316
240
|
|
241
|
+
if drop_reason is not None:
|
242
|
+
para_content['drop_reason'] = drop_reason
|
317
243
|
|
318
|
-
|
319
|
-
content_list = []
|
320
|
-
for page_info in pdf_info_dict:
|
321
|
-
paras_of_layout = page_info.get('para_blocks')
|
322
|
-
if not paras_of_layout:
|
323
|
-
continue
|
324
|
-
for para_block in paras_of_layout:
|
325
|
-
para_content = para_to_standard_format_v2(para_block,
|
326
|
-
img_buket_path)
|
327
|
-
content_list.append(para_content)
|
328
|
-
return content_list
|
329
|
-
|
330
|
-
|
331
|
-
def line_to_standard_format(line, img_buket_path):
|
332
|
-
line_text = ''
|
333
|
-
inline_equation_num = 0
|
334
|
-
for span in line['spans']:
|
335
|
-
if not span.get('content'):
|
336
|
-
if not span.get('image_path'):
|
337
|
-
continue
|
338
|
-
else:
|
339
|
-
if span['type'] == ContentType.Image:
|
340
|
-
content = {
|
341
|
-
'type': 'image',
|
342
|
-
'img_path': join_path(img_buket_path,
|
343
|
-
span['image_path']),
|
344
|
-
}
|
345
|
-
return content
|
346
|
-
elif span['type'] == ContentType.Table:
|
347
|
-
content = {
|
348
|
-
'type': 'table',
|
349
|
-
'img_path': join_path(img_buket_path,
|
350
|
-
span['image_path']),
|
351
|
-
}
|
352
|
-
return content
|
353
|
-
else:
|
354
|
-
if span['type'] == ContentType.InterlineEquation:
|
355
|
-
interline_equation = span['content']
|
356
|
-
content = {
|
357
|
-
'type': 'equation',
|
358
|
-
'latex': f'$$\n{interline_equation}\n$$'
|
359
|
-
}
|
360
|
-
return content
|
361
|
-
elif span['type'] == ContentType.InlineEquation:
|
362
|
-
inline_equation = span['content']
|
363
|
-
line_text += f'${inline_equation}$'
|
364
|
-
inline_equation_num += 1
|
365
|
-
elif span['type'] == ContentType.Text:
|
366
|
-
text_content = ocr_escape_special_markdown_char(
|
367
|
-
span['content']) # 转义特殊符号
|
368
|
-
line_text += text_content
|
369
|
-
content = {
|
370
|
-
'type': 'text',
|
371
|
-
'text': line_text,
|
372
|
-
'inline_equation_num': inline_equation_num,
|
373
|
-
}
|
374
|
-
return content
|
375
|
-
|
376
|
-
|
377
|
-
def ocr_mk_mm_standard_format(pdf_info_dict: list):
|
378
|
-
"""content_list type string
|
379
|
-
image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
|
380
|
-
latex文本字段。 text string 纯文本格式的文本数据。 md string
|
381
|
-
markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
|
382
|
-
content_list = []
|
383
|
-
for page_info in pdf_info_dict:
|
384
|
-
blocks = page_info.get('preproc_blocks')
|
385
|
-
if not blocks:
|
386
|
-
continue
|
387
|
-
for block in blocks:
|
388
|
-
for line in block['lines']:
|
389
|
-
content = line_to_standard_format(line)
|
390
|
-
content_list.append(content)
|
391
|
-
return content_list
|
244
|
+
return para_content
|
392
245
|
|
393
246
|
|
394
247
|
def union_make(pdf_info_dict: list,
|
395
248
|
make_mode: str,
|
396
249
|
drop_mode: str,
|
397
|
-
img_buket_path: str = ''
|
250
|
+
img_buket_path: str = '',
|
251
|
+
):
|
398
252
|
output_content = []
|
399
253
|
for page_info in pdf_info_dict:
|
254
|
+
drop_reason_flag = False
|
255
|
+
drop_reason = None
|
400
256
|
if page_info.get('need_drop', False):
|
401
257
|
drop_reason = page_info.get('drop_reason')
|
402
258
|
if drop_mode == DropMode.NONE:
|
403
259
|
pass
|
260
|
+
elif drop_mode == DropMode.NONE_WITH_REASON:
|
261
|
+
drop_reason_flag = True
|
404
262
|
elif drop_mode == DropMode.WHOLE_PDF:
|
405
263
|
raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
|
406
264
|
f'drop_reason is {drop_reason}'))
|
@@ -425,8 +283,12 @@ def union_make(pdf_info_dict: list,
|
|
425
283
|
output_content.extend(page_markdown)
|
426
284
|
elif make_mode == MakeMode.STANDARD_FORMAT:
|
427
285
|
for para_block in paras_of_layout:
|
428
|
-
|
429
|
-
|
286
|
+
if drop_reason_flag:
|
287
|
+
para_content = para_to_standard_format_v2(
|
288
|
+
para_block, img_buket_path, page_idx)
|
289
|
+
else:
|
290
|
+
para_content = para_to_standard_format_v2(
|
291
|
+
para_block, img_buket_path, page_idx)
|
430
292
|
output_content.append(para_content)
|
431
293
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
432
294
|
return '\n\n'.join(output_content)
|
magic_pdf/libs/Constants.py
CHANGED
@@ -10,18 +10,12 @@ block维度自定义字段
|
|
10
10
|
# block中lines是否被删除
|
11
11
|
LINES_DELETED = "lines_deleted"
|
12
12
|
|
13
|
-
# struct eqtable
|
14
|
-
STRUCT_EQTABLE = "struct_eqtable"
|
15
|
-
|
16
13
|
# table recognition max time default value
|
17
14
|
TABLE_MAX_TIME_VALUE = 400
|
18
15
|
|
19
16
|
# pp_table_result_max_length
|
20
17
|
TABLE_MAX_LEN = 480
|
21
18
|
|
22
|
-
# pp table structure algorithm
|
23
|
-
TABLE_MASTER = "TableMaster"
|
24
|
-
|
25
19
|
# table master structure dict
|
26
20
|
TABLE_MASTER_DICT = "table_master_structure_dict.txt"
|
27
21
|
|
@@ -29,12 +23,31 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
|
|
29
23
|
TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
|
30
24
|
|
31
25
|
# pp detect model dir
|
32
|
-
DETECT_MODEL_DIR = "ch_PP-
|
26
|
+
DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
|
33
27
|
|
34
28
|
# pp rec model dir
|
35
|
-
REC_MODEL_DIR = "ch_PP-
|
29
|
+
REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
|
36
30
|
|
37
31
|
# pp rec char dict path
|
38
32
|
REC_CHAR_DICT = "ppocr_keys_v1.txt"
|
39
33
|
|
34
|
+
# pp rec copy rec directory
|
35
|
+
PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
|
36
|
+
|
37
|
+
# pp rec copy det directory
|
38
|
+
PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
|
39
|
+
|
40
|
+
|
41
|
+
class MODEL_NAME:
|
42
|
+
# pp table structure algorithm
|
43
|
+
TABLE_MASTER = "tablemaster"
|
44
|
+
# struct eqtable
|
45
|
+
STRUCT_EQTABLE = "struct_eqtable"
|
46
|
+
|
47
|
+
DocLayout_YOLO = "doclayout_yolo"
|
48
|
+
|
49
|
+
LAYOUTLMv3 = "layoutlmv3"
|
50
|
+
|
51
|
+
YOLO_V8_MFD = "yolo_v8_mfd"
|
40
52
|
|
53
|
+
UniMerNet_v2_Small = "unimernet_small"
|
magic_pdf/libs/boxbase.py
CHANGED
@@ -445,3 +445,38 @@ def get_overlap_area(bbox1, bbox2):
|
|
445
445
|
|
446
446
|
# The area of overlap area
|
447
447
|
return (x_right - x_left) * (y_bottom - y_top)
|
448
|
+
|
449
|
+
|
450
|
+
def calculate_vertical_projection_overlap_ratio(block1, block2):
|
451
|
+
"""
|
452
|
+
Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
|
453
|
+
|
454
|
+
Args:
|
455
|
+
block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
|
456
|
+
block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
|
457
|
+
|
458
|
+
Returns:
|
459
|
+
float: The proportion of the x-axis covered by the vertical projection of the two blocks.
|
460
|
+
"""
|
461
|
+
x0_1, _, x1_1, _ = block1
|
462
|
+
x0_2, _, x1_2, _ = block2
|
463
|
+
|
464
|
+
# Calculate the intersection of the x-coordinates
|
465
|
+
x_left = max(x0_1, x0_2)
|
466
|
+
x_right = min(x1_1, x1_2)
|
467
|
+
|
468
|
+
if x_right < x_left:
|
469
|
+
return 0.0
|
470
|
+
|
471
|
+
# Length of the intersection
|
472
|
+
intersection_length = x_right - x_left
|
473
|
+
|
474
|
+
# Length of the x-axis projection of the first block
|
475
|
+
block1_length = x1_1 - x0_1
|
476
|
+
|
477
|
+
if block1_length == 0:
|
478
|
+
return 0.0
|
479
|
+
|
480
|
+
# Proportion of the x-axis covered by the intersection
|
481
|
+
# logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
|
482
|
+
return intersection_length / block1_length
|