magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +134 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/Constants.py +27 -1
- magic_pdf/libs/boxbase.py +169 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +230 -161
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +135 -22
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
- magic_pdf/model/ppTableModel.py +67 -0
- magic_pdf/para/para_split_v2.py +76 -74
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/resources/model_config/model_configs.yaml +3 -1
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,27 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
import wordninja
|
1
4
|
from loguru import logger
|
2
5
|
|
3
|
-
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
4
6
|
from magic_pdf.libs.commons import join_path
|
5
7
|
from magic_pdf.libs.language import detect_lang
|
8
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
6
9
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
7
|
-
from magic_pdf.libs.ocr_content_type import
|
8
|
-
|
9
|
-
|
10
|
+
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
|
11
|
+
|
12
|
+
|
13
|
+
def __is_hyphen_at_line_end(line):
|
14
|
+
"""
|
15
|
+
Check if a line ends with one or more letters followed by a hyphen.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
line (str): The line of text to check.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
|
22
|
+
"""
|
23
|
+
# Use regex to check if the line ends with one or more letters followed by a hyphen
|
24
|
+
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
10
25
|
|
11
26
|
|
12
27
|
def split_long_words(text):
|
@@ -14,7 +29,7 @@ def split_long_words(text):
|
|
14
29
|
for i in range(len(segments)):
|
15
30
|
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
|
16
31
|
for j in range(len(words)):
|
17
|
-
if len(words[j]) >
|
32
|
+
if len(words[j]) > 10:
|
18
33
|
words[j] = ' '.join(wordninja.split(words[j]))
|
19
34
|
segments[i] = ''.join(words)
|
20
35
|
return ' '.join(segments)
|
@@ -23,8 +38,9 @@ def split_long_words(text):
|
|
23
38
|
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
|
24
39
|
markdown = []
|
25
40
|
for page_info in pdf_info_list:
|
26
|
-
paras_of_layout = page_info.get(
|
27
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
41
|
+
paras_of_layout = page_info.get('para_blocks')
|
42
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
43
|
+
paras_of_layout, 'mm', img_buket_path)
|
28
44
|
markdown.extend(page_markdown)
|
29
45
|
return '\n\n'.join(markdown)
|
30
46
|
|
@@ -32,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
|
|
32
48
|
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
|
33
49
|
markdown = []
|
34
50
|
for page_info in pdf_info_dict:
|
35
|
-
paras_of_layout = page_info.get(
|
36
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
51
|
+
paras_of_layout = page_info.get('para_blocks')
|
52
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
53
|
+
paras_of_layout, 'nlp')
|
37
54
|
markdown.extend(page_markdown)
|
38
55
|
return '\n\n'.join(markdown)
|
39
56
|
|
40
57
|
|
41
|
-
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
58
|
+
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
59
|
+
img_buket_path):
|
42
60
|
markdown_with_para_and_pagination = []
|
43
61
|
page_no = 0
|
44
62
|
for page_info in pdf_info_dict:
|
45
|
-
paras_of_layout = page_info.get(
|
63
|
+
paras_of_layout = page_info.get('para_blocks')
|
46
64
|
if not paras_of_layout:
|
47
65
|
continue
|
48
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
66
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
67
|
+
paras_of_layout, 'mm', img_buket_path)
|
49
68
|
markdown_with_para_and_pagination.append({
|
50
|
-
'page_no':
|
51
|
-
|
69
|
+
'page_no':
|
70
|
+
page_no,
|
71
|
+
'md_content':
|
72
|
+
'\n\n'.join(page_markdown)
|
52
73
|
})
|
53
74
|
page_no += 1
|
54
75
|
return markdown_with_para_and_pagination
|
55
76
|
|
56
77
|
|
57
|
-
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=
|
78
|
+
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
|
58
79
|
page_markdown = []
|
59
80
|
for paras in paras_of_layout:
|
60
81
|
for para in paras:
|
@@ -67,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
|
|
67
88
|
if span_type == ContentType.Text:
|
68
89
|
content = span['content']
|
69
90
|
language = detect_lang(content)
|
70
|
-
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
71
|
-
content = ocr_escape_special_markdown_char(
|
91
|
+
if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
|
92
|
+
content = ocr_escape_special_markdown_char(
|
93
|
+
split_long_words(content))
|
72
94
|
else:
|
73
95
|
content = ocr_escape_special_markdown_char(content)
|
74
96
|
elif span_type == ContentType.InlineEquation:
|
@@ -92,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
|
|
92
114
|
return page_markdown
|
93
115
|
|
94
116
|
|
95
|
-
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
117
|
+
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
118
|
+
mode,
|
119
|
+
img_buket_path=''):
|
96
120
|
page_markdown = []
|
97
121
|
for para_block in paras_of_layout:
|
98
122
|
para_text = ''
|
@@ -100,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
100
124
|
if para_type == BlockType.Text:
|
101
125
|
para_text = merge_para_with_text(para_block)
|
102
126
|
elif para_type == BlockType.Title:
|
103
|
-
para_text = f
|
127
|
+
para_text = f'# {merge_para_with_text(para_block)}'
|
104
128
|
elif para_type == BlockType.InterlineEquation:
|
105
129
|
para_text = merge_para_with_text(para_block)
|
106
130
|
elif para_type == BlockType.Image:
|
@@ -116,14 +140,16 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
116
140
|
for block in para_block['blocks']: # 2nd.拼image_caption
|
117
141
|
if block['type'] == BlockType.ImageCaption:
|
118
142
|
para_text += merge_para_with_text(block)
|
143
|
+
for block in para_block['blocks']: # 2nd.拼image_caption
|
144
|
+
if block['type'] == BlockType.ImageFootnote:
|
145
|
+
para_text += merge_para_with_text(block)
|
119
146
|
elif para_type == BlockType.Table:
|
120
147
|
if mode == 'nlp':
|
121
148
|
continue
|
122
149
|
elif mode == 'mm':
|
123
|
-
table_caption = ''
|
124
150
|
for block in para_block['blocks']: # 1st.拼table_caption
|
125
151
|
if block['type'] == BlockType.TableCaption:
|
126
|
-
|
152
|
+
para_text += merge_para_with_text(block)
|
127
153
|
for block in para_block['blocks']: # 2nd.拼table_body
|
128
154
|
if block['type'] == BlockType.TableBody:
|
129
155
|
for line in block['lines']:
|
@@ -132,8 +158,10 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
132
158
|
# if processed by table model
|
133
159
|
if span.get('latex', ''):
|
134
160
|
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
|
161
|
+
elif span.get('html', ''):
|
162
|
+
para_text += f"\n\n{span['html']}\n\n"
|
135
163
|
else:
|
136
|
-
para_text += f"\n}) \n"
|
137
165
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
138
166
|
if block['type'] == BlockType.TableFootnote:
|
139
167
|
para_text += merge_para_with_text(block)
|
@@ -147,24 +175,39 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
147
175
|
|
148
176
|
|
149
177
|
def merge_para_with_text(para_block):
|
178
|
+
|
179
|
+
def detect_language(text):
|
180
|
+
en_pattern = r'[a-zA-Z]+'
|
181
|
+
en_matches = re.findall(en_pattern, text)
|
182
|
+
en_length = sum(len(match) for match in en_matches)
|
183
|
+
if len(text) > 0:
|
184
|
+
if en_length / len(text) >= 0.5:
|
185
|
+
return 'en'
|
186
|
+
else:
|
187
|
+
return 'unknown'
|
188
|
+
else:
|
189
|
+
return 'empty'
|
190
|
+
|
150
191
|
para_text = ''
|
151
192
|
for line in para_block['lines']:
|
152
|
-
line_text =
|
153
|
-
line_lang =
|
193
|
+
line_text = ''
|
194
|
+
line_lang = ''
|
154
195
|
for span in line['spans']:
|
155
196
|
span_type = span['type']
|
156
197
|
if span_type == ContentType.Text:
|
157
198
|
line_text += span['content'].strip()
|
158
|
-
if line_text !=
|
199
|
+
if line_text != '':
|
159
200
|
line_lang = detect_lang(line_text)
|
160
201
|
for span in line['spans']:
|
161
202
|
span_type = span['type']
|
162
203
|
content = ''
|
163
204
|
if span_type == ContentType.Text:
|
164
205
|
content = span['content']
|
165
|
-
language = detect_lang(content)
|
206
|
+
# language = detect_lang(content)
|
207
|
+
language = detect_language(content)
|
166
208
|
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
167
|
-
content = ocr_escape_special_markdown_char(
|
209
|
+
content = ocr_escape_special_markdown_char(
|
210
|
+
split_long_words(content))
|
168
211
|
else:
|
169
212
|
content = ocr_escape_special_markdown_char(content)
|
170
213
|
elif span_type == ContentType.InlineEquation:
|
@@ -173,10 +216,17 @@ def merge_para_with_text(para_block):
|
|
173
216
|
content = f"\n$$\n{span['content']}\n$$\n"
|
174
217
|
|
175
218
|
if content != '':
|
176
|
-
|
177
|
-
|
219
|
+
langs = ['zh', 'ja', 'ko']
|
220
|
+
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
221
|
+
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
|
222
|
+
elif line_lang == 'en':
|
223
|
+
# 如果是前一行带有-连字符,那么末尾不应该加空格
|
224
|
+
if __is_hyphen_at_line_end(content):
|
225
|
+
para_text += content[:-1]
|
226
|
+
else:
|
227
|
+
para_text += content + ' '
|
178
228
|
else:
|
179
|
-
para_text += content + ' ' #
|
229
|
+
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
|
180
230
|
return para_text
|
181
231
|
|
182
232
|
|
@@ -191,18 +241,18 @@ def para_to_standard_format(para, img_buket_path):
|
|
191
241
|
for span in line['spans']:
|
192
242
|
language = ''
|
193
243
|
span_type = span.get('type')
|
194
|
-
content =
|
244
|
+
content = ''
|
195
245
|
if span_type == ContentType.Text:
|
196
246
|
content = span['content']
|
197
247
|
language = detect_lang(content)
|
198
248
|
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
199
|
-
content = ocr_escape_special_markdown_char(
|
249
|
+
content = ocr_escape_special_markdown_char(
|
250
|
+
split_long_words(content))
|
200
251
|
else:
|
201
252
|
content = ocr_escape_special_markdown_char(content)
|
202
253
|
elif span_type == ContentType.InlineEquation:
|
203
254
|
content = f"${span['content']}$"
|
204
255
|
inline_equation_num += 1
|
205
|
-
|
206
256
|
if language == 'en': # 英文语境下 content间需要空格分隔
|
207
257
|
para_text += content + ' '
|
208
258
|
else: # 中文语境下,content间不需要空格分隔
|
@@ -210,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
|
|
210
260
|
para_content = {
|
211
261
|
'type': 'text',
|
212
262
|
'text': para_text,
|
213
|
-
'inline_equation_num': inline_equation_num
|
263
|
+
'inline_equation_num': inline_equation_num,
|
214
264
|
}
|
215
265
|
return para_content
|
216
266
|
|
@@ -221,41 +271,41 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
|
221
271
|
para_content = {
|
222
272
|
'type': 'text',
|
223
273
|
'text': merge_para_with_text(para_block),
|
224
|
-
'page_idx': page_idx
|
274
|
+
'page_idx': page_idx,
|
225
275
|
}
|
226
276
|
elif para_type == BlockType.Title:
|
227
277
|
para_content = {
|
228
278
|
'type': 'text',
|
229
279
|
'text': merge_para_with_text(para_block),
|
230
280
|
'text_level': 1,
|
231
|
-
'page_idx': page_idx
|
281
|
+
'page_idx': page_idx,
|
232
282
|
}
|
233
283
|
elif para_type == BlockType.InterlineEquation:
|
234
284
|
para_content = {
|
235
285
|
'type': 'equation',
|
236
286
|
'text': merge_para_with_text(para_block),
|
237
|
-
'text_format':
|
238
|
-
'page_idx': page_idx
|
287
|
+
'text_format': 'latex',
|
288
|
+
'page_idx': page_idx,
|
239
289
|
}
|
240
290
|
elif para_type == BlockType.Image:
|
241
|
-
para_content = {
|
242
|
-
'type': 'image',
|
243
|
-
'page_idx': page_idx
|
244
|
-
}
|
291
|
+
para_content = {'type': 'image', 'page_idx': page_idx}
|
245
292
|
for block in para_block['blocks']:
|
246
293
|
if block['type'] == BlockType.ImageBody:
|
247
|
-
para_content['img_path'] = join_path(
|
294
|
+
para_content['img_path'] = join_path(
|
295
|
+
img_buket_path,
|
296
|
+
block['lines'][0]['spans'][0]['image_path'])
|
248
297
|
if block['type'] == BlockType.ImageCaption:
|
249
298
|
para_content['img_caption'] = merge_para_with_text(block)
|
299
|
+
if block['type'] == BlockType.ImageFootnote:
|
300
|
+
para_content['img_footnote'] = merge_para_with_text(block)
|
250
301
|
elif para_type == BlockType.Table:
|
251
|
-
para_content = {
|
252
|
-
'type': 'table',
|
253
|
-
'page_idx': page_idx
|
254
|
-
}
|
302
|
+
para_content = {'type': 'table', 'page_idx': page_idx}
|
255
303
|
for block in para_block['blocks']:
|
256
304
|
if block['type'] == BlockType.TableBody:
|
257
305
|
if block["lines"][0]["spans"][0].get('latex', ''):
|
258
306
|
para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
|
307
|
+
elif block["lines"][0]["spans"][0].get('html', ''):
|
308
|
+
para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
|
259
309
|
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
|
260
310
|
if block['type'] == BlockType.TableCaption:
|
261
311
|
para_content['table_caption'] = merge_para_with_text(block)
|
@@ -268,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
|
268
318
|
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
|
269
319
|
content_list = []
|
270
320
|
for page_info in pdf_info_dict:
|
271
|
-
paras_of_layout = page_info.get(
|
321
|
+
paras_of_layout = page_info.get('para_blocks')
|
272
322
|
if not paras_of_layout:
|
273
323
|
continue
|
274
324
|
for para_block in paras_of_layout:
|
275
|
-
para_content = para_to_standard_format_v2(para_block,
|
325
|
+
para_content = para_to_standard_format_v2(para_block,
|
326
|
+
img_buket_path)
|
276
327
|
content_list.append(para_content)
|
277
328
|
return content_list
|
278
329
|
|
279
330
|
|
280
331
|
def line_to_standard_format(line, img_buket_path):
|
281
|
-
line_text =
|
332
|
+
line_text = ''
|
282
333
|
inline_equation_num = 0
|
283
334
|
for span in line['spans']:
|
284
335
|
if not span.get('content'):
|
@@ -288,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
|
|
288
339
|
if span['type'] == ContentType.Image:
|
289
340
|
content = {
|
290
341
|
'type': 'image',
|
291
|
-
'img_path': join_path(img_buket_path,
|
342
|
+
'img_path': join_path(img_buket_path,
|
343
|
+
span['image_path']),
|
292
344
|
}
|
293
345
|
return content
|
294
346
|
elif span['type'] == ContentType.Table:
|
295
347
|
content = {
|
296
348
|
'type': 'table',
|
297
|
-
'img_path': join_path(img_buket_path,
|
349
|
+
'img_path': join_path(img_buket_path,
|
350
|
+
span['image_path']),
|
298
351
|
}
|
299
352
|
return content
|
300
353
|
else:
|
@@ -302,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
|
|
302
355
|
interline_equation = span['content']
|
303
356
|
content = {
|
304
357
|
'type': 'equation',
|
305
|
-
'latex': f
|
358
|
+
'latex': f'$$\n{interline_equation}\n$$'
|
306
359
|
}
|
307
360
|
return content
|
308
361
|
elif span['type'] == ContentType.InlineEquation:
|
309
362
|
inline_equation = span['content']
|
310
|
-
line_text += f
|
363
|
+
line_text += f'${inline_equation}$'
|
311
364
|
inline_equation_num += 1
|
312
365
|
elif span['type'] == ContentType.Text:
|
313
|
-
text_content = ocr_escape_special_markdown_char(
|
366
|
+
text_content = ocr_escape_special_markdown_char(
|
367
|
+
span['content']) # 转义特殊符号
|
314
368
|
line_text += text_content
|
315
369
|
content = {
|
316
370
|
'type': 'text',
|
317
371
|
'text': line_text,
|
318
|
-
'inline_equation_num': inline_equation_num
|
372
|
+
'inline_equation_num': inline_equation_num,
|
319
373
|
}
|
320
374
|
return content
|
321
375
|
|
322
376
|
|
323
377
|
def ocr_mk_mm_standard_format(pdf_info_dict: list):
|
324
|
-
"""
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
text string 纯文本格式的文本数据。
|
329
|
-
md string markdown格式的文本数据。
|
330
|
-
img_path string s3://full/path/to/img.jpg
|
331
|
-
"""
|
378
|
+
"""content_list type string
|
379
|
+
image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
|
380
|
+
latex文本字段。 text string 纯文本格式的文本数据。 md string
|
381
|
+
markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
|
332
382
|
content_list = []
|
333
383
|
for page_info in pdf_info_dict:
|
334
|
-
blocks = page_info.get(
|
384
|
+
blocks = page_info.get('preproc_blocks')
|
335
385
|
if not blocks:
|
336
386
|
continue
|
337
387
|
for block in blocks:
|
@@ -341,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
|
|
341
391
|
return content_list
|
342
392
|
|
343
393
|
|
344
|
-
def union_make(pdf_info_dict: list,
|
394
|
+
def union_make(pdf_info_dict: list,
|
395
|
+
make_mode: str,
|
396
|
+
drop_mode: str,
|
397
|
+
img_buket_path: str = ''):
|
345
398
|
output_content = []
|
346
399
|
for page_info in pdf_info_dict:
|
347
|
-
if page_info.get(
|
348
|
-
drop_reason = page_info.get(
|
400
|
+
if page_info.get('need_drop', False):
|
401
|
+
drop_reason = page_info.get('drop_reason')
|
349
402
|
if drop_mode == DropMode.NONE:
|
350
403
|
pass
|
351
404
|
elif drop_mode == DropMode.WHOLE_PDF:
|
352
|
-
raise Exception(f
|
405
|
+
raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
|
406
|
+
f'drop_reason is {drop_reason}'))
|
353
407
|
elif drop_mode == DropMode.SINGLE_PAGE:
|
354
|
-
logger.warning(f
|
408
|
+
logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
|
409
|
+
f'drop_reason is {drop_reason}'))
|
355
410
|
continue
|
356
411
|
else:
|
357
|
-
raise Exception(
|
412
|
+
raise Exception('drop_mode can not be null')
|
358
413
|
|
359
|
-
paras_of_layout = page_info.get(
|
360
|
-
page_idx = page_info.get(
|
414
|
+
paras_of_layout = page_info.get('para_blocks')
|
415
|
+
page_idx = page_info.get('page_idx')
|
361
416
|
if not paras_of_layout:
|
362
417
|
continue
|
363
418
|
if make_mode == MakeMode.MM_MD:
|
364
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
419
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
420
|
+
paras_of_layout, 'mm', img_buket_path)
|
365
421
|
output_content.extend(page_markdown)
|
366
422
|
elif make_mode == MakeMode.NLP_MD:
|
367
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
423
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
424
|
+
paras_of_layout, 'nlp')
|
368
425
|
output_content.extend(page_markdown)
|
369
426
|
elif make_mode == MakeMode.STANDARD_FORMAT:
|
370
427
|
for para_block in paras_of_layout:
|
371
|
-
para_content = para_to_standard_format_v2(
|
428
|
+
para_content = para_to_standard_format_v2(
|
429
|
+
para_block, img_buket_path, page_idx)
|
372
430
|
output_content.append(para_content)
|
373
431
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
374
432
|
return '\n\n'.join(output_content)
|
File without changes
|
File without changes
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
|
7
|
+
Node)
|
8
|
+
from magic_pdf.integrations.rag.utils import inference
|
9
|
+
|
10
|
+
|
11
|
+
class RagPageReader:
|
12
|
+
|
13
|
+
def __init__(self, pagedata: LayoutElements):
|
14
|
+
self.o = [
|
15
|
+
Node(
|
16
|
+
category_type=v.category_type,
|
17
|
+
text=v.text,
|
18
|
+
image_path=v.image_path,
|
19
|
+
anno_id=v.anno_id,
|
20
|
+
latex=v.latex,
|
21
|
+
html=v.html,
|
22
|
+
) for v in pagedata.layout_dets
|
23
|
+
]
|
24
|
+
|
25
|
+
self.pagedata = pagedata
|
26
|
+
|
27
|
+
def __iter__(self):
|
28
|
+
return iter(self.o)
|
29
|
+
|
30
|
+
def get_rel_map(self) -> list[ElementRelation]:
|
31
|
+
return self.pagedata.extra.element_relation
|
32
|
+
|
33
|
+
|
34
|
+
class RagDocumentReader:
|
35
|
+
|
36
|
+
def __init__(self, ragdata: list[LayoutElements]):
|
37
|
+
self.o = [RagPageReader(v) for v in ragdata]
|
38
|
+
|
39
|
+
def __iter__(self):
|
40
|
+
return iter(self.o)
|
41
|
+
|
42
|
+
|
43
|
+
class DataReader:
|
44
|
+
|
45
|
+
def __init__(self, path_or_directory: str, method: str, output_dir: str):
|
46
|
+
self.path_or_directory = path_or_directory
|
47
|
+
self.method = method
|
48
|
+
self.output_dir = output_dir
|
49
|
+
self.pdfs = []
|
50
|
+
if os.path.isdir(path_or_directory):
|
51
|
+
for doc_path in Path(path_or_directory).glob('*.pdf'):
|
52
|
+
self.pdfs.append(doc_path)
|
53
|
+
else:
|
54
|
+
assert path_or_directory.endswith('.pdf')
|
55
|
+
self.pdfs.append(Path(path_or_directory))
|
56
|
+
|
57
|
+
def get_documents_count(self) -> int:
|
58
|
+
"""Returns the number of documents in the directory."""
|
59
|
+
return len(self.pdfs)
|
60
|
+
|
61
|
+
def get_document_result(self, idx: int) -> RagDocumentReader | None:
|
62
|
+
"""
|
63
|
+
Args:
|
64
|
+
idx (int): the index of documents under the
|
65
|
+
directory path_or_directory
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
RagDocumentReader | None: RagDocumentReader is an iterable object,
|
69
|
+
more details @RagDocumentReader
|
70
|
+
"""
|
71
|
+
if idx >= self.get_documents_count() or idx < 0:
|
72
|
+
logger.error(f'invalid idx: {idx}')
|
73
|
+
return None
|
74
|
+
res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
|
75
|
+
if res is None:
|
76
|
+
logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
|
77
|
+
return None
|
78
|
+
return RagDocumentReader(res)
|
79
|
+
|
80
|
+
def get_document_filename(self, idx: int) -> Path:
|
81
|
+
"""get the filename of the document."""
|
82
|
+
return self.pdfs[idx]
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
# rag
|
7
|
+
class CategoryType(Enum): # py310 not support StrEnum
|
8
|
+
text = 'text'
|
9
|
+
title = 'title'
|
10
|
+
interline_equation = 'interline_equation'
|
11
|
+
image = 'image'
|
12
|
+
image_body = 'image_body'
|
13
|
+
image_caption = 'image_caption'
|
14
|
+
table = 'table'
|
15
|
+
table_body = 'table_body'
|
16
|
+
table_caption = 'table_caption'
|
17
|
+
table_footnote = 'table_footnote'
|
18
|
+
|
19
|
+
|
20
|
+
class ElementRelType(Enum):
|
21
|
+
sibling = 'sibling'
|
22
|
+
|
23
|
+
|
24
|
+
class PageInfo(BaseModel):
|
25
|
+
page_no: int = Field(description='the index of page, start from zero',
|
26
|
+
ge=0)
|
27
|
+
height: int = Field(description='the height of page', gt=0)
|
28
|
+
width: int = Field(description='the width of page', ge=0)
|
29
|
+
image_path: str | None = Field(description='the image of this page',
|
30
|
+
default=None)
|
31
|
+
|
32
|
+
|
33
|
+
class ContentObject(BaseModel):
|
34
|
+
category_type: CategoryType = Field(description='类别')
|
35
|
+
poly: list[float] = Field(
|
36
|
+
description=('Coordinates, need to convert back to PDF coordinates,'
|
37
|
+
' order is top-left, top-right, bottom-right, bottom-left'
|
38
|
+
' x,y coordinates'))
|
39
|
+
ignore: bool = Field(description='whether ignore this object',
|
40
|
+
default=False)
|
41
|
+
text: str | None = Field(description='text content of the object',
|
42
|
+
default=None)
|
43
|
+
image_path: str | None = Field(description='path of embedded image',
|
44
|
+
default=None)
|
45
|
+
order: int = Field(description='the order of this object within a page',
|
46
|
+
default=-1)
|
47
|
+
anno_id: int = Field(description='unique id', default=-1)
|
48
|
+
latex: str | None = Field(description='latex result', default=None)
|
49
|
+
html: str | None = Field(description='html result', default=None)
|
50
|
+
|
51
|
+
|
52
|
+
class ElementRelation(BaseModel):
|
53
|
+
source_anno_id: int = Field(description='unique id of the source object',
|
54
|
+
default=-1)
|
55
|
+
target_anno_id: int = Field(description='unique id of the target object',
|
56
|
+
default=-1)
|
57
|
+
relation: ElementRelType = Field(
|
58
|
+
description='the relation between source and target element')
|
59
|
+
|
60
|
+
|
61
|
+
class LayoutElementsExtra(BaseModel):
|
62
|
+
element_relation: list[ElementRelation] = Field(
|
63
|
+
description='the relation between source and target element')
|
64
|
+
|
65
|
+
|
66
|
+
class LayoutElements(BaseModel):
|
67
|
+
layout_dets: list[ContentObject] = Field(
|
68
|
+
description='layout element details')
|
69
|
+
page_info: PageInfo = Field(description='page info')
|
70
|
+
extra: LayoutElementsExtra = Field(description='extra information')
|
71
|
+
|
72
|
+
|
73
|
+
# iter data format
|
74
|
+
class Node(BaseModel):
|
75
|
+
category_type: CategoryType = Field(description='类别')
|
76
|
+
text: str | None = Field(description='text content of the object',
|
77
|
+
default=None)
|
78
|
+
image_path: str | None = Field(description='path of embedded image',
|
79
|
+
default=None)
|
80
|
+
anno_id: int = Field(description='unique id', default=-1)
|
81
|
+
latex: str | None = Field(description='latex result', default=None)
|
82
|
+
html: str | None = Field(description='html result', default=None)
|