magic-pdf 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +130 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/boxbase.py +169 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +227 -161
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +105 -15
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/para/para_split_v2.py +26 -27
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +68 -26
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +34 -29
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,27 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
import wordninja
|
1
4
|
from loguru import logger
|
2
5
|
|
3
|
-
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
4
6
|
from magic_pdf.libs.commons import join_path
|
5
7
|
from magic_pdf.libs.language import detect_lang
|
8
|
+
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
6
9
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
7
|
-
from magic_pdf.libs.ocr_content_type import
|
8
|
-
|
9
|
-
|
10
|
+
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
|
11
|
+
|
12
|
+
|
13
|
+
def __is_hyphen_at_line_end(line):
|
14
|
+
"""
|
15
|
+
Check if a line ends with one or more letters followed by a hyphen.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
line (str): The line of text to check.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
|
22
|
+
"""
|
23
|
+
# Use regex to check if the line ends with one or more letters followed by a hyphen
|
24
|
+
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
10
25
|
|
11
26
|
|
12
27
|
def split_long_words(text):
|
@@ -14,7 +29,7 @@ def split_long_words(text):
|
|
14
29
|
for i in range(len(segments)):
|
15
30
|
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
|
16
31
|
for j in range(len(words)):
|
17
|
-
if len(words[j]) >
|
32
|
+
if len(words[j]) > 10:
|
18
33
|
words[j] = ' '.join(wordninja.split(words[j]))
|
19
34
|
segments[i] = ''.join(words)
|
20
35
|
return ' '.join(segments)
|
@@ -23,8 +38,9 @@ def split_long_words(text):
|
|
23
38
|
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
|
24
39
|
markdown = []
|
25
40
|
for page_info in pdf_info_list:
|
26
|
-
paras_of_layout = page_info.get(
|
27
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
41
|
+
paras_of_layout = page_info.get('para_blocks')
|
42
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
43
|
+
paras_of_layout, 'mm', img_buket_path)
|
28
44
|
markdown.extend(page_markdown)
|
29
45
|
return '\n\n'.join(markdown)
|
30
46
|
|
@@ -32,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
|
|
32
48
|
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
|
33
49
|
markdown = []
|
34
50
|
for page_info in pdf_info_dict:
|
35
|
-
paras_of_layout = page_info.get(
|
36
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
51
|
+
paras_of_layout = page_info.get('para_blocks')
|
52
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
53
|
+
paras_of_layout, 'nlp')
|
37
54
|
markdown.extend(page_markdown)
|
38
55
|
return '\n\n'.join(markdown)
|
39
56
|
|
40
57
|
|
41
|
-
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
58
|
+
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
59
|
+
img_buket_path):
|
42
60
|
markdown_with_para_and_pagination = []
|
43
61
|
page_no = 0
|
44
62
|
for page_info in pdf_info_dict:
|
45
|
-
paras_of_layout = page_info.get(
|
63
|
+
paras_of_layout = page_info.get('para_blocks')
|
46
64
|
if not paras_of_layout:
|
47
65
|
continue
|
48
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
66
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
67
|
+
paras_of_layout, 'mm', img_buket_path)
|
49
68
|
markdown_with_para_and_pagination.append({
|
50
|
-
'page_no':
|
51
|
-
|
69
|
+
'page_no':
|
70
|
+
page_no,
|
71
|
+
'md_content':
|
72
|
+
'\n\n'.join(page_markdown)
|
52
73
|
})
|
53
74
|
page_no += 1
|
54
75
|
return markdown_with_para_and_pagination
|
55
76
|
|
56
77
|
|
57
|
-
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=
|
78
|
+
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
|
58
79
|
page_markdown = []
|
59
80
|
for paras in paras_of_layout:
|
60
81
|
for para in paras:
|
@@ -67,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
|
|
67
88
|
if span_type == ContentType.Text:
|
68
89
|
content = span['content']
|
69
90
|
language = detect_lang(content)
|
70
|
-
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
71
|
-
content = ocr_escape_special_markdown_char(
|
91
|
+
if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
|
92
|
+
content = ocr_escape_special_markdown_char(
|
93
|
+
split_long_words(content))
|
72
94
|
else:
|
73
95
|
content = ocr_escape_special_markdown_char(content)
|
74
96
|
elif span_type == ContentType.InlineEquation:
|
@@ -92,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
|
|
92
114
|
return page_markdown
|
93
115
|
|
94
116
|
|
95
|
-
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
117
|
+
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
118
|
+
mode,
|
119
|
+
img_buket_path=''):
|
96
120
|
page_markdown = []
|
97
121
|
for para_block in paras_of_layout:
|
98
122
|
para_text = ''
|
@@ -100,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
100
124
|
if para_type == BlockType.Text:
|
101
125
|
para_text = merge_para_with_text(para_block)
|
102
126
|
elif para_type == BlockType.Title:
|
103
|
-
para_text = f
|
127
|
+
para_text = f'# {merge_para_with_text(para_block)}'
|
104
128
|
elif para_type == BlockType.InterlineEquation:
|
105
129
|
para_text = merge_para_with_text(para_block)
|
106
130
|
elif para_type == BlockType.Image:
|
@@ -116,14 +140,16 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
116
140
|
for block in para_block['blocks']: # 2nd.拼image_caption
|
117
141
|
if block['type'] == BlockType.ImageCaption:
|
118
142
|
para_text += merge_para_with_text(block)
|
143
|
+
for block in para_block['blocks']: # 2nd.拼image_caption
|
144
|
+
if block['type'] == BlockType.ImageFootnote:
|
145
|
+
para_text += merge_para_with_text(block)
|
119
146
|
elif para_type == BlockType.Table:
|
120
147
|
if mode == 'nlp':
|
121
148
|
continue
|
122
149
|
elif mode == 'mm':
|
123
|
-
table_caption = ''
|
124
150
|
for block in para_block['blocks']: # 1st.拼table_caption
|
125
151
|
if block['type'] == BlockType.TableCaption:
|
126
|
-
|
152
|
+
para_text += merge_para_with_text(block)
|
127
153
|
for block in para_block['blocks']: # 2nd.拼table_body
|
128
154
|
if block['type'] == BlockType.TableBody:
|
129
155
|
for line in block['lines']:
|
@@ -135,7 +161,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
135
161
|
elif span.get('html', ''):
|
136
162
|
para_text += f"\n\n{span['html']}\n\n"
|
137
163
|
else:
|
138
|
-
para_text += f"\n}) \n"
|
139
165
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
140
166
|
if block['type'] == BlockType.TableFootnote:
|
141
167
|
para_text += merge_para_with_text(block)
|
@@ -149,24 +175,39 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
149
175
|
|
150
176
|
|
151
177
|
def merge_para_with_text(para_block):
|
178
|
+
|
179
|
+
def detect_language(text):
|
180
|
+
en_pattern = r'[a-zA-Z]+'
|
181
|
+
en_matches = re.findall(en_pattern, text)
|
182
|
+
en_length = sum(len(match) for match in en_matches)
|
183
|
+
if len(text) > 0:
|
184
|
+
if en_length / len(text) >= 0.5:
|
185
|
+
return 'en'
|
186
|
+
else:
|
187
|
+
return 'unknown'
|
188
|
+
else:
|
189
|
+
return 'empty'
|
190
|
+
|
152
191
|
para_text = ''
|
153
192
|
for line in para_block['lines']:
|
154
|
-
line_text =
|
155
|
-
line_lang =
|
193
|
+
line_text = ''
|
194
|
+
line_lang = ''
|
156
195
|
for span in line['spans']:
|
157
196
|
span_type = span['type']
|
158
197
|
if span_type == ContentType.Text:
|
159
198
|
line_text += span['content'].strip()
|
160
|
-
if line_text !=
|
199
|
+
if line_text != '':
|
161
200
|
line_lang = detect_lang(line_text)
|
162
201
|
for span in line['spans']:
|
163
202
|
span_type = span['type']
|
164
203
|
content = ''
|
165
204
|
if span_type == ContentType.Text:
|
166
205
|
content = span['content']
|
167
|
-
language = detect_lang(content)
|
206
|
+
# language = detect_lang(content)
|
207
|
+
language = detect_language(content)
|
168
208
|
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
169
|
-
content = ocr_escape_special_markdown_char(
|
209
|
+
content = ocr_escape_special_markdown_char(
|
210
|
+
split_long_words(content))
|
170
211
|
else:
|
171
212
|
content = ocr_escape_special_markdown_char(content)
|
172
213
|
elif span_type == ContentType.InlineEquation:
|
@@ -175,10 +216,17 @@ def merge_para_with_text(para_block):
|
|
175
216
|
content = f"\n$$\n{span['content']}\n$$\n"
|
176
217
|
|
177
218
|
if content != '':
|
178
|
-
|
179
|
-
|
219
|
+
langs = ['zh', 'ja', 'ko']
|
220
|
+
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
221
|
+
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
|
222
|
+
elif line_lang == 'en':
|
223
|
+
# 如果是前一行带有-连字符,那么末尾不应该加空格
|
224
|
+
if __is_hyphen_at_line_end(content):
|
225
|
+
para_text += content[:-1]
|
226
|
+
else:
|
227
|
+
para_text += content + ' '
|
180
228
|
else:
|
181
|
-
para_text += content + ' ' #
|
229
|
+
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
|
182
230
|
return para_text
|
183
231
|
|
184
232
|
|
@@ -193,18 +241,18 @@ def para_to_standard_format(para, img_buket_path):
|
|
193
241
|
for span in line['spans']:
|
194
242
|
language = ''
|
195
243
|
span_type = span.get('type')
|
196
|
-
content =
|
244
|
+
content = ''
|
197
245
|
if span_type == ContentType.Text:
|
198
246
|
content = span['content']
|
199
247
|
language = detect_lang(content)
|
200
248
|
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
201
|
-
content = ocr_escape_special_markdown_char(
|
249
|
+
content = ocr_escape_special_markdown_char(
|
250
|
+
split_long_words(content))
|
202
251
|
else:
|
203
252
|
content = ocr_escape_special_markdown_char(content)
|
204
253
|
elif span_type == ContentType.InlineEquation:
|
205
254
|
content = f"${span['content']}$"
|
206
255
|
inline_equation_num += 1
|
207
|
-
|
208
256
|
if language == 'en': # 英文语境下 content间需要空格分隔
|
209
257
|
para_text += content + ' '
|
210
258
|
else: # 中文语境下,content间不需要空格分隔
|
@@ -212,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
|
|
212
260
|
para_content = {
|
213
261
|
'type': 'text',
|
214
262
|
'text': para_text,
|
215
|
-
'inline_equation_num': inline_equation_num
|
263
|
+
'inline_equation_num': inline_equation_num,
|
216
264
|
}
|
217
265
|
return para_content
|
218
266
|
|
@@ -223,37 +271,35 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
|
223
271
|
para_content = {
|
224
272
|
'type': 'text',
|
225
273
|
'text': merge_para_with_text(para_block),
|
226
|
-
'page_idx': page_idx
|
274
|
+
'page_idx': page_idx,
|
227
275
|
}
|
228
276
|
elif para_type == BlockType.Title:
|
229
277
|
para_content = {
|
230
278
|
'type': 'text',
|
231
279
|
'text': merge_para_with_text(para_block),
|
232
280
|
'text_level': 1,
|
233
|
-
'page_idx': page_idx
|
281
|
+
'page_idx': page_idx,
|
234
282
|
}
|
235
283
|
elif para_type == BlockType.InterlineEquation:
|
236
284
|
para_content = {
|
237
285
|
'type': 'equation',
|
238
286
|
'text': merge_para_with_text(para_block),
|
239
|
-
'text_format':
|
240
|
-
'page_idx': page_idx
|
287
|
+
'text_format': 'latex',
|
288
|
+
'page_idx': page_idx,
|
241
289
|
}
|
242
290
|
elif para_type == BlockType.Image:
|
243
|
-
para_content = {
|
244
|
-
'type': 'image',
|
245
|
-
'page_idx': page_idx
|
246
|
-
}
|
291
|
+
para_content = {'type': 'image', 'page_idx': page_idx}
|
247
292
|
for block in para_block['blocks']:
|
248
293
|
if block['type'] == BlockType.ImageBody:
|
249
|
-
para_content['img_path'] = join_path(
|
294
|
+
para_content['img_path'] = join_path(
|
295
|
+
img_buket_path,
|
296
|
+
block['lines'][0]['spans'][0]['image_path'])
|
250
297
|
if block['type'] == BlockType.ImageCaption:
|
251
298
|
para_content['img_caption'] = merge_para_with_text(block)
|
299
|
+
if block['type'] == BlockType.ImageFootnote:
|
300
|
+
para_content['img_footnote'] = merge_para_with_text(block)
|
252
301
|
elif para_type == BlockType.Table:
|
253
|
-
para_content = {
|
254
|
-
'type': 'table',
|
255
|
-
'page_idx': page_idx
|
256
|
-
}
|
302
|
+
para_content = {'type': 'table', 'page_idx': page_idx}
|
257
303
|
for block in para_block['blocks']:
|
258
304
|
if block['type'] == BlockType.TableBody:
|
259
305
|
if block["lines"][0]["spans"][0].get('latex', ''):
|
@@ -272,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
|
272
318
|
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
|
273
319
|
content_list = []
|
274
320
|
for page_info in pdf_info_dict:
|
275
|
-
paras_of_layout = page_info.get(
|
321
|
+
paras_of_layout = page_info.get('para_blocks')
|
276
322
|
if not paras_of_layout:
|
277
323
|
continue
|
278
324
|
for para_block in paras_of_layout:
|
279
|
-
para_content = para_to_standard_format_v2(para_block,
|
325
|
+
para_content = para_to_standard_format_v2(para_block,
|
326
|
+
img_buket_path)
|
280
327
|
content_list.append(para_content)
|
281
328
|
return content_list
|
282
329
|
|
283
330
|
|
284
331
|
def line_to_standard_format(line, img_buket_path):
|
285
|
-
line_text =
|
332
|
+
line_text = ''
|
286
333
|
inline_equation_num = 0
|
287
334
|
for span in line['spans']:
|
288
335
|
if not span.get('content'):
|
@@ -292,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
|
|
292
339
|
if span['type'] == ContentType.Image:
|
293
340
|
content = {
|
294
341
|
'type': 'image',
|
295
|
-
'img_path': join_path(img_buket_path,
|
342
|
+
'img_path': join_path(img_buket_path,
|
343
|
+
span['image_path']),
|
296
344
|
}
|
297
345
|
return content
|
298
346
|
elif span['type'] == ContentType.Table:
|
299
347
|
content = {
|
300
348
|
'type': 'table',
|
301
|
-
'img_path': join_path(img_buket_path,
|
349
|
+
'img_path': join_path(img_buket_path,
|
350
|
+
span['image_path']),
|
302
351
|
}
|
303
352
|
return content
|
304
353
|
else:
|
@@ -306,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
|
|
306
355
|
interline_equation = span['content']
|
307
356
|
content = {
|
308
357
|
'type': 'equation',
|
309
|
-
'latex': f
|
358
|
+
'latex': f'$$\n{interline_equation}\n$$'
|
310
359
|
}
|
311
360
|
return content
|
312
361
|
elif span['type'] == ContentType.InlineEquation:
|
313
362
|
inline_equation = span['content']
|
314
|
-
line_text += f
|
363
|
+
line_text += f'${inline_equation}$'
|
315
364
|
inline_equation_num += 1
|
316
365
|
elif span['type'] == ContentType.Text:
|
317
|
-
text_content = ocr_escape_special_markdown_char(
|
366
|
+
text_content = ocr_escape_special_markdown_char(
|
367
|
+
span['content']) # 转义特殊符号
|
318
368
|
line_text += text_content
|
319
369
|
content = {
|
320
370
|
'type': 'text',
|
321
371
|
'text': line_text,
|
322
|
-
'inline_equation_num': inline_equation_num
|
372
|
+
'inline_equation_num': inline_equation_num,
|
323
373
|
}
|
324
374
|
return content
|
325
375
|
|
326
376
|
|
327
377
|
def ocr_mk_mm_standard_format(pdf_info_dict: list):
|
328
|
-
"""
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
text string 纯文本格式的文本数据。
|
333
|
-
md string markdown格式的文本数据。
|
334
|
-
img_path string s3://full/path/to/img.jpg
|
335
|
-
"""
|
378
|
+
"""content_list type string
|
379
|
+
image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
|
380
|
+
latex文本字段。 text string 纯文本格式的文本数据。 md string
|
381
|
+
markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
|
336
382
|
content_list = []
|
337
383
|
for page_info in pdf_info_dict:
|
338
|
-
blocks = page_info.get(
|
384
|
+
blocks = page_info.get('preproc_blocks')
|
339
385
|
if not blocks:
|
340
386
|
continue
|
341
387
|
for block in blocks:
|
@@ -345,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
|
|
345
391
|
return content_list
|
346
392
|
|
347
393
|
|
348
|
-
def union_make(pdf_info_dict: list,
|
394
|
+
def union_make(pdf_info_dict: list,
|
395
|
+
make_mode: str,
|
396
|
+
drop_mode: str,
|
397
|
+
img_buket_path: str = ''):
|
349
398
|
output_content = []
|
350
399
|
for page_info in pdf_info_dict:
|
351
|
-
if page_info.get(
|
352
|
-
drop_reason = page_info.get(
|
400
|
+
if page_info.get('need_drop', False):
|
401
|
+
drop_reason = page_info.get('drop_reason')
|
353
402
|
if drop_mode == DropMode.NONE:
|
354
403
|
pass
|
355
404
|
elif drop_mode == DropMode.WHOLE_PDF:
|
356
|
-
raise Exception(f
|
405
|
+
raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
|
406
|
+
f'drop_reason is {drop_reason}'))
|
357
407
|
elif drop_mode == DropMode.SINGLE_PAGE:
|
358
|
-
logger.warning(f
|
408
|
+
logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
|
409
|
+
f'drop_reason is {drop_reason}'))
|
359
410
|
continue
|
360
411
|
else:
|
361
|
-
raise Exception(
|
412
|
+
raise Exception('drop_mode can not be null')
|
362
413
|
|
363
|
-
paras_of_layout = page_info.get(
|
364
|
-
page_idx = page_info.get(
|
414
|
+
paras_of_layout = page_info.get('para_blocks')
|
415
|
+
page_idx = page_info.get('page_idx')
|
365
416
|
if not paras_of_layout:
|
366
417
|
continue
|
367
418
|
if make_mode == MakeMode.MM_MD:
|
368
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
419
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
420
|
+
paras_of_layout, 'mm', img_buket_path)
|
369
421
|
output_content.extend(page_markdown)
|
370
422
|
elif make_mode == MakeMode.NLP_MD:
|
371
|
-
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
423
|
+
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
424
|
+
paras_of_layout, 'nlp')
|
372
425
|
output_content.extend(page_markdown)
|
373
426
|
elif make_mode == MakeMode.STANDARD_FORMAT:
|
374
427
|
for para_block in paras_of_layout:
|
375
|
-
para_content = para_to_standard_format_v2(
|
428
|
+
para_content = para_to_standard_format_v2(
|
429
|
+
para_block, img_buket_path, page_idx)
|
376
430
|
output_content.append(para_content)
|
377
431
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
378
432
|
return '\n\n'.join(output_content)
|
File without changes
|
File without changes
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
|
7
|
+
Node)
|
8
|
+
from magic_pdf.integrations.rag.utils import inference
|
9
|
+
|
10
|
+
|
11
|
+
class RagPageReader:
|
12
|
+
|
13
|
+
def __init__(self, pagedata: LayoutElements):
|
14
|
+
self.o = [
|
15
|
+
Node(
|
16
|
+
category_type=v.category_type,
|
17
|
+
text=v.text,
|
18
|
+
image_path=v.image_path,
|
19
|
+
anno_id=v.anno_id,
|
20
|
+
latex=v.latex,
|
21
|
+
html=v.html,
|
22
|
+
) for v in pagedata.layout_dets
|
23
|
+
]
|
24
|
+
|
25
|
+
self.pagedata = pagedata
|
26
|
+
|
27
|
+
def __iter__(self):
|
28
|
+
return iter(self.o)
|
29
|
+
|
30
|
+
def get_rel_map(self) -> list[ElementRelation]:
|
31
|
+
return self.pagedata.extra.element_relation
|
32
|
+
|
33
|
+
|
34
|
+
class RagDocumentReader:
|
35
|
+
|
36
|
+
def __init__(self, ragdata: list[LayoutElements]):
|
37
|
+
self.o = [RagPageReader(v) for v in ragdata]
|
38
|
+
|
39
|
+
def __iter__(self):
|
40
|
+
return iter(self.o)
|
41
|
+
|
42
|
+
|
43
|
+
class DataReader:
|
44
|
+
|
45
|
+
def __init__(self, path_or_directory: str, method: str, output_dir: str):
|
46
|
+
self.path_or_directory = path_or_directory
|
47
|
+
self.method = method
|
48
|
+
self.output_dir = output_dir
|
49
|
+
self.pdfs = []
|
50
|
+
if os.path.isdir(path_or_directory):
|
51
|
+
for doc_path in Path(path_or_directory).glob('*.pdf'):
|
52
|
+
self.pdfs.append(doc_path)
|
53
|
+
else:
|
54
|
+
assert path_or_directory.endswith('.pdf')
|
55
|
+
self.pdfs.append(Path(path_or_directory))
|
56
|
+
|
57
|
+
def get_documents_count(self) -> int:
|
58
|
+
"""Returns the number of documents in the directory."""
|
59
|
+
return len(self.pdfs)
|
60
|
+
|
61
|
+
def get_document_result(self, idx: int) -> RagDocumentReader | None:
|
62
|
+
"""
|
63
|
+
Args:
|
64
|
+
idx (int): the index of documents under the
|
65
|
+
directory path_or_directory
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
RagDocumentReader | None: RagDocumentReader is an iterable object,
|
69
|
+
more details @RagDocumentReader
|
70
|
+
"""
|
71
|
+
if idx >= self.get_documents_count() or idx < 0:
|
72
|
+
logger.error(f'invalid idx: {idx}')
|
73
|
+
return None
|
74
|
+
res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
|
75
|
+
if res is None:
|
76
|
+
logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
|
77
|
+
return None
|
78
|
+
return RagDocumentReader(res)
|
79
|
+
|
80
|
+
def get_document_filename(self, idx: int) -> Path:
|
81
|
+
"""get the filename of the document."""
|
82
|
+
return self.pdfs[idx]
|
@@ -0,0 +1,82 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
# rag
|
7
|
+
class CategoryType(Enum): # py310 not support StrEnum
|
8
|
+
text = 'text'
|
9
|
+
title = 'title'
|
10
|
+
interline_equation = 'interline_equation'
|
11
|
+
image = 'image'
|
12
|
+
image_body = 'image_body'
|
13
|
+
image_caption = 'image_caption'
|
14
|
+
table = 'table'
|
15
|
+
table_body = 'table_body'
|
16
|
+
table_caption = 'table_caption'
|
17
|
+
table_footnote = 'table_footnote'
|
18
|
+
|
19
|
+
|
20
|
+
class ElementRelType(Enum):
|
21
|
+
sibling = 'sibling'
|
22
|
+
|
23
|
+
|
24
|
+
class PageInfo(BaseModel):
|
25
|
+
page_no: int = Field(description='the index of page, start from zero',
|
26
|
+
ge=0)
|
27
|
+
height: int = Field(description='the height of page', gt=0)
|
28
|
+
width: int = Field(description='the width of page', ge=0)
|
29
|
+
image_path: str | None = Field(description='the image of this page',
|
30
|
+
default=None)
|
31
|
+
|
32
|
+
|
33
|
+
class ContentObject(BaseModel):
|
34
|
+
category_type: CategoryType = Field(description='类别')
|
35
|
+
poly: list[float] = Field(
|
36
|
+
description=('Coordinates, need to convert back to PDF coordinates,'
|
37
|
+
' order is top-left, top-right, bottom-right, bottom-left'
|
38
|
+
' x,y coordinates'))
|
39
|
+
ignore: bool = Field(description='whether ignore this object',
|
40
|
+
default=False)
|
41
|
+
text: str | None = Field(description='text content of the object',
|
42
|
+
default=None)
|
43
|
+
image_path: str | None = Field(description='path of embedded image',
|
44
|
+
default=None)
|
45
|
+
order: int = Field(description='the order of this object within a page',
|
46
|
+
default=-1)
|
47
|
+
anno_id: int = Field(description='unique id', default=-1)
|
48
|
+
latex: str | None = Field(description='latex result', default=None)
|
49
|
+
html: str | None = Field(description='html result', default=None)
|
50
|
+
|
51
|
+
|
52
|
+
class ElementRelation(BaseModel):
|
53
|
+
source_anno_id: int = Field(description='unique id of the source object',
|
54
|
+
default=-1)
|
55
|
+
target_anno_id: int = Field(description='unique id of the target object',
|
56
|
+
default=-1)
|
57
|
+
relation: ElementRelType = Field(
|
58
|
+
description='the relation between source and target element')
|
59
|
+
|
60
|
+
|
61
|
+
class LayoutElementsExtra(BaseModel):
|
62
|
+
element_relation: list[ElementRelation] = Field(
|
63
|
+
description='the relation between source and target element')
|
64
|
+
|
65
|
+
|
66
|
+
class LayoutElements(BaseModel):
|
67
|
+
layout_dets: list[ContentObject] = Field(
|
68
|
+
description='layout element details')
|
69
|
+
page_info: PageInfo = Field(description='page info')
|
70
|
+
extra: LayoutElementsExtra = Field(description='extra information')
|
71
|
+
|
72
|
+
|
73
|
+
# iter data format
|
74
|
+
class Node(BaseModel):
|
75
|
+
category_type: CategoryType = Field(description='类别')
|
76
|
+
text: str | None = Field(description='text content of the object',
|
77
|
+
default=None)
|
78
|
+
image_path: str | None = Field(description='path of embedded image',
|
79
|
+
default=None)
|
80
|
+
anno_id: int = Field(description='unique id', default=-1)
|
81
|
+
latex: str | None = Field(description='latex result', default=None)
|
82
|
+
html: str | None = Field(description='html result', default=None)
|