magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +134 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/Constants.py +27 -1
  9. magic_pdf/libs/boxbase.py +169 -149
  10. magic_pdf/libs/draw_bbox.py +113 -87
  11. magic_pdf/libs/ocr_content_type.py +21 -18
  12. magic_pdf/libs/version.py +1 -1
  13. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  14. magic_pdf/model/magic_model.py +230 -161
  15. magic_pdf/model/model_list.py +8 -0
  16. magic_pdf/model/pdf_extract_kit.py +135 -22
  17. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  18. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
  19. magic_pdf/model/ppTableModel.py +67 -0
  20. magic_pdf/para/para_split_v2.py +76 -74
  21. magic_pdf/pdf_parse_union_core.py +34 -6
  22. magic_pdf/pipe/AbsPipe.py +4 -1
  23. magic_pdf/pipe/OCRPipe.py +7 -4
  24. magic_pdf/pipe/TXTPipe.py +7 -4
  25. magic_pdf/pipe/UNIPipe.py +11 -6
  26. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  27. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  28. magic_pdf/resources/model_config/model_configs.yaml +3 -1
  29. magic_pdf/tools/cli.py +56 -29
  30. magic_pdf/tools/cli_dev.py +61 -64
  31. magic_pdf/tools/common.py +57 -37
  32. magic_pdf/user_api.py +17 -9
  33. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
  34. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
  35. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
  36. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
  37. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
  38. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,27 @@
1
+ import re
2
+
3
+ import wordninja
1
4
  from loguru import logger
2
5
 
3
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
4
6
  from magic_pdf.libs.commons import join_path
5
7
  from magic_pdf.libs.language import detect_lang
8
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
6
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
7
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
8
- import wordninja
9
- import re
10
+ from magic_pdf.libs.ocr_content_type import BlockType, ContentType
11
+
12
+
13
+ def __is_hyphen_at_line_end(line):
14
+ """
15
+ Check if a line ends with one or more letters followed by a hyphen.
16
+
17
+ Args:
18
+ line (str): The line of text to check.
19
+
20
+ Returns:
21
+ bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
22
+ """
23
+ # Use regex to check if the line ends with one or more letters followed by a hyphen
24
+ return bool(re.search(r'[A-Za-z]+-\s*$', line))
10
25
 
11
26
 
12
27
  def split_long_words(text):
@@ -14,7 +29,7 @@ def split_long_words(text):
14
29
  for i in range(len(segments)):
15
30
  words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
16
31
  for j in range(len(words)):
17
- if len(words[j]) > 15:
32
+ if len(words[j]) > 10:
18
33
  words[j] = ' '.join(wordninja.split(words[j]))
19
34
  segments[i] = ''.join(words)
20
35
  return ' '.join(segments)
@@ -23,8 +38,9 @@ def split_long_words(text):
23
38
  def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
24
39
  markdown = []
25
40
  for page_info in pdf_info_list:
26
- paras_of_layout = page_info.get("para_blocks")
27
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
41
+ paras_of_layout = page_info.get('para_blocks')
42
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
43
+ paras_of_layout, 'mm', img_buket_path)
28
44
  markdown.extend(page_markdown)
29
45
  return '\n\n'.join(markdown)
30
46
 
@@ -32,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
32
48
  def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
33
49
  markdown = []
34
50
  for page_info in pdf_info_dict:
35
- paras_of_layout = page_info.get("para_blocks")
36
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
51
+ paras_of_layout = page_info.get('para_blocks')
52
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
53
+ paras_of_layout, 'nlp')
37
54
  markdown.extend(page_markdown)
38
55
  return '\n\n'.join(markdown)
39
56
 
40
57
 
41
- def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
58
+ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
59
+ img_buket_path):
42
60
  markdown_with_para_and_pagination = []
43
61
  page_no = 0
44
62
  for page_info in pdf_info_dict:
45
- paras_of_layout = page_info.get("para_blocks")
63
+ paras_of_layout = page_info.get('para_blocks')
46
64
  if not paras_of_layout:
47
65
  continue
48
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
66
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
67
+ paras_of_layout, 'mm', img_buket_path)
49
68
  markdown_with_para_and_pagination.append({
50
- 'page_no': page_no,
51
- 'md_content': '\n\n'.join(page_markdown)
69
+ 'page_no':
70
+ page_no,
71
+ 'md_content':
72
+ '\n\n'.join(page_markdown)
52
73
  })
53
74
  page_no += 1
54
75
  return markdown_with_para_and_pagination
55
76
 
56
77
 
57
- def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
78
+ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
58
79
  page_markdown = []
59
80
  for paras in paras_of_layout:
60
81
  for para in paras:
@@ -67,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
67
88
  if span_type == ContentType.Text:
68
89
  content = span['content']
69
90
  language = detect_lang(content)
70
- if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
71
- content = ocr_escape_special_markdown_char(split_long_words(content))
91
+ if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
92
+ content = ocr_escape_special_markdown_char(
93
+ split_long_words(content))
72
94
  else:
73
95
  content = ocr_escape_special_markdown_char(content)
74
96
  elif span_type == ContentType.InlineEquation:
@@ -92,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
92
114
  return page_markdown
93
115
 
94
116
 
95
- def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
117
+ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
118
+ mode,
119
+ img_buket_path=''):
96
120
  page_markdown = []
97
121
  for para_block in paras_of_layout:
98
122
  para_text = ''
@@ -100,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
100
124
  if para_type == BlockType.Text:
101
125
  para_text = merge_para_with_text(para_block)
102
126
  elif para_type == BlockType.Title:
103
- para_text = f"# {merge_para_with_text(para_block)}"
127
+ para_text = f'# {merge_para_with_text(para_block)}'
104
128
  elif para_type == BlockType.InterlineEquation:
105
129
  para_text = merge_para_with_text(para_block)
106
130
  elif para_type == BlockType.Image:
@@ -116,14 +140,16 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
116
140
  for block in para_block['blocks']: # 2nd.拼image_caption
117
141
  if block['type'] == BlockType.ImageCaption:
118
142
  para_text += merge_para_with_text(block)
143
+ for block in para_block['blocks']: # 2nd.拼image_caption
144
+ if block['type'] == BlockType.ImageFootnote:
145
+ para_text += merge_para_with_text(block)
119
146
  elif para_type == BlockType.Table:
120
147
  if mode == 'nlp':
121
148
  continue
122
149
  elif mode == 'mm':
123
- table_caption = ''
124
150
  for block in para_block['blocks']: # 1st.拼table_caption
125
151
  if block['type'] == BlockType.TableCaption:
126
- table_caption = merge_para_with_text(block)
152
+ para_text += merge_para_with_text(block)
127
153
  for block in para_block['blocks']: # 2nd.拼table_body
128
154
  if block['type'] == BlockType.TableBody:
129
155
  for line in block['lines']:
@@ -132,8 +158,10 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
132
158
  # if processed by table model
133
159
  if span.get('latex', ''):
134
160
  para_text += f"\n\n$\n {span['latex']}\n$\n\n"
161
+ elif span.get('html', ''):
162
+ para_text += f"\n\n{span['html']}\n\n"
135
163
  else:
136
- para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])}) \n"
164
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
137
165
  for block in para_block['blocks']: # 3rd.拼table_footnote
138
166
  if block['type'] == BlockType.TableFootnote:
139
167
  para_text += merge_para_with_text(block)
@@ -147,24 +175,39 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
147
175
 
148
176
 
149
177
  def merge_para_with_text(para_block):
178
+
179
+ def detect_language(text):
180
+ en_pattern = r'[a-zA-Z]+'
181
+ en_matches = re.findall(en_pattern, text)
182
+ en_length = sum(len(match) for match in en_matches)
183
+ if len(text) > 0:
184
+ if en_length / len(text) >= 0.5:
185
+ return 'en'
186
+ else:
187
+ return 'unknown'
188
+ else:
189
+ return 'empty'
190
+
150
191
  para_text = ''
151
192
  for line in para_block['lines']:
152
- line_text = ""
153
- line_lang = ""
193
+ line_text = ''
194
+ line_lang = ''
154
195
  for span in line['spans']:
155
196
  span_type = span['type']
156
197
  if span_type == ContentType.Text:
157
198
  line_text += span['content'].strip()
158
- if line_text != "":
199
+ if line_text != '':
159
200
  line_lang = detect_lang(line_text)
160
201
  for span in line['spans']:
161
202
  span_type = span['type']
162
203
  content = ''
163
204
  if span_type == ContentType.Text:
164
205
  content = span['content']
165
- language = detect_lang(content)
206
+ # language = detect_lang(content)
207
+ language = detect_language(content)
166
208
  if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
167
- content = ocr_escape_special_markdown_char(split_long_words(content))
209
+ content = ocr_escape_special_markdown_char(
210
+ split_long_words(content))
168
211
  else:
169
212
  content = ocr_escape_special_markdown_char(content)
170
213
  elif span_type == ContentType.InlineEquation:
@@ -173,10 +216,17 @@ def merge_para_with_text(para_block):
173
216
  content = f"\n$$\n{span['content']}\n$$\n"
174
217
 
175
218
  if content != '':
176
- if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
177
- para_text += content # 中文语境下,content间不需要空格分隔
219
+ langs = ['zh', 'ja', 'ko']
220
+ if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
221
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
222
+ elif line_lang == 'en':
223
+ # 如果是前一行带有-连字符,那么末尾不应该加空格
224
+ if __is_hyphen_at_line_end(content):
225
+ para_text += content[:-1]
226
+ else:
227
+ para_text += content + ' '
178
228
  else:
179
- para_text += content + ' ' # 英文语境下 content间需要空格分隔
229
+ para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
180
230
  return para_text
181
231
 
182
232
 
@@ -191,18 +241,18 @@ def para_to_standard_format(para, img_buket_path):
191
241
  for span in line['spans']:
192
242
  language = ''
193
243
  span_type = span.get('type')
194
- content = ""
244
+ content = ''
195
245
  if span_type == ContentType.Text:
196
246
  content = span['content']
197
247
  language = detect_lang(content)
198
248
  if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
199
- content = ocr_escape_special_markdown_char(split_long_words(content))
249
+ content = ocr_escape_special_markdown_char(
250
+ split_long_words(content))
200
251
  else:
201
252
  content = ocr_escape_special_markdown_char(content)
202
253
  elif span_type == ContentType.InlineEquation:
203
254
  content = f"${span['content']}$"
204
255
  inline_equation_num += 1
205
-
206
256
  if language == 'en': # 英文语境下 content间需要空格分隔
207
257
  para_text += content + ' '
208
258
  else: # 中文语境下,content间不需要空格分隔
@@ -210,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
210
260
  para_content = {
211
261
  'type': 'text',
212
262
  'text': para_text,
213
- 'inline_equation_num': inline_equation_num
263
+ 'inline_equation_num': inline_equation_num,
214
264
  }
215
265
  return para_content
216
266
 
@@ -221,41 +271,41 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
221
271
  para_content = {
222
272
  'type': 'text',
223
273
  'text': merge_para_with_text(para_block),
224
- 'page_idx': page_idx
274
+ 'page_idx': page_idx,
225
275
  }
226
276
  elif para_type == BlockType.Title:
227
277
  para_content = {
228
278
  'type': 'text',
229
279
  'text': merge_para_with_text(para_block),
230
280
  'text_level': 1,
231
- 'page_idx': page_idx
281
+ 'page_idx': page_idx,
232
282
  }
233
283
  elif para_type == BlockType.InterlineEquation:
234
284
  para_content = {
235
285
  'type': 'equation',
236
286
  'text': merge_para_with_text(para_block),
237
- 'text_format': "latex",
238
- 'page_idx': page_idx
287
+ 'text_format': 'latex',
288
+ 'page_idx': page_idx,
239
289
  }
240
290
  elif para_type == BlockType.Image:
241
- para_content = {
242
- 'type': 'image',
243
- 'page_idx': page_idx
244
- }
291
+ para_content = {'type': 'image', 'page_idx': page_idx}
245
292
  for block in para_block['blocks']:
246
293
  if block['type'] == BlockType.ImageBody:
247
- para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
294
+ para_content['img_path'] = join_path(
295
+ img_buket_path,
296
+ block['lines'][0]['spans'][0]['image_path'])
248
297
  if block['type'] == BlockType.ImageCaption:
249
298
  para_content['img_caption'] = merge_para_with_text(block)
299
+ if block['type'] == BlockType.ImageFootnote:
300
+ para_content['img_footnote'] = merge_para_with_text(block)
250
301
  elif para_type == BlockType.Table:
251
- para_content = {
252
- 'type': 'table',
253
- 'page_idx': page_idx
254
- }
302
+ para_content = {'type': 'table', 'page_idx': page_idx}
255
303
  for block in para_block['blocks']:
256
304
  if block['type'] == BlockType.TableBody:
257
305
  if block["lines"][0]["spans"][0].get('latex', ''):
258
306
  para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
307
+ elif block["lines"][0]["spans"][0].get('html', ''):
308
+ para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
259
309
  para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
260
310
  if block['type'] == BlockType.TableCaption:
261
311
  para_content['table_caption'] = merge_para_with_text(block)
@@ -268,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
268
318
  def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
269
319
  content_list = []
270
320
  for page_info in pdf_info_dict:
271
- paras_of_layout = page_info.get("para_blocks")
321
+ paras_of_layout = page_info.get('para_blocks')
272
322
  if not paras_of_layout:
273
323
  continue
274
324
  for para_block in paras_of_layout:
275
- para_content = para_to_standard_format_v2(para_block, img_buket_path)
325
+ para_content = para_to_standard_format_v2(para_block,
326
+ img_buket_path)
276
327
  content_list.append(para_content)
277
328
  return content_list
278
329
 
279
330
 
280
331
  def line_to_standard_format(line, img_buket_path):
281
- line_text = ""
332
+ line_text = ''
282
333
  inline_equation_num = 0
283
334
  for span in line['spans']:
284
335
  if not span.get('content'):
@@ -288,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
288
339
  if span['type'] == ContentType.Image:
289
340
  content = {
290
341
  'type': 'image',
291
- 'img_path': join_path(img_buket_path, span['image_path'])
342
+ 'img_path': join_path(img_buket_path,
343
+ span['image_path']),
292
344
  }
293
345
  return content
294
346
  elif span['type'] == ContentType.Table:
295
347
  content = {
296
348
  'type': 'table',
297
- 'img_path': join_path(img_buket_path, span['image_path'])
349
+ 'img_path': join_path(img_buket_path,
350
+ span['image_path']),
298
351
  }
299
352
  return content
300
353
  else:
@@ -302,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
302
355
  interline_equation = span['content']
303
356
  content = {
304
357
  'type': 'equation',
305
- 'latex': f"$$\n{interline_equation}\n$$"
358
+ 'latex': f'$$\n{interline_equation}\n$$'
306
359
  }
307
360
  return content
308
361
  elif span['type'] == ContentType.InlineEquation:
309
362
  inline_equation = span['content']
310
- line_text += f"${inline_equation}$"
363
+ line_text += f'${inline_equation}$'
311
364
  inline_equation_num += 1
312
365
  elif span['type'] == ContentType.Text:
313
- text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
366
+ text_content = ocr_escape_special_markdown_char(
367
+ span['content']) # 转义特殊符号
314
368
  line_text += text_content
315
369
  content = {
316
370
  'type': 'text',
317
371
  'text': line_text,
318
- 'inline_equation_num': inline_equation_num
372
+ 'inline_equation_num': inline_equation_num,
319
373
  }
320
374
  return content
321
375
 
322
376
 
323
377
  def ocr_mk_mm_standard_format(pdf_info_dict: list):
324
- """
325
- content_list
326
- type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
327
- latex string latex文本字段。
328
- text string 纯文本格式的文本数据。
329
- md string markdown格式的文本数据。
330
- img_path string s3://full/path/to/img.jpg
331
- """
378
+ """content_list type string
379
+ image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
380
+ latex文本字段。 text string 纯文本格式的文本数据。 md string
381
+ markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
332
382
  content_list = []
333
383
  for page_info in pdf_info_dict:
334
- blocks = page_info.get("preproc_blocks")
384
+ blocks = page_info.get('preproc_blocks')
335
385
  if not blocks:
336
386
  continue
337
387
  for block in blocks:
@@ -341,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
341
391
  return content_list
342
392
 
343
393
 
344
- def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
394
+ def union_make(pdf_info_dict: list,
395
+ make_mode: str,
396
+ drop_mode: str,
397
+ img_buket_path: str = ''):
345
398
  output_content = []
346
399
  for page_info in pdf_info_dict:
347
- if page_info.get("need_drop", False):
348
- drop_reason = page_info.get("drop_reason")
400
+ if page_info.get('need_drop', False):
401
+ drop_reason = page_info.get('drop_reason')
349
402
  if drop_mode == DropMode.NONE:
350
403
  pass
351
404
  elif drop_mode == DropMode.WHOLE_PDF:
352
- raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
405
+ raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
406
+ f'drop_reason is {drop_reason}'))
353
407
  elif drop_mode == DropMode.SINGLE_PAGE:
354
- logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
408
+ logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
409
+ f'drop_reason is {drop_reason}'))
355
410
  continue
356
411
  else:
357
- raise Exception(f"drop_mode can not be null")
412
+ raise Exception('drop_mode can not be null')
358
413
 
359
- paras_of_layout = page_info.get("para_blocks")
360
- page_idx = page_info.get("page_idx")
414
+ paras_of_layout = page_info.get('para_blocks')
415
+ page_idx = page_info.get('page_idx')
361
416
  if not paras_of_layout:
362
417
  continue
363
418
  if make_mode == MakeMode.MM_MD:
364
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
419
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
420
+ paras_of_layout, 'mm', img_buket_path)
365
421
  output_content.extend(page_markdown)
366
422
  elif make_mode == MakeMode.NLP_MD:
367
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
423
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
424
+ paras_of_layout, 'nlp')
368
425
  output_content.extend(page_markdown)
369
426
  elif make_mode == MakeMode.STANDARD_FORMAT:
370
427
  for para_block in paras_of_layout:
371
- para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
428
+ para_content = para_to_standard_format_v2(
429
+ para_block, img_buket_path, page_idx)
372
430
  output_content.append(para_content)
373
431
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
374
432
  return '\n\n'.join(output_content)
File without changes
File without changes
@@ -0,0 +1,82 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from loguru import logger
5
+
6
+ from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
7
+ Node)
8
+ from magic_pdf.integrations.rag.utils import inference
9
+
10
+
11
+ class RagPageReader:
12
+
13
+ def __init__(self, pagedata: LayoutElements):
14
+ self.o = [
15
+ Node(
16
+ category_type=v.category_type,
17
+ text=v.text,
18
+ image_path=v.image_path,
19
+ anno_id=v.anno_id,
20
+ latex=v.latex,
21
+ html=v.html,
22
+ ) for v in pagedata.layout_dets
23
+ ]
24
+
25
+ self.pagedata = pagedata
26
+
27
+ def __iter__(self):
28
+ return iter(self.o)
29
+
30
+ def get_rel_map(self) -> list[ElementRelation]:
31
+ return self.pagedata.extra.element_relation
32
+
33
+
34
+ class RagDocumentReader:
35
+
36
+ def __init__(self, ragdata: list[LayoutElements]):
37
+ self.o = [RagPageReader(v) for v in ragdata]
38
+
39
+ def __iter__(self):
40
+ return iter(self.o)
41
+
42
+
43
+ class DataReader:
44
+
45
+ def __init__(self, path_or_directory: str, method: str, output_dir: str):
46
+ self.path_or_directory = path_or_directory
47
+ self.method = method
48
+ self.output_dir = output_dir
49
+ self.pdfs = []
50
+ if os.path.isdir(path_or_directory):
51
+ for doc_path in Path(path_or_directory).glob('*.pdf'):
52
+ self.pdfs.append(doc_path)
53
+ else:
54
+ assert path_or_directory.endswith('.pdf')
55
+ self.pdfs.append(Path(path_or_directory))
56
+
57
+ def get_documents_count(self) -> int:
58
+ """Returns the number of documents in the directory."""
59
+ return len(self.pdfs)
60
+
61
+ def get_document_result(self, idx: int) -> RagDocumentReader | None:
62
+ """
63
+ Args:
64
+ idx (int): the index of documents under the
65
+ directory path_or_directory
66
+
67
+ Returns:
68
+ RagDocumentReader | None: RagDocumentReader is an iterable object,
69
+ more details @RagDocumentReader
70
+ """
71
+ if idx >= self.get_documents_count() or idx < 0:
72
+ logger.error(f'invalid idx: {idx}')
73
+ return None
74
+ res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
75
+ if res is None:
76
+ logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
77
+ return None
78
+ return RagDocumentReader(res)
79
+
80
+ def get_document_filename(self, idx: int) -> Path:
81
+ """get the filename of the document."""
82
+ return self.pdfs[idx]
@@ -0,0 +1,82 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ # rag
7
+ class CategoryType(Enum): # py310 not support StrEnum
8
+ text = 'text'
9
+ title = 'title'
10
+ interline_equation = 'interline_equation'
11
+ image = 'image'
12
+ image_body = 'image_body'
13
+ image_caption = 'image_caption'
14
+ table = 'table'
15
+ table_body = 'table_body'
16
+ table_caption = 'table_caption'
17
+ table_footnote = 'table_footnote'
18
+
19
+
20
+ class ElementRelType(Enum):
21
+ sibling = 'sibling'
22
+
23
+
24
+ class PageInfo(BaseModel):
25
+ page_no: int = Field(description='the index of page, start from zero',
26
+ ge=0)
27
+ height: int = Field(description='the height of page', gt=0)
28
+ width: int = Field(description='the width of page', ge=0)
29
+ image_path: str | None = Field(description='the image of this page',
30
+ default=None)
31
+
32
+
33
+ class ContentObject(BaseModel):
34
+ category_type: CategoryType = Field(description='类别')
35
+ poly: list[float] = Field(
36
+ description=('Coordinates, need to convert back to PDF coordinates,'
37
+ ' order is top-left, top-right, bottom-right, bottom-left'
38
+ ' x,y coordinates'))
39
+ ignore: bool = Field(description='whether ignore this object',
40
+ default=False)
41
+ text: str | None = Field(description='text content of the object',
42
+ default=None)
43
+ image_path: str | None = Field(description='path of embedded image',
44
+ default=None)
45
+ order: int = Field(description='the order of this object within a page',
46
+ default=-1)
47
+ anno_id: int = Field(description='unique id', default=-1)
48
+ latex: str | None = Field(description='latex result', default=None)
49
+ html: str | None = Field(description='html result', default=None)
50
+
51
+
52
+ class ElementRelation(BaseModel):
53
+ source_anno_id: int = Field(description='unique id of the source object',
54
+ default=-1)
55
+ target_anno_id: int = Field(description='unique id of the target object',
56
+ default=-1)
57
+ relation: ElementRelType = Field(
58
+ description='the relation between source and target element')
59
+
60
+
61
+ class LayoutElementsExtra(BaseModel):
62
+ element_relation: list[ElementRelation] = Field(
63
+ description='the relation between source and target element')
64
+
65
+
66
+ class LayoutElements(BaseModel):
67
+ layout_dets: list[ContentObject] = Field(
68
+ description='layout element details')
69
+ page_info: PageInfo = Field(description='page info')
70
+ extra: LayoutElementsExtra = Field(description='extra information')
71
+
72
+
73
+ # iter data format
74
+ class Node(BaseModel):
75
+ category_type: CategoryType = Field(description='类别')
76
+ text: str | None = Field(description='text content of the object',
77
+ default=None)
78
+ image_path: str | None = Field(description='path of embedded image',
79
+ default=None)
80
+ anno_id: int = Field(description='unique id', default=-1)
81
+ latex: str | None = Field(description='latex result', default=None)
82
+ html: str | None = Field(description='html result', default=None)