magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +188 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +283 -166
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,27 @@
1
+ import re
2
+
3
+ import wordninja
1
4
  from loguru import logger
2
5
 
3
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
4
6
  from magic_pdf.libs.commons import join_path
5
7
  from magic_pdf.libs.language import detect_lang
8
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
6
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
7
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
8
- import wordninja
9
- import re
10
+ from magic_pdf.libs.ocr_content_type import BlockType, ContentType
11
+
12
+
13
+ def __is_hyphen_at_line_end(line):
14
+ """
15
+ Check if a line ends with one or more letters followed by a hyphen.
16
+
17
+ Args:
18
+ line (str): The line of text to check.
19
+
20
+ Returns:
21
+ bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
22
+ """
23
+ # Use regex to check if the line ends with one or more letters followed by a hyphen
24
+ return bool(re.search(r'[A-Za-z]+-\s*$', line))
10
25
 
11
26
 
12
27
  def split_long_words(text):
@@ -14,7 +29,7 @@ def split_long_words(text):
14
29
  for i in range(len(segments)):
15
30
  words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
16
31
  for j in range(len(words)):
17
- if len(words[j]) > 15:
32
+ if len(words[j]) > 10:
18
33
  words[j] = ' '.join(wordninja.split(words[j]))
19
34
  segments[i] = ''.join(words)
20
35
  return ' '.join(segments)
@@ -23,8 +38,9 @@ def split_long_words(text):
23
38
  def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
24
39
  markdown = []
25
40
  for page_info in pdf_info_list:
26
- paras_of_layout = page_info.get("para_blocks")
27
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
41
+ paras_of_layout = page_info.get('para_blocks')
42
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
43
+ paras_of_layout, 'mm', img_buket_path)
28
44
  markdown.extend(page_markdown)
29
45
  return '\n\n'.join(markdown)
30
46
 
@@ -32,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
32
48
  def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
33
49
  markdown = []
34
50
  for page_info in pdf_info_dict:
35
- paras_of_layout = page_info.get("para_blocks")
36
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
51
+ paras_of_layout = page_info.get('para_blocks')
52
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
53
+ paras_of_layout, 'nlp')
37
54
  markdown.extend(page_markdown)
38
55
  return '\n\n'.join(markdown)
39
56
 
40
57
 
41
- def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
58
+ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
59
+ img_buket_path):
42
60
  markdown_with_para_and_pagination = []
43
61
  page_no = 0
44
62
  for page_info in pdf_info_dict:
45
- paras_of_layout = page_info.get("para_blocks")
63
+ paras_of_layout = page_info.get('para_blocks')
46
64
  if not paras_of_layout:
47
65
  continue
48
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
66
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
67
+ paras_of_layout, 'mm', img_buket_path)
49
68
  markdown_with_para_and_pagination.append({
50
- 'page_no': page_no,
51
- 'md_content': '\n\n'.join(page_markdown)
69
+ 'page_no':
70
+ page_no,
71
+ 'md_content':
72
+ '\n\n'.join(page_markdown)
52
73
  })
53
74
  page_no += 1
54
75
  return markdown_with_para_and_pagination
55
76
 
56
77
 
57
- def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
78
+ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
58
79
  page_markdown = []
59
80
  for paras in paras_of_layout:
60
81
  for para in paras:
@@ -67,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
67
88
  if span_type == ContentType.Text:
68
89
  content = span['content']
69
90
  language = detect_lang(content)
70
- if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
71
- content = ocr_escape_special_markdown_char(split_long_words(content))
91
+ if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
92
+ content = ocr_escape_special_markdown_char(
93
+ split_long_words(content))
72
94
  else:
73
95
  content = ocr_escape_special_markdown_char(content)
74
96
  elif span_type == ContentType.InlineEquation:
@@ -92,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
92
114
  return page_markdown
93
115
 
94
116
 
95
- def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
117
+ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
118
+ mode,
119
+ img_buket_path=''):
96
120
  page_markdown = []
97
121
  for para_block in paras_of_layout:
98
122
  para_text = ''
@@ -100,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
100
124
  if para_type == BlockType.Text:
101
125
  para_text = merge_para_with_text(para_block)
102
126
  elif para_type == BlockType.Title:
103
- para_text = f"# {merge_para_with_text(para_block)}"
127
+ para_text = f'# {merge_para_with_text(para_block)}'
104
128
  elif para_type == BlockType.InterlineEquation:
105
129
  para_text = merge_para_with_text(para_block)
106
130
  elif para_type == BlockType.Image:
@@ -116,14 +140,16 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
116
140
  for block in para_block['blocks']: # 2nd.拼image_caption
117
141
  if block['type'] == BlockType.ImageCaption:
118
142
  para_text += merge_para_with_text(block)
143
+ for block in para_block['blocks']: # 2nd.拼image_caption
144
+ if block['type'] == BlockType.ImageFootnote:
145
+ para_text += merge_para_with_text(block)
119
146
  elif para_type == BlockType.Table:
120
147
  if mode == 'nlp':
121
148
  continue
122
149
  elif mode == 'mm':
123
- table_caption = ''
124
150
  for block in para_block['blocks']: # 1st.拼table_caption
125
151
  if block['type'] == BlockType.TableCaption:
126
- table_caption = merge_para_with_text(block)
152
+ para_text += merge_para_with_text(block)
127
153
  for block in para_block['blocks']: # 2nd.拼table_body
128
154
  if block['type'] == BlockType.TableBody:
129
155
  for line in block['lines']:
@@ -135,7 +161,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
135
161
  elif span.get('html', ''):
136
162
  para_text += f"\n\n{span['html']}\n\n"
137
163
  else:
138
- para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])}) \n"
164
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
139
165
  for block in para_block['blocks']: # 3rd.拼table_footnote
140
166
  if block['type'] == BlockType.TableFootnote:
141
167
  para_text += merge_para_with_text(block)
@@ -149,24 +175,39 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
149
175
 
150
176
 
151
177
  def merge_para_with_text(para_block):
178
+
179
+ def detect_language(text):
180
+ en_pattern = r'[a-zA-Z]+'
181
+ en_matches = re.findall(en_pattern, text)
182
+ en_length = sum(len(match) for match in en_matches)
183
+ if len(text) > 0:
184
+ if en_length / len(text) >= 0.5:
185
+ return 'en'
186
+ else:
187
+ return 'unknown'
188
+ else:
189
+ return 'empty'
190
+
152
191
  para_text = ''
153
192
  for line in para_block['lines']:
154
- line_text = ""
155
- line_lang = ""
193
+ line_text = ''
194
+ line_lang = ''
156
195
  for span in line['spans']:
157
196
  span_type = span['type']
158
197
  if span_type == ContentType.Text:
159
198
  line_text += span['content'].strip()
160
- if line_text != "":
199
+ if line_text != '':
161
200
  line_lang = detect_lang(line_text)
162
201
  for span in line['spans']:
163
202
  span_type = span['type']
164
203
  content = ''
165
204
  if span_type == ContentType.Text:
166
205
  content = span['content']
167
- language = detect_lang(content)
206
+ # language = detect_lang(content)
207
+ language = detect_language(content)
168
208
  if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
169
- content = ocr_escape_special_markdown_char(split_long_words(content))
209
+ content = ocr_escape_special_markdown_char(
210
+ split_long_words(content))
170
211
  else:
171
212
  content = ocr_escape_special_markdown_char(content)
172
213
  elif span_type == ContentType.InlineEquation:
@@ -175,10 +216,17 @@ def merge_para_with_text(para_block):
175
216
  content = f"\n$$\n{span['content']}\n$$\n"
176
217
 
177
218
  if content != '':
178
- if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
179
- para_text += content # 中文语境下,content间不需要空格分隔
219
+ langs = ['zh', 'ja', 'ko']
220
+ if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
221
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
222
+ elif line_lang == 'en':
223
+ # 如果是前一行带有-连字符,那么末尾不应该加空格
224
+ if __is_hyphen_at_line_end(content):
225
+ para_text += content[:-1]
226
+ else:
227
+ para_text += content + ' '
180
228
  else:
181
- para_text += content + ' ' # 英文语境下 content间需要空格分隔
229
+ para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
182
230
  return para_text
183
231
 
184
232
 
@@ -193,18 +241,18 @@ def para_to_standard_format(para, img_buket_path):
193
241
  for span in line['spans']:
194
242
  language = ''
195
243
  span_type = span.get('type')
196
- content = ""
244
+ content = ''
197
245
  if span_type == ContentType.Text:
198
246
  content = span['content']
199
247
  language = detect_lang(content)
200
248
  if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
201
- content = ocr_escape_special_markdown_char(split_long_words(content))
249
+ content = ocr_escape_special_markdown_char(
250
+ split_long_words(content))
202
251
  else:
203
252
  content = ocr_escape_special_markdown_char(content)
204
253
  elif span_type == ContentType.InlineEquation:
205
254
  content = f"${span['content']}$"
206
255
  inline_equation_num += 1
207
-
208
256
  if language == 'en': # 英文语境下 content间需要空格分隔
209
257
  para_text += content + ' '
210
258
  else: # 中文语境下,content间不需要空格分隔
@@ -212,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
212
260
  para_content = {
213
261
  'type': 'text',
214
262
  'text': para_text,
215
- 'inline_equation_num': inline_equation_num
263
+ 'inline_equation_num': inline_equation_num,
216
264
  }
217
265
  return para_content
218
266
 
@@ -223,37 +271,35 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
223
271
  para_content = {
224
272
  'type': 'text',
225
273
  'text': merge_para_with_text(para_block),
226
- 'page_idx': page_idx
274
+ 'page_idx': page_idx,
227
275
  }
228
276
  elif para_type == BlockType.Title:
229
277
  para_content = {
230
278
  'type': 'text',
231
279
  'text': merge_para_with_text(para_block),
232
280
  'text_level': 1,
233
- 'page_idx': page_idx
281
+ 'page_idx': page_idx,
234
282
  }
235
283
  elif para_type == BlockType.InterlineEquation:
236
284
  para_content = {
237
285
  'type': 'equation',
238
286
  'text': merge_para_with_text(para_block),
239
- 'text_format': "latex",
240
- 'page_idx': page_idx
287
+ 'text_format': 'latex',
288
+ 'page_idx': page_idx,
241
289
  }
242
290
  elif para_type == BlockType.Image:
243
- para_content = {
244
- 'type': 'image',
245
- 'page_idx': page_idx
246
- }
291
+ para_content = {'type': 'image', 'page_idx': page_idx}
247
292
  for block in para_block['blocks']:
248
293
  if block['type'] == BlockType.ImageBody:
249
- para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
294
+ para_content['img_path'] = join_path(
295
+ img_buket_path,
296
+ block['lines'][0]['spans'][0]['image_path'])
250
297
  if block['type'] == BlockType.ImageCaption:
251
298
  para_content['img_caption'] = merge_para_with_text(block)
299
+ if block['type'] == BlockType.ImageFootnote:
300
+ para_content['img_footnote'] = merge_para_with_text(block)
252
301
  elif para_type == BlockType.Table:
253
- para_content = {
254
- 'type': 'table',
255
- 'page_idx': page_idx
256
- }
302
+ para_content = {'type': 'table', 'page_idx': page_idx}
257
303
  for block in para_block['blocks']:
258
304
  if block['type'] == BlockType.TableBody:
259
305
  if block["lines"][0]["spans"][0].get('latex', ''):
@@ -272,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
272
318
  def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
273
319
  content_list = []
274
320
  for page_info in pdf_info_dict:
275
- paras_of_layout = page_info.get("para_blocks")
321
+ paras_of_layout = page_info.get('para_blocks')
276
322
  if not paras_of_layout:
277
323
  continue
278
324
  for para_block in paras_of_layout:
279
- para_content = para_to_standard_format_v2(para_block, img_buket_path)
325
+ para_content = para_to_standard_format_v2(para_block,
326
+ img_buket_path)
280
327
  content_list.append(para_content)
281
328
  return content_list
282
329
 
283
330
 
284
331
  def line_to_standard_format(line, img_buket_path):
285
- line_text = ""
332
+ line_text = ''
286
333
  inline_equation_num = 0
287
334
  for span in line['spans']:
288
335
  if not span.get('content'):
@@ -292,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
292
339
  if span['type'] == ContentType.Image:
293
340
  content = {
294
341
  'type': 'image',
295
- 'img_path': join_path(img_buket_path, span['image_path'])
342
+ 'img_path': join_path(img_buket_path,
343
+ span['image_path']),
296
344
  }
297
345
  return content
298
346
  elif span['type'] == ContentType.Table:
299
347
  content = {
300
348
  'type': 'table',
301
- 'img_path': join_path(img_buket_path, span['image_path'])
349
+ 'img_path': join_path(img_buket_path,
350
+ span['image_path']),
302
351
  }
303
352
  return content
304
353
  else:
@@ -306,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
306
355
  interline_equation = span['content']
307
356
  content = {
308
357
  'type': 'equation',
309
- 'latex': f"$$\n{interline_equation}\n$$"
358
+ 'latex': f'$$\n{interline_equation}\n$$'
310
359
  }
311
360
  return content
312
361
  elif span['type'] == ContentType.InlineEquation:
313
362
  inline_equation = span['content']
314
- line_text += f"${inline_equation}$"
363
+ line_text += f'${inline_equation}$'
315
364
  inline_equation_num += 1
316
365
  elif span['type'] == ContentType.Text:
317
- text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
366
+ text_content = ocr_escape_special_markdown_char(
367
+ span['content']) # 转义特殊符号
318
368
  line_text += text_content
319
369
  content = {
320
370
  'type': 'text',
321
371
  'text': line_text,
322
- 'inline_equation_num': inline_equation_num
372
+ 'inline_equation_num': inline_equation_num,
323
373
  }
324
374
  return content
325
375
 
326
376
 
327
377
  def ocr_mk_mm_standard_format(pdf_info_dict: list):
328
- """
329
- content_list
330
- type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
331
- latex string latex文本字段。
332
- text string 纯文本格式的文本数据。
333
- md string markdown格式的文本数据。
334
- img_path string s3://full/path/to/img.jpg
335
- """
378
+ """content_list type string
379
+ image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
380
+ latex文本字段。 text string 纯文本格式的文本数据。 md string
381
+ markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
336
382
  content_list = []
337
383
  for page_info in pdf_info_dict:
338
- blocks = page_info.get("preproc_blocks")
384
+ blocks = page_info.get('preproc_blocks')
339
385
  if not blocks:
340
386
  continue
341
387
  for block in blocks:
@@ -345,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
345
391
  return content_list
346
392
 
347
393
 
348
- def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
394
+ def union_make(pdf_info_dict: list,
395
+ make_mode: str,
396
+ drop_mode: str,
397
+ img_buket_path: str = ''):
349
398
  output_content = []
350
399
  for page_info in pdf_info_dict:
351
- if page_info.get("need_drop", False):
352
- drop_reason = page_info.get("drop_reason")
400
+ if page_info.get('need_drop', False):
401
+ drop_reason = page_info.get('drop_reason')
353
402
  if drop_mode == DropMode.NONE:
354
403
  pass
355
404
  elif drop_mode == DropMode.WHOLE_PDF:
356
- raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
405
+ raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
406
+ f'drop_reason is {drop_reason}'))
357
407
  elif drop_mode == DropMode.SINGLE_PAGE:
358
- logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
408
+ logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
409
+ f'drop_reason is {drop_reason}'))
359
410
  continue
360
411
  else:
361
- raise Exception(f"drop_mode can not be null")
412
+ raise Exception('drop_mode can not be null')
362
413
 
363
- paras_of_layout = page_info.get("para_blocks")
364
- page_idx = page_info.get("page_idx")
414
+ paras_of_layout = page_info.get('para_blocks')
415
+ page_idx = page_info.get('page_idx')
365
416
  if not paras_of_layout:
366
417
  continue
367
418
  if make_mode == MakeMode.MM_MD:
368
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
419
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
420
+ paras_of_layout, 'mm', img_buket_path)
369
421
  output_content.extend(page_markdown)
370
422
  elif make_mode == MakeMode.NLP_MD:
371
- page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
423
+ page_markdown = ocr_mk_markdown_with_para_core_v2(
424
+ paras_of_layout, 'nlp')
372
425
  output_content.extend(page_markdown)
373
426
  elif make_mode == MakeMode.STANDARD_FORMAT:
374
427
  for para_block in paras_of_layout:
375
- para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
428
+ para_content = para_to_standard_format_v2(
429
+ para_block, img_buket_path, page_idx)
376
430
  output_content.append(para_content)
377
431
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
378
432
  return '\n\n'.join(output_content)
File without changes
File without changes
@@ -0,0 +1,82 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from loguru import logger
5
+
6
+ from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
7
+ Node)
8
+ from magic_pdf.integrations.rag.utils import inference
9
+
10
+
11
+ class RagPageReader:
12
+
13
+ def __init__(self, pagedata: LayoutElements):
14
+ self.o = [
15
+ Node(
16
+ category_type=v.category_type,
17
+ text=v.text,
18
+ image_path=v.image_path,
19
+ anno_id=v.anno_id,
20
+ latex=v.latex,
21
+ html=v.html,
22
+ ) for v in pagedata.layout_dets
23
+ ]
24
+
25
+ self.pagedata = pagedata
26
+
27
+ def __iter__(self):
28
+ return iter(self.o)
29
+
30
+ def get_rel_map(self) -> list[ElementRelation]:
31
+ return self.pagedata.extra.element_relation
32
+
33
+
34
+ class RagDocumentReader:
35
+
36
+ def __init__(self, ragdata: list[LayoutElements]):
37
+ self.o = [RagPageReader(v) for v in ragdata]
38
+
39
+ def __iter__(self):
40
+ return iter(self.o)
41
+
42
+
43
+ class DataReader:
44
+
45
+ def __init__(self, path_or_directory: str, method: str, output_dir: str):
46
+ self.path_or_directory = path_or_directory
47
+ self.method = method
48
+ self.output_dir = output_dir
49
+ self.pdfs = []
50
+ if os.path.isdir(path_or_directory):
51
+ for doc_path in Path(path_or_directory).glob('*.pdf'):
52
+ self.pdfs.append(doc_path)
53
+ else:
54
+ assert path_or_directory.endswith('.pdf')
55
+ self.pdfs.append(Path(path_or_directory))
56
+
57
+ def get_documents_count(self) -> int:
58
+ """Returns the number of documents in the directory."""
59
+ return len(self.pdfs)
60
+
61
+ def get_document_result(self, idx: int) -> RagDocumentReader | None:
62
+ """
63
+ Args:
64
+ idx (int): the index of documents under the
65
+ directory path_or_directory
66
+
67
+ Returns:
68
+ RagDocumentReader | None: RagDocumentReader is an iterable object,
69
+ more details @RagDocumentReader
70
+ """
71
+ if idx >= self.get_documents_count() or idx < 0:
72
+ logger.error(f'invalid idx: {idx}')
73
+ return None
74
+ res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
75
+ if res is None:
76
+ logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
77
+ return None
78
+ return RagDocumentReader(res)
79
+
80
+ def get_document_filename(self, idx: int) -> Path:
81
+ """get the filename of the document."""
82
+ return self.pdfs[idx]
@@ -0,0 +1,82 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ # rag
7
+ class CategoryType(Enum): # py310 not support StrEnum
8
+ text = 'text'
9
+ title = 'title'
10
+ interline_equation = 'interline_equation'
11
+ image = 'image'
12
+ image_body = 'image_body'
13
+ image_caption = 'image_caption'
14
+ table = 'table'
15
+ table_body = 'table_body'
16
+ table_caption = 'table_caption'
17
+ table_footnote = 'table_footnote'
18
+
19
+
20
+ class ElementRelType(Enum):
21
+ sibling = 'sibling'
22
+
23
+
24
+ class PageInfo(BaseModel):
25
+ page_no: int = Field(description='the index of page, start from zero',
26
+ ge=0)
27
+ height: int = Field(description='the height of page', gt=0)
28
+ width: int = Field(description='the width of page', ge=0)
29
+ image_path: str | None = Field(description='the image of this page',
30
+ default=None)
31
+
32
+
33
+ class ContentObject(BaseModel):
34
+ category_type: CategoryType = Field(description='类别')
35
+ poly: list[float] = Field(
36
+ description=('Coordinates, need to convert back to PDF coordinates,'
37
+ ' order is top-left, top-right, bottom-right, bottom-left'
38
+ ' x,y coordinates'))
39
+ ignore: bool = Field(description='whether ignore this object',
40
+ default=False)
41
+ text: str | None = Field(description='text content of the object',
42
+ default=None)
43
+ image_path: str | None = Field(description='path of embedded image',
44
+ default=None)
45
+ order: int = Field(description='the order of this object within a page',
46
+ default=-1)
47
+ anno_id: int = Field(description='unique id', default=-1)
48
+ latex: str | None = Field(description='latex result', default=None)
49
+ html: str | None = Field(description='html result', default=None)
50
+
51
+
52
+ class ElementRelation(BaseModel):
53
+ source_anno_id: int = Field(description='unique id of the source object',
54
+ default=-1)
55
+ target_anno_id: int = Field(description='unique id of the target object',
56
+ default=-1)
57
+ relation: ElementRelType = Field(
58
+ description='the relation between source and target element')
59
+
60
+
61
+ class LayoutElementsExtra(BaseModel):
62
+ element_relation: list[ElementRelation] = Field(
63
+ description='the relation between source and target element')
64
+
65
+
66
+ class LayoutElements(BaseModel):
67
+ layout_dets: list[ContentObject] = Field(
68
+ description='layout element details')
69
+ page_info: PageInfo = Field(description='page info')
70
+ extra: LayoutElementsExtra = Field(description='extra information')
71
+
72
+
73
+ # iter data format
74
+ class Node(BaseModel):
75
+ category_type: CategoryType = Field(description='类别')
76
+ text: str | None = Field(description='text content of the object',
77
+ default=None)
78
+ image_path: str | None = Field(description='path of embedded image',
79
+ default=None)
80
+ anno_id: int = Field(description='unique id', default=-1)
81
+ latex: str | None = Field(description='latex result', default=None)
82
+ html: str | None = Field(description='html result', default=None)