magic-pdf 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. magic_pdf/config/__init__.py +0 -0
  2. magic_pdf/config/enums.py +7 -0
  3. magic_pdf/config/exceptions.py +32 -0
  4. magic_pdf/data/__init__.py +0 -0
  5. magic_pdf/data/data_reader_writer/__init__.py +12 -0
  6. magic_pdf/data/data_reader_writer/base.py +51 -0
  7. magic_pdf/data/data_reader_writer/filebase.py +59 -0
  8. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +143 -0
  9. magic_pdf/data/data_reader_writer/s3.py +73 -0
  10. magic_pdf/data/dataset.py +194 -0
  11. magic_pdf/data/io/__init__.py +6 -0
  12. magic_pdf/data/io/base.py +42 -0
  13. magic_pdf/data/io/http.py +37 -0
  14. magic_pdf/data/io/s3.py +114 -0
  15. magic_pdf/data/read_api.py +95 -0
  16. magic_pdf/data/schemas.py +19 -0
  17. magic_pdf/data/utils.py +32 -0
  18. magic_pdf/dict2md/ocr_mkcontent.py +106 -244
  19. magic_pdf/libs/Constants.py +21 -8
  20. magic_pdf/libs/MakeContentConfig.py +1 -0
  21. magic_pdf/libs/boxbase.py +35 -0
  22. magic_pdf/libs/clean_memory.py +10 -0
  23. magic_pdf/libs/config_reader.py +53 -23
  24. magic_pdf/libs/draw_bbox.py +150 -65
  25. magic_pdf/libs/ocr_content_type.py +2 -0
  26. magic_pdf/libs/version.py +1 -1
  27. magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
  28. magic_pdf/model/magic_model.py +331 -15
  29. magic_pdf/model/pdf_extract_kit.py +170 -83
  30. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +40 -16
  31. magic_pdf/model/ppTableModel.py +8 -6
  32. magic_pdf/model/pp_structure_v2.py +5 -2
  33. magic_pdf/model/v3/__init__.py +0 -0
  34. magic_pdf/model/v3/helpers.py +125 -0
  35. magic_pdf/para/para_split_v3.py +322 -0
  36. magic_pdf/pdf_parse_by_ocr.py +6 -3
  37. magic_pdf/pdf_parse_by_txt.py +6 -3
  38. magic_pdf/pdf_parse_union_core_v2.py +644 -0
  39. magic_pdf/pipe/AbsPipe.py +5 -1
  40. magic_pdf/pipe/OCRPipe.py +10 -4
  41. magic_pdf/pipe/TXTPipe.py +10 -4
  42. magic_pdf/pipe/UNIPipe.py +16 -7
  43. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
  44. magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
  45. magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
  46. magic_pdf/resources/model_config/model_configs.yaml +5 -13
  47. magic_pdf/tools/cli.py +14 -1
  48. magic_pdf/tools/common.py +18 -8
  49. magic_pdf/user_api.py +25 -6
  50. magic_pdf/utils/__init__.py +0 -0
  51. magic_pdf/utils/annotations.py +11 -0
  52. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/LICENSE.md +1 -0
  53. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/METADATA +124 -78
  54. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/RECORD +57 -33
  55. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/WHEEL +0 -0
  56. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/entry_points.txt +0 -0
  57. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  import re
2
2
 
3
- import wordninja
4
3
  from loguru import logger
5
4
 
6
5
  from magic_pdf.libs.commons import join_path
@@ -8,6 +7,7 @@ from magic_pdf.libs.language import detect_lang
8
7
  from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
9
8
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
9
  from magic_pdf.libs.ocr_content_type import BlockType, ContentType
10
+ from magic_pdf.para.para_split_v3 import ListLineTag
11
11
 
12
12
 
13
13
  def __is_hyphen_at_line_end(line):
@@ -24,37 +24,6 @@ def __is_hyphen_at_line_end(line):
24
24
  return bool(re.search(r'[A-Za-z]+-\s*$', line))
25
25
 
26
26
 
27
- def split_long_words(text):
28
- segments = text.split(' ')
29
- for i in range(len(segments)):
30
- words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
31
- for j in range(len(words)):
32
- if len(words[j]) > 10:
33
- words[j] = ' '.join(wordninja.split(words[j]))
34
- segments[i] = ''.join(words)
35
- return ' '.join(segments)
36
-
37
-
38
- def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
39
- markdown = []
40
- for page_info in pdf_info_list:
41
- paras_of_layout = page_info.get('para_blocks')
42
- page_markdown = ocr_mk_markdown_with_para_core_v2(
43
- paras_of_layout, 'mm', img_buket_path)
44
- markdown.extend(page_markdown)
45
- return '\n\n'.join(markdown)
46
-
47
-
48
- def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
49
- markdown = []
50
- for page_info in pdf_info_dict:
51
- paras_of_layout = page_info.get('para_blocks')
52
- page_markdown = ocr_mk_markdown_with_para_core_v2(
53
- paras_of_layout, 'nlp')
54
- markdown.extend(page_markdown)
55
- return '\n\n'.join(markdown)
56
-
57
-
58
27
  def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
59
28
  img_buket_path):
60
29
  markdown_with_para_and_pagination = []
@@ -67,61 +36,23 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
67
36
  paras_of_layout, 'mm', img_buket_path)
68
37
  markdown_with_para_and_pagination.append({
69
38
  'page_no':
70
- page_no,
39
+ page_no,
71
40
  'md_content':
72
- '\n\n'.join(page_markdown)
41
+ '\n\n'.join(page_markdown)
73
42
  })
74
43
  page_no += 1
75
44
  return markdown_with_para_and_pagination
76
45
 
77
46
 
78
- def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
79
- page_markdown = []
80
- for paras in paras_of_layout:
81
- for para in paras:
82
- para_text = ''
83
- for line in para:
84
- for span in line['spans']:
85
- span_type = span.get('type')
86
- content = ''
87
- language = ''
88
- if span_type == ContentType.Text:
89
- content = span['content']
90
- language = detect_lang(content)
91
- if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
92
- content = ocr_escape_special_markdown_char(
93
- split_long_words(content))
94
- else:
95
- content = ocr_escape_special_markdown_char(content)
96
- elif span_type == ContentType.InlineEquation:
97
- content = f"${span['content']}$"
98
- elif span_type == ContentType.InterlineEquation:
99
- content = f"\n$$\n{span['content']}\n$$\n"
100
- elif span_type in [ContentType.Image, ContentType.Table]:
101
- if mode == 'mm':
102
- content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
103
- elif mode == 'nlp':
104
- pass
105
- if content != '':
106
- if language == 'en': # 英文语境下 content间需要空格分隔
107
- para_text += content + ' '
108
- else: # 中文语境下,content间不需要空格分隔
109
- para_text += content
110
- if para_text.strip() == '':
111
- continue
112
- else:
113
- page_markdown.append(para_text.strip() + ' ')
114
- return page_markdown
115
-
116
-
117
47
  def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
118
48
  mode,
119
- img_buket_path=''):
49
+ img_buket_path='',
50
+ ):
120
51
  page_markdown = []
121
52
  for para_block in paras_of_layout:
122
53
  para_text = ''
123
54
  para_type = para_block['type']
124
- if para_type == BlockType.Text:
55
+ if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
125
56
  para_text = merge_para_with_text(para_block)
126
57
  elif para_type == BlockType.Title:
127
58
  para_text = f'# {merge_para_with_text(para_block)}'
@@ -136,20 +67,21 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
136
67
  for line in block['lines']:
137
68
  for span in line['spans']:
138
69
  if span['type'] == ContentType.Image:
139
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
70
+ if span.get('image_path', ''):
71
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
140
72
  for block in para_block['blocks']: # 2nd.拼image_caption
141
73
  if block['type'] == BlockType.ImageCaption:
142
- para_text += merge_para_with_text(block)
143
- for block in para_block['blocks']: # 2nd.拼image_caption
74
+ para_text += merge_para_with_text(block) + ' \n'
75
+ for block in para_block['blocks']: # 3rd.拼image_footnote
144
76
  if block['type'] == BlockType.ImageFootnote:
145
- para_text += merge_para_with_text(block)
77
+ para_text += merge_para_with_text(block) + ' \n'
146
78
  elif para_type == BlockType.Table:
147
79
  if mode == 'nlp':
148
80
  continue
149
81
  elif mode == 'mm':
150
82
  for block in para_block['blocks']: # 1st.拼table_caption
151
83
  if block['type'] == BlockType.TableCaption:
152
- para_text += merge_para_with_text(block)
84
+ para_text += merge_para_with_text(block) + ' \n'
153
85
  for block in para_block['blocks']: # 2nd.拼table_body
154
86
  if block['type'] == BlockType.TableBody:
155
87
  for line in block['lines']:
@@ -160,11 +92,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
160
92
  para_text += f"\n\n$\n {span['latex']}\n$\n\n"
161
93
  elif span.get('html', ''):
162
94
  para_text += f"\n\n{span['html']}\n\n"
163
- else:
95
+ elif span.get('image_path', ''):
164
96
  para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
165
97
  for block in para_block['blocks']: # 3rd.拼table_footnote
166
98
  if block['type'] == BlockType.TableFootnote:
167
- para_text += merge_para_with_text(block)
99
+ para_text += merge_para_with_text(block) + ' \n'
168
100
 
169
101
  if para_text.strip() == '':
170
102
  continue
@@ -174,22 +106,36 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
174
106
  return page_markdown
175
107
 
176
108
 
177
- def merge_para_with_text(para_block):
178
-
179
- def detect_language(text):
180
- en_pattern = r'[a-zA-Z]+'
181
- en_matches = re.findall(en_pattern, text)
182
- en_length = sum(len(match) for match in en_matches)
183
- if len(text) > 0:
184
- if en_length / len(text) >= 0.5:
185
- return 'en'
186
- else:
187
- return 'unknown'
109
+ def detect_language(text):
110
+ en_pattern = r'[a-zA-Z]+'
111
+ en_matches = re.findall(en_pattern, text)
112
+ en_length = sum(len(match) for match in en_matches)
113
+ if len(text) > 0:
114
+ if en_length / len(text) >= 0.5:
115
+ return 'en'
188
116
  else:
189
- return 'empty'
117
+ return 'unknown'
118
+ else:
119
+ return 'empty'
120
+
121
+
122
+ # 连写字符拆分
123
+ def __replace_ligatures(text: str):
124
+ text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
125
+ text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
126
+ text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
127
+ text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
128
+ text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
129
+ return text
190
130
 
131
+
132
+ def merge_para_with_text(para_block):
191
133
  para_text = ''
192
- for line in para_block['lines']:
134
+ for i, line in enumerate(para_block['lines']):
135
+
136
+ if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
137
+ para_text += ' \n'
138
+
193
139
  line_text = ''
194
140
  line_lang = ''
195
141
  for span in line['spans']:
@@ -199,208 +145,120 @@ def merge_para_with_text(para_block):
199
145
  if line_text != '':
200
146
  line_lang = detect_lang(line_text)
201
147
  for span in line['spans']:
148
+
202
149
  span_type = span['type']
203
150
  content = ''
204
151
  if span_type == ContentType.Text:
205
- content = span['content']
206
- # language = detect_lang(content)
207
- language = detect_language(content)
208
- if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
209
- content = ocr_escape_special_markdown_char(
210
- split_long_words(content))
211
- else:
212
- content = ocr_escape_special_markdown_char(content)
152
+ content = ocr_escape_special_markdown_char(span['content'])
213
153
  elif span_type == ContentType.InlineEquation:
214
- content = f" ${span['content']}$ "
154
+ content = f"${span['content']}$"
215
155
  elif span_type == ContentType.InterlineEquation:
216
156
  content = f"\n$$\n{span['content']}\n$$\n"
217
157
 
158
+ content = content.strip()
218
159
  if content != '':
219
160
  langs = ['zh', 'ja', 'ko']
220
161
  if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
221
- para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
222
- elif line_lang == 'en':
223
- # 如果是前一行带有-连字符,那么末尾不应该加空格
224
- if __is_hyphen_at_line_end(content):
225
- para_text += content[:-1]
226
- else:
227
- para_text += content + ' '
162
+ if span_type in [ContentType.Text, ContentType.InterlineEquation]:
163
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
164
+ elif span_type == ContentType.InlineEquation:
165
+ para_text += f" {content} "
228
166
  else:
229
- para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
230
- return para_text
231
-
167
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
168
+ # 如果是前一行带有-连字符,那么末尾不应该加空格
169
+ if __is_hyphen_at_line_end(content):
170
+ para_text += content[:-1]
171
+ elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
172
+ para_text += content
173
+ else: # 西方文本语境下 content间需要空格分隔
174
+ para_text += f"{content} "
175
+ elif span_type == ContentType.InterlineEquation:
176
+ para_text += content
177
+ else:
178
+ continue
179
+ # 连写字符拆分
180
+ para_text = __replace_ligatures(para_text)
232
181
 
233
- def para_to_standard_format(para, img_buket_path):
234
- para_content = {}
235
- if len(para) == 1:
236
- para_content = line_to_standard_format(para[0], img_buket_path)
237
- elif len(para) > 1:
238
- para_text = ''
239
- inline_equation_num = 0
240
- for line in para:
241
- for span in line['spans']:
242
- language = ''
243
- span_type = span.get('type')
244
- content = ''
245
- if span_type == ContentType.Text:
246
- content = span['content']
247
- language = detect_lang(content)
248
- if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
249
- content = ocr_escape_special_markdown_char(
250
- split_long_words(content))
251
- else:
252
- content = ocr_escape_special_markdown_char(content)
253
- elif span_type == ContentType.InlineEquation:
254
- content = f"${span['content']}$"
255
- inline_equation_num += 1
256
- if language == 'en': # 英文语境下 content间需要空格分隔
257
- para_text += content + ' '
258
- else: # 中文语境下,content间不需要空格分隔
259
- para_text += content
260
- para_content = {
261
- 'type': 'text',
262
- 'text': para_text,
263
- 'inline_equation_num': inline_equation_num,
264
- }
265
- return para_content
182
+ return para_text
266
183
 
267
184
 
268
- def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
185
+ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
269
186
  para_type = para_block['type']
270
- if para_type == BlockType.Text:
187
+ para_content = {}
188
+ if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
271
189
  para_content = {
272
190
  'type': 'text',
273
191
  'text': merge_para_with_text(para_block),
274
- 'page_idx': page_idx,
275
192
  }
276
193
  elif para_type == BlockType.Title:
277
194
  para_content = {
278
195
  'type': 'text',
279
196
  'text': merge_para_with_text(para_block),
280
197
  'text_level': 1,
281
- 'page_idx': page_idx,
282
198
  }
283
199
  elif para_type == BlockType.InterlineEquation:
284
200
  para_content = {
285
201
  'type': 'equation',
286
202
  'text': merge_para_with_text(para_block),
287
203
  'text_format': 'latex',
288
- 'page_idx': page_idx,
289
204
  }
290
205
  elif para_type == BlockType.Image:
291
- para_content = {'type': 'image', 'page_idx': page_idx}
206
+ para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
292
207
  for block in para_block['blocks']:
293
208
  if block['type'] == BlockType.ImageBody:
294
- para_content['img_path'] = join_path(
295
- img_buket_path,
296
- block['lines'][0]['spans'][0]['image_path'])
209
+ for line in block['lines']:
210
+ for span in line['spans']:
211
+ if span['type'] == ContentType.Image:
212
+ if span.get('image_path', ''):
213
+ para_content['img_path'] = join_path(img_buket_path, span['image_path'])
297
214
  if block['type'] == BlockType.ImageCaption:
298
- para_content['img_caption'] = merge_para_with_text(block)
215
+ para_content['img_caption'].append(merge_para_with_text(block))
299
216
  if block['type'] == BlockType.ImageFootnote:
300
- para_content['img_footnote'] = merge_para_with_text(block)
217
+ para_content['img_footnote'].append(merge_para_with_text(block))
301
218
  elif para_type == BlockType.Table:
302
- para_content = {'type': 'table', 'page_idx': page_idx}
219
+ para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
303
220
  for block in para_block['blocks']:
304
221
  if block['type'] == BlockType.TableBody:
305
- if block["lines"][0]["spans"][0].get('latex', ''):
306
- para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
307
- elif block["lines"][0]["spans"][0].get('html', ''):
308
- para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
309
- para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
222
+ for line in block['lines']:
223
+ for span in line['spans']:
224
+ if span['type'] == ContentType.Table:
225
+
226
+ if span.get('latex', ''):
227
+ para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
228
+ elif span.get('html', ''):
229
+ para_content['table_body'] = f"\n\n{span['html']}\n\n"
230
+
231
+ if span.get('image_path', ''):
232
+ para_content['img_path'] = join_path(img_buket_path, span['image_path'])
233
+
310
234
  if block['type'] == BlockType.TableCaption:
311
- para_content['table_caption'] = merge_para_with_text(block)
235
+ para_content['table_caption'].append(merge_para_with_text(block))
312
236
  if block['type'] == BlockType.TableFootnote:
313
- para_content['table_footnote'] = merge_para_with_text(block)
237
+ para_content['table_footnote'].append(merge_para_with_text(block))
314
238
 
315
- return para_content
239
+ para_content['page_idx'] = page_idx
316
240
 
241
+ if drop_reason is not None:
242
+ para_content['drop_reason'] = drop_reason
317
243
 
318
- def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
319
- content_list = []
320
- for page_info in pdf_info_dict:
321
- paras_of_layout = page_info.get('para_blocks')
322
- if not paras_of_layout:
323
- continue
324
- for para_block in paras_of_layout:
325
- para_content = para_to_standard_format_v2(para_block,
326
- img_buket_path)
327
- content_list.append(para_content)
328
- return content_list
329
-
330
-
331
- def line_to_standard_format(line, img_buket_path):
332
- line_text = ''
333
- inline_equation_num = 0
334
- for span in line['spans']:
335
- if not span.get('content'):
336
- if not span.get('image_path'):
337
- continue
338
- else:
339
- if span['type'] == ContentType.Image:
340
- content = {
341
- 'type': 'image',
342
- 'img_path': join_path(img_buket_path,
343
- span['image_path']),
344
- }
345
- return content
346
- elif span['type'] == ContentType.Table:
347
- content = {
348
- 'type': 'table',
349
- 'img_path': join_path(img_buket_path,
350
- span['image_path']),
351
- }
352
- return content
353
- else:
354
- if span['type'] == ContentType.InterlineEquation:
355
- interline_equation = span['content']
356
- content = {
357
- 'type': 'equation',
358
- 'latex': f'$$\n{interline_equation}\n$$'
359
- }
360
- return content
361
- elif span['type'] == ContentType.InlineEquation:
362
- inline_equation = span['content']
363
- line_text += f'${inline_equation}$'
364
- inline_equation_num += 1
365
- elif span['type'] == ContentType.Text:
366
- text_content = ocr_escape_special_markdown_char(
367
- span['content']) # 转义特殊符号
368
- line_text += text_content
369
- content = {
370
- 'type': 'text',
371
- 'text': line_text,
372
- 'inline_equation_num': inline_equation_num,
373
- }
374
- return content
375
-
376
-
377
- def ocr_mk_mm_standard_format(pdf_info_dict: list):
378
- """content_list type string
379
- image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
380
- latex文本字段。 text string 纯文本格式的文本数据。 md string
381
- markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
382
- content_list = []
383
- for page_info in pdf_info_dict:
384
- blocks = page_info.get('preproc_blocks')
385
- if not blocks:
386
- continue
387
- for block in blocks:
388
- for line in block['lines']:
389
- content = line_to_standard_format(line)
390
- content_list.append(content)
391
- return content_list
244
+ return para_content
392
245
 
393
246
 
394
247
  def union_make(pdf_info_dict: list,
395
248
  make_mode: str,
396
249
  drop_mode: str,
397
- img_buket_path: str = ''):
250
+ img_buket_path: str = '',
251
+ ):
398
252
  output_content = []
399
253
  for page_info in pdf_info_dict:
254
+ drop_reason_flag = False
255
+ drop_reason = None
400
256
  if page_info.get('need_drop', False):
401
257
  drop_reason = page_info.get('drop_reason')
402
258
  if drop_mode == DropMode.NONE:
403
259
  pass
260
+ elif drop_mode == DropMode.NONE_WITH_REASON:
261
+ drop_reason_flag = True
404
262
  elif drop_mode == DropMode.WHOLE_PDF:
405
263
  raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
406
264
  f'drop_reason is {drop_reason}'))
@@ -425,8 +283,12 @@ def union_make(pdf_info_dict: list,
425
283
  output_content.extend(page_markdown)
426
284
  elif make_mode == MakeMode.STANDARD_FORMAT:
427
285
  for para_block in paras_of_layout:
428
- para_content = para_to_standard_format_v2(
429
- para_block, img_buket_path, page_idx)
286
+ if drop_reason_flag:
287
+ para_content = para_to_standard_format_v2(
288
+ para_block, img_buket_path, page_idx)
289
+ else:
290
+ para_content = para_to_standard_format_v2(
291
+ para_block, img_buket_path, page_idx)
430
292
  output_content.append(para_content)
431
293
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
432
294
  return '\n\n'.join(output_content)
@@ -10,18 +10,12 @@ block维度自定义字段
10
10
  # block中lines是否被删除
11
11
  LINES_DELETED = "lines_deleted"
12
12
 
13
- # struct eqtable
14
- STRUCT_EQTABLE = "struct_eqtable"
15
-
16
13
  # table recognition max time default value
17
14
  TABLE_MAX_TIME_VALUE = 400
18
15
 
19
16
  # pp_table_result_max_length
20
17
  TABLE_MAX_LEN = 480
21
18
 
22
- # pp table structure algorithm
23
- TABLE_MASTER = "TableMaster"
24
-
25
19
  # table master structure dict
26
20
  TABLE_MASTER_DICT = "table_master_structure_dict.txt"
27
21
 
@@ -29,12 +23,31 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
29
23
  TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
30
24
 
31
25
  # pp detect model dir
32
- DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
26
+ DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
33
27
 
34
28
  # pp rec model dir
35
- REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
29
+ REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
36
30
 
37
31
  # pp rec char dict path
38
32
  REC_CHAR_DICT = "ppocr_keys_v1.txt"
39
33
 
34
+ # pp rec copy rec directory
35
+ PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
36
+
37
+ # pp rec copy det directory
38
+ PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
39
+
40
+
41
+ class MODEL_NAME:
42
+ # pp table structure algorithm
43
+ TABLE_MASTER = "tablemaster"
44
+ # struct eqtable
45
+ STRUCT_EQTABLE = "struct_eqtable"
46
+
47
+ DocLayout_YOLO = "doclayout_yolo"
48
+
49
+ LAYOUTLMv3 = "layoutlmv3"
50
+
51
+ YOLO_V8_MFD = "yolo_v8_mfd"
40
52
 
53
+ UniMerNet_v2_Small = "unimernet_small"
@@ -8,3 +8,4 @@ class DropMode:
8
8
  WHOLE_PDF = "whole_pdf"
9
9
  SINGLE_PAGE = "single_page"
10
10
  NONE = "none"
11
+ NONE_WITH_REASON = "none_with_reason"
magic_pdf/libs/boxbase.py CHANGED
@@ -445,3 +445,38 @@ def get_overlap_area(bbox1, bbox2):
445
445
 
446
446
  # The area of overlap area
447
447
  return (x_right - x_left) * (y_bottom - y_top)
448
+
449
+
450
+ def calculate_vertical_projection_overlap_ratio(block1, block2):
451
+ """
452
+ Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
453
+
454
+ Args:
455
+ block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
456
+ block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
457
+
458
+ Returns:
459
+ float: The proportion of the x-axis covered by the vertical projection of the two blocks.
460
+ """
461
+ x0_1, _, x1_1, _ = block1
462
+ x0_2, _, x1_2, _ = block2
463
+
464
+ # Calculate the intersection of the x-coordinates
465
+ x_left = max(x0_1, x0_2)
466
+ x_right = min(x1_1, x1_2)
467
+
468
+ if x_right < x_left:
469
+ return 0.0
470
+
471
+ # Length of the intersection
472
+ intersection_length = x_right - x_left
473
+
474
+ # Length of the x-axis projection of the first block
475
+ block1_length = x1_1 - x0_1
476
+
477
+ if block1_length == 0:
478
+ return 0.0
479
+
480
+ # Proportion of the x-axis covered by the intersection
481
+ # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
482
+ return intersection_length / block1_length
@@ -0,0 +1,10 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ import torch
3
+ import gc
4
+
5
+
6
+ def clean_memory():
7
+ if torch.cuda.is_available():
8
+ torch.cuda.empty_cache()
9
+ torch.cuda.ipc_collect()
10
+ gc.collect()