magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. magic_pdf/config/__init__.py +0 -0
  2. magic_pdf/config/enums.py +7 -0
  3. magic_pdf/config/exceptions.py +32 -0
  4. magic_pdf/data/__init__.py +0 -0
  5. magic_pdf/data/data_reader_writer/__init__.py +12 -0
  6. magic_pdf/data/data_reader_writer/base.py +51 -0
  7. magic_pdf/data/data_reader_writer/filebase.py +59 -0
  8. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
  9. magic_pdf/data/data_reader_writer/s3.py +69 -0
  10. magic_pdf/data/dataset.py +194 -0
  11. magic_pdf/data/io/__init__.py +0 -0
  12. magic_pdf/data/io/base.py +42 -0
  13. magic_pdf/data/io/http.py +37 -0
  14. magic_pdf/data/io/s3.py +114 -0
  15. magic_pdf/data/read_api.py +95 -0
  16. magic_pdf/data/schemas.py +15 -0
  17. magic_pdf/data/utils.py +32 -0
  18. magic_pdf/dict2md/ocr_mkcontent.py +74 -234
  19. magic_pdf/libs/Constants.py +21 -8
  20. magic_pdf/libs/MakeContentConfig.py +1 -0
  21. magic_pdf/libs/boxbase.py +54 -0
  22. magic_pdf/libs/clean_memory.py +10 -0
  23. magic_pdf/libs/config_reader.py +53 -23
  24. magic_pdf/libs/draw_bbox.py +150 -65
  25. magic_pdf/libs/ocr_content_type.py +2 -0
  26. magic_pdf/libs/version.py +1 -1
  27. magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
  28. magic_pdf/model/magic_model.py +418 -51
  29. magic_pdf/model/pdf_extract_kit.py +164 -80
  30. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
  31. magic_pdf/model/ppTableModel.py +2 -2
  32. magic_pdf/model/pp_structure_v2.py +5 -2
  33. magic_pdf/model/v3/__init__.py +0 -0
  34. magic_pdf/model/v3/helpers.py +125 -0
  35. magic_pdf/para/para_split_v3.py +296 -0
  36. magic_pdf/pdf_parse_by_ocr.py +6 -3
  37. magic_pdf/pdf_parse_by_txt.py +6 -3
  38. magic_pdf/pdf_parse_union_core_v2.py +644 -0
  39. magic_pdf/pipe/AbsPipe.py +5 -1
  40. magic_pdf/pipe/OCRPipe.py +10 -4
  41. magic_pdf/pipe/TXTPipe.py +10 -4
  42. magic_pdf/pipe/UNIPipe.py +16 -7
  43. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
  44. magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
  45. magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
  46. magic_pdf/resources/model_config/model_configs.yaml +5 -13
  47. magic_pdf/tools/cli.py +14 -1
  48. magic_pdf/tools/common.py +19 -9
  49. magic_pdf/user_api.py +25 -6
  50. magic_pdf/utils/__init__.py +0 -0
  51. magic_pdf/utils/annotations.py +11 -0
  52. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
  53. magic_pdf-0.9.0.dist-info/METADATA +507 -0
  54. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
  55. magic_pdf-0.8.0.dist-info/METADATA +0 -459
  56. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
  57. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
  58. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  import re
2
2
 
3
- import wordninja
4
3
  from loguru import logger
5
4
 
6
5
  from magic_pdf.libs.commons import join_path
@@ -8,6 +7,7 @@ from magic_pdf.libs.language import detect_lang
8
7
  from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
9
8
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
9
  from magic_pdf.libs.ocr_content_type import BlockType, ContentType
10
+ from magic_pdf.para.para_split_v3 import ListLineTag
11
11
 
12
12
 
13
13
  def __is_hyphen_at_line_end(line):
@@ -24,37 +24,6 @@ def __is_hyphen_at_line_end(line):
24
24
  return bool(re.search(r'[A-Za-z]+-\s*$', line))
25
25
 
26
26
 
27
- def split_long_words(text):
28
- segments = text.split(' ')
29
- for i in range(len(segments)):
30
- words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
31
- for j in range(len(words)):
32
- if len(words[j]) > 10:
33
- words[j] = ' '.join(wordninja.split(words[j]))
34
- segments[i] = ''.join(words)
35
- return ' '.join(segments)
36
-
37
-
38
- def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
39
- markdown = []
40
- for page_info in pdf_info_list:
41
- paras_of_layout = page_info.get('para_blocks')
42
- page_markdown = ocr_mk_markdown_with_para_core_v2(
43
- paras_of_layout, 'mm', img_buket_path)
44
- markdown.extend(page_markdown)
45
- return '\n\n'.join(markdown)
46
-
47
-
48
- def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
49
- markdown = []
50
- for page_info in pdf_info_dict:
51
- paras_of_layout = page_info.get('para_blocks')
52
- page_markdown = ocr_mk_markdown_with_para_core_v2(
53
- paras_of_layout, 'nlp')
54
- markdown.extend(page_markdown)
55
- return '\n\n'.join(markdown)
56
-
57
-
58
27
  def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
59
28
  img_buket_path):
60
29
  markdown_with_para_and_pagination = []
@@ -67,61 +36,23 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
67
36
  paras_of_layout, 'mm', img_buket_path)
68
37
  markdown_with_para_and_pagination.append({
69
38
  'page_no':
70
- page_no,
39
+ page_no,
71
40
  'md_content':
72
- '\n\n'.join(page_markdown)
41
+ '\n\n'.join(page_markdown)
73
42
  })
74
43
  page_no += 1
75
44
  return markdown_with_para_and_pagination
76
45
 
77
46
 
78
- def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
79
- page_markdown = []
80
- for paras in paras_of_layout:
81
- for para in paras:
82
- para_text = ''
83
- for line in para:
84
- for span in line['spans']:
85
- span_type = span.get('type')
86
- content = ''
87
- language = ''
88
- if span_type == ContentType.Text:
89
- content = span['content']
90
- language = detect_lang(content)
91
- if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
92
- content = ocr_escape_special_markdown_char(
93
- split_long_words(content))
94
- else:
95
- content = ocr_escape_special_markdown_char(content)
96
- elif span_type == ContentType.InlineEquation:
97
- content = f"${span['content']}$"
98
- elif span_type == ContentType.InterlineEquation:
99
- content = f"\n$$\n{span['content']}\n$$\n"
100
- elif span_type in [ContentType.Image, ContentType.Table]:
101
- if mode == 'mm':
102
- content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
103
- elif mode == 'nlp':
104
- pass
105
- if content != '':
106
- if language == 'en': # 英文语境下 content间需要空格分隔
107
- para_text += content + ' '
108
- else: # 中文语境下,content间不需要空格分隔
109
- para_text += content
110
- if para_text.strip() == '':
111
- continue
112
- else:
113
- page_markdown.append(para_text.strip() + ' ')
114
- return page_markdown
115
-
116
-
117
47
  def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
118
48
  mode,
119
- img_buket_path=''):
49
+ img_buket_path='',
50
+ ):
120
51
  page_markdown = []
121
52
  for para_block in paras_of_layout:
122
53
  para_text = ''
123
54
  para_type = para_block['type']
124
- if para_type == BlockType.Text:
55
+ if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
125
56
  para_text = merge_para_with_text(para_block)
126
57
  elif para_type == BlockType.Title:
127
58
  para_text = f'# {merge_para_with_text(para_block)}'
@@ -136,20 +67,21 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
136
67
  for line in block['lines']:
137
68
  for span in line['spans']:
138
69
  if span['type'] == ContentType.Image:
139
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
70
+ if span.get('image_path', ''):
71
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
140
72
  for block in para_block['blocks']: # 2nd.拼image_caption
141
73
  if block['type'] == BlockType.ImageCaption:
142
- para_text += merge_para_with_text(block)
143
- for block in para_block['blocks']: # 2nd.拼image_caption
74
+ para_text += merge_para_with_text(block) + ' \n'
75
+ for block in para_block['blocks']: # 3rd.拼image_footnote
144
76
  if block['type'] == BlockType.ImageFootnote:
145
- para_text += merge_para_with_text(block)
77
+ para_text += merge_para_with_text(block) + ' \n'
146
78
  elif para_type == BlockType.Table:
147
79
  if mode == 'nlp':
148
80
  continue
149
81
  elif mode == 'mm':
150
82
  for block in para_block['blocks']: # 1st.拼table_caption
151
83
  if block['type'] == BlockType.TableCaption:
152
- para_text += merge_para_with_text(block)
84
+ para_text += merge_para_with_text(block) + ' \n'
153
85
  for block in para_block['blocks']: # 2nd.拼table_body
154
86
  if block['type'] == BlockType.TableBody:
155
87
  for line in block['lines']:
@@ -160,11 +92,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
160
92
  para_text += f"\n\n$\n {span['latex']}\n$\n\n"
161
93
  elif span.get('html', ''):
162
94
  para_text += f"\n\n{span['html']}\n\n"
163
- else:
95
+ elif span.get('image_path', ''):
164
96
  para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
165
97
  for block in para_block['blocks']: # 3rd.拼table_footnote
166
98
  if block['type'] == BlockType.TableFootnote:
167
- para_text += merge_para_with_text(block)
99
+ para_text += merge_para_with_text(block) + ' \n'
168
100
 
169
101
  if para_text.strip() == '':
170
102
  continue
@@ -174,22 +106,26 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
174
106
  return page_markdown
175
107
 
176
108
 
177
- def merge_para_with_text(para_block):
178
-
179
- def detect_language(text):
180
- en_pattern = r'[a-zA-Z]+'
181
- en_matches = re.findall(en_pattern, text)
182
- en_length = sum(len(match) for match in en_matches)
183
- if len(text) > 0:
184
- if en_length / len(text) >= 0.5:
185
- return 'en'
186
- else:
187
- return 'unknown'
109
+ def detect_language(text):
110
+ en_pattern = r'[a-zA-Z]+'
111
+ en_matches = re.findall(en_pattern, text)
112
+ en_length = sum(len(match) for match in en_matches)
113
+ if len(text) > 0:
114
+ if en_length / len(text) >= 0.5:
115
+ return 'en'
188
116
  else:
189
- return 'empty'
117
+ return 'unknown'
118
+ else:
119
+ return 'empty'
190
120
 
121
+
122
+ def merge_para_with_text(para_block):
191
123
  para_text = ''
192
- for line in para_block['lines']:
124
+ for i, line in enumerate(para_block['lines']):
125
+
126
+ if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
127
+ para_text += ' \n'
128
+
193
129
  line_text = ''
194
130
  line_lang = ''
195
131
  for span in line['spans']:
@@ -199,17 +135,11 @@ def merge_para_with_text(para_block):
199
135
  if line_text != '':
200
136
  line_lang = detect_lang(line_text)
201
137
  for span in line['spans']:
138
+
202
139
  span_type = span['type']
203
140
  content = ''
204
141
  if span_type == ContentType.Text:
205
- content = span['content']
206
- # language = detect_lang(content)
207
- language = detect_language(content)
208
- if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
209
- content = ocr_escape_special_markdown_char(
210
- split_long_words(content))
211
- else:
212
- content = ocr_escape_special_markdown_char(content)
142
+ content = ocr_escape_special_markdown_char(span['content'])
213
143
  elif span_type == ContentType.InlineEquation:
214
144
  content = f" ${span['content']}$ "
215
145
  elif span_type == ContentType.InterlineEquation:
@@ -230,177 +160,83 @@ def merge_para_with_text(para_block):
230
160
  return para_text
231
161
 
232
162
 
233
- def para_to_standard_format(para, img_buket_path):
234
- para_content = {}
235
- if len(para) == 1:
236
- para_content = line_to_standard_format(para[0], img_buket_path)
237
- elif len(para) > 1:
238
- para_text = ''
239
- inline_equation_num = 0
240
- for line in para:
241
- for span in line['spans']:
242
- language = ''
243
- span_type = span.get('type')
244
- content = ''
245
- if span_type == ContentType.Text:
246
- content = span['content']
247
- language = detect_lang(content)
248
- if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
249
- content = ocr_escape_special_markdown_char(
250
- split_long_words(content))
251
- else:
252
- content = ocr_escape_special_markdown_char(content)
253
- elif span_type == ContentType.InlineEquation:
254
- content = f"${span['content']}$"
255
- inline_equation_num += 1
256
- if language == 'en': # 英文语境下 content间需要空格分隔
257
- para_text += content + ' '
258
- else: # 中文语境下,content间不需要空格分隔
259
- para_text += content
260
- para_content = {
261
- 'type': 'text',
262
- 'text': para_text,
263
- 'inline_equation_num': inline_equation_num,
264
- }
265
- return para_content
266
-
267
-
268
- def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
163
+ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
269
164
  para_type = para_block['type']
270
- if para_type == BlockType.Text:
165
+ para_content = {}
166
+ if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
271
167
  para_content = {
272
168
  'type': 'text',
273
169
  'text': merge_para_with_text(para_block),
274
- 'page_idx': page_idx,
275
170
  }
276
171
  elif para_type == BlockType.Title:
277
172
  para_content = {
278
173
  'type': 'text',
279
174
  'text': merge_para_with_text(para_block),
280
175
  'text_level': 1,
281
- 'page_idx': page_idx,
282
176
  }
283
177
  elif para_type == BlockType.InterlineEquation:
284
178
  para_content = {
285
179
  'type': 'equation',
286
180
  'text': merge_para_with_text(para_block),
287
181
  'text_format': 'latex',
288
- 'page_idx': page_idx,
289
182
  }
290
183
  elif para_type == BlockType.Image:
291
- para_content = {'type': 'image', 'page_idx': page_idx}
184
+ para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
292
185
  for block in para_block['blocks']:
293
186
  if block['type'] == BlockType.ImageBody:
294
- para_content['img_path'] = join_path(
295
- img_buket_path,
296
- block['lines'][0]['spans'][0]['image_path'])
187
+ for line in block['lines']:
188
+ for span in line['spans']:
189
+ if span['type'] == ContentType.Image:
190
+ if span.get('image_path', ''):
191
+ para_content['img_path'] = join_path(img_buket_path, span['image_path'])
297
192
  if block['type'] == BlockType.ImageCaption:
298
- para_content['img_caption'] = merge_para_with_text(block)
193
+ para_content['img_caption'].append(merge_para_with_text(block))
299
194
  if block['type'] == BlockType.ImageFootnote:
300
- para_content['img_footnote'] = merge_para_with_text(block)
195
+ para_content['img_footnote'].append(merge_para_with_text(block))
301
196
  elif para_type == BlockType.Table:
302
- para_content = {'type': 'table', 'page_idx': page_idx}
197
+ para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
303
198
  for block in para_block['blocks']:
304
199
  if block['type'] == BlockType.TableBody:
305
- if block["lines"][0]["spans"][0].get('latex', ''):
306
- para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
307
- elif block["lines"][0]["spans"][0].get('html', ''):
308
- para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
309
- para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
310
- if block['type'] == BlockType.TableCaption:
311
- para_content['table_caption'] = merge_para_with_text(block)
312
- if block['type'] == BlockType.TableFootnote:
313
- para_content['table_footnote'] = merge_para_with_text(block)
314
-
315
- return para_content
200
+ for line in block['lines']:
201
+ for span in line['spans']:
202
+ if span['type'] == ContentType.Table:
316
203
 
204
+ if span.get('latex', ''):
205
+ para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
206
+ elif span.get('html', ''):
207
+ para_content['table_body'] = f"\n\n{span['html']}\n\n"
317
208
 
318
- def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
319
- content_list = []
320
- for page_info in pdf_info_dict:
321
- paras_of_layout = page_info.get('para_blocks')
322
- if not paras_of_layout:
323
- continue
324
- for para_block in paras_of_layout:
325
- para_content = para_to_standard_format_v2(para_block,
326
- img_buket_path)
327
- content_list.append(para_content)
328
- return content_list
209
+ if span.get('image_path', ''):
210
+ para_content['img_path'] = join_path(img_buket_path, span['image_path'])
329
211
 
212
+ if block['type'] == BlockType.TableCaption:
213
+ para_content['table_caption'].append(merge_para_with_text(block))
214
+ if block['type'] == BlockType.TableFootnote:
215
+ para_content['table_footnote'].append(merge_para_with_text(block))
330
216
 
331
- def line_to_standard_format(line, img_buket_path):
332
- line_text = ''
333
- inline_equation_num = 0
334
- for span in line['spans']:
335
- if not span.get('content'):
336
- if not span.get('image_path'):
337
- continue
338
- else:
339
- if span['type'] == ContentType.Image:
340
- content = {
341
- 'type': 'image',
342
- 'img_path': join_path(img_buket_path,
343
- span['image_path']),
344
- }
345
- return content
346
- elif span['type'] == ContentType.Table:
347
- content = {
348
- 'type': 'table',
349
- 'img_path': join_path(img_buket_path,
350
- span['image_path']),
351
- }
352
- return content
353
- else:
354
- if span['type'] == ContentType.InterlineEquation:
355
- interline_equation = span['content']
356
- content = {
357
- 'type': 'equation',
358
- 'latex': f'$$\n{interline_equation}\n$$'
359
- }
360
- return content
361
- elif span['type'] == ContentType.InlineEquation:
362
- inline_equation = span['content']
363
- line_text += f'${inline_equation}$'
364
- inline_equation_num += 1
365
- elif span['type'] == ContentType.Text:
366
- text_content = ocr_escape_special_markdown_char(
367
- span['content']) # 转义特殊符号
368
- line_text += text_content
369
- content = {
370
- 'type': 'text',
371
- 'text': line_text,
372
- 'inline_equation_num': inline_equation_num,
373
- }
374
- return content
217
+ para_content['page_idx'] = page_idx
375
218
 
219
+ if drop_reason is not None:
220
+ para_content['drop_reason'] = drop_reason
376
221
 
377
- def ocr_mk_mm_standard_format(pdf_info_dict: list):
378
- """content_list type string
379
- image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
380
- latex文本字段。 text string 纯文本格式的文本数据。 md string
381
- markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
382
- content_list = []
383
- for page_info in pdf_info_dict:
384
- blocks = page_info.get('preproc_blocks')
385
- if not blocks:
386
- continue
387
- for block in blocks:
388
- for line in block['lines']:
389
- content = line_to_standard_format(line)
390
- content_list.append(content)
391
- return content_list
222
+ return para_content
392
223
 
393
224
 
394
225
  def union_make(pdf_info_dict: list,
395
226
  make_mode: str,
396
227
  drop_mode: str,
397
- img_buket_path: str = ''):
228
+ img_buket_path: str = '',
229
+ ):
398
230
  output_content = []
399
231
  for page_info in pdf_info_dict:
232
+ drop_reason_flag = False
233
+ drop_reason = None
400
234
  if page_info.get('need_drop', False):
401
235
  drop_reason = page_info.get('drop_reason')
402
236
  if drop_mode == DropMode.NONE:
403
237
  pass
238
+ elif drop_mode == DropMode.NONE_WITH_REASON:
239
+ drop_reason_flag = True
404
240
  elif drop_mode == DropMode.WHOLE_PDF:
405
241
  raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
406
242
  f'drop_reason is {drop_reason}'))
@@ -425,8 +261,12 @@ def union_make(pdf_info_dict: list,
425
261
  output_content.extend(page_markdown)
426
262
  elif make_mode == MakeMode.STANDARD_FORMAT:
427
263
  for para_block in paras_of_layout:
428
- para_content = para_to_standard_format_v2(
429
- para_block, img_buket_path, page_idx)
264
+ if drop_reason_flag:
265
+ para_content = para_to_standard_format_v2(
266
+ para_block, img_buket_path, page_idx)
267
+ else:
268
+ para_content = para_to_standard_format_v2(
269
+ para_block, img_buket_path, page_idx)
430
270
  output_content.append(para_content)
431
271
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
432
272
  return '\n\n'.join(output_content)
@@ -10,18 +10,12 @@ block维度自定义字段
10
10
  # block中lines是否被删除
11
11
  LINES_DELETED = "lines_deleted"
12
12
 
13
- # struct eqtable
14
- STRUCT_EQTABLE = "struct_eqtable"
15
-
16
13
  # table recognition max time default value
17
14
  TABLE_MAX_TIME_VALUE = 400
18
15
 
19
16
  # pp_table_result_max_length
20
17
  TABLE_MAX_LEN = 480
21
18
 
22
- # pp table structure algorithm
23
- TABLE_MASTER = "TableMaster"
24
-
25
19
  # table master structure dict
26
20
  TABLE_MASTER_DICT = "table_master_structure_dict.txt"
27
21
 
@@ -29,12 +23,31 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
29
23
  TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
30
24
 
31
25
  # pp detect model dir
32
- DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
26
+ DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
33
27
 
34
28
  # pp rec model dir
35
- REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
29
+ REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
36
30
 
37
31
  # pp rec char dict path
38
32
  REC_CHAR_DICT = "ppocr_keys_v1.txt"
39
33
 
34
+ # pp rec copy rec directory
35
+ PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
36
+
37
+ # pp rec copy det directory
38
+ PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
39
+
40
+
41
+ class MODEL_NAME:
42
+ # pp table structure algorithm
43
+ TABLE_MASTER = "tablemaster"
44
+ # struct eqtable
45
+ STRUCT_EQTABLE = "struct_eqtable"
46
+
47
+ DocLayout_YOLO = "doclayout_yolo"
48
+
49
+ LAYOUTLMv3 = "layoutlmv3"
50
+
51
+ YOLO_V8_MFD = "yolo_v8_mfd"
40
52
 
53
+ UniMerNet_v2_Small = "unimernet_small"
@@ -8,3 +8,4 @@ class DropMode:
8
8
  WHOLE_PDF = "whole_pdf"
9
9
  SINGLE_PAGE = "single_page"
10
10
  NONE = "none"
11
+ NONE_WITH_REASON = "none_with_reason"
magic_pdf/libs/boxbase.py CHANGED
@@ -426,3 +426,57 @@ def bbox_distance(bbox1, bbox2):
426
426
  elif top:
427
427
  return y2 - y1b
428
428
  return 0.0
429
+
430
+
431
+ def box_area(bbox):
432
+ return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
433
+
434
+
435
+ def get_overlap_area(bbox1, bbox2):
436
+ """计算box1和box2的重叠面积占bbox1的比例."""
437
+ # Determine the coordinates of the intersection rectangle
438
+ x_left = max(bbox1[0], bbox2[0])
439
+ y_top = max(bbox1[1], bbox2[1])
440
+ x_right = min(bbox1[2], bbox2[2])
441
+ y_bottom = min(bbox1[3], bbox2[3])
442
+
443
+ if x_right < x_left or y_bottom < y_top:
444
+ return 0.0
445
+
446
+ # The area of overlap area
447
+ return (x_right - x_left) * (y_bottom - y_top)
448
+
449
+
450
+ def calculate_vertical_projection_overlap_ratio(block1, block2):
451
+ """
452
+ Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
453
+
454
+ Args:
455
+ block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
456
+ block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
457
+
458
+ Returns:
459
+ float: The proportion of the x-axis covered by the vertical projection of the two blocks.
460
+ """
461
+ x0_1, _, x1_1, _ = block1
462
+ x0_2, _, x1_2, _ = block2
463
+
464
+ # Calculate the intersection of the x-coordinates
465
+ x_left = max(x0_1, x0_2)
466
+ x_right = min(x1_1, x1_2)
467
+
468
+ if x_right < x_left:
469
+ return 0.0
470
+
471
+ # Length of the intersection
472
+ intersection_length = x_right - x_left
473
+
474
+ # Length of the x-axis projection of the first block
475
+ block1_length = x1_1 - x0_1
476
+
477
+ if block1_length == 0:
478
+ return 0.0
479
+
480
+ # Proportion of the x-axis covered by the intersection
481
+ # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
482
+ return intersection_length / block1_length
@@ -0,0 +1,10 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ import torch
3
+ import gc
4
+
5
+
6
+ def clean_memory():
7
+ if torch.cuda.is_available():
8
+ torch.cuda.empty_cache()
9
+ torch.cuda.ipc_collect()
10
+ gc.collect()