magic-pdf 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/data/utils.py CHANGED
@@ -10,22 +10,22 @@ from loguru import logger
10
10
 
11
11
 
12
12
 
13
- def fitz_doc_to_image(doc, dpi=200) -> dict:
13
+ def fitz_doc_to_image(page, dpi=200) -> dict:
14
14
  """Convert fitz.Document to image, Then convert the image to numpy array.
15
15
 
16
16
  Args:
17
- doc (_type_): pymudoc page
17
+ page (_type_): pymudoc page
18
18
  dpi (int, optional): reset the dpi of dpi. Defaults to 200.
19
19
 
20
20
  Returns:
21
21
  dict: {'img': numpy array, 'width': width, 'height': height }
22
22
  """
23
23
  mat = fitz.Matrix(dpi / 72, dpi / 72)
24
- pm = doc.get_pixmap(matrix=mat, alpha=False)
24
+ pm = page.get_pixmap(matrix=mat, alpha=False)
25
25
 
26
26
  # If the width or height exceeds 4500 after scaling, do not scale further.
27
27
  if pm.width > 4500 or pm.height > 4500:
28
- pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
28
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
29
29
 
30
30
  # Convert pixmap samples directly to numpy array
31
31
  img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
70
70
  if mode == 'nlp':
71
71
  continue
72
72
  elif mode == 'mm':
73
- for block in para_block['blocks']: # 1st.拼image_body
74
- if block['type'] == BlockType.ImageBody:
75
- for line in block['lines']:
76
- for span in line['spans']:
77
- if span['type'] == ContentType.Image:
78
- if span.get('image_path', ''):
79
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
80
- for block in para_block['blocks']: # 2nd.拼image_caption
81
- if block['type'] == BlockType.ImageCaption:
82
- para_text += merge_para_with_text(block) + ' \n'
83
- for block in para_block['blocks']: # 3rd.拼image_footnote
84
- if block['type'] == BlockType.ImageFootnote:
85
- para_text += merge_para_with_text(block) + ' \n'
73
+ # 检测是否存在图片脚注
74
+ has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
75
+ # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
76
+ if has_image_footnote:
77
+ for block in para_block['blocks']: # 1st.拼image_caption
78
+ if block['type'] == BlockType.ImageCaption:
79
+ para_text += merge_para_with_text(block) + ' \n'
80
+ for block in para_block['blocks']: # 2nd.拼image_body
81
+ if block['type'] == BlockType.ImageBody:
82
+ for line in block['lines']:
83
+ for span in line['spans']:
84
+ if span['type'] == ContentType.Image:
85
+ if span.get('image_path', ''):
86
+ para_text += f"![]({img_buket_path}/{span['image_path']})"
87
+ for block in para_block['blocks']: # 3rd.拼image_footnote
88
+ if block['type'] == BlockType.ImageFootnote:
89
+ para_text += ' \n' + merge_para_with_text(block)
90
+ else:
91
+ for block in para_block['blocks']: # 1st.拼image_body
92
+ if block['type'] == BlockType.ImageBody:
93
+ for line in block['lines']:
94
+ for span in line['spans']:
95
+ if span['type'] == ContentType.Image:
96
+ if span.get('image_path', ''):
97
+ para_text += f"![]({img_buket_path}/{span['image_path']})"
98
+ for block in para_block['blocks']: # 2nd.拼image_caption
99
+ if block['type'] == BlockType.ImageCaption:
100
+ para_text += ' \n' + merge_para_with_text(block)
86
101
  elif para_type == BlockType.Table:
87
102
  if mode == 'nlp':
88
103
  continue
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
96
111
  for span in line['spans']:
97
112
  if span['type'] == ContentType.Table:
98
113
  # if processed by table model
99
- if span.get('latex', ''):
100
- para_text += f"\n\n$\n {span['latex']}\n$\n\n"
101
- elif span.get('html', ''):
102
- para_text += f"\n\n{span['html']}\n\n"
114
+ if span.get('html', ''):
115
+ para_text += f"\n{span['html']}\n"
103
116
  elif span.get('image_path', ''):
104
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
117
+ para_text += f"![]({img_buket_path}/{span['image_path']})"
105
118
  for block in para_block['blocks']: # 3rd.拼table_footnote
106
119
  if block['type'] == BlockType.TableFootnote:
107
- para_text += merge_para_with_text(block) + ' \n'
120
+ para_text += '\n' + merge_para_with_text(block) + ' '
108
121
 
109
122
  if para_text.strip() == '':
110
123
  continue
111
124
  else:
112
- page_markdown.append(para_text.strip() + ' ')
125
+ # page_markdown.append(para_text.strip() + ' ')
126
+ page_markdown.append(para_text.strip())
113
127
 
114
128
  return page_markdown
115
129
 
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
257
271
  if span['type'] == ContentType.Table:
258
272
 
259
273
  if span.get('latex', ''):
260
- para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
274
+ para_content['table_body'] = f"{span['latex']}"
261
275
  elif span.get('html', ''):
262
- para_content['table_body'] = f"\n\n{span['html']}\n\n"
276
+ para_content['table_body'] = f"{span['html']}"
263
277
 
264
278
  if span.get('image_path', ''):
265
279
  para_content['img_path'] = join_path(img_buket_path, span['image_path'])
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.3.10"
1
+ __version__ = "1.3.12"
@@ -6,7 +6,7 @@ from tqdm import tqdm
6
6
  from magic_pdf.config.constants import MODEL_NAME
7
7
  from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
8
8
  from magic_pdf.model.sub_modules.model_utils import (
9
- clean_vram, crop_img, get_res_list_from_layout_res)
9
+ clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area)
10
10
  from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
11
11
  get_adjusted_mfdetrec_res, get_ocr_result_list)
12
12
 
@@ -148,6 +148,19 @@ class BatchAnalyze:
148
148
  # Integration results
149
149
  if ocr_res:
150
150
  ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
151
+
152
+ if res["category_id"] == 3:
153
+ # ocr_result_list中所有bbox的面积之和
154
+ ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
155
+ # 求ocr_res_area和res的面积的比值
156
+ res_area = get_coords_and_area(res)[4]
157
+ if res_area > 0:
158
+ ratio = ocr_res_area / res_area
159
+ if ratio > 0.25:
160
+ res["category_id"] = 1
161
+ else:
162
+ continue
163
+
151
164
  ocr_res_list_dict['layout_res'].extend(ocr_result_list)
152
165
 
153
166
  # det_count += len(ocr_res_list_dict['ocr_res_list'])
@@ -156,7 +156,10 @@ def doc_analyze(
156
156
  batch_images = [images_with_extra_info]
157
157
 
158
158
  results = []
159
- for batch_image in batch_images:
159
+ processed_images_count = 0
160
+ for index, batch_image in enumerate(batch_images):
161
+ processed_images_count += len(batch_image)
162
+ logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
160
163
  result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
161
164
  results.extend(result)
162
165
 
@@ -186,7 +189,7 @@ def batch_doc_analyze(
186
189
  formula_enable=None,
187
190
  table_enable=None,
188
191
  ):
189
- MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
192
+ MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
190
193
  batch_size = MIN_BATCH_INFERENCE_SIZE
191
194
  page_wh_list = []
192
195
 
@@ -66,9 +66,9 @@ LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
66
66
 
67
67
  def fix_latex_left_right(s):
68
68
  """
69
- 修复LaTeX中的\left和\right命令
69
+ 修复LaTeX中的\\left和\\right命令
70
70
  1. 确保它们后面跟有效分隔符
71
- 2. 平衡\left和\right的数量
71
+ 2. 平衡\\left和\\right的数量
72
72
  """
73
73
  # 白名单分隔符
74
74
  valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
@@ -106,7 +106,7 @@ def fix_latex_left_right(s):
106
106
 
107
107
  def fix_left_right_pairs(latex_formula):
108
108
  """
109
- 检测并修复LaTeX公式中\left和\right不在同一组的情况
109
+ 检测并修复LaTeX公式中\\left和\\right不在同一组的情况
110
110
 
111
111
  Args:
112
112
  latex_formula (str): 输入的LaTeX公式
@@ -308,9 +308,9 @@ ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') fo
308
308
 
309
309
  def fix_latex_environments(s):
310
310
  """
311
- 检测LaTeX中环境(如array)的\begin和\end是否匹配
312
- 1. 如果缺少\begin标签则在开头添加
313
- 2. 如果缺少\end标签则在末尾添加
311
+ 检测LaTeX中环境(如array)的\\begin和\\end是否匹配
312
+ 1. 如果缺少\\begin标签则在开头添加
313
+ 2. 如果缺少\\end标签则在末尾添加
314
314
  """
315
315
  for env in ENV_TYPES:
316
316
  begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
@@ -334,7 +334,7 @@ def fix_latex_environments(s):
334
334
 
335
335
  UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
336
336
  COMMANDS_TO_REMOVE_PATTERN = re.compile(
337
- r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
337
+ r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
338
338
  REPLACEMENTS_PATTERNS = {
339
339
  re.compile(r'\\underbar'): r'\\underline',
340
340
  re.compile(r'\\Bar'): r'\\hat',
@@ -346,6 +346,9 @@ REPLACEMENTS_PATTERNS = {
346
346
  re.compile(r'\\textunderscore'): r'\\_',
347
347
  re.compile(r'\\fint'): r'⨏',
348
348
  re.compile(r'\\up '): r'\\ ',
349
+ re.compile(r'\\vline = '): r'\\models ',
350
+ re.compile(r'\\vDash '): r'\\models ',
351
+ re.compile(r'\\sq \\sqcup '): r'\\square ',
349
352
  }
350
353
  QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
351
354
 
@@ -31,10 +31,10 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
31
31
  return return_image, return_list
32
32
 
33
33
 
34
- def get_coords_and_area(table):
34
+ def get_coords_and_area(block_with_poly):
35
35
  """Extract coordinates and area from a table."""
36
- xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
37
- xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
36
+ xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1])
37
+ xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5])
38
38
  area = (xmax - xmin) * (ymax - ymin)
39
39
  return xmin, ymin, xmax, ymax, area
40
40
 
@@ -243,7 +243,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
243
243
  "bbox": [int(res['poly'][0]), int(res['poly'][1]),
244
244
  int(res['poly'][4]), int(res['poly'][5])],
245
245
  })
246
- elif category_id in [0, 2, 4, 6, 7]: # OCR regions
246
+ elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions
247
247
  ocr_res_list.append(res)
248
248
  elif category_id == 5: # Table regions
249
249
  table_res_list.append(res)
@@ -35,7 +35,7 @@ def build_backbone(config, model_type):
35
35
  from .rec_mobilenet_v3 import MobileNetV3
36
36
  from .rec_svtrnet import SVTRNet
37
37
  from .rec_mv1_enhance import MobileNetV1Enhance
38
-
38
+ from .rec_pphgnetv2 import PPHGNetV2_B4
39
39
  support_dict = [
40
40
  "MobileNetV1Enhance",
41
41
  "MobileNetV3",
@@ -48,6 +48,7 @@ def build_backbone(config, model_type):
48
48
  "DenseNet",
49
49
  "PPLCNetV3",
50
50
  "PPHGNet_small",
51
+ "PPHGNetV2_B4",
51
52
  ]
52
53
  else:
53
54
  raise NotImplementedError