magic-pdf 1.3.11__py3-none-any.whl → 1.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/data/utils.py CHANGED
@@ -10,22 +10,22 @@ from loguru import logger
10
10
 
11
11
 
12
12
 
13
- def fitz_doc_to_image(doc, dpi=200) -> dict:
13
+ def fitz_doc_to_image(page, dpi=200) -> dict:
14
14
  """Convert fitz.Document to image, Then convert the image to numpy array.
15
15
 
16
16
  Args:
17
- doc (_type_): pymudoc page
17
+ page (_type_): pymudoc page
18
18
  dpi (int, optional): reset the dpi of dpi. Defaults to 200.
19
19
 
20
20
  Returns:
21
21
  dict: {'img': numpy array, 'width': width, 'height': height }
22
22
  """
23
23
  mat = fitz.Matrix(dpi / 72, dpi / 72)
24
- pm = doc.get_pixmap(matrix=mat, alpha=False)
24
+ pm = page.get_pixmap(matrix=mat, alpha=False)
25
25
 
26
26
  # If the width or height exceeds 4500 after scaling, do not scale further.
27
27
  if pm.width > 4500 or pm.height > 4500:
28
- pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
28
+ pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
29
29
 
30
30
  # Convert pixmap samples directly to numpy array
31
31
  img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
70
70
  if mode == 'nlp':
71
71
  continue
72
72
  elif mode == 'mm':
73
- for block in para_block['blocks']: # 1st.拼image_body
74
- if block['type'] == BlockType.ImageBody:
75
- for line in block['lines']:
76
- for span in line['spans']:
77
- if span['type'] == ContentType.Image:
78
- if span.get('image_path', ''):
79
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
80
- for block in para_block['blocks']: # 2nd.拼image_caption
81
- if block['type'] == BlockType.ImageCaption:
82
- para_text += merge_para_with_text(block) + ' \n'
83
- for block in para_block['blocks']: # 3rd.拼image_footnote
84
- if block['type'] == BlockType.ImageFootnote:
85
- para_text += merge_para_with_text(block) + ' \n'
73
+ # 检测是否存在图片脚注
74
+ has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
75
+ # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
76
+ if has_image_footnote:
77
+ for block in para_block['blocks']: # 1st.拼image_caption
78
+ if block['type'] == BlockType.ImageCaption:
79
+ para_text += merge_para_with_text(block) + ' \n'
80
+ for block in para_block['blocks']: # 2nd.拼image_body
81
+ if block['type'] == BlockType.ImageBody:
82
+ for line in block['lines']:
83
+ for span in line['spans']:
84
+ if span['type'] == ContentType.Image:
85
+ if span.get('image_path', ''):
86
+ para_text += f"![]({img_buket_path}/{span['image_path']})"
87
+ for block in para_block['blocks']: # 3rd.拼image_footnote
88
+ if block['type'] == BlockType.ImageFootnote:
89
+ para_text += ' \n' + merge_para_with_text(block)
90
+ else:
91
+ for block in para_block['blocks']: # 1st.拼image_body
92
+ if block['type'] == BlockType.ImageBody:
93
+ for line in block['lines']:
94
+ for span in line['spans']:
95
+ if span['type'] == ContentType.Image:
96
+ if span.get('image_path', ''):
97
+ para_text += f"![]({img_buket_path}/{span['image_path']})"
98
+ for block in para_block['blocks']: # 2nd.拼image_caption
99
+ if block['type'] == BlockType.ImageCaption:
100
+ para_text += ' \n' + merge_para_with_text(block)
86
101
  elif para_type == BlockType.Table:
87
102
  if mode == 'nlp':
88
103
  continue
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
96
111
  for span in line['spans']:
97
112
  if span['type'] == ContentType.Table:
98
113
  # if processed by table model
99
- if span.get('latex', ''):
100
- para_text += f"\n\n$\n {span['latex']}\n$\n\n"
101
- elif span.get('html', ''):
102
- para_text += f"\n\n{span['html']}\n\n"
114
+ if span.get('html', ''):
115
+ para_text += f"\n{span['html']}\n"
103
116
  elif span.get('image_path', ''):
104
- para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n"
117
+ para_text += f"![]({img_buket_path}/{span['image_path']})"
105
118
  for block in para_block['blocks']: # 3rd.拼table_footnote
106
119
  if block['type'] == BlockType.TableFootnote:
107
- para_text += merge_para_with_text(block) + ' \n'
120
+ para_text += '\n' + merge_para_with_text(block) + ' '
108
121
 
109
122
  if para_text.strip() == '':
110
123
  continue
111
124
  else:
112
- page_markdown.append(para_text.strip() + ' ')
125
+ # page_markdown.append(para_text.strip() + ' ')
126
+ page_markdown.append(para_text.strip())
113
127
 
114
128
  return page_markdown
115
129
 
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
257
271
  if span['type'] == ContentType.Table:
258
272
 
259
273
  if span.get('latex', ''):
260
- para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
274
+ para_content['table_body'] = f"{span['latex']}"
261
275
  elif span.get('html', ''):
262
- para_content['table_body'] = f"\n\n{span['html']}\n\n"
276
+ para_content['table_body'] = f"{span['html']}"
263
277
 
264
278
  if span.get('image_path', ''):
265
279
  para_content['img_path'] = join_path(img_buket_path, span['image_path'])
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.3.11"
1
+ __version__ = "1.3.12"
@@ -6,7 +6,7 @@ from tqdm import tqdm
6
6
  from magic_pdf.config.constants import MODEL_NAME
7
7
  from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
8
8
  from magic_pdf.model.sub_modules.model_utils import (
9
- clean_vram, crop_img, get_res_list_from_layout_res)
9
+ clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area)
10
10
  from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
11
11
  get_adjusted_mfdetrec_res, get_ocr_result_list)
12
12
 
@@ -148,6 +148,19 @@ class BatchAnalyze:
148
148
  # Integration results
149
149
  if ocr_res:
150
150
  ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
151
+
152
+ if res["category_id"] == 3:
153
+ # ocr_result_list中所有bbox的面积之和
154
+ ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
155
+ # 求ocr_res_area和res的面积的比值
156
+ res_area = get_coords_and_area(res)[4]
157
+ if res_area > 0:
158
+ ratio = ocr_res_area / res_area
159
+ if ratio > 0.25:
160
+ res["category_id"] = 1
161
+ else:
162
+ continue
163
+
151
164
  ocr_res_list_dict['layout_res'].extend(ocr_result_list)
152
165
 
153
166
  # det_count += len(ocr_res_list_dict['ocr_res_list'])
@@ -189,7 +189,7 @@ def batch_doc_analyze(
189
189
  formula_enable=None,
190
190
  table_enable=None,
191
191
  ):
192
- MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
192
+ MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
193
193
  batch_size = MIN_BATCH_INFERENCE_SIZE
194
194
  page_wh_list = []
195
195
 
@@ -31,10 +31,10 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
31
31
  return return_image, return_list
32
32
 
33
33
 
34
- def get_coords_and_area(table):
34
+ def get_coords_and_area(block_with_poly):
35
35
  """Extract coordinates and area from a table."""
36
- xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
37
- xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
36
+ xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1])
37
+ xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5])
38
38
  area = (xmax - xmin) * (ymax - ymin)
39
39
  return xmin, ymin, xmax, ymax, area
40
40
 
@@ -243,7 +243,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
243
243
  "bbox": [int(res['poly'][0]), int(res['poly'][1]),
244
244
  int(res['poly'][4]), int(res['poly'][5])],
245
245
  })
246
- elif category_id in [0, 2, 4, 6, 7]: # OCR regions
246
+ elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions
247
247
  ocr_res_list.append(res)
248
248
  elif category_id == 5: # Table regions
249
249
  table_res_list.append(res)
@@ -35,7 +35,7 @@ def build_backbone(config, model_type):
35
35
  from .rec_mobilenet_v3 import MobileNetV3
36
36
  from .rec_svtrnet import SVTRNet
37
37
  from .rec_mv1_enhance import MobileNetV1Enhance
38
-
38
+ from .rec_pphgnetv2 import PPHGNetV2_B4
39
39
  support_dict = [
40
40
  "MobileNetV1Enhance",
41
41
  "MobileNetV3",
@@ -48,6 +48,7 @@ def build_backbone(config, model_type):
48
48
  "DenseNet",
49
49
  "PPLCNetV3",
50
50
  "PPHGNet_small",
51
+ "PPHGNetV2_B4",
51
52
  ]
52
53
  else:
53
54
  raise NotImplementedError