magic-pdf 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,22 +4,37 @@ from loguru import logger
4
4
  from magic_pdf.config.constants import MODEL_NAME
5
5
  from magic_pdf.model.model_list import AtomicModel
6
6
  from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
7
- from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import \
8
- DocLayoutYOLOModel
9
- from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import \
10
- Layoutlmv3_Predictor
7
+ from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
8
+ from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
11
9
  from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
12
10
  from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel
13
- from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import \
14
- ModifiedPaddleOCR
15
- from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import \
16
- RapidTableModel
17
- # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
18
- from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import \
19
- StructTableModel
20
- from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import \
21
- TableMasterPaddleModel
22
11
 
12
+ try:
13
+ from magic_pdf_ascend_plugin.libs.license_verifier import load_license, LicenseFormatError, LicenseSignatureError, LicenseExpiredError
14
+ from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
15
+ from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
16
+ license_key = load_license()
17
+ logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
18
+ f' License expired at {license_key["payload"]["date"]["end_date"]}')
19
+ except Exception as e:
20
+ if isinstance(e, ImportError):
21
+ pass
22
+ elif isinstance(e, LicenseFormatError):
23
+ logger.error("Ascend Plugin: Invalid license format. Please check the license file.")
24
+ elif isinstance(e, LicenseSignatureError):
25
+ logger.error("Ascend Plugin: Invalid signature. The license may be tampered with.")
26
+ elif isinstance(e, LicenseExpiredError):
27
+ logger.error("Ascend Plugin: License has expired. Please renew your license.")
28
+ elif isinstance(e, FileNotFoundError):
29
+ logger.error("Ascend Plugin: Not found License file.")
30
+ else:
31
+ logger.error(f"Ascend Plugin: {e}")
32
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
33
+ # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
34
+ from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
35
+
36
+ from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
37
+ from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
23
38
 
24
39
  def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
25
40
  if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
@@ -76,7 +91,6 @@ def ocr_model_init(show_log: bool = False,
76
91
  use_dilation=True,
77
92
  det_db_unclip_ratio=1.8,
78
93
  ):
79
-
80
94
  if lang is not None and lang != '':
81
95
  model = ModifiedPaddleOCR(
82
96
  show_log=show_log,
@@ -6,8 +6,10 @@ import statistics
6
6
  import time
7
7
  from typing import List
8
8
 
9
+ import cv2
9
10
  import fitz
10
11
  import torch
12
+ import numpy as np
11
13
  from loguru import logger
12
14
 
13
15
  from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -19,9 +21,12 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
19
21
  from magic_pdf.libs.convert_utils import dict_to_list
20
22
  from magic_pdf.libs.hash_utils import compute_md5
21
23
  from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
24
+ from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
22
25
  from magic_pdf.model.magic_model import MagicModel
23
26
  from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
24
27
 
28
+ from concurrent.futures import ThreadPoolExecutor
29
+
25
30
  try:
26
31
  import torchtext
27
32
 
@@ -127,16 +132,15 @@ def fill_char_in_spans(spans, all_chars):
127
132
  span['chars'].append(char)
128
133
  break
129
134
 
130
- empty_spans = []
131
-
135
+ need_ocr_spans = []
132
136
  for span in spans:
133
137
  chars_to_content(span)
134
138
  # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
135
139
  if len(span['content']) * span['height'] < span['width'] * 0.5:
136
140
  # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
137
- empty_spans.append(span)
141
+ need_ocr_spans.append(span)
138
142
  del span['height'], span['width']
139
- return empty_spans
143
+ return need_ocr_spans
140
144
 
141
145
 
142
146
  # 使用鲁棒性更强的中心点坐标判断
@@ -190,6 +194,31 @@ def remove_tilted_line(text_blocks):
190
194
  block['lines'].remove(line)
191
195
 
192
196
 
197
+ def calculate_contrast(img, img_mode) -> float:
198
+ """
199
+ 计算给定图像的对比度。
200
+ :param img: 图像,类型为numpy.ndarray
201
+ :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
202
+ :return: 图像的对比度值
203
+ """
204
+ if img_mode == 'rgb':
205
+ # 将RGB图像转换为灰度图
206
+ gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
207
+ elif img_mode == 'bgr':
208
+ # 将BGR图像转换为灰度图
209
+ gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
210
+ else:
211
+ raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
212
+
213
+ # 计算均值和标准差
214
+ mean_value = np.mean(gray_img)
215
+ std_dev = np.std(gray_img)
216
+ # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
217
+ contrast = std_dev / (mean_value + 1e-6)
218
+ # logger.info(f"contrast: {contrast}")
219
+ return round(contrast, 2)
220
+
221
+ # @measure_time
193
222
  def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
194
223
  # cid用0xfffd表示,连字符拆开
195
224
  # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -274,9 +303,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
274
303
  span['chars'] = []
275
304
  new_spans.append(span)
276
305
 
277
- empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
306
+ need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
278
307
 
279
- if len(empty_spans) > 0:
308
+ if len(need_ocr_spans) > 0:
280
309
 
281
310
  # 初始化ocr模型
282
311
  atom_model_manager = AtomModelSingleton()
@@ -287,9 +316,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
287
316
  lang=lang
288
317
  )
289
318
 
290
- for span in empty_spans:
319
+ for span in need_ocr_spans:
291
320
  # 对span的bbox截图再ocr
292
321
  span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
322
+
323
+ # 计算span的对比度,低于0.20的span不进行ocr
324
+ if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
325
+ spans.remove(span)
326
+ continue
327
+
293
328
  ocr_res = ocr_model.ocr(span_img, det=False)
294
329
  if ocr_res and len(ocr_res) > 0:
295
330
  if len(ocr_res[0]) > 0:
@@ -306,24 +341,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
306
341
 
307
342
  def model_init(model_name: str):
308
343
  from transformers import LayoutLMv3ForTokenClassification
309
- device = get_device()
310
- if torch.cuda.is_available():
311
- device = torch.device('cuda')
312
- if torch.cuda.is_bf16_supported():
313
- supports_bfloat16 = True
314
- else:
315
- supports_bfloat16 = False
316
- elif str(device).startswith("npu"):
317
- import torch_npu
318
- if torch_npu.npu.is_available():
319
- device = torch.device('npu')
320
- supports_bfloat16 = False
321
- else:
322
- device = torch.device('cpu')
323
- supports_bfloat16 = False
324
- else:
325
- device = torch.device('cpu')
326
- supports_bfloat16 = False
344
+ device = torch.device(get_device())
327
345
 
328
346
  if model_name == 'layoutreader':
329
347
  # 检测modelscope的缓存目录是否存在
@@ -339,9 +357,6 @@ def model_init(model_name: str):
339
357
  model = LayoutLMv3ForTokenClassification.from_pretrained(
340
358
  'hantian/layoutreader'
341
359
  )
342
- # 检查设备是否支持 bfloat16
343
- if supports_bfloat16:
344
- model.bfloat16()
345
360
  model.to(device).eval()
346
361
  else:
347
362
  logger.error('model name not allow')
@@ -404,10 +419,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
404
419
  block_bboxes.append(block['bbox'])
405
420
 
406
421
  # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
407
- if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
408
- block['virtual_lines'] = copy.deepcopy(block['lines'])
409
- block['lines'] = copy.deepcopy(block['real_lines'])
410
- del block['real_lines']
422
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
423
+ if 'real_lines' in block:
424
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
425
+ block['lines'] = copy.deepcopy(block['real_lines'])
426
+ del block['real_lines']
411
427
 
412
428
  import numpy as np
413
429
 
@@ -476,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
476
492
  else:
477
493
  return [[x0, y0, x1, y1]]
478
494
 
479
-
495
+ # @measure_time
480
496
  def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
481
497
  page_line_list = []
482
498
 
@@ -910,7 +926,6 @@ def pdf_parse_union(
910
926
  magic_model = MagicModel(model_list, dataset)
911
927
 
912
928
  """根据输入的起始范围解析pdf"""
913
- # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
914
929
  end_page_id = (
915
930
  end_page_id
916
931
  if end_page_id is not None and end_page_id >= 0
@@ -947,6 +962,8 @@ def pdf_parse_union(
947
962
  )
948
963
  pdf_info_dict[f'page_{page_id}'] = page_info
949
964
 
965
+ # PerformanceStats.print_stats()
966
+
950
967
  """分段"""
951
968
  para_split(pdf_info_dict)
952
969
 
@@ -3,6 +3,7 @@ import json
3
3
  from loguru import logger
4
4
  from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
5
5
  from openai import OpenAI
6
+ import ast
6
7
 
7
8
 
8
9
  #@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
@@ -119,11 +120,12 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
119
120
  - 在完成初步分级后,仔细检查分级结果的合理性
120
121
  - 根据上下文关系和逻辑顺序,对不合理的分级进行微调
121
122
  - 确保最终的分级结果符合文档的实际结构和逻辑
123
+ - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
122
124
 
123
125
  IMPORTANT:
124
- 请直接返回优化过的由标题层级组成的json,格式如下:
125
- {{"0":1,"1":2,"2":2,"3":3}}
126
- 返回的json不需要格式化。
126
+ 请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
127
+ {{0:1,1:2,2:2,3:3}}
128
+ 不需要对字典格式化,不需要返回任何其他信息。
127
129
 
128
130
  Input title list:
129
131
  {title_dict}
@@ -133,7 +135,7 @@ Corrected title list:
133
135
 
134
136
  retry_count = 0
135
137
  max_retries = 3
136
- json_completion = None
138
+ dict_completion = None
137
139
 
138
140
  while retry_count < max_retries:
139
141
  try:
@@ -143,24 +145,20 @@ Corrected title list:
143
145
  {'role': 'user', 'content': title_optimize_prompt}],
144
146
  temperature=0.7,
145
147
  )
146
- json_completion = json.loads(completion.choices[0].message.content)
148
+ # logger.info(f"Title completion: {completion.choices[0].message.content}")
149
+ dict_completion = ast.literal_eval(completion.choices[0].message.content)
150
+ # logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
147
151
 
148
- # logger.info(f"Title completion: {json_completion}")
149
- # logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
150
-
151
- if len(json_completion) == len(title_dict):
152
+ if len(dict_completion) == len(title_dict):
152
153
  for i, origin_title_block in enumerate(origin_title_list):
153
- origin_title_block["level"] = int(json_completion[str(i)])
154
+ origin_title_block["level"] = int(dict_completion[i])
154
155
  break
155
156
  else:
156
157
  logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
157
158
  retry_count += 1
158
159
  except Exception as e:
159
- if isinstance(e, json.decoder.JSONDecodeError):
160
- logger.warning(f"JSON decode error on attempt {retry_count + 1}: {e}")
161
- else:
162
- logger.exception(e)
160
+ logger.exception(e)
163
161
  retry_count += 1
164
162
 
165
- if json_completion is None:
166
- logger.error("Failed to decode JSON after maximum retries.")
163
+ if dict_completion is None:
164
+ logger.error("Failed to decode dict after maximum retries.")
@@ -60,6 +60,19 @@ def merge_spans_to_line(spans, threshold=0.6):
60
60
  return lines
61
61
 
62
62
 
63
+ def span_block_type_compatible(span_type, block_type):
64
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
65
+ return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
66
+ elif span_type == ContentType.InterlineEquation:
67
+ return block_type in [BlockType.InterlineEquation, BlockType.Text]
68
+ elif span_type == ContentType.Image:
69
+ return block_type in [BlockType.ImageBody]
70
+ elif span_type == ContentType.Table:
71
+ return block_type in [BlockType.TableBody]
72
+ else:
73
+ return False
74
+
75
+
63
76
  def fill_spans_in_blocks(blocks, spans, radio):
64
77
  """将allspans中的span按位置关系,放入blocks中."""
65
78
  block_with_spans = []
@@ -78,8 +91,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
78
91
  block_spans = []
79
92
  for span in spans:
80
93
  span_bbox = span['bbox']
81
- if calculate_overlap_area_in_bbox1_area_ratio(
82
- span_bbox, block_bbox) > radio:
94
+ if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
83
95
  block_spans.append(span)
84
96
 
85
97
  block_dict['spans'] = block_spans
@@ -1,49 +1,49 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.1.0
3
+ Version: 1.2.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE.md
9
- Requires-Dist: boto3 >=1.28.43
10
- Requires-Dist: Brotli >=1.1.0
11
- Requires-Dist: click >=8.1.7
12
- Requires-Dist: fast-langdetect >=0.2.3
13
- Requires-Dist: loguru >=0.6.0
14
- Requires-Dist: numpy <2.0.0,>=1.21.6
15
- Requires-Dist: pydantic >=2.7.2
16
- Requires-Dist: PyMuPDF <=1.24.14,>=1.24.9
17
- Requires-Dist: scikit-learn >=1.0.2
18
- Requires-Dist: torch >=2.2.2
9
+ Requires-Dist: boto3>=1.28.43
10
+ Requires-Dist: Brotli>=1.1.0
11
+ Requires-Dist: click>=8.1.7
12
+ Requires-Dist: fast-langdetect>=0.2.3
13
+ Requires-Dist: loguru>=0.6.0
14
+ Requires-Dist: numpy<2.0.0,>=1.21.6
15
+ Requires-Dist: pydantic>=2.7.2
16
+ Requires-Dist: PyMuPDF<=1.24.14,>=1.24.9
17
+ Requires-Dist: scikit-learn>=1.0.2
18
+ Requires-Dist: torch>=2.2.2
19
19
  Requires-Dist: transformers
20
- Requires-Dist: pdfminer.six ==20231228
20
+ Requires-Dist: pdfminer.six==20231228
21
21
  Provides-Extra: full
22
- Requires-Dist: unimernet ==0.2.3 ; extra == 'full'
23
- Requires-Dist: torch <=2.3.1,>=2.2.2 ; extra == 'full'
24
- Requires-Dist: torchvision <=0.18.1,>=0.17.2 ; extra == 'full'
25
- Requires-Dist: ultralytics >=8.3.48 ; extra == 'full'
26
- Requires-Dist: paddleocr ==2.7.3 ; extra == 'full'
27
- Requires-Dist: struct-eqtable ==0.3.2 ; extra == 'full'
28
- Requires-Dist: einops ; extra == 'full'
29
- Requires-Dist: accelerate ; extra == 'full'
30
- Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full'
31
- Requires-Dist: rapidocr-paddle ; extra == 'full'
32
- Requires-Dist: rapidocr-onnxruntime ; extra == 'full'
33
- Requires-Dist: rapid-table <2.0.0,>=1.0.3 ; extra == 'full'
34
- Requires-Dist: PyYAML ; extra == 'full'
35
- Requires-Dist: openai ; extra == 'full'
36
- Requires-Dist: detectron2 ; extra == 'full'
37
- Requires-Dist: paddlepaddle ==3.0.0b1 ; (platform_system == "Linux") and extra == 'full'
38
- Requires-Dist: matplotlib ; (platform_system == "Linux" or platform_system == "Darwin") and extra == 'full'
39
- Requires-Dist: matplotlib <=3.9.0 ; (platform_system == "Windows") and extra == 'full'
40
- Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_system == "Darwin") and extra == 'full'
22
+ Requires-Dist: unimernet==0.2.3; extra == "full"
23
+ Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
24
+ Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
25
+ Requires-Dist: ultralytics>=8.3.48; extra == "full"
26
+ Requires-Dist: paddleocr==2.7.3; extra == "full"
27
+ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
28
+ Requires-Dist: einops; extra == "full"
29
+ Requires-Dist: accelerate; extra == "full"
30
+ Requires-Dist: doclayout-yolo==0.0.2b1; extra == "full"
31
+ Requires-Dist: rapidocr-paddle<2.0.0,>=1.4.5; extra == "full"
32
+ Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.4; extra == "full"
33
+ Requires-Dist: rapid-table<2.0.0,>=1.0.3; extra == "full"
34
+ Requires-Dist: PyYAML; extra == "full"
35
+ Requires-Dist: openai; extra == "full"
36
+ Requires-Dist: detectron2; extra == "full"
37
+ Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
38
+ Requires-Dist: paddlepaddle==3.0.0rc1; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
39
+ Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
40
+ Requires-Dist: paddlepaddle==2.6.1; platform_system == "Windows" and extra == "full"
41
41
  Provides-Extra: lite
42
- Requires-Dist: paddleocr ==2.7.3 ; extra == 'lite'
43
- Requires-Dist: paddlepaddle ==3.0.0b1 ; (platform_system == "Linux") and extra == 'lite'
44
- Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_system == "Darwin") and extra == 'lite'
42
+ Requires-Dist: paddleocr==2.7.3; extra == "lite"
43
+ Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
44
+ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
45
45
  Provides-Extra: old_linux
46
- Requires-Dist: albumentations <=1.4.20 ; extra == 'old_linux'
46
+ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
47
47
 
48
48
  <div align="center" xmlns="http://www.w3.org/1999/html">
49
49
  <!-- logo -->
@@ -94,6 +94,19 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
94
94
  </div>
95
95
 
96
96
  # Changelog
97
+ - 2025/03/03 1.2.1 released, fixed several bugs:
98
+ - Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
99
+ - Fixed caption matching inaccuracies in certain scenarios
100
+ - Fixed formula span loss issues in certain scenarios
101
+ - 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
102
+ - Performance Optimization
103
+ - Increased classification speed for PDF documents in auto mode.
104
+ - Parsing Optimization
105
+ - Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.
106
+ - Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.
107
+ - Bug Fixes
108
+ - Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.
109
+ - Resolved an issue where title blocks were empty in some cases.
97
110
  - 2025/01/22 1.1.0 released. In this version we have focused on improving parsing accuracy and efficiency:
98
111
  - Model capability upgrade (requires re-executing the [model download process](docs/how_to_download_models_en.md) to obtain incremental updates of model files)
99
112
  - The layout recognition model has been upgraded to the latest `doclayout_yolo(2501)` model, improving layout recognition accuracy.
@@ -280,10 +293,9 @@ There are three different ways to experience MinerU:
280
293
 
281
294
  ### Online Demo
282
295
 
283
- Stable Version (Stable version verified by QA):
284
- [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
296
+ Synced with dev branch updates:
285
297
 
286
- Test Version (Synced with dev branch updates, testing new features):
298
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
287
299
  [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
288
300
  [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
289
301
 
@@ -292,8 +304,8 @@ Test Version (Synced with dev branch updates, testing new features):
292
304
  #### 1. Install magic-pdf
293
305
 
294
306
  ```bash
295
- conda create -n MinerU python=3.10
296
- conda activate MinerU
307
+ conda create -n mineru python=3.10
308
+ conda activate mineru
297
309
  pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
298
310
  ```
299
311
 
@@ -353,7 +365,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
353
365
  ```bash
354
366
  wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile
355
367
  docker build -t mineru:latest .
356
- docker run --rm -it --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
368
+ docker run -it --name mineru --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
357
369
  magic-pdf --help
358
370
  ```
359
371
 
@@ -1,5 +1,5 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_union_core_v2.py,sha256=qh-Vj7v8EenC_f_MNMa76i1DVuckulQo1QC1IOw8LRE,37723
2
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=Pt3UtPQgOrF2YudQqrwVVC767_271E-LRg2aUsiggXg,38435
3
3
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
5
5
  magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
@@ -24,10 +24,10 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
24
24
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
25
25
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
26
26
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=RQ47F2CT0Zgmg1rZoqYj5IW5msqoCTEF6GEHi3mVd8U,12989
28
- magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
29
- magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
30
- magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
27
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=12WeBVxnBzzruk8CfYqqsV2dpH-mDWmE4Osl1RlRoc8,13741
28
+ magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
29
+ magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
30
+ magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
31
31
  magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
@@ -47,23 +47,24 @@ magic_pdf/libs/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,14
47
47
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
48
48
  magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
49
49
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
50
- magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
50
+ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3396
51
51
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
52
+ magic_pdf/libs/performance_stats.py,sha256=BFi4NIsUYlanznYoTVq4hBpj4NOuShAlWBHzebBGVYM,1702
52
53
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
53
- magic_pdf/libs/version.py,sha256=LGVQyDsWifdACo7qztwb8RWWHds1E7uQ-ZqD8SAjyw4,22
54
+ magic_pdf/libs/version.py,sha256=Mlm4Gvmb_6yQxwUbv2Ksc-BJFXLPg9H1Vt2iV7wXrA4,22
54
55
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
55
56
  magic_pdf/model/batch_analyze.py,sha256=sbrgOJWycb1Ep6e62CPi6jEyG6VSeklIxc4PmrqaLhM,11933
56
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=zryHy3ljcEvDqOWivXZQrpau_jPtt6x1lLOZaOkk_tI,8153
57
- magic_pdf/model/magic_model.py,sha256=Nt74oZGYUcbm4qdOQtN-hbKhXxlWO2LVv3K9yXvteWY,25204
57
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=T0-h4QmSIDXRzgF5uWO4jQrwIot221l26PXU52xeKiA,7933
58
+ magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
58
59
  magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
59
- magic_pdf/model/pdf_extract_kit.py,sha256=S-UVZQroUe-eEAJzuOucdCh9FCAWy2DVAZow3dGUiWI,12520
60
+ magic_pdf/model/pdf_extract_kit.py,sha256=Rd51VNZPKRA_tUbDss-b44d84K6WDG2S87a37Ax7HUA,12224
60
61
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
61
62
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- magic_pdf/model/sub_modules/model_init.py,sha256=RCv6BkRLEFBKrfVReRvIvbRQ21BZLz8jj-AKQhwHkhw,6520
63
+ magic_pdf/model/sub_modules/model_init.py,sha256=Ltwi3Nd5PdVVXRF9fto5nImFVg6w-twAMzOLV_F-c3g,7693
63
64
  magic_pdf/model/sub_modules/model_utils.py,sha256=2pI1Xcr2zCF3b64e4WoFtIbjSmTVYBE4zjyHB23gvmE,2488
64
65
  magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
65
66
  magic_pdf/model/sub_modules/language_detection/utils.py,sha256=5nec_loLyYCJ5o6n38AYLz2SKmRvHDCBdt6ka84EaGM,3096
66
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=GW_9WkqIzpJm1MFJexZ2ZvA6AjoqM-6yh8p4LupJhas,4762
67
+ magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=bl2i7kweoJNdj47FlE9h0B_-nNQrMcW9mCLQ1puMEH8,4893
67
68
  magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
68
69
  magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
70
  magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=gy7rc8poO-Zr8511NJjuBV8Uryq5k3JKrstLtCONg0c,2237
@@ -92,7 +93,7 @@ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
92
93
  magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
93
94
  magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
95
  magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=R05qw54QuLl2btNWdkxf4yCjDeEj8o0786e-gz_Xv8k,5290
96
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=HPNetRfQeHoHfRTzFEaIjLSHfjrxRvS-EaApMUebZuQ,8020
96
97
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
98
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
99
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -115,14 +116,13 @@ magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1R
115
116
  magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
116
117
  magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
117
118
  magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
118
- magic_pdf/post_proc/llm_aided.py,sha256=p-XwDObLkDv5rPxsdI7092MP-rHCMr1uAUq3fs7Zc-E,6334
119
- magic_pdf/post_proc/llm_aided_ocr.py,sha256=89kxzEQVqNGSUtmvgcg2AVDDmgb43bamdRxXbwS2FxQ,33557
119
+ magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
120
120
  magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
121
121
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
122
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
123
123
  magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
124
124
  magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
125
- magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
125
+ magic_pdf/pre_proc/ocr_dict_merge.py,sha256=4Z3aHZ9sxzijkVpOCENslvUcpp7DXgNID4Gl3pxwIg4,5512
126
126
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=xrgC9vR0poklZuY4Og41pZVdXzuaGFg3BnQ01X60dpo,3102
127
127
  magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
128
128
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
@@ -139,9 +139,9 @@ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,838
139
139
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
140
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
141
141
  magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
142
- magic_pdf-1.1.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-1.1.0.dist-info/METADATA,sha256=Ud48caL9BHS-ZuLN-3VpswLJFPqao7KqY0aqfF0ApOo,40958
144
- magic_pdf-1.1.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
145
- magic_pdf-1.1.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-1.1.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-1.1.0.dist-info/RECORD,,
142
+ magic_pdf-1.2.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-1.2.1.dist-info/METADATA,sha256=oMxODNFW4g154Rrh9g2sE9_irmB6x7j_5KmF7dRYEPQ,40994
144
+ magic_pdf-1.2.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-1.2.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-1.2.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-1.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5