magic-pdf 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. magic_pdf/data/batch_build_dataset.py +156 -0
  2. magic_pdf/data/dataset.py +44 -24
  3. magic_pdf/data/utils.py +108 -9
  4. magic_pdf/dict2md/ocr_mkcontent.py +4 -3
  5. magic_pdf/libs/pdf_image_tools.py +11 -6
  6. magic_pdf/libs/performance_stats.py +12 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/batch_analyze.py +175 -201
  9. magic_pdf/model/doc_analyze_by_custom_model.py +137 -92
  10. magic_pdf/model/pdf_extract_kit.py +5 -38
  11. magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
  12. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
  13. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
  14. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
  15. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
  16. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
  17. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
  18. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
  19. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
  20. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
  21. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
  22. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
  25. magic_pdf/model/sub_modules/model_init.py +50 -37
  26. magic_pdf/model/sub_modules/model_utils.py +17 -11
  27. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
  29. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
  32. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
  33. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
  34. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
  35. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
  36. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
  37. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
  38. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
  39. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
  40. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
  41. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
  42. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
  43. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
  44. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
  45. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
  46. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
  47. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
  48. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
  49. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
  50. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
  51. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
  52. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
  53. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
  54. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
  55. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
  56. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
  57. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
  58. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
  59. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
  60. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
  61. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
  62. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
  63. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
  64. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
  65. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
  66. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
  67. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
  68. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
  69. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
  70. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
  71. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
  72. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
  73. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
  74. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
  75. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
  76. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
  77. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
  78. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
  79. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +10 -18
  80. magic_pdf/pdf_parse_union_core_v2.py +112 -74
  81. magic_pdf/post_proc/para_split_v3.py +16 -13
  82. magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
  83. magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
  84. magic_pdf/resources/model_config/model_configs.yaml +1 -1
  85. magic_pdf/tools/cli.py +30 -12
  86. magic_pdf/tools/common.py +90 -12
  87. {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/METADATA +51 -41
  88. magic_pdf-1.3.0.dist-info/RECORD +202 -0
  89. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
  90. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
  91. magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
  92. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
  93. magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
  94. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
  95. magic_pdf-1.2.1.dist-info/RECORD +0 -147
  96. /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
  97. /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
  98. /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
  99. {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/LICENSE.md +0 -0
  100. {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/WHEEL +0 -0
  101. {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/entry_points.txt +0 -0
  102. {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/top_level.txt +0 -0
@@ -41,6 +41,57 @@ def check_chars_is_overlap_in_span(chars):
41
41
  return False
42
42
 
43
43
 
44
+ def remove_x_overlapping_chars(span, median_width):
45
+ """
46
+ Remove characters from a span that overlap significantly on the x-axis.
47
+
48
+ Args:
49
+ median_width:
50
+ span (dict): A span containing a list of chars, each with bbox coordinates
51
+ in the format [x0, y0, x1, y1]
52
+
53
+ Returns:
54
+ dict: The span with overlapping characters removed
55
+ """
56
+ if 'chars' not in span or len(span['chars']) < 2:
57
+ return span
58
+
59
+ overlap_threshold = median_width * 0.3
60
+
61
+ i = 0
62
+ while i < len(span['chars']) - 1:
63
+ char1 = span['chars'][i]
64
+ char2 = span['chars'][i + 1]
65
+
66
+ # Calculate overlap width
67
+ x_left = max(char1['bbox'][0], char2['bbox'][0])
68
+ x_right = min(char1['bbox'][2], char2['bbox'][2])
69
+
70
+ if x_right > x_left: # There is overlap
71
+ overlap_width = x_right - x_left
72
+
73
+ if overlap_width > overlap_threshold:
74
+ if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
75
+ # Determine which character to remove
76
+ width1 = char1['bbox'][2] - char1['bbox'][0]
77
+ width2 = char2['bbox'][2] - char2['bbox'][0]
78
+ if width1 < width2:
79
+ # Remove the narrower character
80
+ span['chars'].pop(i)
81
+ else:
82
+ span['chars'].pop(i + 1)
83
+ else:
84
+ i += 1
85
+
86
+ # Don't increment i since we need to check the new pair
87
+ else:
88
+ i += 1
89
+ else:
90
+ i += 1
91
+
92
+ return span
93
+
94
+
44
95
  def remove_overlaps_min_spans(spans):
45
96
  dropped_spans = []
46
97
  # 删除重叠spans中较小的那些
@@ -2,7 +2,7 @@ weights:
2
2
  layoutlmv3: Layout/LayoutLMv3/model_final.pth
3
3
  doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
4
4
  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
5
- unimernet_small: MFR/unimernet_small_2501
5
+ unimernet_small: MFR/unimernet_hf_small_2503
6
6
  struct_eqtable: TabRec/StructEqTable
7
7
  tablemaster: TabRec/TableMaster
8
8
  rapid_table: TabRec/RapidTable
magic_pdf/tools/cli.py CHANGED
@@ -1,15 +1,18 @@
1
1
  import os
2
2
  import shutil
3
3
  import tempfile
4
+ from pathlib import Path
5
+
4
6
  import click
5
7
  import fitz
6
8
  from loguru import logger
7
- from pathlib import Path
8
9
 
9
10
  import magic_pdf.model as model_config
11
+ from magic_pdf.data.batch_build_dataset import batch_build_dataset
10
12
  from magic_pdf.data.data_reader_writer import FileBasedDataReader
13
+ from magic_pdf.data.dataset import Dataset
11
14
  from magic_pdf.libs.version import __version__
12
- from magic_pdf.tools.common import do_parse, parse_pdf_methods
15
+ from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
13
16
  from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
14
17
 
15
18
  pdf_suffixes = ['.pdf']
@@ -87,37 +90,38 @@ without method specified, auto will be used by default.""",
87
90
  default=None,
88
91
  )
89
92
  def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
90
- model_config.__use_inside_model__ = True
91
- model_config.__model_mode__ = 'full'
92
93
  os.makedirs(output_dir, exist_ok=True)
93
94
  temp_dir = tempfile.mkdtemp()
94
95
  def read_fn(path: Path):
95
96
  if path.suffix in ms_office_suffixes:
96
97
  convert_file_to_pdf(str(path), temp_dir)
97
- fn = os.path.join(temp_dir, f"{path.stem}.pdf")
98
+ fn = os.path.join(temp_dir, f'{path.stem}.pdf')
98
99
  elif path.suffix in image_suffixes:
99
100
  with open(str(path), 'rb') as f:
100
101
  bits = f.read()
101
102
  pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
102
- fn = os.path.join(temp_dir, f"{path.stem}.pdf")
103
+ fn = os.path.join(temp_dir, f'{path.stem}.pdf')
103
104
  with open(fn, 'wb') as f:
104
105
  f.write(pdf_bytes)
105
106
  elif path.suffix in pdf_suffixes:
106
107
  fn = str(path)
107
108
  else:
108
- raise Exception(f"Unknown file suffix: {path.suffix}")
109
-
109
+ raise Exception(f'Unknown file suffix: {path.suffix}')
110
+
110
111
  disk_rw = FileBasedDataReader(os.path.dirname(fn))
111
112
  return disk_rw.read(os.path.basename(fn))
112
113
 
113
- def parse_doc(doc_path: Path):
114
+ def parse_doc(doc_path: Path, dataset: Dataset | None = None):
114
115
  try:
115
116
  file_name = str(Path(doc_path).stem)
116
- pdf_data = read_fn(doc_path)
117
+ if dataset is None:
118
+ pdf_data_or_dataset = read_fn(doc_path)
119
+ else:
120
+ pdf_data_or_dataset = dataset
117
121
  do_parse(
118
122
  output_dir,
119
123
  file_name,
120
- pdf_data,
124
+ pdf_data_or_dataset,
121
125
  [],
122
126
  method,
123
127
  debug_able,
@@ -130,9 +134,23 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
130
134
  logger.exception(e)
131
135
 
132
136
  if os.path.isdir(path):
137
+ doc_paths = []
133
138
  for doc_path in Path(path).glob('*'):
134
139
  if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
135
- parse_doc(doc_path)
140
+ if doc_path.suffix in ms_office_suffixes:
141
+ convert_file_to_pdf(str(doc_path), temp_dir)
142
+ doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
143
+ elif doc_path.suffix in image_suffixes:
144
+ with open(str(doc_path), 'rb') as f:
145
+ bits = f.read()
146
+ pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
147
+ fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
148
+ with open(fn, 'wb') as f:
149
+ f.write(pdf_bytes)
150
+ doc_path = Path(fn)
151
+ doc_paths.append(doc_path)
152
+ datasets = batch_build_dataset(doc_paths, 4, lang)
153
+ batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
136
154
  else:
137
155
  parse_doc(Path(path))
138
156
 
magic_pdf/tools/common.py CHANGED
@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
8
8
  from magic_pdf.config.enums import SupportedPdfParseMethod
9
9
  from magic_pdf.config.make_content_config import DropMode, MakeMode
10
10
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
11
- from magic_pdf.data.dataset import PymuDocDataset
11
+ from magic_pdf.data.dataset import Dataset, PymuDocDataset
12
12
  from magic_pdf.libs.draw_bbox import draw_char_bbox
13
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
14
- from magic_pdf.operators.models import InferenceResult
13
+ from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
14
+ doc_analyze)
15
15
 
16
16
  # from io import BytesIO
17
17
  # from pypdf import PdfReader, PdfWriter
@@ -67,13 +67,13 @@ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_i
67
67
  return output_bytes
68
68
 
69
69
 
70
- def do_parse(
70
+ def _do_parse(
71
71
  output_dir,
72
72
  pdf_file_name,
73
- pdf_bytes,
73
+ pdf_bytes_or_dataset,
74
74
  model_list,
75
75
  parse_method,
76
- debug_able,
76
+ debug_able=False,
77
77
  f_draw_span_bbox=True,
78
78
  f_draw_layout_bbox=True,
79
79
  f_dump_md=True,
@@ -92,16 +92,21 @@ def do_parse(
92
92
  formula_enable=None,
93
93
  table_enable=None,
94
94
  ):
95
+ from magic_pdf.operators.models import InferenceResult
95
96
  if debug_able:
96
97
  logger.warning('debug mode is on')
97
98
  f_draw_model_bbox = True
98
99
  f_draw_line_sort_bbox = True
99
100
  # f_draw_char_bbox = True
100
101
 
101
- pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
102
- pdf_bytes, start_page_id, end_page_id
103
- )
104
-
102
+ if isinstance(pdf_bytes_or_dataset, bytes):
103
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
104
+ pdf_bytes_or_dataset, start_page_id, end_page_id
105
+ )
106
+ ds = PymuDocDataset(pdf_bytes, lang=lang)
107
+ else:
108
+ ds = pdf_bytes_or_dataset
109
+ pdf_bytes = ds._raw_data
105
110
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
106
111
 
107
112
  image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
@@ -109,8 +114,6 @@ def do_parse(
109
114
  )
110
115
  image_dir = str(os.path.basename(local_image_dir))
111
116
 
112
- ds = PymuDocDataset(pdf_bytes, lang=lang)
113
-
114
117
  if len(model_list) == 0:
115
118
  if model_config.__use_inside_model__:
116
119
  if parse_method == 'auto':
@@ -241,5 +244,80 @@ def do_parse(
241
244
 
242
245
  logger.info(f'local output dir is {local_md_dir}')
243
246
 
247
+ def do_parse(
248
+ output_dir,
249
+ pdf_file_name,
250
+ pdf_bytes_or_dataset,
251
+ model_list,
252
+ parse_method,
253
+ debug_able=False,
254
+ f_draw_span_bbox=True,
255
+ f_draw_layout_bbox=True,
256
+ f_dump_md=True,
257
+ f_dump_middle_json=True,
258
+ f_dump_model_json=True,
259
+ f_dump_orig_pdf=True,
260
+ f_dump_content_list=True,
261
+ f_make_md_mode=MakeMode.MM_MD,
262
+ f_draw_model_bbox=False,
263
+ f_draw_line_sort_bbox=False,
264
+ f_draw_char_bbox=False,
265
+ start_page_id=0,
266
+ end_page_id=None,
267
+ lang=None,
268
+ layout_model=None,
269
+ formula_enable=None,
270
+ table_enable=None,
271
+ ):
272
+ parallel_count = 1
273
+ if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
274
+ parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
275
+
276
+ if parallel_count > 1:
277
+ if isinstance(pdf_bytes_or_dataset, bytes):
278
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
279
+ pdf_bytes_or_dataset, start_page_id, end_page_id
280
+ )
281
+ ds = PymuDocDataset(pdf_bytes, lang=lang)
282
+ else:
283
+ ds = pdf_bytes_or_dataset
284
+ batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
285
+ else:
286
+ _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
287
+
288
+
289
+ def batch_do_parse(
290
+ output_dir,
291
+ pdf_file_names: list[str],
292
+ pdf_bytes_or_datasets: list[bytes | Dataset],
293
+ parse_method,
294
+ debug_able=False,
295
+ f_draw_span_bbox=True,
296
+ f_draw_layout_bbox=True,
297
+ f_dump_md=True,
298
+ f_dump_middle_json=True,
299
+ f_dump_model_json=True,
300
+ f_dump_orig_pdf=True,
301
+ f_dump_content_list=True,
302
+ f_make_md_mode=MakeMode.MM_MD,
303
+ f_draw_model_bbox=False,
304
+ f_draw_line_sort_bbox=False,
305
+ f_draw_char_bbox=False,
306
+ lang=None,
307
+ layout_model=None,
308
+ formula_enable=None,
309
+ table_enable=None,
310
+ ):
311
+ dss = []
312
+ for v in pdf_bytes_or_datasets:
313
+ if isinstance(v, bytes):
314
+ dss.append(PymuDocDataset(v, lang=lang))
315
+ else:
316
+ dss.append(v)
317
+
318
+ infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
319
+ for idx, infer_result in enumerate(infer_results):
320
+ _do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
321
+
244
322
 
245
323
  parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.2.1
3
+ Version: 1.3.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,35 +9,30 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: fast-langdetect>=0.2.3
12
+ Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
13
13
  Requires-Dist: loguru>=0.6.0
14
- Requires-Dist: numpy<2.0.0,>=1.21.6
15
- Requires-Dist: pydantic>=2.7.2
16
- Requires-Dist: PyMuPDF<=1.24.14,>=1.24.9
14
+ Requires-Dist: numpy>=1.21.6
15
+ Requires-Dist: pydantic<2.11,>=2.7.2
16
+ Requires-Dist: PyMuPDF<1.25.0,>=1.24.9
17
17
  Requires-Dist: scikit-learn>=1.0.2
18
- Requires-Dist: torch>=2.2.2
19
- Requires-Dist: transformers
18
+ Requires-Dist: torch!=2.5.0,!=2.5.1,<=2.6.0,>=2.2.2
19
+ Requires-Dist: torchvision
20
+ Requires-Dist: transformers<5.0.0,>=4.49.0
20
21
  Requires-Dist: pdfminer.six==20231228
22
+ Requires-Dist: tqdm>=4.67.1
21
23
  Provides-Extra: full
22
- Requires-Dist: unimernet==0.2.3; extra == "full"
23
- Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
24
- Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
25
24
  Requires-Dist: ultralytics>=8.3.48; extra == "full"
26
- Requires-Dist: paddleocr==2.7.3; extra == "full"
27
- Requires-Dist: struct-eqtable==0.3.2; extra == "full"
28
- Requires-Dist: einops; extra == "full"
29
- Requires-Dist: accelerate; extra == "full"
30
25
  Requires-Dist: doclayout-yolo==0.0.2b1; extra == "full"
31
- Requires-Dist: rapidocr-paddle<2.0.0,>=1.4.5; extra == "full"
32
- Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.4; extra == "full"
26
+ Requires-Dist: dill<1,>=0.3.9; extra == "full"
33
27
  Requires-Dist: rapid-table<2.0.0,>=1.0.3; extra == "full"
34
- Requires-Dist: PyYAML; extra == "full"
35
- Requires-Dist: openai; extra == "full"
36
- Requires-Dist: detectron2; extra == "full"
37
- Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
38
- Requires-Dist: paddlepaddle==3.0.0rc1; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
28
+ Requires-Dist: PyYAML<7,>=6.0.2; extra == "full"
29
+ Requires-Dist: ftfy<7,>=6.3.1; extra == "full"
30
+ Requires-Dist: openai<2,>=1.70.0; extra == "full"
31
+ Requires-Dist: shapely<3,>=2.0.7; extra == "full"
32
+ Requires-Dist: pyclipper<2,>=1.3.0; extra == "full"
33
+ Requires-Dist: omegaconf<3,>=2.3.0; extra == "full"
34
+ Requires-Dist: matplotlib>=3.10; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
39
35
  Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
40
- Requires-Dist: paddlepaddle==2.6.1; platform_system == "Windows" and extra == "full"
41
36
  Provides-Extra: lite
42
37
  Requires-Dist: paddleocr==2.7.3; extra == "lite"
43
38
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
@@ -94,6 +89,23 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
94
89
  </div>
95
90
 
96
91
  # Changelog
92
+ - 2025/04/03 Release of 1.3.0, in this version we made many optimizations and improvements:
93
+ - Installation and compatibility optimization
94
+ - By removing the use of `layoutlmv3` in layout, resolved compatibility issues caused by `detectron2`.
95
+ - Torch version compatibility extended to 2.2~2.6 (excluding 2.5).
96
+ - CUDA compatibility supports 11.8/12.4/12.6 (CUDA version determined by torch), resolving compatibility issues for some users with 50-series and H-series GPUs.
97
+ - Python compatible versions expanded to 3.10~3.12, solving the problem of automatic downgrade to 0.6.1 during installation in non-3.10 environments.
98
+ - Offline deployment process optimized; no internet connection required after successful deployment to download any model files.
99
+ - Performance optimization
100
+ - By supporting batch processing of multiple PDF files ([script example](demo/batch_demo.py)), improved parsing speed for small files in batches (compared to version 1.0.1, formula parsing speed increased by over 1400%, overall parsing speed increased by over 500%).
101
+ - Optimized loading and usage of the mfr model, reducing GPU memory usage and improving parsing speed (requires re-execution of the [model download process](docs/how_to_download_models_en.md) to obtain incremental updates of model files).
102
+ - Optimized GPU memory usage, requiring only a minimum of 6GB to run this project.
103
+ - Improved running speed on MPS devices.
104
+ - Parsing effect optimization
105
+ - Updated the mfr model to `unimernet(2503)`, solving the issue of lost line breaks in multi-line formulas.
106
+ - Usability Optimization
107
+ - By using `paddleocr2torch`, completely replaced the use of the `paddle` framework and `paddleocr` in the project, resolving conflicts between `paddle` and `torch`, as well as thread safety issues caused by the `paddle` framework.
108
+ - Added a real-time progress bar during the parsing process to accurately track progress, making the wait less painful.
97
109
  - 2025/03/03 1.2.1 released, fixed several bugs:
98
110
  - Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
99
111
  - Fixed caption matching inaccuracies in certain scenarios
@@ -262,7 +274,7 @@ There are three different ways to experience MinerU:
262
274
  </tr>
263
275
  <tr>
264
276
  <td colspan="3">Python Version</td>
265
- <td colspan="3">3.10(Please make sure to create a Python 3.10 virtual environment using conda)</td>
277
+ <td colspan="3">3.10~3.12</td>
266
278
  </tr>
267
279
  <tr>
268
280
  <td colspan="3">Nvidia Driver Version</td>
@@ -272,8 +284,8 @@ There are three different ways to experience MinerU:
272
284
  </tr>
273
285
  <tr>
274
286
  <td colspan="3">CUDA Environment</td>
275
- <td>Automatic installation [12.1 (pytorch) + 11.8 (paddle)]</td>
276
- <td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
287
+ <td>11.8/12.4/12.6</td>
288
+ <td>11.8/12.4/12.6</td>
277
289
  <td>None</td>
278
290
  </tr>
279
291
  <tr>
@@ -283,11 +295,11 @@ There are three different ways to experience MinerU:
283
295
  <td>None</td>
284
296
  </tr>
285
297
  <tr>
286
- <td rowspan="2">GPU Hardware Support List</td>
287
- <td colspan="2">GPU VRAM 8GB or more</td>
288
- <td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
289
- 8G VRAM can enable all acceleration features</td>
290
- <td rowspan="2">None</td>
298
+ <td rowspan="2">GPU/MPS Hardware Support List</td>
299
+ <td colspan="2">GPU VRAM 6GB or more</td>
300
+ <td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
301
+ More than 6GB VRAM </td>
302
+ <td rowspan="2">apple slicon</td>
291
303
  </tr>
292
304
  </table>
293
305
 
@@ -304,9 +316,9 @@ Synced with dev branch updates:
304
316
  #### 1. Install magic-pdf
305
317
 
306
318
  ```bash
307
- conda create -n mineru python=3.10
319
+ conda create -n mineru 'python<3.13' -y
308
320
  conda activate mineru
309
- pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
321
+ pip install -U "magic-pdf[full]"
310
322
  ```
311
323
 
312
324
  #### 2. Download model weight files
@@ -331,7 +343,7 @@ You can modify certain configurations in this file to enable or disable features
331
343
  {
332
344
  // other config
333
345
  "layout-config": {
334
- "model": "doclayout_yolo" // Please change to "layoutlmv3" when using layoutlmv3.
346
+ "model": "doclayout_yolo"
335
347
  },
336
348
  "formula-config": {
337
349
  "mfd_model": "yolo_v8_mfd",
@@ -339,8 +351,8 @@ You can modify certain configurations in this file to enable or disable features
339
351
  "enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
340
352
  },
341
353
  "table-config": {
342
- "model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
343
- "sub_model": "slanet_plus", // When the model is "rapid_table", you can choose a sub_model. The options are "slanet_plus" and "unitable"
354
+ "model": "rapid_table",
355
+ "sub_model": "slanet_plus",
344
356
  "enable": true, // The table recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
345
357
  "max_time": 400
346
358
  }
@@ -355,7 +367,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
355
367
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
356
368
  - Quick Deployment with Docker
357
369
  > [!IMPORTANT]
358
- > Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
370
+ > Docker requires a GPU with at least 6GB of VRAM, and all acceleration features are enabled by default.
359
371
  >
360
372
  > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
361
373
  >
@@ -377,7 +389,7 @@ If your device has NPU acceleration hardware, you can follow the tutorial below
377
389
 
378
390
  ### Using MPS
379
391
 
380
- If your device uses Apple silicon chips, you can enable MPS acceleration for certain supported tasks (such as layout detection and formula detection).
392
+ If your device uses Apple silicon chips, you can enable MPS acceleration for your tasks.
381
393
 
382
394
  You can enable MPS acceleration by setting the `device-mode` parameter to `mps` in the `magic-pdf.json` configuration file.
383
395
 
@@ -388,10 +400,6 @@ You can enable MPS acceleration by setting the `device-mode` parameter to `mps`
388
400
  }
389
401
  ```
390
402
 
391
- > [!TIP]
392
- > Since the formula recognition task cannot utilize MPS acceleration, you can disable the formula recognition feature in tasks where it is not needed to achieve optimal performance.
393
- >
394
- > You can disable the formula recognition feature by setting the `enable` parameter in the `formula-config` section to `false`.
395
403
 
396
404
  ## Usage
397
405
 
@@ -465,6 +473,8 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
465
473
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
466
474
  - [RapidTable](https://github.com/RapidAI/RapidTable)
467
475
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
476
+ - [RapidOCR](https://github.com/RapidAI/RapidOCR)
477
+ - [PaddleOCR2Pytorch](https://github.com/frotms/PaddleOCR2Pytorch)
468
478
  - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
469
479
  - [layoutreader](https://github.com/ppaanngggg/layoutreader)
470
480
  - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)