magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +134 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/Constants.py +27 -1
  9. magic_pdf/libs/boxbase.py +169 -149
  10. magic_pdf/libs/draw_bbox.py +113 -87
  11. magic_pdf/libs/ocr_content_type.py +21 -18
  12. magic_pdf/libs/version.py +1 -1
  13. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  14. magic_pdf/model/magic_model.py +230 -161
  15. magic_pdf/model/model_list.py +8 -0
  16. magic_pdf/model/pdf_extract_kit.py +135 -22
  17. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  18. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
  19. magic_pdf/model/ppTableModel.py +67 -0
  20. magic_pdf/para/para_split_v2.py +76 -74
  21. magic_pdf/pdf_parse_union_core.py +34 -6
  22. magic_pdf/pipe/AbsPipe.py +4 -1
  23. magic_pdf/pipe/OCRPipe.py +7 -4
  24. magic_pdf/pipe/TXTPipe.py +7 -4
  25. magic_pdf/pipe/UNIPipe.py +11 -6
  26. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  27. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  28. magic_pdf/resources/model_config/model_configs.yaml +3 -1
  29. magic_pdf/tools/cli.py +56 -29
  30. magic_pdf/tools/cli_dev.py +61 -64
  31. magic_pdf/tools/common.py +57 -37
  32. magic_pdf/user_api.py +17 -9
  33. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
  34. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
  35. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
  36. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
  37. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
  38. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py CHANGED
@@ -1,22 +1,25 @@
1
- import os
2
- import json as json_parse
3
1
  import copy
2
+ import json as json_parse
3
+ import os
4
+
4
5
  import click
5
6
  from loguru import logger
7
+
8
+ import magic_pdf.model as model_config
9
+ from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
10
+ drow_model_bbox)
6
11
  from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
7
- from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
8
- from magic_pdf.pipe.UNIPipe import UNIPipe
9
12
  from magic_pdf.pipe.OCRPipe import OCRPipe
10
13
  from magic_pdf.pipe.TXTPipe import TXTPipe
11
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
+ from magic_pdf.pipe.UNIPipe import UNIPipe
12
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
- import magic_pdf.model as model_config
16
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
17
 
15
18
 
16
19
  def prepare_env(output_dir, pdf_file_name, method):
17
20
  local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
18
21
 
19
- local_image_dir = os.path.join(str(local_parent_dir), "images")
22
+ local_image_dir = os.path.join(str(local_parent_dir), 'images')
20
23
  local_md_dir = local_parent_dir
21
24
  os.makedirs(local_image_dir, exist_ok=True)
22
25
  os.makedirs(local_md_dir, exist_ok=True)
@@ -29,6 +32,7 @@ def do_parse(
29
32
  pdf_bytes,
30
33
  model_list,
31
34
  parse_method,
35
+ debug_able,
32
36
  f_draw_span_bbox=True,
33
37
  f_draw_layout_bbox=True,
34
38
  f_dump_md=True,
@@ -38,24 +42,34 @@ def do_parse(
38
42
  f_dump_content_list=False,
39
43
  f_make_md_mode=MakeMode.MM_MD,
40
44
  f_draw_model_bbox=False,
45
+ start_page_id=0,
46
+ end_page_id=None,
41
47
  ):
48
+ if debug_able:
49
+ logger.warning("debug mode is on")
50
+ f_dump_content_list = True
51
+ f_draw_model_bbox = True
52
+
42
53
  orig_model_list = copy.deepcopy(model_list)
43
- local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
54
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
55
+ parse_method)
44
56
 
45
- image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
46
- local_md_dir
47
- )
57
+ image_writer, md_writer = DiskReaderWriter(
58
+ local_image_dir), DiskReaderWriter(local_md_dir)
48
59
  image_dir = str(os.path.basename(local_image_dir))
49
60
 
50
- if parse_method == "auto":
51
- jso_useful_key = {"_pdf_type": "", "model_list": model_list}
52
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
53
- elif parse_method == "txt":
54
- pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
55
- elif parse_method == "ocr":
56
- pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
61
+ if parse_method == 'auto':
62
+ jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
63
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
64
+ start_page_id=start_page_id, end_page_id=end_page_id)
65
+ elif parse_method == 'txt':
66
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
67
+ start_page_id=start_page_id, end_page_id=end_page_id)
68
+ elif parse_method == 'ocr':
69
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
70
+ start_page_id=start_page_id, end_page_id=end_page_id)
57
71
  else:
58
- logger.error("unknown parse method")
72
+ logger.error('unknown parse method')
59
73
  exit(1)
60
74
 
61
75
  pipe.pipe_classify()
@@ -65,58 +79,64 @@ def do_parse(
65
79
  pipe.pipe_analyze()
66
80
  orig_model_list = copy.deepcopy(pipe.model_list)
67
81
  else:
68
- logger.error("need model list input")
82
+ logger.error('need model list input')
69
83
  exit(2)
70
84
 
71
85
  pipe.pipe_parse()
72
- pdf_info = pipe.pdf_mid_data["pdf_info"]
86
+ pdf_info = pipe.pdf_mid_data['pdf_info']
73
87
  if f_draw_layout_bbox:
74
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
88
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
75
89
  if f_draw_span_bbox:
76
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
90
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
77
91
  if f_draw_model_bbox:
78
- drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
92
+ drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
79
93
 
80
- md_content = pipe.pipe_mk_markdown(
81
- image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
82
- )
94
+ md_content = pipe.pipe_mk_markdown(image_dir,
95
+ drop_mode=DropMode.NONE,
96
+ md_make_mode=f_make_md_mode)
83
97
  if f_dump_md:
84
98
  md_writer.write(
85
99
  content=md_content,
86
- path=f"{pdf_file_name}.md",
100
+ path=f'{pdf_file_name}.md',
87
101
  mode=AbsReaderWriter.MODE_TXT,
88
102
  )
89
103
 
90
104
  if f_dump_middle_json:
91
105
  md_writer.write(
92
- content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
93
- path="middle.json",
106
+ content=json_parse.dumps(pipe.pdf_mid_data,
107
+ ensure_ascii=False,
108
+ indent=4),
109
+ path=f'{pdf_file_name}_middle.json',
94
110
  mode=AbsReaderWriter.MODE_TXT,
95
111
  )
96
112
 
97
113
  if f_dump_model_json:
98
114
  md_writer.write(
99
- content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
100
- path="model.json",
115
+ content=json_parse.dumps(orig_model_list,
116
+ ensure_ascii=False,
117
+ indent=4),
118
+ path=f'{pdf_file_name}_model.json',
101
119
  mode=AbsReaderWriter.MODE_TXT,
102
120
  )
103
121
 
104
122
  if f_dump_orig_pdf:
105
123
  md_writer.write(
106
124
  content=pdf_bytes,
107
- path="origin.pdf",
125
+ path=f'{pdf_file_name}_origin.pdf',
108
126
  mode=AbsReaderWriter.MODE_BIN,
109
127
  )
110
128
 
111
129
  content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
112
130
  if f_dump_content_list:
113
131
  md_writer.write(
114
- content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
115
- path="content_list.json",
132
+ content=json_parse.dumps(content_list,
133
+ ensure_ascii=False,
134
+ indent=4),
135
+ path=f'{pdf_file_name}_content_list.json',
116
136
  mode=AbsReaderWriter.MODE_TXT,
117
137
  )
118
138
 
119
- logger.info(f"local output dir is {local_md_dir}")
139
+ logger.info(f'local output dir is {local_md_dir}')
120
140
 
121
141
 
122
- parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
142
+ parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
magic_pdf/user_api.py CHANGED
@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
25
25
  PARSE_TYPE_OCR = "ocr"
26
26
 
27
27
 
28
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
29
- **kwargs):
28
+ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
29
+ start_page_id=0, end_page_id=None,
30
+ *args, **kwargs):
30
31
  """
31
32
  解析文本类pdf
32
33
  """
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
34
35
  pdf_bytes,
35
36
  pdf_models,
36
37
  imageWriter,
37
- start_page_id=start_page,
38
+ start_page_id=start_page_id,
39
+ end_page_id=end_page_id,
38
40
  debug_mode=is_debug,
39
41
  )
40
42
 
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
45
47
  return pdf_info_dict
46
48
 
47
49
 
48
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
49
- **kwargs):
50
+ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
51
+ start_page_id=0, end_page_id=None,
52
+ *args, **kwargs):
50
53
  """
51
54
  解析ocr类pdf
52
55
  """
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
54
57
  pdf_bytes,
55
58
  pdf_models,
56
59
  imageWriter,
57
- start_page_id=start_page,
60
+ start_page_id=start_page_id,
61
+ end_page_id=end_page_id,
58
62
  debug_mode=is_debug,
59
63
  )
60
64
 
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
65
69
  return pdf_info_dict
66
70
 
67
71
 
68
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
72
+ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
69
73
  input_model_is_empty: bool = False,
74
+ start_page_id=0, end_page_id=None,
70
75
  *args, **kwargs):
71
76
  """
72
77
  ocr和文本混合的pdf,全部解析出来
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
78
83
  pdf_bytes,
79
84
  pdf_models,
80
85
  imageWriter,
81
- start_page_id=start_page,
86
+ start_page_id=start_page_id,
87
+ end_page_id=end_page_id,
82
88
  debug_mode=is_debug,
83
89
  )
84
90
  except Exception as e:
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
89
95
  if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
90
96
  logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
91
97
  if input_model_is_empty:
92
- pdf_models = doc_analyze(pdf_bytes, ocr=True)
98
+ pdf_models = doc_analyze(pdf_bytes, ocr=True,
99
+ start_page_id=start_page_id,
100
+ end_page_id=end_page_id)
93
101
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
94
102
  if pdf_info_dict is None:
95
103
  raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.7.0b1
3
+ Version: 0.8.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,13 +9,14 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: PyMuPDF>=1.24.9
12
+ Requires-Dist: fast-langdetect==0.2.0
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
- Requires-Dist: fast-langdetect==0.2.0
16
- Requires-Dist: wordninja>=2.0.0
17
- Requires-Dist: scikit-learn>=1.0.2
18
15
  Requires-Dist: pdfminer.six==20231228
16
+ Requires-Dist: pydantic<2.8.0,>=2.7.2
17
+ Requires-Dist: PyMuPDF>=1.24.9
18
+ Requires-Dist: scikit-learn>=1.0.2
19
+ Requires-Dist: wordninja>=2.0.0
19
20
  Provides-Extra: full
20
21
  Requires-Dist: unimernet==0.1.6; extra == "full"
21
22
  Requires-Dist: ultralytics; extra == "full"
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
39
40
  </p>
40
41
 
41
42
  <!-- icon -->
43
+
42
44
  [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
43
45
  [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
44
46
  [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
@@ -46,17 +48,26 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
46
48
  [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
47
49
  [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
48
50
  [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
51
+
52
+ [![HuggingFace](https://img.shields.io/badge/HuggingFace-Demo-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC)](https://huggingface.co/spaces/opendatalab/MinerU)
53
+ [![ModelScope](https://img.shields.io/badge/ModelScope-Demo-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
54
+ [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
55
+ [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](#)
56
+
49
57
  <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
50
58
 
51
59
  <!-- language -->
60
+
52
61
  [English](README.md) | [简体中文](README_zh-CN.md)
53
62
 
54
63
  <!-- hot link -->
64
+
55
65
  <p align="center">
56
66
  <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
57
67
  </p>
58
68
 
59
69
  <!-- join us -->
70
+
60
71
  <p align="center">
61
72
  👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
62
73
  </p>
@@ -64,11 +75,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
64
75
  </div>
65
76
 
66
77
  # Changelog
78
+ - 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
79
+ - 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
67
80
  - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
68
81
  - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
69
82
  - 2024/07/05: Initial open-source release
70
83
 
71
84
  <!-- TABLE OF CONTENT -->
85
+
72
86
  <details open="open">
73
87
  <summary><h2 style="display: inline-block">Table of Contents</h2></summary>
74
88
  <ol>
@@ -107,10 +121,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
107
121
  </ol>
108
122
  </details>
109
123
 
110
-
111
-
112
124
  # MinerU
125
+
113
126
  ## Project Introduction
127
+
114
128
  MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
115
129
  MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
116
130
  Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
@@ -134,6 +148,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
134
148
  If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
135
149
  If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
136
150
  There are three different ways to experience MinerU:
151
+
137
152
  - [Online Demo (No Installation Required)](#online-demo)
138
153
  - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
139
154
  - [Linux/Windows + CUDA](#Using-GPU)
@@ -191,7 +206,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
191
206
  <tr>
192
207
  <td colspan="2">Recommended Configuration 16G+ VRAM</td>
193
208
  <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
194
- 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously</td>
209
+ 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
210
+ 24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
211
+ </td>
195
212
  </tr>
196
213
  </table>
197
214
 
@@ -202,51 +219,73 @@ In non-mainline environments, due to the diversity of hardware and software conf
202
219
  ### Quick CPU Demo
203
220
 
204
221
  #### 1. Install magic-pdf
222
+
205
223
  ```bash
206
224
  conda create -n MinerU python=3.10
207
225
  conda activate MinerU
208
- pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
226
+ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
209
227
  ```
228
+
210
229
  #### 2. Download model weight files
211
230
 
212
231
  Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
232
+
213
233
  > ❗️After downloading the models, please make sure to verify the completeness of the model files.
214
- >
234
+ >
215
235
  > Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
216
236
 
217
237
  #### 3. Copy and configure the template file
238
+
218
239
  You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
240
+
219
241
  > ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
220
- >
242
+ >
221
243
  > The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
244
+
222
245
  ```bash
223
246
  cp magic-pdf.template.json ~/magic-pdf.json
224
247
  ```
225
248
 
226
249
  Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
250
+
227
251
  > ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
228
252
  >
229
253
  > On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
230
- >
254
+ >
231
255
  > For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
256
+
232
257
  ```json
233
258
  {
234
259
  // other config
235
260
  "models-dir": "D:/models",
236
261
  "table-config": {
262
+ "model": "TableMaster", // Another option of this value is 'struct_eqtable'
237
263
  "is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
238
264
  "max_time": 400
239
265
  }
240
266
  }
241
267
  ```
242
268
 
243
-
244
269
  ### Using GPU
270
+
245
271
  If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
246
272
 
247
273
  - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
248
274
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
249
-
275
+ - Quick Deployment with Docker
276
+ > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
277
+ >
278
+ > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
279
+ >
280
+ > ```bash
281
+ > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
282
+ > ```
283
+ ```bash
284
+ wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
285
+ docker build -t mineru:latest .
286
+ docker run --rm -it --gpus=all mineru:latest /bin/bash
287
+ magic-pdf --help
288
+ ```
250
289
 
251
290
  ## Usage
252
291
 
@@ -260,12 +299,12 @@ Options:
260
299
  -v, --version display the version and exit
261
300
  -p, --path PATH local pdf filepath or directory [required]
262
301
  -o, --output-dir TEXT output local directory
263
- -m, --method [ocr|txt|auto] the method for parsing pdf.
302
+ -m, --method [ocr|txt|auto] the method for parsing pdf.
264
303
  ocr: using ocr technique to extract information from pdf,
265
304
  txt: suitable for the text-based pdf only and outperform ocr,
266
305
  auto: automatically choose the best method for parsing pdf
267
306
  from ocr and txt.
268
- without method specified, auto will be used by default.
307
+ without method specified, auto will be used by default.
269
308
  --help Show this message and exit.
270
309
 
271
310
 
@@ -280,13 +319,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
280
319
  The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
281
320
 
282
321
  ```text
283
- ├── some_pdf.md # markdown file
284
- ├── images # directory for storing images
285
- ├── layout.pdf # layout diagram
286
- ├── middle.json # MinerU intermediate processing result
287
- ├── model.json # model inference result
288
- ├── origin.pdf # original PDF file
289
- └── spans.pdf # smallest granularity bbox position information diagram
322
+ ├── some_pdf.md # markdown file
323
+ ├── images # directory for storing images
324
+ ├── some_pdf_layout.pdf # layout diagram
325
+ ├── some_pdf_middle.json # MinerU intermediate processing result
326
+ ├── some_pdf_model.json # model inference result
327
+ ├── some_pdf_origin.pdf # original PDF file
328
+ └── some_pdf_spans.pdf # smallest granularity bbox position information diagram
290
329
  ```
291
330
 
292
331
  For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
@@ -294,6 +333,7 @@ For more information about the output files, please refer to the [Output File De
294
333
  ### API
295
334
 
296
335
  Processing files from local disk
336
+
297
337
  ```python
298
338
  image_writer = DiskReaderWriter(local_image_dir)
299
339
  image_dir = str(os.path.basename(local_image_dir))
@@ -306,6 +346,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
306
346
  ```
307
347
 
308
348
  Processing files from object storage
349
+
309
350
  ```python
310
351
  s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
311
352
  image_dir = "s3://img_bucket/"
@@ -320,10 +361,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
320
361
  ```
321
362
 
322
363
  For detailed implementation, refer to:
364
+
323
365
  - [demo.py Simplest Processing Method](demo/demo.py)
324
366
  - [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
325
367
 
326
-
327
368
  ### Development Guide
328
369
 
329
370
  TODO
@@ -335,30 +376,25 @@ TODO
335
376
  - [ ] Code block recognition within the text
336
377
  - [ ] Table of contents recognition
337
378
  - [x] Table recognition
338
- - [ ] Chemical formula recognition
379
+ - [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
339
380
  - [ ] Geometric shape recognition
340
381
 
341
382
  # Known Issues
383
+
342
384
  - Reading order is segmented based on rules, which can cause disordered sequences in some cases
343
385
  - Vertical text is not supported
344
386
  - Lists, code blocks, and table of contents are not yet supported in the layout model
345
387
  - Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
346
388
  - Enabling OCR may produce better results in PDFs with a high density of formulas
347
389
  - If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
348
- - **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
349
390
 
350
- | Table Size | Parsing Time |
351
- |---------------|----------------------------|
352
- | 6\*5 55kb | 37s |
353
- | 16\*12 284kb | 3m18s |
354
- | 44\*7 559kb | 4m12s |
355
391
 
356
392
  # FAQ
393
+
357
394
  [FAQ in Chinese](docs/FAQ_zh_cn.md)
358
395
 
359
396
  [FAQ in English](docs/FAQ_en_us.md)
360
397
 
361
-
362
398
  # All Thanks To Our Contributors
363
399
 
364
400
  <a href="https://github.com/opendatalab/MinerU/graphs/contributors">
@@ -371,8 +407,8 @@ TODO
371
407
 
372
408
  This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
373
409
 
374
-
375
410
  # Acknowledgments
411
+
376
412
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
377
413
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
378
414
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
@@ -409,9 +445,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
409
445
  </a>
410
446
 
411
447
  # Magic-doc
448
+
412
449
  [Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
413
450
 
414
451
  # Magic-html
452
+
415
453
  [Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
416
454
 
417
455
  # Links