magic-pdf 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +169 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +227 -161
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +68 -26
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py CHANGED
@@ -1,22 +1,25 @@
1
- import os
2
- import json as json_parse
3
1
  import copy
2
+ import json as json_parse
3
+ import os
4
+
4
5
  import click
5
6
  from loguru import logger
7
+
8
+ import magic_pdf.model as model_config
9
+ from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
10
+ drow_model_bbox)
6
11
  from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
7
- from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
8
- from magic_pdf.pipe.UNIPipe import UNIPipe
9
12
  from magic_pdf.pipe.OCRPipe import OCRPipe
10
13
  from magic_pdf.pipe.TXTPipe import TXTPipe
11
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
+ from magic_pdf.pipe.UNIPipe import UNIPipe
12
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
- import magic_pdf.model as model_config
16
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
17
 
15
18
 
16
19
  def prepare_env(output_dir, pdf_file_name, method):
17
20
  local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
18
21
 
19
- local_image_dir = os.path.join(str(local_parent_dir), "images")
22
+ local_image_dir = os.path.join(str(local_parent_dir), 'images')
20
23
  local_md_dir = local_parent_dir
21
24
  os.makedirs(local_image_dir, exist_ok=True)
22
25
  os.makedirs(local_md_dir, exist_ok=True)
@@ -29,6 +32,7 @@ def do_parse(
29
32
  pdf_bytes,
30
33
  model_list,
31
34
  parse_method,
35
+ debug_able,
32
36
  f_draw_span_bbox=True,
33
37
  f_draw_layout_bbox=True,
34
38
  f_dump_md=True,
@@ -38,24 +42,34 @@ def do_parse(
38
42
  f_dump_content_list=False,
39
43
  f_make_md_mode=MakeMode.MM_MD,
40
44
  f_draw_model_bbox=False,
45
+ start_page_id=0,
46
+ end_page_id=None,
41
47
  ):
48
+ if debug_able:
49
+ logger.warning("debug mode is on")
50
+ f_dump_content_list = True
51
+ f_draw_model_bbox = True
52
+
42
53
  orig_model_list = copy.deepcopy(model_list)
43
- local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
54
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
55
+ parse_method)
44
56
 
45
- image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
46
- local_md_dir
47
- )
57
+ image_writer, md_writer = DiskReaderWriter(
58
+ local_image_dir), DiskReaderWriter(local_md_dir)
48
59
  image_dir = str(os.path.basename(local_image_dir))
49
60
 
50
- if parse_method == "auto":
51
- jso_useful_key = {"_pdf_type": "", "model_list": model_list}
52
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
53
- elif parse_method == "txt":
54
- pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
55
- elif parse_method == "ocr":
56
- pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
61
+ if parse_method == 'auto':
62
+ jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
63
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
64
+ start_page_id=start_page_id, end_page_id=end_page_id)
65
+ elif parse_method == 'txt':
66
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
67
+ start_page_id=start_page_id, end_page_id=end_page_id)
68
+ elif parse_method == 'ocr':
69
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
70
+ start_page_id=start_page_id, end_page_id=end_page_id)
57
71
  else:
58
- logger.error("unknown parse method")
72
+ logger.error('unknown parse method')
59
73
  exit(1)
60
74
 
61
75
  pipe.pipe_classify()
@@ -65,58 +79,64 @@ def do_parse(
65
79
  pipe.pipe_analyze()
66
80
  orig_model_list = copy.deepcopy(pipe.model_list)
67
81
  else:
68
- logger.error("need model list input")
82
+ logger.error('need model list input')
69
83
  exit(2)
70
84
 
71
85
  pipe.pipe_parse()
72
- pdf_info = pipe.pdf_mid_data["pdf_info"]
86
+ pdf_info = pipe.pdf_mid_data['pdf_info']
73
87
  if f_draw_layout_bbox:
74
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
88
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
75
89
  if f_draw_span_bbox:
76
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
90
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
77
91
  if f_draw_model_bbox:
78
- drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
92
+ drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
79
93
 
80
- md_content = pipe.pipe_mk_markdown(
81
- image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
82
- )
94
+ md_content = pipe.pipe_mk_markdown(image_dir,
95
+ drop_mode=DropMode.NONE,
96
+ md_make_mode=f_make_md_mode)
83
97
  if f_dump_md:
84
98
  md_writer.write(
85
99
  content=md_content,
86
- path=f"{pdf_file_name}.md",
100
+ path=f'{pdf_file_name}.md',
87
101
  mode=AbsReaderWriter.MODE_TXT,
88
102
  )
89
103
 
90
104
  if f_dump_middle_json:
91
105
  md_writer.write(
92
- content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
93
- path="middle.json",
106
+ content=json_parse.dumps(pipe.pdf_mid_data,
107
+ ensure_ascii=False,
108
+ indent=4),
109
+ path=f'{pdf_file_name}_middle.json',
94
110
  mode=AbsReaderWriter.MODE_TXT,
95
111
  )
96
112
 
97
113
  if f_dump_model_json:
98
114
  md_writer.write(
99
- content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
100
- path="model.json",
115
+ content=json_parse.dumps(orig_model_list,
116
+ ensure_ascii=False,
117
+ indent=4),
118
+ path=f'{pdf_file_name}_model.json',
101
119
  mode=AbsReaderWriter.MODE_TXT,
102
120
  )
103
121
 
104
122
  if f_dump_orig_pdf:
105
123
  md_writer.write(
106
124
  content=pdf_bytes,
107
- path="origin.pdf",
125
+ path=f'{pdf_file_name}_origin.pdf',
108
126
  mode=AbsReaderWriter.MODE_BIN,
109
127
  )
110
128
 
111
129
  content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
112
130
  if f_dump_content_list:
113
131
  md_writer.write(
114
- content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
115
- path="content_list.json",
132
+ content=json_parse.dumps(content_list,
133
+ ensure_ascii=False,
134
+ indent=4),
135
+ path=f'{pdf_file_name}_content_list.json',
116
136
  mode=AbsReaderWriter.MODE_TXT,
117
137
  )
118
138
 
119
- logger.info(f"local output dir is {local_md_dir}")
139
+ logger.info(f'local output dir is {local_md_dir}')
120
140
 
121
141
 
122
- parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
142
+ parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
magic_pdf/user_api.py CHANGED
@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
25
25
  PARSE_TYPE_OCR = "ocr"
26
26
 
27
27
 
28
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
29
- **kwargs):
28
+ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
29
+ start_page_id=0, end_page_id=None,
30
+ *args, **kwargs):
30
31
  """
31
32
  解析文本类pdf
32
33
  """
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
34
35
  pdf_bytes,
35
36
  pdf_models,
36
37
  imageWriter,
37
- start_page_id=start_page,
38
+ start_page_id=start_page_id,
39
+ end_page_id=end_page_id,
38
40
  debug_mode=is_debug,
39
41
  )
40
42
 
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
45
47
  return pdf_info_dict
46
48
 
47
49
 
48
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
49
- **kwargs):
50
+ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
51
+ start_page_id=0, end_page_id=None,
52
+ *args, **kwargs):
50
53
  """
51
54
  解析ocr类pdf
52
55
  """
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
54
57
  pdf_bytes,
55
58
  pdf_models,
56
59
  imageWriter,
57
- start_page_id=start_page,
60
+ start_page_id=start_page_id,
61
+ end_page_id=end_page_id,
58
62
  debug_mode=is_debug,
59
63
  )
60
64
 
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
65
69
  return pdf_info_dict
66
70
 
67
71
 
68
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
72
+ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
69
73
  input_model_is_empty: bool = False,
74
+ start_page_id=0, end_page_id=None,
70
75
  *args, **kwargs):
71
76
  """
72
77
  ocr和文本混合的pdf,全部解析出来
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
78
83
  pdf_bytes,
79
84
  pdf_models,
80
85
  imageWriter,
81
- start_page_id=start_page,
86
+ start_page_id=start_page_id,
87
+ end_page_id=end_page_id,
82
88
  debug_mode=is_debug,
83
89
  )
84
90
  except Exception as e:
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
89
95
  if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
90
96
  logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
91
97
  if input_model_is_empty:
92
- pdf_models = doc_analyze(pdf_bytes, ocr=True)
98
+ pdf_models = doc_analyze(pdf_bytes, ocr=True,
99
+ start_page_id=start_page_id,
100
+ end_page_id=end_page_id)
93
101
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
94
102
  if pdf_info_dict is None:
95
103
  raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.7.1
3
+ Version: 0.8.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,13 +9,14 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: PyMuPDF>=1.24.9
12
+ Requires-Dist: fast-langdetect==0.2.0
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
- Requires-Dist: fast-langdetect==0.2.0
16
- Requires-Dist: wordninja>=2.0.0
17
- Requires-Dist: scikit-learn>=1.0.2
18
15
  Requires-Dist: pdfminer.six==20231228
16
+ Requires-Dist: pydantic<2.8.0,>=2.7.2
17
+ Requires-Dist: PyMuPDF>=1.24.9
18
+ Requires-Dist: scikit-learn>=1.0.2
19
+ Requires-Dist: wordninja>=2.0.0
19
20
  Provides-Extra: full
20
21
  Requires-Dist: unimernet==0.1.6; extra == "full"
21
22
  Requires-Dist: ultralytics; extra == "full"
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
39
40
  </p>
40
41
 
41
42
  <!-- icon -->
43
+
42
44
  [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
43
45
  [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
44
46
  [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
@@ -46,17 +48,26 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
46
48
  [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
47
49
  [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
48
50
  [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
51
+
52
+ [![HuggingFace](https://img.shields.io/badge/HuggingFace-Demo-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC)](https://huggingface.co/spaces/opendatalab/MinerU)
53
+ [![ModelScope](https://img.shields.io/badge/ModelScope-Demo-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
54
+ [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
55
+ [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](#)
56
+
49
57
  <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
50
58
 
51
59
  <!-- language -->
60
+
52
61
  [English](README.md) | [简体中文](README_zh-CN.md)
53
62
 
54
63
  <!-- hot link -->
64
+
55
65
  <p align="center">
56
66
  <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
57
67
  </p>
58
68
 
59
69
  <!-- join us -->
70
+
60
71
  <p align="center">
61
72
  👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
62
73
  </p>
@@ -64,12 +75,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
64
75
  </div>
65
76
 
66
77
  # Changelog
78
+ - 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
67
79
  - 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
68
80
  - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
69
81
  - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
70
82
  - 2024/07/05: Initial open-source release
71
83
 
72
84
  <!-- TABLE OF CONTENT -->
85
+
73
86
  <details open="open">
74
87
  <summary><h2 style="display: inline-block">Table of Contents</h2></summary>
75
88
  <ol>
@@ -108,10 +121,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
108
121
  </ol>
109
122
  </details>
110
123
 
111
-
112
-
113
124
  # MinerU
125
+
114
126
  ## Project Introduction
127
+
115
128
  MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
116
129
  MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
117
130
  Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
@@ -135,6 +148,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
135
148
  If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
136
149
  If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
137
150
  There are three different ways to experience MinerU:
151
+
138
152
  - [Online Demo (No Installation Required)](#online-demo)
139
153
  - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
140
154
  - [Linux/Windows + CUDA](#Using-GPU)
@@ -192,7 +206,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
192
206
  <tr>
193
207
  <td colspan="2">Recommended Configuration 16G+ VRAM</td>
194
208
  <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
195
- 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously</td>
209
+ 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
210
+ 24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
211
+ </td>
196
212
  </tr>
197
213
  </table>
198
214
 
@@ -203,33 +219,41 @@ In non-mainline environments, due to the diversity of hardware and software conf
203
219
  ### Quick CPU Demo
204
220
 
205
221
  #### 1. Install magic-pdf
222
+
206
223
  ```bash
207
224
  conda create -n MinerU python=3.10
208
225
  conda activate MinerU
209
226
  pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
210
227
  ```
228
+
211
229
  #### 2. Download model weight files
212
230
 
213
231
  Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
232
+
214
233
  > ❗️After downloading the models, please make sure to verify the completeness of the model files.
215
- >
234
+ >
216
235
  > Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
217
236
 
218
237
  #### 3. Copy and configure the template file
238
+
219
239
  You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
240
+
220
241
  > ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
221
- >
242
+ >
222
243
  > The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
244
+
223
245
  ```bash
224
246
  cp magic-pdf.template.json ~/magic-pdf.json
225
247
  ```
226
248
 
227
249
  Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
250
+
228
251
  > ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
229
252
  >
230
253
  > On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
231
- >
254
+ >
232
255
  > For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
256
+
233
257
  ```json
234
258
  {
235
259
  // other config
@@ -242,13 +266,26 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
242
266
  }
243
267
  ```
244
268
 
245
-
246
269
  ### Using GPU
270
+
247
271
  If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
248
272
 
249
273
  - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
250
274
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
251
-
275
+ - Quick Deployment with Docker
276
+ > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
277
+ >
278
+ > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
279
+ >
280
+ > ```bash
281
+ > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
282
+ > ```
283
+ ```bash
284
+ wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
285
+ docker build -t mineru:latest .
286
+ docker run --rm -it --gpus=all mineru:latest /bin/bash
287
+ magic-pdf --help
288
+ ```
252
289
 
253
290
  ## Usage
254
291
 
@@ -262,12 +299,12 @@ Options:
262
299
  -v, --version display the version and exit
263
300
  -p, --path PATH local pdf filepath or directory [required]
264
301
  -o, --output-dir TEXT output local directory
265
- -m, --method [ocr|txt|auto] the method for parsing pdf.
302
+ -m, --method [ocr|txt|auto] the method for parsing pdf.
266
303
  ocr: using ocr technique to extract information from pdf,
267
304
  txt: suitable for the text-based pdf only and outperform ocr,
268
305
  auto: automatically choose the best method for parsing pdf
269
306
  from ocr and txt.
270
- without method specified, auto will be used by default.
307
+ without method specified, auto will be used by default.
271
308
  --help Show this message and exit.
272
309
 
273
310
 
@@ -282,13 +319,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
282
319
  The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
283
320
 
284
321
  ```text
285
- ├── some_pdf.md # markdown file
286
- ├── images # directory for storing images
287
- ├── layout.pdf # layout diagram
288
- ├── middle.json # MinerU intermediate processing result
289
- ├── model.json # model inference result
290
- ├── origin.pdf # original PDF file
291
- └── spans.pdf # smallest granularity bbox position information diagram
322
+ ├── some_pdf.md # markdown file
323
+ ├── images # directory for storing images
324
+ ├── some_pdf_layout.pdf # layout diagram
325
+ ├── some_pdf_middle.json # MinerU intermediate processing result
326
+ ├── some_pdf_model.json # model inference result
327
+ ├── some_pdf_origin.pdf # original PDF file
328
+ └── some_pdf_spans.pdf # smallest granularity bbox position information diagram
292
329
  ```
293
330
 
294
331
  For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
@@ -296,6 +333,7 @@ For more information about the output files, please refer to the [Output File De
296
333
  ### API
297
334
 
298
335
  Processing files from local disk
336
+
299
337
  ```python
300
338
  image_writer = DiskReaderWriter(local_image_dir)
301
339
  image_dir = str(os.path.basename(local_image_dir))
@@ -308,6 +346,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
308
346
  ```
309
347
 
310
348
  Processing files from object storage
349
+
311
350
  ```python
312
351
  s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
313
352
  image_dir = "s3://img_bucket/"
@@ -322,10 +361,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
322
361
  ```
323
362
 
324
363
  For detailed implementation, refer to:
364
+
325
365
  - [demo.py Simplest Processing Method](demo/demo.py)
326
366
  - [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
327
367
 
328
-
329
368
  ### Development Guide
330
369
 
331
370
  TODO
@@ -337,10 +376,11 @@ TODO
337
376
  - [ ] Code block recognition within the text
338
377
  - [ ] Table of contents recognition
339
378
  - [x] Table recognition
340
- - [ ] Chemical formula recognition
379
+ - [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
341
380
  - [ ] Geometric shape recognition
342
381
 
343
382
  # Known Issues
383
+
344
384
  - Reading order is segmented based on rules, which can cause disordered sequences in some cases
345
385
  - Vertical text is not supported
346
386
  - Lists, code blocks, and table of contents are not yet supported in the layout model
@@ -350,11 +390,11 @@ TODO
350
390
 
351
391
 
352
392
  # FAQ
393
+
353
394
  [FAQ in Chinese](docs/FAQ_zh_cn.md)
354
395
 
355
396
  [FAQ in English](docs/FAQ_en_us.md)
356
397
 
357
-
358
398
  # All Thanks To Our Contributors
359
399
 
360
400
  <a href="https://github.com/opendatalab/MinerU/graphs/contributors">
@@ -367,8 +407,8 @@ TODO
367
407
 
368
408
  This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
369
409
 
370
-
371
410
  # Acknowledgments
411
+
372
412
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
373
413
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
374
414
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
@@ -405,9 +445,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
405
445
  </a>
406
446
 
407
447
  # Magic-doc
448
+
408
449
  [Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
409
450
 
410
451
  # Magic-html
452
+
411
453
  [Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
412
454
 
413
455
  # Links