magic-pdf 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
2
3
  from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
3
4
  from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
7
8
 
8
9
 
9
10
  class MultiS3Mixin:
10
- def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
11
+ def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
11
12
  """Initialized with multiple s3 configs.
12
13
 
13
14
  Args:
14
- default_bucket (str): the default bucket name of the relative path
15
+ default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
15
16
  s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
16
17
 
17
18
  Raises:
18
- InvalidConfig: default bucket config not in s3_configs
19
- InvalidConfig: bucket name not unique in s3_configs
20
- InvalidConfig: default bucket must be provided
19
+ InvalidConfig: default bucket config not in s3_configs.
20
+ InvalidConfig: bucket name not unique in s3_configs.
21
+ InvalidConfig: default bucket must be provided.
21
22
  """
22
- if len(default_bucket) == 0:
23
- raise InvalidConfig('default_bucket must be provided')
23
+ if len(default_prefix) == 0:
24
+ raise InvalidConfig('default_prefix must be provided')
25
+
26
+ arr = default_prefix.strip("/").split("/")
27
+ self.default_bucket = arr[0]
28
+ self.default_prefix = "/".join(arr[1:])
24
29
 
25
30
  found_default_bucket_config = False
26
31
  for conf in s3_configs:
27
- if conf.bucket_name == default_bucket:
32
+ if conf.bucket_name == self.default_bucket:
28
33
  found_default_bucket_config = True
29
34
  break
30
35
 
31
36
  if not found_default_bucket_config:
32
37
  raise InvalidConfig(
33
- f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
38
+ f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
34
39
  )
35
40
 
36
41
  uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
39
44
  f'the bucket_name in s3_configs: {s3_configs} must be unique'
40
45
  )
41
46
 
42
- self.default_bucket = default_bucket
43
47
  self.s3_configs = s3_configs
44
48
  self._s3_clients_h: dict = {}
45
49
 
@@ -47,14 +51,14 @@ class MultiS3Mixin:
47
51
  class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
48
52
  def read(self, path: str) -> bytes:
49
53
  """Read the path from s3, select diffect bucket client for each request
50
- based on the path, also support range read.
54
+ based on the bucket, also support range read.
51
55
 
52
56
  Args:
53
- path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
54
- for example: s3://bucket_name/path?0,100
57
+ path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
58
+ for example: s3://bucket_name/path?0,100.
55
59
 
56
60
  Returns:
57
- bytes: the content of s3 file
61
+ bytes: the content of s3 file.
58
62
  """
59
63
  may_range_params = parse_s3_range_params(path)
60
64
  if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
84
88
 
85
89
  def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
86
90
  """Read the file with offset and limit, select diffect bucket client
87
- for each request based on the path.
91
+ for each request based on the bucket.
88
92
 
89
93
  Args:
90
- path (str): the file path
94
+ path (str): the file path.
91
95
  offset (int, optional): the number of bytes skipped. Defaults to 0.
92
96
  limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
93
97
 
94
98
  Returns:
95
- bytes: the file content
99
+ bytes: the file content.
96
100
  """
97
101
  if path.startswith('s3://'):
98
102
  bucket_name, path = parse_s3path(path)
99
103
  s3_reader = self.__get_s3_client(bucket_name)
100
104
  else:
101
105
  s3_reader = self.__get_s3_client(self.default_bucket)
106
+ path = os.path.join(self.default_prefix, path)
102
107
  return s3_reader.read_at(path, offset, limit)
103
108
 
104
109
 
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
123
128
 
124
129
  def write(self, path: str, data: bytes) -> None:
125
130
  """Write file with data, also select diffect bucket client for each
126
- request based on the path.
131
+ request based on the bucket.
127
132
 
128
133
  Args:
129
134
  path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
130
- data (bytes): the data want to write
135
+ data (bytes): the data want to write.
131
136
  """
132
137
  if path.startswith('s3://'):
133
138
  bucket_name, path = parse_s3path(path)
134
139
  s3_writer = self.__get_s3_client(bucket_name)
135
140
  else:
136
141
  s3_writer = self.__get_s3_client(self.default_bucket)
142
+ path = os.path.join(self.default_prefix, path)
137
143
  return s3_writer.write(path, data)
@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
6
6
  class S3DataReader(MultiBucketS3DataReader):
7
7
  def __init__(
8
8
  self,
9
+ default_prefix_without_bucket: str,
9
10
  bucket: str,
10
11
  ak: str,
11
12
  sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
15
16
  """s3 reader client.
16
17
 
17
18
  Args:
19
+ default_prefix_without_bucket: prefix that not contains bucket
18
20
  bucket (str): bucket name
19
21
  ak (str): access key
20
22
  sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
23
25
  refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
24
26
  """
25
27
  super().__init__(
26
- bucket,
28
+ f'{bucket}/{default_prefix_without_bucket}',
27
29
  [
28
30
  S3Config(
29
31
  bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
39
41
  class S3DataWriter(MultiBucketS3DataWriter):
40
42
  def __init__(
41
43
  self,
44
+ default_prefix_without_bucket: str,
42
45
  bucket: str,
43
46
  ak: str,
44
47
  sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
48
51
  """s3 writer client.
49
52
 
50
53
  Args:
54
+ default_prefix_without_bucket: prefix that not contains bucket
51
55
  bucket (str): bucket name
52
56
  ak (str): access key
53
57
  sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
56
60
  refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
57
61
  """
58
62
  super().__init__(
59
- bucket,
63
+ f'{bucket}/{default_prefix_without_bucket}',
60
64
  [
61
65
  S3Config(
62
66
  bucket_name=bucket,
@@ -0,0 +1,6 @@
1
+
2
+ from magic_pdf.data.io.base import IOReader, IOWriter # noqa: F401
3
+ from magic_pdf.data.io.http import HttpReader, HttpWriter # noqa: F401
4
+ from magic_pdf.data.io.s3 import S3Reader, S3Writer # noqa: F401
5
+
6
+ __all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
magic_pdf/data/io/base.py CHANGED
@@ -29,7 +29,7 @@ class IOReader(ABC):
29
29
  pass
30
30
 
31
31
 
32
- class IOWriter:
32
+ class IOWriter(ABC):
33
33
 
34
34
  @abstractmethod
35
35
  def write(self, path: str, data: bytes) -> None:
magic_pdf/data/schemas.py CHANGED
@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
3
3
 
4
4
 
5
5
  class S3Config(BaseModel):
6
+ """S3 config
7
+ """
6
8
  bucket_name: str = Field(description='s3 bucket name', min_length=1)
7
9
  access_key: str = Field(description='s3 access key', min_length=1)
8
10
  secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
11
13
 
12
14
 
13
15
  class PageInfo(BaseModel):
16
+ """The width and height of page
17
+ """
14
18
  w: float = Field(description='the width of page')
15
19
  h: float = Field(description='the height of page')
@@ -119,6 +119,16 @@ def detect_language(text):
119
119
  return 'empty'
120
120
 
121
121
 
122
+ # 连写字符拆分
123
+ def __replace_ligatures(text: str):
124
+ text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
125
+ text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
126
+ text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
127
+ text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
128
+ text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
129
+ return text
130
+
131
+
122
132
  def merge_para_with_text(para_block):
123
133
  para_text = ''
124
134
  for i, line in enumerate(para_block['lines']):
@@ -141,22 +151,34 @@ def merge_para_with_text(para_block):
141
151
  if span_type == ContentType.Text:
142
152
  content = ocr_escape_special_markdown_char(span['content'])
143
153
  elif span_type == ContentType.InlineEquation:
144
- content = f" ${span['content']}$ "
154
+ content = f"${span['content']}$"
145
155
  elif span_type == ContentType.InterlineEquation:
146
156
  content = f"\n$$\n{span['content']}\n$$\n"
147
157
 
158
+ content = content.strip()
148
159
  if content != '':
149
160
  langs = ['zh', 'ja', 'ko']
150
161
  if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
151
- para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
152
- elif line_lang == 'en':
153
- # 如果是前一行带有-连字符,那么末尾不应该加空格
154
- if __is_hyphen_at_line_end(content):
155
- para_text += content[:-1]
156
- else:
157
- para_text += content + ' '
162
+ if span_type in [ContentType.Text, ContentType.InterlineEquation]:
163
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
164
+ elif span_type == ContentType.InlineEquation:
165
+ para_text += f" {content} "
158
166
  else:
159
- para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
167
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
168
+ # 如果是前一行带有-连字符,那么末尾不应该加空格
169
+ if __is_hyphen_at_line_end(content):
170
+ para_text += content[:-1]
171
+ elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
172
+ para_text += content
173
+ else: # 西方文本语境下 content间需要空格分隔
174
+ para_text += f"{content} "
175
+ elif span_type == ContentType.InterlineEquation:
176
+ para_text += content
177
+ else:
178
+ continue
179
+ # 连写字符拆分
180
+ para_text = __replace_ligatures(para_text)
181
+
160
182
  return para_text
161
183
 
162
184
 
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.0"
1
+ __version__ = "0.9.2"
@@ -38,15 +38,13 @@ except ImportError as e:
38
38
  from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
39
39
  from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
40
40
  from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
41
- # from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
41
+ from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
42
42
  from magic_pdf.model.ppTableModel import ppTableModel
43
43
 
44
44
 
45
45
  def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
46
46
  if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
47
- # table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
48
- logger.error("StructEqTable is under upgrade, the current version does not support it.")
49
- exit(1)
47
+ table_model = StructTableModel(model_path, max_time=max_time)
50
48
  elif table_model_type == MODEL_NAME.TABLE_MASTER:
51
49
  config = {
52
50
  "model_dir": model_path,
@@ -284,8 +282,6 @@ class CustomPEKModel:
284
282
  )
285
283
  # 初始化ocr
286
284
  if self.apply_ocr:
287
-
288
- # self.ocr_model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=0.3)
289
285
  self.ocr_model = atom_model_manager.get_atom_model(
290
286
  atom_model_name=AtomicModel.OCR,
291
287
  ocr_show_log=show_log,
@@ -303,17 +299,6 @@ class CustomPEKModel:
303
299
  device=self.device
304
300
  )
305
301
 
306
- home_directory = Path.home()
307
- det_source = os.path.join(models_dir, table_model_dir, DETECT_MODEL_DIR)
308
- rec_source = os.path.join(models_dir, table_model_dir, REC_MODEL_DIR)
309
- det_dest_dir = os.path.join(home_directory, PP_DET_DIRECTORY)
310
- rec_dest_dir = os.path.join(home_directory, PP_REC_DIRECTORY)
311
-
312
- if not os.path.exists(det_dest_dir):
313
- shutil.copytree(det_source, det_dest_dir)
314
- if not os.path.exists(rec_dest_dir):
315
- shutil.copytree(rec_source, rec_dest_dir)
316
-
317
302
  logger.info('DocAnalysis init done!')
318
303
 
319
304
  def __call__(self, image):
@@ -393,7 +378,7 @@ class CustomPEKModel:
393
378
  elif int(res['category_id']) in [5]:
394
379
  table_res_list.append(res)
395
380
 
396
- if torch.cuda.is_available():
381
+ if torch.cuda.is_available() and self.device != 'cpu':
397
382
  properties = torch.cuda.get_device_properties(self.device)
398
383
  total_memory = properties.total_memory / (1024 ** 3) # 将字节转换为 GB
399
384
  if total_memory <= 10:
@@ -463,7 +448,9 @@ class CustomPEKModel:
463
448
  html_code = None
464
449
  if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
465
450
  with torch.no_grad():
466
- latex_code = self.table_model.image2latex(new_image)[0]
451
+ table_result = self.table_model.predict(new_image, "html")
452
+ if len(table_result) > 0:
453
+ html_code = table_result[0]
467
454
  else:
468
455
  html_code = self.table_model.img2html(new_image)
469
456
 
@@ -474,14 +461,17 @@ class CustomPEKModel:
474
461
  # 判断是否返回正常
475
462
 
476
463
  if latex_code:
477
- expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
478
- 'end{table}')
464
+ expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
479
465
  if expected_ending:
480
466
  res["latex"] = latex_code
481
467
  else:
482
468
  logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
483
469
  elif html_code:
484
- res["html"] = html_code
470
+ expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
471
+ if expected_ending:
472
+ res["html"] = html_code
473
+ else:
474
+ logger.warning(f"table recognition processing fails, not found expected HTML table end")
485
475
  else:
486
476
  logger.warning(f"table recognition processing fails, not get latex or html return")
487
477
  logger.info(f"table time: {round(time.time() - table_start, 2)}")
@@ -1,28 +1,45 @@
1
- from loguru import logger
1
+ import re
2
2
 
3
- try:
4
- from struct_eqtable.model import StructTable
5
- except ImportError:
6
- logger.error("StructEqTable is under upgrade, the current version does not support it.")
7
- from pypandoc import convert_text
3
+ import torch
4
+ from struct_eqtable import build_model
8
5
 
9
6
 
10
7
  class StructTableModel:
11
- def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
8
+ def __init__(self, model_path, max_new_tokens=1024, max_time=60):
12
9
  # init
13
- self.model_path = model_path
14
- self.max_new_tokens = max_new_tokens # maximum output tokens length
15
- self.max_time = max_time # timeout for processing in seconds
16
- if device == 'cuda':
17
- self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
10
+ assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
11
+ self.model = build_model(
12
+ model_ckpt=model_path,
13
+ max_new_tokens=max_new_tokens,
14
+ max_time=max_time,
15
+ lmdeploy=False,
16
+ flash_attn=False,
17
+ batch_size=1,
18
+ ).cuda()
19
+ self.default_format = "html"
20
+
21
+ def predict(self, images, output_format=None, **kwargs):
22
+
23
+ if output_format is None:
24
+ output_format = self.default_format
18
25
  else:
19
- self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
26
+ if output_format not in ['latex', 'markdown', 'html']:
27
+ raise ValueError(f"Output format {output_format} is not supported.")
28
+
29
+ results = self.model(
30
+ images, output_format=output_format
31
+ )
32
+
33
+ if output_format == "html":
34
+ results = [self.minify_html(html) for html in results]
20
35
 
21
- def image2latex(self, image) -> str:
22
- table_latex = self.model.forward(image)
23
- return table_latex
36
+ return results
24
37
 
25
- def image2html(self, image) -> str:
26
- table_latex = self.image2latex(image)
27
- table_html = convert_text(table_latex, 'html', format='latex')
28
- return table_html
38
+ def minify_html(self, html):
39
+ # 移除多余的空白字符
40
+ html = re.sub(r'\s+', ' ', html)
41
+ # 移除行尾的空白字符
42
+ html = re.sub(r'\s*>\s*', '>', html)
43
+ # 移除标签前的空白字符
44
+ html = re.sub(r'\s*<\s*', '<', html)
45
+ return html.strip()
@@ -1,3 +1,4 @@
1
+ import cv2
1
2
  from paddleocr.ppstructure.table.predict_table import TableSystem
2
3
  from paddleocr.ppstructure.utility import init_args
3
4
  from magic_pdf.libs.Constants import *
@@ -36,12 +37,13 @@ class ppTableModel(object):
36
37
  - HTML (str): A string representing the HTML structure with content of the table.
37
38
  """
38
39
  if isinstance(image, Image.Image):
39
- image = np.array(image)
40
+ image = np.asarray(image)
41
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
40
42
  pred_res, _ = self.table_sys(image)
41
43
  pred_html = pred_res["html"]
42
- res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
43
- "") + "</table></td>\n"
44
- return res
44
+ # res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
45
+ # "</table></body></html>","") + "</table></td>\n"
46
+ return pred_html
45
47
 
46
48
  def parse_args(self, **kwargs):
47
49
  parser = init_args()
@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
63
63
  first_line = block['lines'][0]
64
64
  line_height = first_line['bbox'][3] - first_line['bbox'][1]
65
65
  block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
66
+ block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
66
67
 
67
68
  left_close_num = 0
68
69
  left_not_close_num = 0
69
70
  right_not_close_num = 0
70
71
  right_close_num = 0
71
72
  lines_text_list = []
72
-
73
+ center_close_num = 0
74
+ external_sides_not_close_num = 0
73
75
  multiple_para_flag = False
74
76
  last_line = block['lines'][-1]
77
+
75
78
  # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
76
79
  if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
77
80
  # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):
82
85
 
83
86
  for line in block['lines']:
84
87
 
88
+ line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
89
+ block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
90
+ if (
91
+ line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
92
+ block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
93
+ ):
94
+ external_sides_not_close_num += 1
95
+ if abs(line_mid_x - block_mid_x) < line_height / 2:
96
+ center_close_num += 1
97
+
85
98
  line_text = ""
86
99
 
87
100
  for span in line['spans']:
@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
103
116
  right_close_num += 1
104
117
  else:
105
118
  # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
106
- closed_area = 0.3 * block_weight
119
+ closed_area = 0.26 * block_weight
107
120
  # closed_area = 5 * line_height
108
121
  if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
109
122
  right_not_close_num += 1
@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
132
145
  line_num_flag = True
133
146
 
134
147
  # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
135
- if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
148
+ if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
136
149
  and line_num_flag
137
150
  ):
138
151
  for line in block['lines']:
139
152
  line[ListLineTag.IS_LIST_START_LINE] = True
140
153
  return BlockType.Index
141
154
 
155
+ # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
156
+ # 补充条件block的长宽比有要求
157
+ elif (
158
+ external_sides_not_close_num >= 2 and
159
+ center_close_num == len(block['lines']) and
160
+ external_sides_not_close_num / len(block['lines']) >= 0.5 and
161
+ block_height / block_weight > 0.4
162
+ ):
163
+ for line in block['lines']:
164
+ line[ListLineTag.IS_LIST_START_LINE] = True
165
+ return BlockType.List
166
+
142
167
  elif left_close_num >= 2 and (
143
168
  right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
144
169
  # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
145
- if left_close_num / len(block['lines']) > 0.9:
170
+ if left_close_num / len(block['lines']) > 0.8:
146
171
  # 这种是每个item只有一行,且左边都贴边的短item list
147
172
  if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
148
173
  for line in block['lines']:
@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
154
179
  if lines_text_list[i][-1] in LIST_END_FLAG:
155
180
  line[ListLineTag.IS_LIST_END_LINE] = True
156
181
  if i + 1 < len(block['lines']):
157
- block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
182
+ block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
158
183
  # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
159
184
  else:
160
185
  line_start_flag = False
@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
162
187
  if line_start_flag:
163
188
  line[ListLineTag.IS_LIST_START_LINE] = True
164
189
  line_start_flag = False
165
- elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
190
+ # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
191
+ if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
166
192
  line[ListLineTag.IS_LIST_END_LINE] = True
167
193
  line_start_flag = True
168
194
  # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.9.0
3
+ Version: 0.9.2
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -22,8 +22,9 @@ Provides-Extra: full
22
22
  Requires-Dist: unimernet==0.2.1; extra == "full"
23
23
  Requires-Dist: ultralytics; extra == "full"
24
24
  Requires-Dist: paddleocr==2.7.3; extra == "full"
25
- Requires-Dist: pypandoc; extra == "full"
26
- Requires-Dist: struct-eqtable==0.1.0; extra == "full"
25
+ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
26
+ Requires-Dist: einops; extra == "full"
27
+ Requires-Dist: accelerate; extra == "full"
27
28
  Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
28
29
  Requires-Dist: detectron2; extra == "full"
29
30
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
@@ -54,8 +55,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
54
55
  [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMzAiIGhlaWdodD0iMzAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgZmlsbD0ibm9uZSI+CiA8ZGVmcz4KICA8bGluZWFyR3JhZGllbnQgeTI9IjAuNTMzNjciIHgyPSIxLjAwMDQiIHkxPSIwLjI5MjE5IiB4MT0iLTAuMTEyNjgiIGlkPSJhIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogIDxsaW5lYXJHcmFkaWVudCB5Mj0iMC41OTc1NyIgeDI9IjEuMDExMzciIHkxPSIwLjExMDIzIiB4MT0iLTAuMDg0NzQiIGlkPSJiIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogPC9kZWZzPgogPGc+CiAgPHRpdGxlPkxheWVyIDE8L3RpdGxlPgogIDxwYXRoIGlkPSJzdmdfMSIgZmlsbD0idXJsKCNhKSIgZD0ibTEuNjIzLDEyLjA2N2EwLjQ4NCwwLjQ4NCAwIDAgMSAwLjA3LC0wLjM4NGw1LjMxLC03Ljg5NWMwLjA2OCwtMC4xIDAuMTcsLTAuMTcyIDAuMjg4LC0wLjJsMTQuMzc3LC0zLjQ3NGEwLjQ4NCwwLjQ4NCAwIDAgMSAwLjU4NCwwLjM1N2wzLjY2MiwxNS4xNTJjMS40NzcsNi4xMTQgLTIuMjgxLDEyLjI2NyAtOC4zOTQsMTMuNzQ1Yy02LjExNCwxLjQ3NyAtMTIuMjY3LC0yLjI4MSAtMTMuNzQ1LC04LjM5NWwtMi4xNTIsLTguOTA2eiIgb3BhY2l0eT0iMC40Ii8+CiAgPHBhdGggaWQ9InN2Z18yIiBmaWxsPSJ1cmwoI2IpIiBkPSJtNS44MjYsOC42NzNjMCwtMC4xMzYgMC4wNTcsLTAuMjY2IDAuMTU3LC0wLjM1OGw3LjAxNywtNi40MjVhMC40ODQsMC40ODQgMCAwIDEgMC4zMjcsLTAuMTI3bDE0Ljc5LDBjMC4yNjgsMCAwLjQ4NSwwLjIxNiAwLjQ4NSwwLjQ4NGwwLDE1LjU4OWMwLDYuMjkgLTUuMDk5LDExLjM4OCAtMTEuMzg4LDExLjM4OGMtNi4yOSwwIC0xMS4zODgsLTUuMDk5IC0xMS4zODgsLTExLjM4OGwwLC05LjE2M3oiLz4KICA8cGF0aCBpZD0ic3ZnXzMiIGZpbGw9IiM1RDc2RkYiIGQ9Im0xMi4zMzEsOC43NTNsLTYuMzgzLC0wLjM5OGw3LjEyMiwtNi41MmwwLjI5OSw1Ljg5MmEwLjk3OCwwLjk3OCAwIDAgMSAtMS4wMzgsMS4wMjZ6Ii8+CiAgPHBhdGggaWQ9InN2Z180IiBmaWxsPSIjMDAyOEZEIiBkPSJtMjAuNDE2LDE1LjAyMmwwLDEuNzExYTIuNDA0LDIuNDA0IDAgMCAxIC00LjgwOCwwbDAsLTQuMjc4bC0yLjgxLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDEgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEyLDB6IiBjbGlwLXJ1bGU9ImV2ZW5vZGQiIGZpbGwtcnVsZT0iZXZlbm9kZCIvPgogIDxwYXRoIGlkPSJzdmdfNSIgZmlsbD0iIzAwMjhGRCIgZD0ibTIzLjIyOCwxMy44ODFsMS4xNCwwbDAsMS4xNDFsLTEuMTQsMGwwLC0xLjE0bDAsLTAuMDAxem0tMi44MTIsLTAuNjkybDEuODM0LDBsMCwxLjgzM2wtMS44MzQsMGwwLC0xLjgzMmwwLC0wLjAwMXptMS44MzQsLTAuOTc5bDAuOTc4LDBsMCwwLjk3OWwtMC45NzgsMGwwLC0wLjk3OGwwLC0wLjAwMXptMS41NDgsLTEuNjI5bDAuNjExLDBsMCwwLjYxMWwtMC42MTEsMGwwLC0wLjYxMXoiLz4KICA8cGF0aCBpZD0ic3ZnXzYiIGZpbGw9IiNmZmYiIGQ9Im0yMC4wODYsMTQuOTEybDAsMS43MTFhMi40MDQsMi40MDQgMCAxIDEgLTQuODA3LDBsMCwtNC4yNzhsLTIuODEyLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDAgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEsMGwtMC4wMDEsMHoiIGNsaXAtcnVsZT0iZXZlbm9kZCIgZmlsbC1ydWxlPSJldmVub2RkIi8+CiAgPHBhdGggaWQ9InN2Z183IiBmaWxsPSIjZmZmIiBkPSJtMjIuODk4LDEzLjc3MWwxLjE0LDBsMCwxLjE0MWwtMS4xNCwwbDAsLTEuMTRsMCwtMC4wMDF6bS0yLjgxMiwtMC42OTJsMS44MzQsMGwwLDEuODMzbC0xLjgzNCwwbDAsLTEuODMybDAsLTAuMDAxem0xLjgzNCwtMC45NzlsMC45NzgsMGwwLDAuOTc5bC0wLjk3OCwwbDAsLTAuOTc5em0xLjU0OCwtMS42MjlsMC42MTEsMGwwLDAuNjExbC0wLjYxLDBsMCwtMC42MWwtMC4wMDEsLTAuMDAxeiIvPgogPC9nPgo8L3N2Zz4=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
55
56
  [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
56
57
  [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
57
- [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
58
-
58
+ [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
59
59
  [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](https://arxiv.org/abs/2409.18839)
60
60
 
61
61
 
@@ -80,6 +80,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
80
80
  </div>
81
81
 
82
82
  # Changelog
83
+ - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
83
84
  - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
84
85
  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
85
86
  - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
@@ -175,13 +176,14 @@ There are three different ways to experience MinerU:
175
176
  - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
176
177
  - [Linux/Windows + CUDA](#Using-GPU)
177
178
 
178
- **⚠️ Pre-installation Notice—Hardware and Software Environment Support**
179
-
180
- To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
181
-
182
- By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
183
-
184
- In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
179
+ > [!WARNING]
180
+ > **Pre-installation Notice—Hardware and Software Environment Support**
181
+ >
182
+ > To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
183
+ >
184
+ > By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
185
+ >
186
+ > In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
185
187
 
186
188
  <table>
187
189
  <tr>
@@ -261,11 +263,13 @@ Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for de
261
263
  After completing the [2. Download model weight files](#2-download-model-weight-files) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
262
264
  You can find the `magic-pdf.json` file in your 【user directory】.
263
265
 
266
+ > [!TIP]
264
267
  > The user directory for Windows is "C:\\Users\\username", for Linux it is "/home/username", and for macOS it is "/Users/username".
265
268
 
266
269
  You can modify certain configurations in this file to enable or disable features, such as table recognition:
267
270
 
268
271
 
272
+ > [!NOTE]
269
273
  > If the following items are not present in the JSON, please manually add the required items and remove the comment content (standard JSON does not support comments).
270
274
 
271
275
  ```json
@@ -294,13 +298,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
294
298
  - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
295
299
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
296
300
  - Quick Deployment with Docker
297
- > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
298
- >
299
- > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
300
- >
301
- > ```bash
302
- > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
303
- > ```
301
+ > [!IMPORTANT]
302
+ > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
303
+ >
304
+ > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
305
+ >
306
+ > ```bash
307
+ > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
308
+ > ```
304
309
  ```bash
305
310
  wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
306
311
  docker build -t mineru:latest .
@@ -362,8 +367,8 @@ The results will be saved in the `{some_output_dir}` directory. The output file
362
367
  ├── some_pdf_spans.pdf # smallest granularity bbox position information diagram
363
368
  └── some_pdf_content_list.json # Rich text JSON arranged in reading order
364
369
  ```
365
-
366
- For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
370
+ > [!TIP]
371
+ > For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
367
372
 
368
373
  ### API
369
374
 
@@ -414,12 +419,12 @@ TODO
414
419
 
415
420
  # TODO
416
421
 
417
- - 🗹 Reading order based on the model
418
- - 🗹 Recognition of `index` and `list` in the main text
419
- - 🗹 Table recognition
420
- - Code block recognition in the main text
421
- - [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
422
- - Geometric shape recognition
422
+ - [x] Reading order based on the model
423
+ - [x] Recognition of `index` and `list` in the main text
424
+ - [x] Table recognition
425
+ - [ ] Code block recognition in the main text
426
+ - [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
427
+ - [ ] Geometric shape recognition
423
428
 
424
429
  # Known Issues
425
430
 
@@ -10,20 +10,20 @@ magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zk
10
10
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
12
12
  magic_pdf/data/read_api.py,sha256=3fKLsEYAow5RwAmGFMMgvcCh0-_WEEHem2uewukjXOA,3570
13
- magic_pdf/data/schemas.py,sha256=XSFNxyYbIWgU_Z4U0695elpGQP3J5dpq4Rlyr3S0O_s,595
13
+ magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
14
14
  magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
15
15
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
16
16
  magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
17
17
  magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
18
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=BY3faRfZTg27kfkaI4iXPjgFW_RecN0am9r9z2RuYgY,5582
19
- magic_pdf/data/data_reader_writer/s3.py,sha256=4tT_hcb5I1m-qojNP2CAUKGOoWBH2ripKQmBa9_dAfg,2096
20
- magic_pdf/data/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- magic_pdf/data/io/base.py,sha256=So3G_Kndunfs0f9nn3l9dRJG_7N09CX0JbFqYEvyaRI,1113
18
+ magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
19
+ magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
20
+ magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
21
+ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
22
22
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
23
23
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
24
24
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
26
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=eMd3qOIh21cZgTl-LMLGh42uxDMpHU2nwE6iA6b_qrA,11915
26
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=ClxKUwrK7wlXKCcDfuTryztKl5e8pzcnh5x_fODFm2U,12928
27
27
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
29
29
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -65,14 +65,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
65
65
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
66
66
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
67
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
- magic_pdf/libs/version.py,sha256=H9NWRZb7NbeRRPLP_V1fARmLNXranorVM-OOY-8_2ug,22
68
+ magic_pdf/libs/version.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
69
69
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
70
70
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
71
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
72
72
  magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
73
73
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
- magic_pdf/model/pdf_extract_kit.py,sha256=qlZANx8DErfSyaPHlOYNYW_Qp50dAEX_4jG8N3coDmM,21317
75
- magic_pdf/model/ppTableModel.py,sha256=Qm5vy6v5aw2wwO5aZTyVr-r1sr3Pi9ManG86WZvfvEo,2697
74
+ magic_pdf/model/pdf_extract_kit.py,sha256=9pdtcQgwn-XMvyQ7yMfzqKgjPfxEuNXR7juCPx-OM-M,20929
75
+ magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
76
76
  magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
77
77
  magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
78
  magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -97,7 +97,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
97
97
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
98
98
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
99
99
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
100
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qtAkShYlXBrrkRWHvgAy3y9SEBtMRYVIvI3CASTuLHU,1069
100
+ magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qQthlYDvDPah1mzzrnKXU4fYqlJdXOPBnJ8tYf-o_0k,1384
101
101
  magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
102
  magic_pdf/model/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  magic_pdf/model/v3/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
@@ -112,7 +112,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
112
112
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
113
113
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
114
114
  magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
115
- magic_pdf/para/para_split_v3.py,sha256=vHHswSAcTpXqnaEAbGEbt2g96YLh9eh839HdRNilDT8,13378
115
+ magic_pdf/para/para_split_v3.py,sha256=k02I9Rdc8jfYr3bMT_Gm38b5ginkl-ZIU5C_XcfAcs8,14704
116
116
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
117
117
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
118
118
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -170,9 +170,9 @@ magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,41
170
170
  magic_pdf/tools/common.py,sha256=2S8N60pcA6bFqAmdchoEmn22l9ntQxEfyaKpxfCKJ-Y,5465
171
171
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
172
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
173
- magic_pdf-0.9.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
174
- magic_pdf-0.9.0.dist-info/METADATA,sha256=DPanG2IP5v1TNR6Qyto-UqZ53IOA09lNCQpMyjguJ_k,39420
175
- magic_pdf-0.9.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
176
- magic_pdf-0.9.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
177
- magic_pdf-0.9.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
178
- magic_pdf-0.9.0.dist-info/RECORD,,
173
+ magic_pdf-0.9.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
174
+ magic_pdf-0.9.2.dist-info/METADATA,sha256=CxyxzxwoOTK3GfaQCGAR8lcjQR3fK4teYf0pXLVDiNQ,39654
175
+ magic_pdf-0.9.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
176
+ magic_pdf-0.9.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
177
+ magic_pdf-0.9.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
178
+ magic_pdf-0.9.2.dist-info/RECORD,,