magic-pdf 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
2
3
  from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
3
4
  from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
7
8
 
8
9
 
9
10
  class MultiS3Mixin:
10
- def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
11
+ def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
11
12
  """Initialized with multiple s3 configs.
12
13
 
13
14
  Args:
14
- default_bucket (str): the default bucket name of the relative path
15
+ default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
15
16
  s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
16
17
 
17
18
  Raises:
18
- InvalidConfig: default bucket config not in s3_configs
19
- InvalidConfig: bucket name not unique in s3_configs
20
- InvalidConfig: default bucket must be provided
19
+ InvalidConfig: default bucket config not in s3_configs.
20
+ InvalidConfig: bucket name not unique in s3_configs.
21
+ InvalidConfig: default bucket must be provided.
21
22
  """
22
- if len(default_bucket) == 0:
23
- raise InvalidConfig('default_bucket must be provided')
23
+ if len(default_prefix) == 0:
24
+ raise InvalidConfig('default_prefix must be provided')
25
+
26
+ arr = default_prefix.strip("/").split("/")
27
+ self.default_bucket = arr[0]
28
+ self.default_prefix = "/".join(arr[1:])
24
29
 
25
30
  found_default_bucket_config = False
26
31
  for conf in s3_configs:
27
- if conf.bucket_name == default_bucket:
32
+ if conf.bucket_name == self.default_bucket:
28
33
  found_default_bucket_config = True
29
34
  break
30
35
 
31
36
  if not found_default_bucket_config:
32
37
  raise InvalidConfig(
33
- f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
38
+ f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
34
39
  )
35
40
 
36
41
  uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
39
44
  f'the bucket_name in s3_configs: {s3_configs} must be unique'
40
45
  )
41
46
 
42
- self.default_bucket = default_bucket
43
47
  self.s3_configs = s3_configs
44
48
  self._s3_clients_h: dict = {}
45
49
 
@@ -47,14 +51,14 @@ class MultiS3Mixin:
47
51
  class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
48
52
  def read(self, path: str) -> bytes:
49
53
  """Read the path from s3, select diffect bucket client for each request
50
- based on the path, also support range read.
54
+ based on the bucket, also support range read.
51
55
 
52
56
  Args:
53
- path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
54
- for example: s3://bucket_name/path?0,100
57
+ path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
58
+ for example: s3://bucket_name/path?0,100.
55
59
 
56
60
  Returns:
57
- bytes: the content of s3 file
61
+ bytes: the content of s3 file.
58
62
  """
59
63
  may_range_params = parse_s3_range_params(path)
60
64
  if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
84
88
 
85
89
  def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
86
90
  """Read the file with offset and limit, select diffect bucket client
87
- for each request based on the path.
91
+ for each request based on the bucket.
88
92
 
89
93
  Args:
90
- path (str): the file path
94
+ path (str): the file path.
91
95
  offset (int, optional): the number of bytes skipped. Defaults to 0.
92
96
  limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
93
97
 
94
98
  Returns:
95
- bytes: the file content
99
+ bytes: the file content.
96
100
  """
97
101
  if path.startswith('s3://'):
98
102
  bucket_name, path = parse_s3path(path)
99
103
  s3_reader = self.__get_s3_client(bucket_name)
100
104
  else:
101
105
  s3_reader = self.__get_s3_client(self.default_bucket)
106
+ path = os.path.join(self.default_prefix, path)
102
107
  return s3_reader.read_at(path, offset, limit)
103
108
 
104
109
 
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
123
128
 
124
129
  def write(self, path: str, data: bytes) -> None:
125
130
  """Write file with data, also select diffect bucket client for each
126
- request based on the path.
131
+ request based on the bucket.
127
132
 
128
133
  Args:
129
134
  path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
130
- data (bytes): the data want to write
135
+ data (bytes): the data want to write.
131
136
  """
132
137
  if path.startswith('s3://'):
133
138
  bucket_name, path = parse_s3path(path)
134
139
  s3_writer = self.__get_s3_client(bucket_name)
135
140
  else:
136
141
  s3_writer = self.__get_s3_client(self.default_bucket)
142
+ path = os.path.join(self.default_prefix, path)
137
143
  return s3_writer.write(path, data)
@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
6
6
  class S3DataReader(MultiBucketS3DataReader):
7
7
  def __init__(
8
8
  self,
9
+ default_prefix_without_bucket: str,
9
10
  bucket: str,
10
11
  ak: str,
11
12
  sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
15
16
  """s3 reader client.
16
17
 
17
18
  Args:
19
+ default_prefix_without_bucket: prefix that not contains bucket
18
20
  bucket (str): bucket name
19
21
  ak (str): access key
20
22
  sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
23
25
  refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
24
26
  """
25
27
  super().__init__(
26
- bucket,
28
+ f'{bucket}/{default_prefix_without_bucket}',
27
29
  [
28
30
  S3Config(
29
31
  bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
39
41
  class S3DataWriter(MultiBucketS3DataWriter):
40
42
  def __init__(
41
43
  self,
44
+ default_prefix_without_bucket: str,
42
45
  bucket: str,
43
46
  ak: str,
44
47
  sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
48
51
  """s3 writer client.
49
52
 
50
53
  Args:
54
+ default_prefix_without_bucket: prefix that not contains bucket
51
55
  bucket (str): bucket name
52
56
  ak (str): access key
53
57
  sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
56
60
  refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
57
61
  """
58
62
  super().__init__(
59
- bucket,
63
+ f'{bucket}/{default_prefix_without_bucket}',
60
64
  [
61
65
  S3Config(
62
66
  bucket_name=bucket,
@@ -0,0 +1,6 @@
1
+
2
+ from magic_pdf.data.io.base import IOReader, IOWriter # noqa: F401
3
+ from magic_pdf.data.io.http import HttpReader, HttpWriter # noqa: F401
4
+ from magic_pdf.data.io.s3 import S3Reader, S3Writer # noqa: F401
5
+
6
+ __all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
magic_pdf/data/io/base.py CHANGED
@@ -29,7 +29,7 @@ class IOReader(ABC):
29
29
  pass
30
30
 
31
31
 
32
- class IOWriter:
32
+ class IOWriter(ABC):
33
33
 
34
34
  @abstractmethod
35
35
  def write(self, path: str, data: bytes) -> None:
magic_pdf/data/schemas.py CHANGED
@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
3
3
 
4
4
 
5
5
  class S3Config(BaseModel):
6
+ """S3 config
7
+ """
6
8
  bucket_name: str = Field(description='s3 bucket name', min_length=1)
7
9
  access_key: str = Field(description='s3 access key', min_length=1)
8
10
  secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
11
13
 
12
14
 
13
15
  class PageInfo(BaseModel):
16
+ """The width and height of page
17
+ """
14
18
  w: float = Field(description='the width of page')
15
19
  h: float = Field(description='the height of page')
@@ -119,6 +119,16 @@ def detect_language(text):
119
119
  return 'empty'
120
120
 
121
121
 
122
+ # 连写字符拆分
123
+ def __replace_ligatures(text: str):
124
+ text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
125
+ text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
126
+ text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
127
+ text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
128
+ text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
129
+ return text
130
+
131
+
122
132
  def merge_para_with_text(para_block):
123
133
  para_text = ''
124
134
  for i, line in enumerate(para_block['lines']):
@@ -141,22 +151,34 @@ def merge_para_with_text(para_block):
141
151
  if span_type == ContentType.Text:
142
152
  content = ocr_escape_special_markdown_char(span['content'])
143
153
  elif span_type == ContentType.InlineEquation:
144
- content = f" ${span['content']}$ "
154
+ content = f"${span['content']}$"
145
155
  elif span_type == ContentType.InterlineEquation:
146
156
  content = f"\n$$\n{span['content']}\n$$\n"
147
157
 
158
+ content = content.strip()
148
159
  if content != '':
149
160
  langs = ['zh', 'ja', 'ko']
150
161
  if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
151
- para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
152
- elif line_lang == 'en':
153
- # 如果是前一行带有-连字符,那么末尾不应该加空格
154
- if __is_hyphen_at_line_end(content):
155
- para_text += content[:-1]
156
- else:
157
- para_text += content + ' '
162
+ if span_type in [ContentType.Text, ContentType.InterlineEquation]:
163
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
164
+ elif span_type == ContentType.InlineEquation:
165
+ para_text += f" {content} "
158
166
  else:
159
- para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
167
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
168
+ # 如果是前一行带有-连字符,那么末尾不应该加空格
169
+ if __is_hyphen_at_line_end(content):
170
+ para_text += content[:-1]
171
+ elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
172
+ para_text += content
173
+ else: # 西方文本语境下 content间需要空格分隔
174
+ para_text += f"{content} "
175
+ elif span_type == ContentType.InterlineEquation:
176
+ para_text += content
177
+ else:
178
+ continue
179
+ # 连写字符拆分
180
+ para_text = __replace_ligatures(para_text)
181
+
160
182
  return para_text
161
183
 
162
184
 
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.0"
1
+ __version__ = "0.9.1"
@@ -38,15 +38,13 @@ except ImportError as e:
38
38
  from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
39
39
  from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
40
40
  from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
41
- # from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
41
+ from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
42
42
  from magic_pdf.model.ppTableModel import ppTableModel
43
43
 
44
44
 
45
45
  def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
46
46
  if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
47
- # table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
48
- logger.error("StructEqTable is under upgrade, the current version does not support it.")
49
- exit(1)
47
+ table_model = StructTableModel(model_path, max_time=max_time)
50
48
  elif table_model_type == MODEL_NAME.TABLE_MASTER:
51
49
  config = {
52
50
  "model_dir": model_path,
@@ -393,7 +391,7 @@ class CustomPEKModel:
393
391
  elif int(res['category_id']) in [5]:
394
392
  table_res_list.append(res)
395
393
 
396
- if torch.cuda.is_available():
394
+ if torch.cuda.is_available() and self.device != 'cpu':
397
395
  properties = torch.cuda.get_device_properties(self.device)
398
396
  total_memory = properties.total_memory / (1024 ** 3) # 将字节转换为 GB
399
397
  if total_memory <= 10:
@@ -463,7 +461,9 @@ class CustomPEKModel:
463
461
  html_code = None
464
462
  if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
465
463
  with torch.no_grad():
466
- latex_code = self.table_model.image2latex(new_image)[0]
464
+ table_result = self.table_model.predict(new_image, "html")
465
+ if len(table_result) > 0:
466
+ html_code = table_result[0]
467
467
  else:
468
468
  html_code = self.table_model.img2html(new_image)
469
469
 
@@ -474,14 +474,17 @@ class CustomPEKModel:
474
474
  # 判断是否返回正常
475
475
 
476
476
  if latex_code:
477
- expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
478
- 'end{table}')
477
+ expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
479
478
  if expected_ending:
480
479
  res["latex"] = latex_code
481
480
  else:
482
481
  logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
483
482
  elif html_code:
484
- res["html"] = html_code
483
+ expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
484
+ if expected_ending:
485
+ res["html"] = html_code
486
+ else:
487
+ logger.warning(f"table recognition processing fails, not found expected HTML table end")
485
488
  else:
486
489
  logger.warning(f"table recognition processing fails, not get latex or html return")
487
490
  logger.info(f"table time: {round(time.time() - table_start, 2)}")
@@ -1,28 +1,45 @@
1
- from loguru import logger
1
+ import re
2
2
 
3
- try:
4
- from struct_eqtable.model import StructTable
5
- except ImportError:
6
- logger.error("StructEqTable is under upgrade, the current version does not support it.")
7
- from pypandoc import convert_text
3
+ import torch
4
+ from struct_eqtable import build_model
8
5
 
9
6
 
10
7
  class StructTableModel:
11
- def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
8
+ def __init__(self, model_path, max_new_tokens=1024, max_time=60):
12
9
  # init
13
- self.model_path = model_path
14
- self.max_new_tokens = max_new_tokens # maximum output tokens length
15
- self.max_time = max_time # timeout for processing in seconds
16
- if device == 'cuda':
17
- self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
10
+ assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
11
+ self.model = build_model(
12
+ model_ckpt=model_path,
13
+ max_new_tokens=max_new_tokens,
14
+ max_time=max_time,
15
+ lmdeploy=False,
16
+ flash_attn=False,
17
+ batch_size=1,
18
+ ).cuda()
19
+ self.default_format = "html"
20
+
21
+ def predict(self, images, output_format=None, **kwargs):
22
+
23
+ if output_format is None:
24
+ output_format = self.default_format
18
25
  else:
19
- self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
26
+ if output_format not in ['latex', 'markdown', 'html']:
27
+ raise ValueError(f"Output format {output_format} is not supported.")
28
+
29
+ results = self.model(
30
+ images, output_format=output_format
31
+ )
32
+
33
+ if output_format == "html":
34
+ results = [self.minify_html(html) for html in results]
20
35
 
21
- def image2latex(self, image) -> str:
22
- table_latex = self.model.forward(image)
23
- return table_latex
36
+ return results
24
37
 
25
- def image2html(self, image) -> str:
26
- table_latex = self.image2latex(image)
27
- table_html = convert_text(table_latex, 'html', format='latex')
28
- return table_html
38
+ def minify_html(self, html):
39
+ # 移除多余的空白字符
40
+ html = re.sub(r'\s+', ' ', html)
41
+ # 移除行尾的空白字符
42
+ html = re.sub(r'\s*>\s*', '>', html)
43
+ # 移除标签前的空白字符
44
+ html = re.sub(r'\s*<\s*', '<', html)
45
+ return html.strip()
@@ -1,3 +1,4 @@
1
+ import cv2
1
2
  from paddleocr.ppstructure.table.predict_table import TableSystem
2
3
  from paddleocr.ppstructure.utility import init_args
3
4
  from magic_pdf.libs.Constants import *
@@ -36,12 +37,13 @@ class ppTableModel(object):
36
37
  - HTML (str): A string representing the HTML structure with content of the table.
37
38
  """
38
39
  if isinstance(image, Image.Image):
39
- image = np.array(image)
40
+ image = np.asarray(image)
41
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
40
42
  pred_res, _ = self.table_sys(image)
41
43
  pred_html = pred_res["html"]
42
- res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
43
- "") + "</table></td>\n"
44
- return res
44
+ # res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
45
+ # "</table></body></html>","") + "</table></td>\n"
46
+ return pred_html
45
47
 
46
48
  def parse_args(self, **kwargs):
47
49
  parser = init_args()
@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
63
63
  first_line = block['lines'][0]
64
64
  line_height = first_line['bbox'][3] - first_line['bbox'][1]
65
65
  block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
66
+ block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
66
67
 
67
68
  left_close_num = 0
68
69
  left_not_close_num = 0
69
70
  right_not_close_num = 0
70
71
  right_close_num = 0
71
72
  lines_text_list = []
72
-
73
+ center_close_num = 0
74
+ external_sides_not_close_num = 0
73
75
  multiple_para_flag = False
74
76
  last_line = block['lines'][-1]
77
+
75
78
  # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
76
79
  if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
77
80
  # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):
82
85
 
83
86
  for line in block['lines']:
84
87
 
88
+ line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
89
+ block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
90
+ if (
91
+ line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
92
+ block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
93
+ ):
94
+ external_sides_not_close_num += 1
95
+ if abs(line_mid_x - block_mid_x) < line_height / 2:
96
+ center_close_num += 1
97
+
85
98
  line_text = ""
86
99
 
87
100
  for span in line['spans']:
@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
103
116
  right_close_num += 1
104
117
  else:
105
118
  # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
106
- closed_area = 0.3 * block_weight
119
+ closed_area = 0.26 * block_weight
107
120
  # closed_area = 5 * line_height
108
121
  if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
109
122
  right_not_close_num += 1
@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
132
145
  line_num_flag = True
133
146
 
134
147
  # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
135
- if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
148
+ if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
136
149
  and line_num_flag
137
150
  ):
138
151
  for line in block['lines']:
139
152
  line[ListLineTag.IS_LIST_START_LINE] = True
140
153
  return BlockType.Index
141
154
 
155
+ # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
156
+ # 补充条件block的长宽比有要求
157
+ elif (
158
+ external_sides_not_close_num >= 2 and
159
+ center_close_num == len(block['lines']) and
160
+ external_sides_not_close_num / len(block['lines']) >= 0.5 and
161
+ block_height / block_weight > 0.4
162
+ ):
163
+ for line in block['lines']:
164
+ line[ListLineTag.IS_LIST_START_LINE] = True
165
+ return BlockType.List
166
+
142
167
  elif left_close_num >= 2 and (
143
168
  right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
144
169
  # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
145
- if left_close_num / len(block['lines']) > 0.9:
170
+ if left_close_num / len(block['lines']) > 0.8:
146
171
  # 这种是每个item只有一行,且左边都贴边的短item list
147
172
  if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
148
173
  for line in block['lines']:
@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
154
179
  if lines_text_list[i][-1] in LIST_END_FLAG:
155
180
  line[ListLineTag.IS_LIST_END_LINE] = True
156
181
  if i + 1 < len(block['lines']):
157
- block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
182
+ block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
158
183
  # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
159
184
  else:
160
185
  line_start_flag = False
@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
162
187
  if line_start_flag:
163
188
  line[ListLineTag.IS_LIST_START_LINE] = True
164
189
  line_start_flag = False
165
- elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
190
+ # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
191
+ if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
166
192
  line[ListLineTag.IS_LIST_END_LINE] = True
167
193
  line_start_flag = True
168
194
  # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.9.0
3
+ Version: 0.9.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -22,8 +22,9 @@ Provides-Extra: full
22
22
  Requires-Dist: unimernet==0.2.1; extra == "full"
23
23
  Requires-Dist: ultralytics; extra == "full"
24
24
  Requires-Dist: paddleocr==2.7.3; extra == "full"
25
- Requires-Dist: pypandoc; extra == "full"
26
- Requires-Dist: struct-eqtable==0.1.0; extra == "full"
25
+ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
26
+ Requires-Dist: einops; extra == "full"
27
+ Requires-Dist: accelerate; extra == "full"
27
28
  Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
28
29
  Requires-Dist: detectron2; extra == "full"
29
30
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
@@ -54,8 +55,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
54
55
  [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
55
56
  [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
56
57
  [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
57
- [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
58
-
58
+ [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
59
59
  [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](https://arxiv.org/abs/2409.18839)
60
60
 
61
61
 
@@ -80,6 +80,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
80
80
  </div>
81
81
 
82
82
  # Changelog
83
+ - 2024/11/06 0.9.1 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
83
84
  - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
84
85
  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
85
86
  - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
@@ -10,20 +10,20 @@ magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zk
10
10
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
12
12
  magic_pdf/data/read_api.py,sha256=3fKLsEYAow5RwAmGFMMgvcCh0-_WEEHem2uewukjXOA,3570
13
- magic_pdf/data/schemas.py,sha256=XSFNxyYbIWgU_Z4U0695elpGQP3J5dpq4Rlyr3S0O_s,595
13
+ magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
14
14
  magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
15
15
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
16
16
  magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
17
17
  magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
18
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=BY3faRfZTg27kfkaI4iXPjgFW_RecN0am9r9z2RuYgY,5582
19
- magic_pdf/data/data_reader_writer/s3.py,sha256=4tT_hcb5I1m-qojNP2CAUKGOoWBH2ripKQmBa9_dAfg,2096
20
- magic_pdf/data/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- magic_pdf/data/io/base.py,sha256=So3G_Kndunfs0f9nn3l9dRJG_7N09CX0JbFqYEvyaRI,1113
18
+ magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
19
+ magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
20
+ magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
21
+ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
22
22
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
23
23
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
24
24
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
26
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=eMd3qOIh21cZgTl-LMLGh42uxDMpHU2nwE6iA6b_qrA,11915
26
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=ClxKUwrK7wlXKCcDfuTryztKl5e8pzcnh5x_fODFm2U,12928
27
27
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
29
29
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -65,14 +65,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
65
65
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
66
66
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
67
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
- magic_pdf/libs/version.py,sha256=H9NWRZb7NbeRRPLP_V1fARmLNXranorVM-OOY-8_2ug,22
68
+ magic_pdf/libs/version.py,sha256=UwJXM8JY2T3tE2id0K2k_lEaVThbRTrGO1mNibyzIz8,22
69
69
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
70
70
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
71
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
72
72
  magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
73
73
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
- magic_pdf/model/pdf_extract_kit.py,sha256=qlZANx8DErfSyaPHlOYNYW_Qp50dAEX_4jG8N3coDmM,21317
75
- magic_pdf/model/ppTableModel.py,sha256=Qm5vy6v5aw2wwO5aZTyVr-r1sr3Pi9ManG86WZvfvEo,2697
74
+ magic_pdf/model/pdf_extract_kit.py,sha256=7BVcVkrIAI2aTAUHD_Xrq0yAuy4BEAAJEicOM8Hr0Xw,21593
75
+ magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
76
76
  magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
77
77
  magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
78
  magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -97,7 +97,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
97
97
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
98
98
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
99
99
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
100
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qtAkShYlXBrrkRWHvgAy3y9SEBtMRYVIvI3CASTuLHU,1069
100
+ magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qQthlYDvDPah1mzzrnKXU4fYqlJdXOPBnJ8tYf-o_0k,1384
101
101
  magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
102
  magic_pdf/model/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  magic_pdf/model/v3/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
@@ -112,7 +112,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
112
112
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
113
113
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
114
114
  magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
115
- magic_pdf/para/para_split_v3.py,sha256=vHHswSAcTpXqnaEAbGEbt2g96YLh9eh839HdRNilDT8,13378
115
+ magic_pdf/para/para_split_v3.py,sha256=k02I9Rdc8jfYr3bMT_Gm38b5ginkl-ZIU5C_XcfAcs8,14704
116
116
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
117
117
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
118
118
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -170,9 +170,9 @@ magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,41
170
170
  magic_pdf/tools/common.py,sha256=2S8N60pcA6bFqAmdchoEmn22l9ntQxEfyaKpxfCKJ-Y,5465
171
171
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
172
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
173
- magic_pdf-0.9.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
174
- magic_pdf-0.9.0.dist-info/METADATA,sha256=DPanG2IP5v1TNR6Qyto-UqZ53IOA09lNCQpMyjguJ_k,39420
175
- magic_pdf-0.9.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
176
- magic_pdf-0.9.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
177
- magic_pdf-0.9.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
178
- magic_pdf-0.9.0.dist-info/RECORD,,
173
+ magic_pdf-0.9.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
174
+ magic_pdf-0.9.1.dist-info/METADATA,sha256=2NLbuQt-GzeMws3412i4A8XaDr8xuMZBymu7n3XY7S0,39624
175
+ magic_pdf-0.9.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
176
+ magic_pdf-0.9.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
177
+ magic_pdf-0.9.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
178
+ magic_pdf-0.9.1.dist-info/RECORD,,