magic-pdf 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +25 -19
- magic_pdf/data/data_reader_writer/s3.py +6 -2
- magic_pdf/data/io/__init__.py +6 -0
- magic_pdf/data/io/base.py +1 -1
- magic_pdf/data/schemas.py +4 -0
- magic_pdf/dict2md/ocr_mkcontent.py +31 -9
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/pdf_extract_kit.py +12 -22
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +37 -20
- magic_pdf/model/ppTableModel.py +6 -4
- magic_pdf/para/para_split_v3.py +32 -6
- {magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/METADATA +32 -27
- {magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/RECORD +17 -17
- {magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
import os
|
1
2
|
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
|
2
3
|
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
|
3
4
|
from magic_pdf.data.io.s3 import S3Reader, S3Writer
|
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
|
|
7
8
|
|
8
9
|
|
9
10
|
class MultiS3Mixin:
|
10
|
-
def __init__(self,
|
11
|
+
def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
|
11
12
|
"""Initialized with multiple s3 configs.
|
12
13
|
|
13
14
|
Args:
|
14
|
-
|
15
|
+
default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
|
15
16
|
s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
|
16
17
|
|
17
18
|
Raises:
|
18
|
-
InvalidConfig: default bucket config not in s3_configs
|
19
|
-
InvalidConfig: bucket name not unique in s3_configs
|
20
|
-
InvalidConfig: default bucket must be provided
|
19
|
+
InvalidConfig: default bucket config not in s3_configs.
|
20
|
+
InvalidConfig: bucket name not unique in s3_configs.
|
21
|
+
InvalidConfig: default bucket must be provided.
|
21
22
|
"""
|
22
|
-
if len(
|
23
|
-
raise InvalidConfig('
|
23
|
+
if len(default_prefix) == 0:
|
24
|
+
raise InvalidConfig('default_prefix must be provided')
|
25
|
+
|
26
|
+
arr = default_prefix.strip("/").split("/")
|
27
|
+
self.default_bucket = arr[0]
|
28
|
+
self.default_prefix = "/".join(arr[1:])
|
24
29
|
|
25
30
|
found_default_bucket_config = False
|
26
31
|
for conf in s3_configs:
|
27
|
-
if conf.bucket_name == default_bucket:
|
32
|
+
if conf.bucket_name == self.default_bucket:
|
28
33
|
found_default_bucket_config = True
|
29
34
|
break
|
30
35
|
|
31
36
|
if not found_default_bucket_config:
|
32
37
|
raise InvalidConfig(
|
33
|
-
f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
|
38
|
+
f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
|
34
39
|
)
|
35
40
|
|
36
41
|
uniq_bucket = set([conf.bucket_name for conf in s3_configs])
|
@@ -39,7 +44,6 @@ class MultiS3Mixin:
|
|
39
44
|
f'the bucket_name in s3_configs: {s3_configs} must be unique'
|
40
45
|
)
|
41
46
|
|
42
|
-
self.default_bucket = default_bucket
|
43
47
|
self.s3_configs = s3_configs
|
44
48
|
self._s3_clients_h: dict = {}
|
45
49
|
|
@@ -47,14 +51,14 @@ class MultiS3Mixin:
|
|
47
51
|
class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
|
48
52
|
def read(self, path: str) -> bytes:
|
49
53
|
"""Read the path from s3, select diffect bucket client for each request
|
50
|
-
based on the
|
54
|
+
based on the bucket, also support range read.
|
51
55
|
|
52
56
|
Args:
|
53
|
-
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
|
54
|
-
for example: s3://bucket_name/path?0,100
|
57
|
+
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
|
58
|
+
for example: s3://bucket_name/path?0,100.
|
55
59
|
|
56
60
|
Returns:
|
57
|
-
bytes: the content of s3 file
|
61
|
+
bytes: the content of s3 file.
|
58
62
|
"""
|
59
63
|
may_range_params = parse_s3_range_params(path)
|
60
64
|
if may_range_params is None or 2 != len(may_range_params):
|
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
|
|
84
88
|
|
85
89
|
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
|
86
90
|
"""Read the file with offset and limit, select diffect bucket client
|
87
|
-
for each request based on the
|
91
|
+
for each request based on the bucket.
|
88
92
|
|
89
93
|
Args:
|
90
|
-
path (str): the file path
|
94
|
+
path (str): the file path.
|
91
95
|
offset (int, optional): the number of bytes skipped. Defaults to 0.
|
92
96
|
limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
|
93
97
|
|
94
98
|
Returns:
|
95
|
-
bytes: the file content
|
99
|
+
bytes: the file content.
|
96
100
|
"""
|
97
101
|
if path.startswith('s3://'):
|
98
102
|
bucket_name, path = parse_s3path(path)
|
99
103
|
s3_reader = self.__get_s3_client(bucket_name)
|
100
104
|
else:
|
101
105
|
s3_reader = self.__get_s3_client(self.default_bucket)
|
106
|
+
path = os.path.join(self.default_prefix, path)
|
102
107
|
return s3_reader.read_at(path, offset, limit)
|
103
108
|
|
104
109
|
|
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
|
|
123
128
|
|
124
129
|
def write(self, path: str, data: bytes) -> None:
|
125
130
|
"""Write file with data, also select diffect bucket client for each
|
126
|
-
request based on the
|
131
|
+
request based on the bucket.
|
127
132
|
|
128
133
|
Args:
|
129
134
|
path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
|
130
|
-
data (bytes): the data want to write
|
135
|
+
data (bytes): the data want to write.
|
131
136
|
"""
|
132
137
|
if path.startswith('s3://'):
|
133
138
|
bucket_name, path = parse_s3path(path)
|
134
139
|
s3_writer = self.__get_s3_client(bucket_name)
|
135
140
|
else:
|
136
141
|
s3_writer = self.__get_s3_client(self.default_bucket)
|
142
|
+
path = os.path.join(self.default_prefix, path)
|
137
143
|
return s3_writer.write(path, data)
|
@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
|
|
6
6
|
class S3DataReader(MultiBucketS3DataReader):
|
7
7
|
def __init__(
|
8
8
|
self,
|
9
|
+
default_prefix_without_bucket: str,
|
9
10
|
bucket: str,
|
10
11
|
ak: str,
|
11
12
|
sk: str,
|
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
|
|
15
16
|
"""s3 reader client.
|
16
17
|
|
17
18
|
Args:
|
19
|
+
default_prefix_without_bucket: prefix that not contains bucket
|
18
20
|
bucket (str): bucket name
|
19
21
|
ak (str): access key
|
20
22
|
sk (str): secret key
|
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
|
|
23
25
|
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
|
24
26
|
"""
|
25
27
|
super().__init__(
|
26
|
-
bucket,
|
28
|
+
f'{bucket}/{default_prefix_without_bucket}',
|
27
29
|
[
|
28
30
|
S3Config(
|
29
31
|
bucket_name=bucket,
|
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
|
|
39
41
|
class S3DataWriter(MultiBucketS3DataWriter):
|
40
42
|
def __init__(
|
41
43
|
self,
|
44
|
+
default_prefix_without_bucket: str,
|
42
45
|
bucket: str,
|
43
46
|
ak: str,
|
44
47
|
sk: str,
|
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
|
|
48
51
|
"""s3 writer client.
|
49
52
|
|
50
53
|
Args:
|
54
|
+
default_prefix_without_bucket: prefix that not contains bucket
|
51
55
|
bucket (str): bucket name
|
52
56
|
ak (str): access key
|
53
57
|
sk (str): secret key
|
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
|
|
56
60
|
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
|
57
61
|
"""
|
58
62
|
super().__init__(
|
59
|
-
bucket,
|
63
|
+
f'{bucket}/{default_prefix_without_bucket}',
|
60
64
|
[
|
61
65
|
S3Config(
|
62
66
|
bucket_name=bucket,
|
magic_pdf/data/io/__init__.py
CHANGED
@@ -0,0 +1,6 @@
|
|
1
|
+
|
2
|
+
from magic_pdf.data.io.base import IOReader, IOWriter # noqa: F401
|
3
|
+
from magic_pdf.data.io.http import HttpReader, HttpWriter # noqa: F401
|
4
|
+
from magic_pdf.data.io.s3 import S3Reader, S3Writer # noqa: F401
|
5
|
+
|
6
|
+
__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
|
magic_pdf/data/io/base.py
CHANGED
magic_pdf/data/schemas.py
CHANGED
@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
|
|
3
3
|
|
4
4
|
|
5
5
|
class S3Config(BaseModel):
|
6
|
+
"""S3 config
|
7
|
+
"""
|
6
8
|
bucket_name: str = Field(description='s3 bucket name', min_length=1)
|
7
9
|
access_key: str = Field(description='s3 access key', min_length=1)
|
8
10
|
secret_key: str = Field(description='s3 secret key', min_length=1)
|
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
|
|
11
13
|
|
12
14
|
|
13
15
|
class PageInfo(BaseModel):
|
16
|
+
"""The width and height of page
|
17
|
+
"""
|
14
18
|
w: float = Field(description='the width of page')
|
15
19
|
h: float = Field(description='the height of page')
|
@@ -119,6 +119,16 @@ def detect_language(text):
|
|
119
119
|
return 'empty'
|
120
120
|
|
121
121
|
|
122
|
+
# 连写字符拆分
|
123
|
+
def __replace_ligatures(text: str):
|
124
|
+
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
|
125
|
+
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
|
126
|
+
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
|
127
|
+
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
|
128
|
+
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
|
129
|
+
return text
|
130
|
+
|
131
|
+
|
122
132
|
def merge_para_with_text(para_block):
|
123
133
|
para_text = ''
|
124
134
|
for i, line in enumerate(para_block['lines']):
|
@@ -141,22 +151,34 @@ def merge_para_with_text(para_block):
|
|
141
151
|
if span_type == ContentType.Text:
|
142
152
|
content = ocr_escape_special_markdown_char(span['content'])
|
143
153
|
elif span_type == ContentType.InlineEquation:
|
144
|
-
content = f"
|
154
|
+
content = f"${span['content']}$"
|
145
155
|
elif span_type == ContentType.InterlineEquation:
|
146
156
|
content = f"\n$$\n{span['content']}\n$$\n"
|
147
157
|
|
158
|
+
content = content.strip()
|
148
159
|
if content != '':
|
149
160
|
langs = ['zh', 'ja', 'ko']
|
150
161
|
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
para_text += content[:-1]
|
156
|
-
else:
|
157
|
-
para_text += content + ' '
|
162
|
+
if span_type in [ContentType.Text, ContentType.InterlineEquation]:
|
163
|
+
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
|
164
|
+
elif span_type == ContentType.InlineEquation:
|
165
|
+
para_text += f" {content} "
|
158
166
|
else:
|
159
|
-
|
167
|
+
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
168
|
+
# 如果是前一行带有-连字符,那么末尾不应该加空格
|
169
|
+
if __is_hyphen_at_line_end(content):
|
170
|
+
para_text += content[:-1]
|
171
|
+
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
|
172
|
+
para_text += content
|
173
|
+
else: # 西方文本语境下 content间需要空格分隔
|
174
|
+
para_text += f"{content} "
|
175
|
+
elif span_type == ContentType.InterlineEquation:
|
176
|
+
para_text += content
|
177
|
+
else:
|
178
|
+
continue
|
179
|
+
# 连写字符拆分
|
180
|
+
para_text = __replace_ligatures(para_text)
|
181
|
+
|
160
182
|
return para_text
|
161
183
|
|
162
184
|
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.9.
|
1
|
+
__version__ = "0.9.2"
|
@@ -38,15 +38,13 @@ except ImportError as e:
|
|
38
38
|
from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
|
39
39
|
from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
|
40
40
|
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
|
41
|
-
|
41
|
+
from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
|
42
42
|
from magic_pdf.model.ppTableModel import ppTableModel
|
43
43
|
|
44
44
|
|
45
45
|
def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
|
46
46
|
if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
|
47
|
-
|
48
|
-
logger.error("StructEqTable is under upgrade, the current version does not support it.")
|
49
|
-
exit(1)
|
47
|
+
table_model = StructTableModel(model_path, max_time=max_time)
|
50
48
|
elif table_model_type == MODEL_NAME.TABLE_MASTER:
|
51
49
|
config = {
|
52
50
|
"model_dir": model_path,
|
@@ -284,8 +282,6 @@ class CustomPEKModel:
|
|
284
282
|
)
|
285
283
|
# 初始化ocr
|
286
284
|
if self.apply_ocr:
|
287
|
-
|
288
|
-
# self.ocr_model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=0.3)
|
289
285
|
self.ocr_model = atom_model_manager.get_atom_model(
|
290
286
|
atom_model_name=AtomicModel.OCR,
|
291
287
|
ocr_show_log=show_log,
|
@@ -303,17 +299,6 @@ class CustomPEKModel:
|
|
303
299
|
device=self.device
|
304
300
|
)
|
305
301
|
|
306
|
-
home_directory = Path.home()
|
307
|
-
det_source = os.path.join(models_dir, table_model_dir, DETECT_MODEL_DIR)
|
308
|
-
rec_source = os.path.join(models_dir, table_model_dir, REC_MODEL_DIR)
|
309
|
-
det_dest_dir = os.path.join(home_directory, PP_DET_DIRECTORY)
|
310
|
-
rec_dest_dir = os.path.join(home_directory, PP_REC_DIRECTORY)
|
311
|
-
|
312
|
-
if not os.path.exists(det_dest_dir):
|
313
|
-
shutil.copytree(det_source, det_dest_dir)
|
314
|
-
if not os.path.exists(rec_dest_dir):
|
315
|
-
shutil.copytree(rec_source, rec_dest_dir)
|
316
|
-
|
317
302
|
logger.info('DocAnalysis init done!')
|
318
303
|
|
319
304
|
def __call__(self, image):
|
@@ -393,7 +378,7 @@ class CustomPEKModel:
|
|
393
378
|
elif int(res['category_id']) in [5]:
|
394
379
|
table_res_list.append(res)
|
395
380
|
|
396
|
-
if torch.cuda.is_available():
|
381
|
+
if torch.cuda.is_available() and self.device != 'cpu':
|
397
382
|
properties = torch.cuda.get_device_properties(self.device)
|
398
383
|
total_memory = properties.total_memory / (1024 ** 3) # 将字节转换为 GB
|
399
384
|
if total_memory <= 10:
|
@@ -463,7 +448,9 @@ class CustomPEKModel:
|
|
463
448
|
html_code = None
|
464
449
|
if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
|
465
450
|
with torch.no_grad():
|
466
|
-
|
451
|
+
table_result = self.table_model.predict(new_image, "html")
|
452
|
+
if len(table_result) > 0:
|
453
|
+
html_code = table_result[0]
|
467
454
|
else:
|
468
455
|
html_code = self.table_model.img2html(new_image)
|
469
456
|
|
@@ -474,14 +461,17 @@ class CustomPEKModel:
|
|
474
461
|
# 判断是否返回正常
|
475
462
|
|
476
463
|
if latex_code:
|
477
|
-
expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
|
478
|
-
'end{table}')
|
464
|
+
expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
|
479
465
|
if expected_ending:
|
480
466
|
res["latex"] = latex_code
|
481
467
|
else:
|
482
468
|
logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
|
483
469
|
elif html_code:
|
484
|
-
|
470
|
+
expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
|
471
|
+
if expected_ending:
|
472
|
+
res["html"] = html_code
|
473
|
+
else:
|
474
|
+
logger.warning(f"table recognition processing fails, not found expected HTML table end")
|
485
475
|
else:
|
486
476
|
logger.warning(f"table recognition processing fails, not get latex or html return")
|
487
477
|
logger.info(f"table time: {round(time.time() - table_start, 2)}")
|
@@ -1,28 +1,45 @@
|
|
1
|
-
|
1
|
+
import re
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
except ImportError:
|
6
|
-
logger.error("StructEqTable is under upgrade, the current version does not support it.")
|
7
|
-
from pypandoc import convert_text
|
3
|
+
import torch
|
4
|
+
from struct_eqtable import build_model
|
8
5
|
|
9
6
|
|
10
7
|
class StructTableModel:
|
11
|
-
def __init__(self, model_path, max_new_tokens=
|
8
|
+
def __init__(self, model_path, max_new_tokens=1024, max_time=60):
|
12
9
|
# init
|
13
|
-
|
14
|
-
self.
|
15
|
-
|
16
|
-
|
17
|
-
|
10
|
+
assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
|
11
|
+
self.model = build_model(
|
12
|
+
model_ckpt=model_path,
|
13
|
+
max_new_tokens=max_new_tokens,
|
14
|
+
max_time=max_time,
|
15
|
+
lmdeploy=False,
|
16
|
+
flash_attn=False,
|
17
|
+
batch_size=1,
|
18
|
+
).cuda()
|
19
|
+
self.default_format = "html"
|
20
|
+
|
21
|
+
def predict(self, images, output_format=None, **kwargs):
|
22
|
+
|
23
|
+
if output_format is None:
|
24
|
+
output_format = self.default_format
|
18
25
|
else:
|
19
|
-
|
26
|
+
if output_format not in ['latex', 'markdown', 'html']:
|
27
|
+
raise ValueError(f"Output format {output_format} is not supported.")
|
28
|
+
|
29
|
+
results = self.model(
|
30
|
+
images, output_format=output_format
|
31
|
+
)
|
32
|
+
|
33
|
+
if output_format == "html":
|
34
|
+
results = [self.minify_html(html) for html in results]
|
20
35
|
|
21
|
-
|
22
|
-
table_latex = self.model.forward(image)
|
23
|
-
return table_latex
|
36
|
+
return results
|
24
37
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
|
38
|
+
def minify_html(self, html):
|
39
|
+
# 移除多余的空白字符
|
40
|
+
html = re.sub(r'\s+', ' ', html)
|
41
|
+
# 移除行尾的空白字符
|
42
|
+
html = re.sub(r'\s*>\s*', '>', html)
|
43
|
+
# 移除标签前的空白字符
|
44
|
+
html = re.sub(r'\s*<\s*', '<', html)
|
45
|
+
return html.strip()
|
magic_pdf/model/ppTableModel.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import cv2
|
1
2
|
from paddleocr.ppstructure.table.predict_table import TableSystem
|
2
3
|
from paddleocr.ppstructure.utility import init_args
|
3
4
|
from magic_pdf.libs.Constants import *
|
@@ -36,12 +37,13 @@ class ppTableModel(object):
|
|
36
37
|
- HTML (str): A string representing the HTML structure with content of the table.
|
37
38
|
"""
|
38
39
|
if isinstance(image, Image.Image):
|
39
|
-
image = np.
|
40
|
+
image = np.asarray(image)
|
41
|
+
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
|
40
42
|
pred_res, _ = self.table_sys(image)
|
41
43
|
pred_html = pred_res["html"]
|
42
|
-
res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
|
43
|
-
|
44
|
-
return
|
44
|
+
# res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
|
45
|
+
# "</table></body></html>","") + "</table></td>\n"
|
46
|
+
return pred_html
|
45
47
|
|
46
48
|
def parse_args(self, **kwargs):
|
47
49
|
parser = init_args()
|
magic_pdf/para/para_split_v3.py
CHANGED
@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
|
|
63
63
|
first_line = block['lines'][0]
|
64
64
|
line_height = first_line['bbox'][3] - first_line['bbox'][1]
|
65
65
|
block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
|
66
|
+
block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
|
66
67
|
|
67
68
|
left_close_num = 0
|
68
69
|
left_not_close_num = 0
|
69
70
|
right_not_close_num = 0
|
70
71
|
right_close_num = 0
|
71
72
|
lines_text_list = []
|
72
|
-
|
73
|
+
center_close_num = 0
|
74
|
+
external_sides_not_close_num = 0
|
73
75
|
multiple_para_flag = False
|
74
76
|
last_line = block['lines'][-1]
|
77
|
+
|
75
78
|
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
|
76
79
|
if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
|
77
80
|
# block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
|
@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):
|
|
82
85
|
|
83
86
|
for line in block['lines']:
|
84
87
|
|
88
|
+
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
89
|
+
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
90
|
+
if (
|
91
|
+
line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
|
92
|
+
block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
|
93
|
+
):
|
94
|
+
external_sides_not_close_num += 1
|
95
|
+
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
96
|
+
center_close_num += 1
|
97
|
+
|
85
98
|
line_text = ""
|
86
99
|
|
87
100
|
for span in line['spans']:
|
@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
|
|
103
116
|
right_close_num += 1
|
104
117
|
else:
|
105
118
|
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
|
106
|
-
closed_area = 0.
|
119
|
+
closed_area = 0.26 * block_weight
|
107
120
|
# closed_area = 5 * line_height
|
108
121
|
if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
|
109
122
|
right_not_close_num += 1
|
@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
|
|
132
145
|
line_num_flag = True
|
133
146
|
|
134
147
|
# 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
|
135
|
-
if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
|
148
|
+
if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
|
136
149
|
and line_num_flag
|
137
150
|
):
|
138
151
|
for line in block['lines']:
|
139
152
|
line[ListLineTag.IS_LIST_START_LINE] = True
|
140
153
|
return BlockType.Index
|
141
154
|
|
155
|
+
# 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
|
156
|
+
# 补充条件block的长宽比有要求
|
157
|
+
elif (
|
158
|
+
external_sides_not_close_num >= 2 and
|
159
|
+
center_close_num == len(block['lines']) and
|
160
|
+
external_sides_not_close_num / len(block['lines']) >= 0.5 and
|
161
|
+
block_height / block_weight > 0.4
|
162
|
+
):
|
163
|
+
for line in block['lines']:
|
164
|
+
line[ListLineTag.IS_LIST_START_LINE] = True
|
165
|
+
return BlockType.List
|
166
|
+
|
142
167
|
elif left_close_num >= 2 and (
|
143
168
|
right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
|
144
169
|
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
|
145
|
-
if left_close_num / len(block['lines']) > 0.
|
170
|
+
if left_close_num / len(block['lines']) > 0.8:
|
146
171
|
# 这种是每个item只有一行,且左边都贴边的短item list
|
147
172
|
if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
|
148
173
|
for line in block['lines']:
|
@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
|
|
154
179
|
if lines_text_list[i][-1] in LIST_END_FLAG:
|
155
180
|
line[ListLineTag.IS_LIST_END_LINE] = True
|
156
181
|
if i + 1 < len(block['lines']):
|
157
|
-
block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
|
182
|
+
block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
|
158
183
|
# line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
|
159
184
|
else:
|
160
185
|
line_start_flag = False
|
@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
|
|
162
187
|
if line_start_flag:
|
163
188
|
line[ListLineTag.IS_LIST_START_LINE] = True
|
164
189
|
line_start_flag = False
|
165
|
-
elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
|
190
|
+
# elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
|
191
|
+
if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
|
166
192
|
line[ListLineTag.IS_LIST_END_LINE] = True
|
167
193
|
line_start_flag = True
|
168
194
|
# 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.2
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -22,8 +22,9 @@ Provides-Extra: full
|
|
22
22
|
Requires-Dist: unimernet==0.2.1; extra == "full"
|
23
23
|
Requires-Dist: ultralytics; extra == "full"
|
24
24
|
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
25
|
-
Requires-Dist:
|
26
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
26
|
+
Requires-Dist: einops; extra == "full"
|
27
|
+
Requires-Dist: accelerate; extra == "full"
|
27
28
|
Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
|
28
29
|
Requires-Dist: detectron2; extra == "full"
|
29
30
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
@@ -54,8 +55,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
54
55
|
[](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
|
55
56
|
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
56
57
|
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
57
|
-
[](https://colab.research.google.com/gist/
|
58
|
-
|
58
|
+
[](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
|
59
59
|
[](https://arxiv.org/abs/2409.18839)
|
60
60
|
|
61
61
|
|
@@ -80,6 +80,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
80
80
|
</div>
|
81
81
|
|
82
82
|
# Changelog
|
83
|
+
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
|
83
84
|
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
|
84
85
|
- Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
|
85
86
|
- Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
|
@@ -175,13 +176,14 @@ There are three different ways to experience MinerU:
|
|
175
176
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
176
177
|
- [Linux/Windows + CUDA](#Using-GPU)
|
177
178
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
179
|
+
> [!WARNING]
|
180
|
+
> **Pre-installation Notice—Hardware and Software Environment Support**
|
181
|
+
>
|
182
|
+
> To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
|
183
|
+
>
|
184
|
+
> By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
|
185
|
+
>
|
186
|
+
> In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
|
185
187
|
|
186
188
|
<table>
|
187
189
|
<tr>
|
@@ -261,11 +263,13 @@ Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for de
|
|
261
263
|
After completing the [2. Download model weight files](#2-download-model-weight-files) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
|
262
264
|
You can find the `magic-pdf.json` file in your 【user directory】.
|
263
265
|
|
266
|
+
> [!TIP]
|
264
267
|
> The user directory for Windows is "C:\\Users\\username", for Linux it is "/home/username", and for macOS it is "/Users/username".
|
265
268
|
|
266
269
|
You can modify certain configurations in this file to enable or disable features, such as table recognition:
|
267
270
|
|
268
271
|
|
272
|
+
> [!NOTE]
|
269
273
|
> If the following items are not present in the JSON, please manually add the required items and remove the comment content (standard JSON does not support comments).
|
270
274
|
|
271
275
|
```json
|
@@ -294,13 +298,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
294
298
|
- [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
|
295
299
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
296
300
|
- Quick Deployment with Docker
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
301
|
+
> [!IMPORTANT]
|
302
|
+
> Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
|
303
|
+
>
|
304
|
+
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
305
|
+
>
|
306
|
+
> ```bash
|
307
|
+
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
308
|
+
> ```
|
304
309
|
```bash
|
305
310
|
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
306
311
|
docker build -t mineru:latest .
|
@@ -362,8 +367,8 @@ The results will be saved in the `{some_output_dir}` directory. The output file
|
|
362
367
|
├── some_pdf_spans.pdf # smallest granularity bbox position information diagram
|
363
368
|
└── some_pdf_content_list.json # Rich text JSON arranged in reading order
|
364
369
|
```
|
365
|
-
|
366
|
-
For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
370
|
+
> [!TIP]
|
371
|
+
> For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
367
372
|
|
368
373
|
### API
|
369
374
|
|
@@ -414,12 +419,12 @@ TODO
|
|
414
419
|
|
415
420
|
# TODO
|
416
421
|
|
417
|
-
-
|
418
|
-
-
|
419
|
-
-
|
420
|
-
-
|
421
|
-
-
|
422
|
-
-
|
422
|
+
- [x] Reading order based on the model
|
423
|
+
- [x] Recognition of `index` and `list` in the main text
|
424
|
+
- [x] Table recognition
|
425
|
+
- [ ] Code block recognition in the main text
|
426
|
+
- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
|
427
|
+
- [ ] Geometric shape recognition
|
423
428
|
|
424
429
|
# Known Issues
|
425
430
|
|
@@ -10,20 +10,20 @@ magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zk
|
|
10
10
|
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
|
12
12
|
magic_pdf/data/read_api.py,sha256=3fKLsEYAow5RwAmGFMMgvcCh0-_WEEHem2uewukjXOA,3570
|
13
|
-
magic_pdf/data/schemas.py,sha256=
|
13
|
+
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
14
14
|
magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
|
15
15
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
16
16
|
magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
|
17
17
|
magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
|
18
|
-
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=
|
19
|
-
magic_pdf/data/data_reader_writer/s3.py,sha256=
|
20
|
-
magic_pdf/data/io/__init__.py,sha256=
|
21
|
-
magic_pdf/data/io/base.py,sha256=
|
18
|
+
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
|
19
|
+
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
20
|
+
magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
|
21
|
+
magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
|
22
22
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
23
23
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
24
24
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
26
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
26
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=ClxKUwrK7wlXKCcDfuTryztKl5e8pzcnh5x_fODFm2U,12928
|
27
27
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
29
29
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -65,14 +65,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
65
65
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
66
66
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
67
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
68
|
-
magic_pdf/libs/version.py,sha256=
|
68
|
+
magic_pdf/libs/version.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
|
69
69
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
70
70
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
71
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
|
72
72
|
magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
|
73
73
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
75
|
-
magic_pdf/model/ppTableModel.py,sha256=
|
74
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=9pdtcQgwn-XMvyQ7yMfzqKgjPfxEuNXR7juCPx-OM-M,20929
|
75
|
+
magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
|
76
76
|
magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
|
77
77
|
magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
78
|
magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
|
@@ -97,7 +97,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
|
|
97
97
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
98
98
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
99
99
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
100
|
-
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=
|
100
|
+
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qQthlYDvDPah1mzzrnKXU4fYqlJdXOPBnJ8tYf-o_0k,1384
|
101
101
|
magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
102
102
|
magic_pdf/model/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
103
103
|
magic_pdf/model/v3/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
|
@@ -112,7 +112,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
|
|
112
112
|
magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
|
113
113
|
magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
|
114
114
|
magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
|
115
|
-
magic_pdf/para/para_split_v3.py,sha256=
|
115
|
+
magic_pdf/para/para_split_v3.py,sha256=k02I9Rdc8jfYr3bMT_Gm38b5ginkl-ZIU5C_XcfAcs8,14704
|
116
116
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
117
117
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
118
118
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
@@ -170,9 +170,9 @@ magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,41
|
|
170
170
|
magic_pdf/tools/common.py,sha256=2S8N60pcA6bFqAmdchoEmn22l9ntQxEfyaKpxfCKJ-Y,5465
|
171
171
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
172
172
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
173
|
-
magic_pdf-0.9.
|
174
|
-
magic_pdf-0.9.
|
175
|
-
magic_pdf-0.9.
|
176
|
-
magic_pdf-0.9.
|
177
|
-
magic_pdf-0.9.
|
178
|
-
magic_pdf-0.9.
|
173
|
+
magic_pdf-0.9.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
174
|
+
magic_pdf-0.9.2.dist-info/METADATA,sha256=CxyxzxwoOTK3GfaQCGAR8lcjQR3fK4teYf0pXLVDiNQ,39654
|
175
|
+
magic_pdf-0.9.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
176
|
+
magic_pdf-0.9.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
177
|
+
magic_pdf-0.9.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
178
|
+
magic_pdf-0.9.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|