magic-pdf 0.7.0b1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +4 -0
- magic_pdf/libs/Constants.py +27 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/magic_model.py +3 -0
- magic_pdf/model/pdf_extract_kit.py +35 -12
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
- magic_pdf/model/ppTableModel.py +67 -0
- magic_pdf/para/para_split_v2.py +50 -47
- magic_pdf/resources/model_config/model_configs.yaml +3 -1
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/METADATA +4 -8
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/RECORD +15 -14
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/top_level.txt +0 -0
@@ -132,6 +132,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
132
132
|
# if processed by table model
|
133
133
|
if span.get('latex', ''):
|
134
134
|
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
|
135
|
+
elif span.get('html', ''):
|
136
|
+
para_text += f"\n\n{span['html']}\n\n"
|
135
137
|
else:
|
136
138
|
para_text += f"\n}) \n"
|
137
139
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
@@ -256,6 +258,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
|
256
258
|
if block['type'] == BlockType.TableBody:
|
257
259
|
if block["lines"][0]["spans"][0].get('latex', ''):
|
258
260
|
para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
|
261
|
+
elif block["lines"][0]["spans"][0].get('html', ''):
|
262
|
+
para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
|
259
263
|
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
|
260
264
|
if block['type'] == BlockType.TableCaption:
|
261
265
|
para_content['table_caption'] = merge_para_with_text(block)
|
magic_pdf/libs/Constants.py
CHANGED
@@ -10,5 +10,31 @@ block维度自定义字段
|
|
10
10
|
# block中lines是否被删除
|
11
11
|
LINES_DELETED = "lines_deleted"
|
12
12
|
|
13
|
+
# struct eqtable
|
14
|
+
STRUCT_EQTABLE = "struct_eqtable"
|
15
|
+
|
13
16
|
# table recognition max time default value
|
14
|
-
TABLE_MAX_TIME_VALUE = 400
|
17
|
+
TABLE_MAX_TIME_VALUE = 400
|
18
|
+
|
19
|
+
# pp_table_result_max_length
|
20
|
+
TABLE_MAX_LEN = 480
|
21
|
+
|
22
|
+
# pp table structure algorithm
|
23
|
+
TABLE_MASTER = "TableMaster"
|
24
|
+
|
25
|
+
# table master structure dict
|
26
|
+
TABLE_MASTER_DICT = "table_master_structure_dict.txt"
|
27
|
+
|
28
|
+
# table master dir
|
29
|
+
TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
|
30
|
+
|
31
|
+
# pp detect model dir
|
32
|
+
DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
|
33
|
+
|
34
|
+
# pp rec model dir
|
35
|
+
REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
|
36
|
+
|
37
|
+
# pp rec char dict path
|
38
|
+
REC_CHAR_DICT = "ppocr_keys_v1.txt"
|
39
|
+
|
40
|
+
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.7.
|
1
|
+
__version__ = "0.7.1"
|
magic_pdf/model/magic_model.py
CHANGED
@@ -562,8 +562,11 @@ class MagicModel:
|
|
562
562
|
elif category_id == 5:
|
563
563
|
# 获取table模型结果
|
564
564
|
latex = layout_det.get("latex", None)
|
565
|
+
html = layout_det.get("html", None)
|
565
566
|
if latex:
|
566
567
|
span["latex"] = latex
|
568
|
+
elif html:
|
569
|
+
span["html"] = html
|
567
570
|
span["type"] = ContentType.Table
|
568
571
|
elif category_id == 13:
|
569
572
|
span["content"] = layout_det["latex"]
|
@@ -2,7 +2,7 @@ from loguru import logger
|
|
2
2
|
import os
|
3
3
|
import time
|
4
4
|
|
5
|
-
from magic_pdf.libs.Constants import
|
5
|
+
from magic_pdf.libs.Constants import *
|
6
6
|
|
7
7
|
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
8
8
|
try:
|
@@ -34,10 +34,18 @@ from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Pre
|
|
34
34
|
from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
|
35
35
|
from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
|
36
36
|
from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
from magic_pdf.model.ppTableModel import ppTableModel
|
38
|
+
|
39
|
+
|
40
|
+
def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
|
41
|
+
if table_model_type == STRUCT_EQTABLE:
|
42
|
+
table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
|
43
|
+
else:
|
44
|
+
config = {
|
45
|
+
"model_dir": model_path,
|
46
|
+
"device": _device_
|
47
|
+
}
|
48
|
+
table_model = ppTableModel(config)
|
41
49
|
return table_model
|
42
50
|
|
43
51
|
|
@@ -104,9 +112,11 @@ class CustomPEKModel:
|
|
104
112
|
# 初始化解析配置
|
105
113
|
self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
|
106
114
|
self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
|
115
|
+
# table config
|
107
116
|
self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
|
108
117
|
self.apply_table = self.table_config.get("is_table_recog_enable", False)
|
109
118
|
self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
|
119
|
+
self.table_model_type = self.table_config.get("model", TABLE_MASTER)
|
110
120
|
self.apply_ocr = ocr
|
111
121
|
logger.info(
|
112
122
|
"DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
|
@@ -141,10 +151,11 @@ class CustomPEKModel:
|
|
141
151
|
if self.apply_ocr:
|
142
152
|
self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
|
143
153
|
|
144
|
-
# init
|
154
|
+
# init table model
|
145
155
|
if self.apply_table:
|
146
|
-
|
147
|
-
|
156
|
+
table_model_dir = self.configs["weights"][self.table_model_type]
|
157
|
+
self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
|
158
|
+
max_time=self.table_max_time, _device_=self.device)
|
148
159
|
logger.info('DocAnalysis init done!')
|
149
160
|
|
150
161
|
def __call__(self, image):
|
@@ -278,16 +289,28 @@ class CustomPEKModel:
|
|
278
289
|
new_image, _ = crop_img(res, pil_img)
|
279
290
|
single_table_start_time = time.time()
|
280
291
|
logger.info("------------------table recognition processing begins-----------------")
|
292
|
+
latex_code = None
|
293
|
+
html_code = None
|
281
294
|
with torch.no_grad():
|
282
|
-
|
295
|
+
if self.table_model_type == STRUCT_EQTABLE:
|
296
|
+
latex_code = self.table_model.image2latex(new_image)[0]
|
297
|
+
else:
|
298
|
+
html_code = self.table_model.img2html(new_image)
|
283
299
|
run_time = time.time() - single_table_start_time
|
284
300
|
logger.info(f"------------table recognition processing ends within {run_time}s-----")
|
285
301
|
if run_time > self.table_max_time:
|
286
302
|
logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
|
287
303
|
# 判断是否返回正常
|
288
|
-
|
289
|
-
if latex_code
|
290
|
-
|
304
|
+
|
305
|
+
if latex_code:
|
306
|
+
expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
|
307
|
+
'end{table}')
|
308
|
+
if expected_ending:
|
309
|
+
res["latex"] = latex_code
|
310
|
+
else:
|
311
|
+
logger.warning(f"------------table recognition processing fails----------")
|
312
|
+
elif html_code:
|
313
|
+
res["html"] = html_code
|
291
314
|
else:
|
292
315
|
logger.warning(f"------------table recognition processing fails----------")
|
293
316
|
table_cost = round(time.time() - table_start, 2)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from paddleocr.ppstructure.table.predict_table import TableSystem
|
2
|
+
from paddleocr.ppstructure.utility import init_args
|
3
|
+
from magic_pdf.libs.Constants import *
|
4
|
+
import os
|
5
|
+
from PIL import Image
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
|
9
|
+
class ppTableModel(object):
|
10
|
+
"""
|
11
|
+
This class is responsible for converting image of table into HTML format using a pre-trained model.
|
12
|
+
|
13
|
+
Attributes:
|
14
|
+
- table_sys: An instance of TableSystem initialized with parsed arguments.
|
15
|
+
|
16
|
+
Methods:
|
17
|
+
- __init__(config): Initializes the model with configuration parameters.
|
18
|
+
- img2html(image): Converts a PIL Image or NumPy array to HTML string.
|
19
|
+
- parse_args(**kwargs): Parses configuration arguments.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, config):
|
23
|
+
"""
|
24
|
+
Parameters:
|
25
|
+
- config (dict): Configuration dictionary containing model_dir and device.
|
26
|
+
"""
|
27
|
+
args = self.parse_args(**config)
|
28
|
+
self.table_sys = TableSystem(args)
|
29
|
+
|
30
|
+
def img2html(self, image):
|
31
|
+
"""
|
32
|
+
Parameters:
|
33
|
+
- image (PIL.Image or np.ndarray): The image of the table to be converted.
|
34
|
+
|
35
|
+
Return:
|
36
|
+
- HTML (str): A string representing the HTML structure with content of the table.
|
37
|
+
"""
|
38
|
+
if isinstance(image, Image.Image):
|
39
|
+
image = np.array(image)
|
40
|
+
pred_res, _ = self.table_sys(image)
|
41
|
+
pred_html = pred_res["html"]
|
42
|
+
res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
|
43
|
+
"") + "</table></td>\n"
|
44
|
+
return res
|
45
|
+
|
46
|
+
def parse_args(self, **kwargs):
|
47
|
+
parser = init_args()
|
48
|
+
model_dir = kwargs.get("model_dir")
|
49
|
+
table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
|
50
|
+
table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
|
51
|
+
det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
|
52
|
+
rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
|
53
|
+
rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
|
54
|
+
device = kwargs.get("device", "cpu")
|
55
|
+
use_gpu = True if device == "cuda" else False
|
56
|
+
config = {
|
57
|
+
"use_gpu": use_gpu,
|
58
|
+
"table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
|
59
|
+
"table_algorithm": TABLE_MASTER,
|
60
|
+
"table_model_dir": table_model_dir,
|
61
|
+
"table_char_dict_path": table_char_dict_path,
|
62
|
+
"det_model_dir": det_model_dir,
|
63
|
+
"rec_model_dir": rec_model_dir,
|
64
|
+
"rec_char_dict_path": rec_char_dict_path,
|
65
|
+
}
|
66
|
+
parser.set_defaults(**config)
|
67
|
+
return parser.parse_args([])
|
magic_pdf/para/para_split_v2.py
CHANGED
@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
|
|
100
100
|
|
101
101
|
if lang != 'en':
|
102
102
|
return lines, None
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
103
|
+
|
104
|
+
total_lines = len(lines)
|
105
|
+
line_fea_encode = []
|
106
|
+
"""
|
107
|
+
对每一行进行特征编码,编码规则如下:
|
108
|
+
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
|
109
|
+
2. 如果顶格,其他非大写开头编码为4
|
110
|
+
3. 如果非顶格,首字符大写,编码为2
|
111
|
+
4. 如果非顶格,首字符非大写编码为3
|
112
|
+
"""
|
113
|
+
if len(lines) > 0:
|
114
|
+
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
|
115
|
+
for l in lines:
|
116
|
+
span_text = __get_span_text(l['spans'][0])
|
117
|
+
if not span_text:
|
118
|
+
line_fea_encode.append(0)
|
119
|
+
continue
|
120
|
+
first_char = span_text[0]
|
121
|
+
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
|
122
|
+
if not layout:
|
123
|
+
line_fea_encode.append(0)
|
124
|
+
else:
|
125
|
+
#
|
126
|
+
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
|
127
|
+
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
|
128
|
+
if not first_char.isalnum() or if_match_reference_list(span_text):
|
129
|
+
line_fea_encode.append(1)
|
130
|
+
else:
|
131
|
+
line_fea_encode.append(4)
|
121
132
|
else:
|
122
|
-
|
123
|
-
|
124
|
-
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
|
125
|
-
if not first_char.isalnum() or if_match_reference_list(span_text):
|
126
|
-
line_fea_encode.append(1)
|
127
|
-
else:
|
128
|
-
line_fea_encode.append(4)
|
133
|
+
if first_char.isupper():
|
134
|
+
line_fea_encode.append(2)
|
129
135
|
else:
|
130
|
-
|
131
|
-
line_fea_encode.append(2)
|
132
|
-
else:
|
133
|
-
line_fea_encode.append(3)
|
136
|
+
line_fea_encode.append(3)
|
134
137
|
|
135
|
-
|
138
|
+
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
|
136
139
|
|
137
|
-
|
138
|
-
|
140
|
+
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
|
141
|
+
if len(list_indice) > 0:
|
142
|
+
if debug_able:
|
143
|
+
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
|
144
|
+
|
145
|
+
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
|
146
|
+
segments = []
|
147
|
+
for start, end in list_indice:
|
148
|
+
for i in range(start, end + 1):
|
149
|
+
if i > 0:
|
150
|
+
if line_fea_encode[i] == 4:
|
151
|
+
if debug_able:
|
152
|
+
logger.info(f"列表行的第{i}行不是顶格的")
|
153
|
+
break
|
154
|
+
else:
|
139
155
|
if debug_able:
|
140
|
-
logger.info(f"
|
141
|
-
|
142
|
-
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
|
143
|
-
segments = []
|
144
|
-
for start, end in list_indice:
|
145
|
-
for i in range(start, end + 1):
|
146
|
-
if i > 0:
|
147
|
-
if line_fea_encode[i] == 4:
|
148
|
-
if debug_able:
|
149
|
-
logger.info(f"列表行的第{i}行不是顶格的")
|
150
|
-
break
|
151
|
-
else:
|
152
|
-
if debug_able:
|
153
|
-
logger.info(f"列表行的第{start}到第{end}行是列表")
|
156
|
+
logger.info(f"列表行的第{start}到第{end}行是列表")
|
154
157
|
|
155
|
-
|
158
|
+
return split_indices(total_lines, list_indice), list_start_idx
|
156
159
|
|
157
160
|
|
158
161
|
def cluster_line_x(lines: list) -> dict:
|
@@ -3,6 +3,7 @@ config:
|
|
3
3
|
layout: True
|
4
4
|
formula: True
|
5
5
|
table_config:
|
6
|
+
model: TableMaster
|
6
7
|
is_table_recog_enable: False
|
7
8
|
max_time: 400
|
8
9
|
|
@@ -10,4 +11,5 @@ weights:
|
|
10
11
|
layout: Layout/model_final.pth
|
11
12
|
mfd: MFD/weights.pt
|
12
13
|
mfr: MFR/UniMERNet
|
13
|
-
|
14
|
+
struct_eqtable: TabRec/StructEqTable
|
15
|
+
TableMaster: TabRec/TableMaster
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -64,6 +64,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
64
64
|
</div>
|
65
65
|
|
66
66
|
# Changelog
|
67
|
+
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
|
67
68
|
- 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
|
68
69
|
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
|
69
70
|
- 2024/07/05: Initial open-source release
|
@@ -205,7 +206,7 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
205
206
|
```bash
|
206
207
|
conda create -n MinerU python=3.10
|
207
208
|
conda activate MinerU
|
208
|
-
pip install magic-pdf[full]
|
209
|
+
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
|
209
210
|
```
|
210
211
|
#### 2. Download model weight files
|
211
212
|
|
@@ -234,6 +235,7 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
|
|
234
235
|
// other config
|
235
236
|
"models-dir": "D:/models",
|
236
237
|
"table-config": {
|
238
|
+
"model": "TableMaster", // Another option of this value is 'struct_eqtable'
|
237
239
|
"is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
|
238
240
|
"max_time": 400
|
239
241
|
}
|
@@ -345,13 +347,7 @@ TODO
|
|
345
347
|
- Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
|
346
348
|
- Enabling OCR may produce better results in PDFs with a high density of formulas
|
347
349
|
- If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
|
348
|
-
- **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
|
349
350
|
|
350
|
-
| Table Size | Parsing Time |
|
351
|
-
|---------------|----------------------------|
|
352
|
-
| 6\*5 55kb | 37s |
|
353
|
-
| 16\*12 284kb | 3m18s |
|
354
|
-
| 44\*7 559kb | 4m12s |
|
355
351
|
|
356
352
|
# FAQ
|
357
353
|
[FAQ in Chinese](docs/FAQ_zh_cn.md)
|
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zx
|
|
5
5
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
6
6
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
8
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
8
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=jg_v2Bj62xBObg0LDayvqUVX_O9DrIBli5Z9_i7Qduw,16479
|
9
9
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
11
11
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -15,7 +15,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
|
|
15
15
|
magic_pdf/layout/layout_sort.py,sha256=ovqRX1xcRA7E7s8VvsI7ZNbaNSElJe07bApCh5hxwIE,33533
|
16
16
|
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
17
17
|
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
18
|
-
magic_pdf/libs/Constants.py,sha256=
|
18
|
+
magic_pdf/libs/Constants.py,sha256=rdJVadmgN0UlIB-xcMQ9j7Qk9q1Qahxt3KEY-vL7hSU,774
|
19
19
|
magic_pdf/libs/MakeContentConfig.py,sha256=UDZPpsv8q4DqTy8h0vRtrT2kHqWiVI205VnVhlUEQc0,206
|
20
20
|
magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
|
21
21
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -41,13 +41,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
41
41
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
42
42
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
43
43
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
44
|
-
magic_pdf/libs/version.py,sha256=
|
44
|
+
magic_pdf/libs/version.py,sha256=2KJZDSMOG7KS82AxYOrZ4ZihYxX0wjfUjDsIZh3L024,22
|
45
45
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
46
46
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
47
47
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=SoT21VHy6ICXoRfC9V3XS6BMiX8EZI6zaqSNgoE17oo,4347
|
48
|
-
magic_pdf/model/magic_model.py,sha256=
|
48
|
+
magic_pdf/model/magic_model.py,sha256=3eAfmglKFkmIVPoz3TG8xAzkNK2g_VLI5rRMQAb_cK4,25544
|
49
49
|
magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
|
50
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
50
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=WO54IoxX8XYXLGrjPts--84qRO1FQZm9f_yVyfpPi0s,14539
|
51
|
+
magic_pdf/model/ppTableModel.py,sha256=wWiui9VOjkKYlNX-viPqsWpzgkNJ-9_S2Se-j4oyLqU,2687
|
51
52
|
magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
|
52
53
|
magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
54
|
magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
|
@@ -72,7 +73,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
|
|
72
73
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
73
74
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
74
75
|
magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
75
|
-
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=
|
76
|
+
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=C9WluPhwaqsFg154WsNxN2HlhFXVkAAw0prR7t8r5J4,918
|
76
77
|
magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
78
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
79
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
@@ -84,7 +85,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
|
|
84
85
|
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
85
86
|
magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
|
86
87
|
magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
|
87
|
-
magic_pdf/para/para_split_v2.py,sha256=
|
88
|
+
magic_pdf/para/para_split_v2.py,sha256=jJnn8numhxVgojGwKGCqBNIIYn2AYsucO-q-eQgsPb4,36911
|
88
89
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
89
90
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
90
91
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
@@ -127,7 +128,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
|
|
127
128
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
128
129
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
129
130
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
130
|
-
magic_pdf/resources/model_config/model_configs.yaml,sha256=
|
131
|
+
magic_pdf/resources/model_config/model_configs.yaml,sha256=_gOSxK9jxe1bFwtH_uwovsyZnRi1sEVNYb1OAexDmF4,301
|
131
132
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
|
132
133
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
133
134
|
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
@@ -140,9 +141,9 @@ magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
141
|
magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
|
141
142
|
magic_pdf/tools/cli_dev.py,sha256=w-J4OixDzHjknnUuRW44PXsUlUqyiD4nPbBSSk9WkXM,4160
|
142
143
|
magic_pdf/tools/common.py,sha256=XoSs19DD-4ubbjrDFQer83T9O6O_MmgEO61NbjlP_2M,3939
|
143
|
-
magic_pdf-0.7.
|
144
|
-
magic_pdf-0.7.
|
145
|
-
magic_pdf-0.7.
|
146
|
-
magic_pdf-0.7.
|
147
|
-
magic_pdf-0.7.
|
148
|
-
magic_pdf-0.7.
|
144
|
+
magic_pdf-0.7.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
145
|
+
magic_pdf-0.7.1.dist-info/METADATA,sha256=SD5oVg3vUEuFg7IyAbwncQ_mtgXljhKiJCOwRCTSOVo,18232
|
146
|
+
magic_pdf-0.7.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
147
|
+
magic_pdf-0.7.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
148
|
+
magic_pdf-0.7.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
149
|
+
magic_pdf-0.7.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|