magic-pdf 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/utils.py +4 -4
- magic_pdf/dict2md/ocr_mkcontent.py +36 -22
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +14 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +5 -2
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +10 -7
- magic_pdf/model/sub_modules/model_utils.py +4 -4
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +2 -1
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py +810 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +18 -5
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +68 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +18383 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +8 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.12.dist-info}/METADATA +22 -11
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.12.dist-info}/RECORD +19 -17
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.12.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.12.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.12.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.10.dist-info → magic_pdf-1.3.12.dist-info}/top_level.txt +0 -0
magic_pdf/data/utils.py
CHANGED
@@ -10,22 +10,22 @@ from loguru import logger
|
|
10
10
|
|
11
11
|
|
12
12
|
|
13
|
-
def fitz_doc_to_image(
|
13
|
+
def fitz_doc_to_image(page, dpi=200) -> dict:
|
14
14
|
"""Convert fitz.Document to image, Then convert the image to numpy array.
|
15
15
|
|
16
16
|
Args:
|
17
|
-
|
17
|
+
page (_type_): pymudoc page
|
18
18
|
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
19
19
|
|
20
20
|
Returns:
|
21
21
|
dict: {'img': numpy array, 'width': width, 'height': height }
|
22
22
|
"""
|
23
23
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
24
|
-
pm =
|
24
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
25
25
|
|
26
26
|
# If the width or height exceeds 4500 after scaling, do not scale further.
|
27
27
|
if pm.width > 4500 or pm.height > 4500:
|
28
|
-
pm =
|
28
|
+
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
29
29
|
|
30
30
|
# Convert pixmap samples directly to numpy array
|
31
31
|
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
|
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
70
70
|
if mode == 'nlp':
|
71
71
|
continue
|
72
72
|
elif mode == 'mm':
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
73
|
+
# 检测是否存在图片脚注
|
74
|
+
has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
|
75
|
+
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
|
76
|
+
if has_image_footnote:
|
77
|
+
for block in para_block['blocks']: # 1st.拼image_caption
|
78
|
+
if block['type'] == BlockType.ImageCaption:
|
79
|
+
para_text += merge_para_with_text(block) + ' \n'
|
80
|
+
for block in para_block['blocks']: # 2nd.拼image_body
|
81
|
+
if block['type'] == BlockType.ImageBody:
|
82
|
+
for line in block['lines']:
|
83
|
+
for span in line['spans']:
|
84
|
+
if span['type'] == ContentType.Image:
|
85
|
+
if span.get('image_path', ''):
|
86
|
+
para_text += f""
|
87
|
+
for block in para_block['blocks']: # 3rd.拼image_footnote
|
88
|
+
if block['type'] == BlockType.ImageFootnote:
|
89
|
+
para_text += ' \n' + merge_para_with_text(block)
|
90
|
+
else:
|
91
|
+
for block in para_block['blocks']: # 1st.拼image_body
|
92
|
+
if block['type'] == BlockType.ImageBody:
|
93
|
+
for line in block['lines']:
|
94
|
+
for span in line['spans']:
|
95
|
+
if span['type'] == ContentType.Image:
|
96
|
+
if span.get('image_path', ''):
|
97
|
+
para_text += f""
|
98
|
+
for block in para_block['blocks']: # 2nd.拼image_caption
|
99
|
+
if block['type'] == BlockType.ImageCaption:
|
100
|
+
para_text += ' \n' + merge_para_with_text(block)
|
86
101
|
elif para_type == BlockType.Table:
|
87
102
|
if mode == 'nlp':
|
88
103
|
continue
|
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
96
111
|
for span in line['spans']:
|
97
112
|
if span['type'] == ContentType.Table:
|
98
113
|
# if processed by table model
|
99
|
-
if span.get('
|
100
|
-
para_text += f"\n
|
101
|
-
elif span.get('html', ''):
|
102
|
-
para_text += f"\n\n{span['html']}\n\n"
|
114
|
+
if span.get('html', ''):
|
115
|
+
para_text += f"\n{span['html']}\n"
|
103
116
|
elif span.get('image_path', ''):
|
104
|
-
para_text += f"
|
117
|
+
para_text += f""
|
105
118
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
106
119
|
if block['type'] == BlockType.TableFootnote:
|
107
|
-
para_text += merge_para_with_text(block) + '
|
120
|
+
para_text += '\n' + merge_para_with_text(block) + ' '
|
108
121
|
|
109
122
|
if para_text.strip() == '':
|
110
123
|
continue
|
111
124
|
else:
|
112
|
-
page_markdown.append(para_text.strip() + ' ')
|
125
|
+
# page_markdown.append(para_text.strip() + ' ')
|
126
|
+
page_markdown.append(para_text.strip())
|
113
127
|
|
114
128
|
return page_markdown
|
115
129
|
|
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
|
|
257
271
|
if span['type'] == ContentType.Table:
|
258
272
|
|
259
273
|
if span.get('latex', ''):
|
260
|
-
para_content['table_body'] = f"
|
274
|
+
para_content['table_body'] = f"{span['latex']}"
|
261
275
|
elif span.get('html', ''):
|
262
|
-
para_content['table_body'] = f"
|
276
|
+
para_content['table_body'] = f"{span['html']}"
|
263
277
|
|
264
278
|
if span.get('image_path', ''):
|
265
279
|
para_content['img_path'] = join_path(img_buket_path, span['image_path'])
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.3.
|
1
|
+
__version__ = "1.3.12"
|
magic_pdf/model/batch_analyze.py
CHANGED
@@ -6,7 +6,7 @@ from tqdm import tqdm
|
|
6
6
|
from magic_pdf.config.constants import MODEL_NAME
|
7
7
|
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
8
8
|
from magic_pdf.model.sub_modules.model_utils import (
|
9
|
-
clean_vram, crop_img, get_res_list_from_layout_res)
|
9
|
+
clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area)
|
10
10
|
from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
|
11
11
|
get_adjusted_mfdetrec_res, get_ocr_result_list)
|
12
12
|
|
@@ -148,6 +148,19 @@ class BatchAnalyze:
|
|
148
148
|
# Integration results
|
149
149
|
if ocr_res:
|
150
150
|
ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
|
151
|
+
|
152
|
+
if res["category_id"] == 3:
|
153
|
+
# ocr_result_list中所有bbox的面积之和
|
154
|
+
ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
|
155
|
+
# 求ocr_res_area和res的面积的比值
|
156
|
+
res_area = get_coords_and_area(res)[4]
|
157
|
+
if res_area > 0:
|
158
|
+
ratio = ocr_res_area / res_area
|
159
|
+
if ratio > 0.25:
|
160
|
+
res["category_id"] = 1
|
161
|
+
else:
|
162
|
+
continue
|
163
|
+
|
151
164
|
ocr_res_list_dict['layout_res'].extend(ocr_result_list)
|
152
165
|
|
153
166
|
# det_count += len(ocr_res_list_dict['ocr_res_list'])
|
@@ -156,7 +156,10 @@ def doc_analyze(
|
|
156
156
|
batch_images = [images_with_extra_info]
|
157
157
|
|
158
158
|
results = []
|
159
|
-
|
159
|
+
processed_images_count = 0
|
160
|
+
for index, batch_image in enumerate(batch_images):
|
161
|
+
processed_images_count += len(batch_image)
|
162
|
+
logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
|
160
163
|
result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
|
161
164
|
results.extend(result)
|
162
165
|
|
@@ -186,7 +189,7 @@ def batch_doc_analyze(
|
|
186
189
|
formula_enable=None,
|
187
190
|
table_enable=None,
|
188
191
|
):
|
189
|
-
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE',
|
192
|
+
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
|
190
193
|
batch_size = MIN_BATCH_INFERENCE_SIZE
|
191
194
|
page_wh_list = []
|
192
195
|
|
@@ -66,9 +66,9 @@ LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
|
|
66
66
|
|
67
67
|
def fix_latex_left_right(s):
|
68
68
|
"""
|
69
|
-
修复LaTeX
|
69
|
+
修复LaTeX中的\\left和\\right命令
|
70
70
|
1. 确保它们后面跟有效分隔符
|
71
|
-
2.
|
71
|
+
2. 平衡\\left和\\right的数量
|
72
72
|
"""
|
73
73
|
# 白名单分隔符
|
74
74
|
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
|
@@ -106,7 +106,7 @@ def fix_latex_left_right(s):
|
|
106
106
|
|
107
107
|
def fix_left_right_pairs(latex_formula):
|
108
108
|
"""
|
109
|
-
检测并修复LaTeX
|
109
|
+
检测并修复LaTeX公式中\\left和\\right不在同一组的情况
|
110
110
|
|
111
111
|
Args:
|
112
112
|
latex_formula (str): 输入的LaTeX公式
|
@@ -308,9 +308,9 @@ ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') fo
|
|
308
308
|
|
309
309
|
def fix_latex_environments(s):
|
310
310
|
"""
|
311
|
-
检测LaTeX中环境(如array
|
312
|
-
1.
|
313
|
-
2.
|
311
|
+
检测LaTeX中环境(如array)的\\begin和\\end是否匹配
|
312
|
+
1. 如果缺少\\begin标签则在开头添加
|
313
|
+
2. 如果缺少\\end标签则在末尾添加
|
314
314
|
"""
|
315
315
|
for env in ENV_TYPES:
|
316
316
|
begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
|
@@ -334,7 +334,7 @@ def fix_latex_environments(s):
|
|
334
334
|
|
335
335
|
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
|
336
336
|
COMMANDS_TO_REMOVE_PATTERN = re.compile(
|
337
|
-
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
|
337
|
+
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
|
338
338
|
REPLACEMENTS_PATTERNS = {
|
339
339
|
re.compile(r'\\underbar'): r'\\underline',
|
340
340
|
re.compile(r'\\Bar'): r'\\hat',
|
@@ -346,6 +346,9 @@ REPLACEMENTS_PATTERNS = {
|
|
346
346
|
re.compile(r'\\textunderscore'): r'\\_',
|
347
347
|
re.compile(r'\\fint'): r'⨏',
|
348
348
|
re.compile(r'\\up '): r'\\ ',
|
349
|
+
re.compile(r'\\vline = '): r'\\models ',
|
350
|
+
re.compile(r'\\vDash '): r'\\models ',
|
351
|
+
re.compile(r'\\sq \\sqcup '): r'\\square ',
|
349
352
|
}
|
350
353
|
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
|
351
354
|
|
@@ -31,10 +31,10 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
|
|
31
31
|
return return_image, return_list
|
32
32
|
|
33
33
|
|
34
|
-
def get_coords_and_area(
|
34
|
+
def get_coords_and_area(block_with_poly):
|
35
35
|
"""Extract coordinates and area from a table."""
|
36
|
-
xmin, ymin = int(
|
37
|
-
xmax, ymax = int(
|
36
|
+
xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1])
|
37
|
+
xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5])
|
38
38
|
area = (xmax - xmin) * (ymax - ymin)
|
39
39
|
return xmin, ymin, xmax, ymax, area
|
40
40
|
|
@@ -243,7 +243,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
|
|
243
243
|
"bbox": [int(res['poly'][0]), int(res['poly'][1]),
|
244
244
|
int(res['poly'][4]), int(res['poly'][5])],
|
245
245
|
})
|
246
|
-
elif category_id in [0, 2, 4, 6, 7]: # OCR regions
|
246
|
+
elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions
|
247
247
|
ocr_res_list.append(res)
|
248
248
|
elif category_id == 5: # Table regions
|
249
249
|
table_res_list.append(res)
|
@@ -35,7 +35,7 @@ def build_backbone(config, model_type):
|
|
35
35
|
from .rec_mobilenet_v3 import MobileNetV3
|
36
36
|
from .rec_svtrnet import SVTRNet
|
37
37
|
from .rec_mv1_enhance import MobileNetV1Enhance
|
38
|
-
|
38
|
+
from .rec_pphgnetv2 import PPHGNetV2_B4
|
39
39
|
support_dict = [
|
40
40
|
"MobileNetV1Enhance",
|
41
41
|
"MobileNetV3",
|
@@ -48,6 +48,7 @@ def build_backbone(config, model_type):
|
|
48
48
|
"DenseNet",
|
49
49
|
"PPLCNetV3",
|
50
50
|
"PPHGNet_small",
|
51
|
+
"PPHGNetV2_B4",
|
51
52
|
]
|
52
53
|
else:
|
53
54
|
raise NotImplementedError
|