magic-pdf 1.3.11__py3-none-any.whl → 1.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/utils.py +4 -4
- magic_pdf/dict2md/ocr_mkcontent.py +36 -22
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +14 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +1 -1
- magic_pdf/model/sub_modules/model_utils.py +4 -4
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +2 -1
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py +810 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +18 -5
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +68 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +18383 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +8 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/METADATA +15 -1
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/RECORD +18 -16
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/top_level.txt +0 -0
magic_pdf/data/utils.py
CHANGED
@@ -10,22 +10,22 @@ from loguru import logger
|
|
10
10
|
|
11
11
|
|
12
12
|
|
13
|
-
def fitz_doc_to_image(
|
13
|
+
def fitz_doc_to_image(page, dpi=200) -> dict:
|
14
14
|
"""Convert fitz.Document to image, Then convert the image to numpy array.
|
15
15
|
|
16
16
|
Args:
|
17
|
-
|
17
|
+
page (_type_): pymudoc page
|
18
18
|
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
19
19
|
|
20
20
|
Returns:
|
21
21
|
dict: {'img': numpy array, 'width': width, 'height': height }
|
22
22
|
"""
|
23
23
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
24
|
-
pm =
|
24
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
25
25
|
|
26
26
|
# If the width or height exceeds 4500 after scaling, do not scale further.
|
27
27
|
if pm.width > 4500 or pm.height > 4500:
|
28
|
-
pm =
|
28
|
+
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
29
29
|
|
30
30
|
# Convert pixmap samples directly to numpy array
|
31
31
|
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
|
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
70
70
|
if mode == 'nlp':
|
71
71
|
continue
|
72
72
|
elif mode == 'mm':
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
73
|
+
# 检测是否存在图片脚注
|
74
|
+
has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
|
75
|
+
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
|
76
|
+
if has_image_footnote:
|
77
|
+
for block in para_block['blocks']: # 1st.拼image_caption
|
78
|
+
if block['type'] == BlockType.ImageCaption:
|
79
|
+
para_text += merge_para_with_text(block) + ' \n'
|
80
|
+
for block in para_block['blocks']: # 2nd.拼image_body
|
81
|
+
if block['type'] == BlockType.ImageBody:
|
82
|
+
for line in block['lines']:
|
83
|
+
for span in line['spans']:
|
84
|
+
if span['type'] == ContentType.Image:
|
85
|
+
if span.get('image_path', ''):
|
86
|
+
para_text += f""
|
87
|
+
for block in para_block['blocks']: # 3rd.拼image_footnote
|
88
|
+
if block['type'] == BlockType.ImageFootnote:
|
89
|
+
para_text += ' \n' + merge_para_with_text(block)
|
90
|
+
else:
|
91
|
+
for block in para_block['blocks']: # 1st.拼image_body
|
92
|
+
if block['type'] == BlockType.ImageBody:
|
93
|
+
for line in block['lines']:
|
94
|
+
for span in line['spans']:
|
95
|
+
if span['type'] == ContentType.Image:
|
96
|
+
if span.get('image_path', ''):
|
97
|
+
para_text += f""
|
98
|
+
for block in para_block['blocks']: # 2nd.拼image_caption
|
99
|
+
if block['type'] == BlockType.ImageCaption:
|
100
|
+
para_text += ' \n' + merge_para_with_text(block)
|
86
101
|
elif para_type == BlockType.Table:
|
87
102
|
if mode == 'nlp':
|
88
103
|
continue
|
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
|
|
96
111
|
for span in line['spans']:
|
97
112
|
if span['type'] == ContentType.Table:
|
98
113
|
# if processed by table model
|
99
|
-
if span.get('
|
100
|
-
para_text += f"\n
|
101
|
-
elif span.get('html', ''):
|
102
|
-
para_text += f"\n\n{span['html']}\n\n"
|
114
|
+
if span.get('html', ''):
|
115
|
+
para_text += f"\n{span['html']}\n"
|
103
116
|
elif span.get('image_path', ''):
|
104
|
-
para_text += f"
|
117
|
+
para_text += f""
|
105
118
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
106
119
|
if block['type'] == BlockType.TableFootnote:
|
107
|
-
para_text += merge_para_with_text(block) + '
|
120
|
+
para_text += '\n' + merge_para_with_text(block) + ' '
|
108
121
|
|
109
122
|
if para_text.strip() == '':
|
110
123
|
continue
|
111
124
|
else:
|
112
|
-
page_markdown.append(para_text.strip() + ' ')
|
125
|
+
# page_markdown.append(para_text.strip() + ' ')
|
126
|
+
page_markdown.append(para_text.strip())
|
113
127
|
|
114
128
|
return page_markdown
|
115
129
|
|
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
|
|
257
271
|
if span['type'] == ContentType.Table:
|
258
272
|
|
259
273
|
if span.get('latex', ''):
|
260
|
-
para_content['table_body'] = f"
|
274
|
+
para_content['table_body'] = f"{span['latex']}"
|
261
275
|
elif span.get('html', ''):
|
262
|
-
para_content['table_body'] = f"
|
276
|
+
para_content['table_body'] = f"{span['html']}"
|
263
277
|
|
264
278
|
if span.get('image_path', ''):
|
265
279
|
para_content['img_path'] = join_path(img_buket_path, span['image_path'])
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.3.
|
1
|
+
__version__ = "1.3.12"
|
magic_pdf/model/batch_analyze.py
CHANGED
@@ -6,7 +6,7 @@ from tqdm import tqdm
|
|
6
6
|
from magic_pdf.config.constants import MODEL_NAME
|
7
7
|
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
8
8
|
from magic_pdf.model.sub_modules.model_utils import (
|
9
|
-
clean_vram, crop_img, get_res_list_from_layout_res)
|
9
|
+
clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area)
|
10
10
|
from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
|
11
11
|
get_adjusted_mfdetrec_res, get_ocr_result_list)
|
12
12
|
|
@@ -148,6 +148,19 @@ class BatchAnalyze:
|
|
148
148
|
# Integration results
|
149
149
|
if ocr_res:
|
150
150
|
ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
|
151
|
+
|
152
|
+
if res["category_id"] == 3:
|
153
|
+
# ocr_result_list中所有bbox的面积之和
|
154
|
+
ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
|
155
|
+
# 求ocr_res_area和res的面积的比值
|
156
|
+
res_area = get_coords_and_area(res)[4]
|
157
|
+
if res_area > 0:
|
158
|
+
ratio = ocr_res_area / res_area
|
159
|
+
if ratio > 0.25:
|
160
|
+
res["category_id"] = 1
|
161
|
+
else:
|
162
|
+
continue
|
163
|
+
|
151
164
|
ocr_res_list_dict['layout_res'].extend(ocr_result_list)
|
152
165
|
|
153
166
|
# det_count += len(ocr_res_list_dict['ocr_res_list'])
|
@@ -189,7 +189,7 @@ def batch_doc_analyze(
|
|
189
189
|
formula_enable=None,
|
190
190
|
table_enable=None,
|
191
191
|
):
|
192
|
-
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE',
|
192
|
+
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
|
193
193
|
batch_size = MIN_BATCH_INFERENCE_SIZE
|
194
194
|
page_wh_list = []
|
195
195
|
|
@@ -31,10 +31,10 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
|
|
31
31
|
return return_image, return_list
|
32
32
|
|
33
33
|
|
34
|
-
def get_coords_and_area(
|
34
|
+
def get_coords_and_area(block_with_poly):
|
35
35
|
"""Extract coordinates and area from a table."""
|
36
|
-
xmin, ymin = int(
|
37
|
-
xmax, ymax = int(
|
36
|
+
xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1])
|
37
|
+
xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5])
|
38
38
|
area = (xmax - xmin) * (ymax - ymin)
|
39
39
|
return xmin, ymin, xmax, ymax, area
|
40
40
|
|
@@ -243,7 +243,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
|
|
243
243
|
"bbox": [int(res['poly'][0]), int(res['poly'][1]),
|
244
244
|
int(res['poly'][4]), int(res['poly'][5])],
|
245
245
|
})
|
246
|
-
elif category_id in [0, 2, 4, 6, 7]: # OCR regions
|
246
|
+
elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions
|
247
247
|
ocr_res_list.append(res)
|
248
248
|
elif category_id == 5: # Table regions
|
249
249
|
table_res_list.append(res)
|
@@ -35,7 +35,7 @@ def build_backbone(config, model_type):
|
|
35
35
|
from .rec_mobilenet_v3 import MobileNetV3
|
36
36
|
from .rec_svtrnet import SVTRNet
|
37
37
|
from .rec_mv1_enhance import MobileNetV1Enhance
|
38
|
-
|
38
|
+
from .rec_pphgnetv2 import PPHGNetV2_B4
|
39
39
|
support_dict = [
|
40
40
|
"MobileNetV1Enhance",
|
41
41
|
"MobileNetV3",
|
@@ -48,6 +48,7 @@ def build_backbone(config, model_type):
|
|
48
48
|
"DenseNet",
|
49
49
|
"PPLCNetV3",
|
50
50
|
"PPHGNet_small",
|
51
|
+
"PPHGNetV2_B4",
|
51
52
|
]
|
52
53
|
else:
|
53
54
|
raise NotImplementedError
|