mineru 2.6.7__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/__init__.py +1 -0
- mineru/backend/hybrid/hybrid_analyze.py +526 -0
- mineru/backend/hybrid/hybrid_magic_model.py +617 -0
- mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
- mineru/backend/pipeline/batch_analyze.py +9 -1
- mineru/backend/pipeline/model_init.py +96 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -4
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
- mineru/backend/vlm/utils.py +3 -1
- mineru/backend/vlm/vlm_analyze.py +12 -12
- mineru/backend/vlm/vlm_magic_model.py +24 -89
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +118 -19
- mineru/cli/client.py +17 -17
- mineru/cli/common.py +170 -20
- mineru/cli/fast_api.py +39 -13
- mineru/cli/gradio_app.py +232 -206
- mineru/model/mfd/yolo_v8.py +12 -6
- mineru/model/mfr/unimernet/Unimernet.py +71 -3
- mineru/resources/header.html +5 -1
- mineru/utils/boxbase.py +23 -0
- mineru/utils/char_utils.py +55 -0
- mineru/utils/engine_utils.py +74 -0
- mineru/utils/enum_class.py +18 -1
- mineru/utils/magic_model_utils.py +85 -2
- mineru/utils/span_pre_proc.py +5 -3
- mineru/utils/table_merge.py +5 -21
- mineru/version.py +1 -1
- mineru-2.7.0.dist-info/METADATA +433 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
- mineru-2.6.7.dist-info/METADATA +0 -954
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import cv2
|
|
7
|
+
import numpy as np
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from mineru.backend.hybrid.hybrid_magic_model import MagicModel
|
|
11
|
+
from mineru.backend.utils import cross_page_table_merge
|
|
12
|
+
from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
|
|
13
|
+
from mineru.utils.cut_image import cut_image_and_table
|
|
14
|
+
from mineru.utils.enum_class import ContentType
|
|
15
|
+
from mineru.utils.hash_utils import bytes_md5
|
|
16
|
+
from mineru.utils.ocr_utils import OcrConfidence
|
|
17
|
+
from mineru.utils.pdf_image_tools import get_crop_img
|
|
18
|
+
from mineru.version import __version__
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
heading_level_import_success = False
|
|
22
|
+
llm_aided_config = get_llm_aided_config()
|
|
23
|
+
if llm_aided_config:
|
|
24
|
+
title_aided_config = llm_aided_config.get('title_aided', {})
|
|
25
|
+
if title_aided_config.get('enable', False):
|
|
26
|
+
try:
|
|
27
|
+
from mineru.utils.llm_aided import llm_aided_title
|
|
28
|
+
from mineru.backend.pipeline.model_init import AtomModelSingleton
|
|
29
|
+
heading_level_import_success = True
|
|
30
|
+
except Exception as e:
|
|
31
|
+
logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
|
|
32
|
+
"please execute `pip install mineru[core]` to install the required packages.")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def blocks_to_page_info(
|
|
36
|
+
page_blocks,
|
|
37
|
+
page_inline_formula,
|
|
38
|
+
page_ocr_res,
|
|
39
|
+
image_dict,
|
|
40
|
+
page,
|
|
41
|
+
image_writer,
|
|
42
|
+
page_index,
|
|
43
|
+
_ocr_enable,
|
|
44
|
+
_vlm_ocr_enable,
|
|
45
|
+
) -> dict:
|
|
46
|
+
"""将blocks转换为页面信息"""
|
|
47
|
+
|
|
48
|
+
scale = image_dict["scale"]
|
|
49
|
+
page_pil_img = image_dict["img_pil"]
|
|
50
|
+
page_img_md5 = bytes_md5(page_pil_img.tobytes())
|
|
51
|
+
width, height = map(int, page.get_size())
|
|
52
|
+
|
|
53
|
+
magic_model = MagicModel(
|
|
54
|
+
page_blocks,
|
|
55
|
+
page_inline_formula,
|
|
56
|
+
page_ocr_res,
|
|
57
|
+
page,
|
|
58
|
+
scale,
|
|
59
|
+
page_pil_img,
|
|
60
|
+
width,
|
|
61
|
+
height,
|
|
62
|
+
_ocr_enable,
|
|
63
|
+
_vlm_ocr_enable,
|
|
64
|
+
)
|
|
65
|
+
image_blocks = magic_model.get_image_blocks()
|
|
66
|
+
table_blocks = magic_model.get_table_blocks()
|
|
67
|
+
title_blocks = magic_model.get_title_blocks()
|
|
68
|
+
discarded_blocks = magic_model.get_discarded_blocks()
|
|
69
|
+
code_blocks = magic_model.get_code_blocks()
|
|
70
|
+
ref_text_blocks = magic_model.get_ref_text_blocks()
|
|
71
|
+
phonetic_blocks = magic_model.get_phonetic_blocks()
|
|
72
|
+
list_blocks = magic_model.get_list_blocks()
|
|
73
|
+
|
|
74
|
+
# 如果有标题优化需求,计算标题的平均行高
|
|
75
|
+
if heading_level_import_success:
|
|
76
|
+
if _vlm_ocr_enable: # vlm_ocr导致没有line信息,需要重新det获取平均行高
|
|
77
|
+
atom_model_manager = AtomModelSingleton()
|
|
78
|
+
ocr_model = atom_model_manager.get_atom_model(
|
|
79
|
+
atom_model_name='ocr',
|
|
80
|
+
ocr_show_log=False,
|
|
81
|
+
det_db_box_thresh=0.3,
|
|
82
|
+
lang='ch_lite'
|
|
83
|
+
)
|
|
84
|
+
for title_block in title_blocks:
|
|
85
|
+
title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
|
|
86
|
+
title_np_img = np.array(title_pil_img)
|
|
87
|
+
# 给title_pil_img添加上下左右各50像素白边padding
|
|
88
|
+
title_np_img = cv2.copyMakeBorder(
|
|
89
|
+
title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
|
|
90
|
+
)
|
|
91
|
+
title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
|
|
92
|
+
ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
|
|
93
|
+
if len(ocr_det_res) > 0:
|
|
94
|
+
# 计算所有res的平均高度
|
|
95
|
+
avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
|
|
96
|
+
title_block['line_avg_height'] = round(avg_height/scale)
|
|
97
|
+
else: # 有line信息,直接计算平均行高
|
|
98
|
+
for title_block in title_blocks:
|
|
99
|
+
lines = title_block.get('lines', [])
|
|
100
|
+
if lines:
|
|
101
|
+
# 使用列表推导式和内置函数,一次性计算平均高度
|
|
102
|
+
avg_height = sum(line['bbox'][3] - line['bbox'][1] for line in lines) / len(lines)
|
|
103
|
+
title_block['line_avg_height'] = round(avg_height)
|
|
104
|
+
else:
|
|
105
|
+
title_block['line_avg_height'] = title_block['bbox'][3] - title_block['bbox'][1]
|
|
106
|
+
|
|
107
|
+
text_blocks = magic_model.get_text_blocks()
|
|
108
|
+
interline_equation_blocks = magic_model.get_interline_equation_blocks()
|
|
109
|
+
|
|
110
|
+
all_spans = magic_model.get_all_spans()
|
|
111
|
+
# 对image/table/interline_equation的span截图
|
|
112
|
+
for span in all_spans:
|
|
113
|
+
if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
|
|
114
|
+
span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
|
|
115
|
+
|
|
116
|
+
page_blocks = []
|
|
117
|
+
page_blocks.extend([
|
|
118
|
+
*image_blocks,
|
|
119
|
+
*table_blocks,
|
|
120
|
+
*code_blocks,
|
|
121
|
+
*ref_text_blocks,
|
|
122
|
+
*phonetic_blocks,
|
|
123
|
+
*title_blocks,
|
|
124
|
+
*text_blocks,
|
|
125
|
+
*interline_equation_blocks,
|
|
126
|
+
*list_blocks,
|
|
127
|
+
])
|
|
128
|
+
# 对page_blocks根据index的值进行排序
|
|
129
|
+
page_blocks.sort(key=lambda x: x["index"])
|
|
130
|
+
|
|
131
|
+
page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_size": [width, height], "page_idx": page_index}
|
|
132
|
+
return page_info
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def result_to_middle_json(
|
|
136
|
+
model_output_blocks_list,
|
|
137
|
+
inline_formula_list,
|
|
138
|
+
ocr_res_list,
|
|
139
|
+
images_list,
|
|
140
|
+
pdf_doc,
|
|
141
|
+
image_writer,
|
|
142
|
+
_ocr_enable,
|
|
143
|
+
_vlm_ocr_enable,
|
|
144
|
+
hybrid_pipeline_model,
|
|
145
|
+
):
|
|
146
|
+
middle_json = {
|
|
147
|
+
"pdf_info": [],
|
|
148
|
+
"_backend": "hybrid",
|
|
149
|
+
"_ocr_enable": _ocr_enable,
|
|
150
|
+
"_vlm_ocr_enable": _vlm_ocr_enable,
|
|
151
|
+
"_version_name": __version__
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
for index, (page_blocks, page_inline_formula, page_ocr_res) in enumerate(zip(model_output_blocks_list, inline_formula_list, ocr_res_list)):
|
|
155
|
+
page = pdf_doc[index]
|
|
156
|
+
image_dict = images_list[index]
|
|
157
|
+
page_info = blocks_to_page_info(
|
|
158
|
+
page_blocks, page_inline_formula, page_ocr_res,
|
|
159
|
+
image_dict, page, image_writer, index,
|
|
160
|
+
_ocr_enable, _vlm_ocr_enable
|
|
161
|
+
)
|
|
162
|
+
middle_json["pdf_info"].append(page_info)
|
|
163
|
+
|
|
164
|
+
if not (_vlm_ocr_enable or _ocr_enable):
|
|
165
|
+
"""后置ocr处理"""
|
|
166
|
+
need_ocr_list = []
|
|
167
|
+
img_crop_list = []
|
|
168
|
+
text_block_list = []
|
|
169
|
+
for page_info in middle_json["pdf_info"]:
|
|
170
|
+
for block in page_info['para_blocks']:
|
|
171
|
+
if block['type'] in ['table', 'image', 'list', 'code']:
|
|
172
|
+
for sub_block in block['blocks']:
|
|
173
|
+
if not sub_block['type'].endswith('body'):
|
|
174
|
+
text_block_list.append(sub_block)
|
|
175
|
+
elif block['type'] in ['text', 'title', 'ref_text']:
|
|
176
|
+
text_block_list.append(block)
|
|
177
|
+
for block in page_info['discarded_blocks']:
|
|
178
|
+
text_block_list.append(block)
|
|
179
|
+
for block in text_block_list:
|
|
180
|
+
for line in block['lines']:
|
|
181
|
+
for span in line['spans']:
|
|
182
|
+
if 'np_img' in span:
|
|
183
|
+
need_ocr_list.append(span)
|
|
184
|
+
img_crop_list.append(span['np_img'])
|
|
185
|
+
span.pop('np_img')
|
|
186
|
+
if len(img_crop_list) > 0:
|
|
187
|
+
ocr_res_list = hybrid_pipeline_model.ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
|
|
188
|
+
assert len(ocr_res_list) == len(
|
|
189
|
+
need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
|
|
190
|
+
for index, span in enumerate(need_ocr_list):
|
|
191
|
+
ocr_text, ocr_score = ocr_res_list[index]
|
|
192
|
+
if ocr_score > OcrConfidence.min_confidence:
|
|
193
|
+
span['content'] = ocr_text
|
|
194
|
+
span['score'] = float(f"{ocr_score:.3f}")
|
|
195
|
+
else:
|
|
196
|
+
span['content'] = ''
|
|
197
|
+
span['score'] = 0.0
|
|
198
|
+
|
|
199
|
+
"""表格跨页合并"""
|
|
200
|
+
table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
|
|
201
|
+
if table_enable:
|
|
202
|
+
cross_page_table_merge(middle_json["pdf_info"])
|
|
203
|
+
|
|
204
|
+
"""llm优化标题分级"""
|
|
205
|
+
if heading_level_import_success:
|
|
206
|
+
llm_aided_title_start_time = time.time()
|
|
207
|
+
llm_aided_title(middle_json["pdf_info"], title_aided_config)
|
|
208
|
+
logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
|
|
209
|
+
|
|
210
|
+
# 关闭pdf文档
|
|
211
|
+
pdf_doc.close()
|
|
212
|
+
return middle_json
|
|
@@ -420,7 +420,15 @@ class BatchAnalyze:
|
|
|
420
420
|
layout_res_item['poly'][4], layout_res_item['poly'][5]]
|
|
421
421
|
layout_res_width = layout_res_bbox[2] - layout_res_bbox[0]
|
|
422
422
|
layout_res_height = layout_res_bbox[3] - layout_res_bbox[1]
|
|
423
|
-
if
|
|
423
|
+
if (
|
|
424
|
+
ocr_text in [
|
|
425
|
+
'(204号', '(20', '(2', '(2号', '(20号', '号', '(204',
|
|
426
|
+
'(cid:)', '(ci:)', '(cd:1)', 'cd:)', 'c)', '(cd:)', 'c', 'id:)',
|
|
427
|
+
':)', '√:)', '√i:)', '−i:)', '−:', 'i:)',
|
|
428
|
+
]
|
|
429
|
+
and ocr_score < 0.8
|
|
430
|
+
and layout_res_width < layout_res_height
|
|
431
|
+
):
|
|
424
432
|
layout_res_item['category_id'] = 16
|
|
425
433
|
|
|
426
434
|
total_processed += len(img_crop_list)
|
|
@@ -14,6 +14,7 @@ from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
|
|
|
14
14
|
# from ...model.table.rec.RapidTable import RapidTableModel
|
|
15
15
|
from ...model.table.rec.slanet_plus.main import RapidTableModel
|
|
16
16
|
from ...model.table.rec.unet_table.main import UnetTableModel
|
|
17
|
+
from ...utils.config_reader import get_device
|
|
17
18
|
from ...utils.enum_class import ModelPath
|
|
18
19
|
from ...utils.models_download_utils import auto_download_and_get_model_root_path
|
|
19
20
|
|
|
@@ -267,4 +268,98 @@ class MineruPipelineModel:
|
|
|
267
268
|
lang=self.lang,
|
|
268
269
|
)
|
|
269
270
|
|
|
270
|
-
logger.info('DocAnalysis init done!')
|
|
271
|
+
logger.info('DocAnalysis init done!')
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class HybridModelSingleton:
|
|
275
|
+
_instance = None
|
|
276
|
+
_models = {}
|
|
277
|
+
|
|
278
|
+
def __new__(cls, *args, **kwargs):
|
|
279
|
+
if cls._instance is None:
|
|
280
|
+
cls._instance = super().__new__(cls)
|
|
281
|
+
return cls._instance
|
|
282
|
+
|
|
283
|
+
def get_model(
|
|
284
|
+
self,
|
|
285
|
+
lang=None,
|
|
286
|
+
formula_enable=None,
|
|
287
|
+
):
|
|
288
|
+
key = (lang, formula_enable)
|
|
289
|
+
if key not in self._models:
|
|
290
|
+
self._models[key] = MineruHybridModel(
|
|
291
|
+
lang=lang,
|
|
292
|
+
formula_enable=formula_enable,
|
|
293
|
+
)
|
|
294
|
+
return self._models[key]
|
|
295
|
+
|
|
296
|
+
def ocr_det_batch_setting(device):
|
|
297
|
+
# 检测torch的版本号
|
|
298
|
+
import torch
|
|
299
|
+
from packaging import version
|
|
300
|
+
if version.parse(torch.__version__) >= version.parse("2.8.0") or str(device).startswith('mps'):
|
|
301
|
+
enable_ocr_det_batch = False
|
|
302
|
+
else:
|
|
303
|
+
enable_ocr_det_batch = True
|
|
304
|
+
return enable_ocr_det_batch
|
|
305
|
+
|
|
306
|
+
class MineruHybridModel:
|
|
307
|
+
def __init__(
|
|
308
|
+
self,
|
|
309
|
+
device=None,
|
|
310
|
+
lang=None,
|
|
311
|
+
formula_enable=True,
|
|
312
|
+
):
|
|
313
|
+
if device is not None:
|
|
314
|
+
self.device = device
|
|
315
|
+
else:
|
|
316
|
+
self.device = get_device()
|
|
317
|
+
|
|
318
|
+
self.lang = lang
|
|
319
|
+
|
|
320
|
+
self.enable_ocr_det_batch = ocr_det_batch_setting(self.device)
|
|
321
|
+
|
|
322
|
+
if str(self.device).startswith('npu'):
|
|
323
|
+
try:
|
|
324
|
+
import torch_npu
|
|
325
|
+
if torch_npu.npu.is_available():
|
|
326
|
+
torch_npu.npu.set_compile_mode(jit_compile=False)
|
|
327
|
+
except Exception as e:
|
|
328
|
+
raise RuntimeError(
|
|
329
|
+
"NPU is selected as device, but torch_npu is not available. "
|
|
330
|
+
"Please ensure that the torch_npu package is installed correctly."
|
|
331
|
+
) from e
|
|
332
|
+
|
|
333
|
+
self.atom_model_manager = AtomModelSingleton()
|
|
334
|
+
|
|
335
|
+
# 初始化OCR模型
|
|
336
|
+
self.ocr_model = self.atom_model_manager.get_atom_model(
|
|
337
|
+
atom_model_name=AtomicModel.OCR,
|
|
338
|
+
det_db_box_thresh=0.3,
|
|
339
|
+
lang=self.lang
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
if formula_enable:
|
|
343
|
+
# 初始化公式检测模型
|
|
344
|
+
self.mfd_model = self.atom_model_manager.get_atom_model(
|
|
345
|
+
atom_model_name=AtomicModel.MFD,
|
|
346
|
+
mfd_weights=str(
|
|
347
|
+
os.path.join(auto_download_and_get_model_root_path(ModelPath.yolo_v8_mfd), ModelPath.yolo_v8_mfd)
|
|
348
|
+
),
|
|
349
|
+
device=self.device,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
# 初始化公式解析模型
|
|
353
|
+
if MFR_MODEL == "unimernet_small":
|
|
354
|
+
mfr_model_path = ModelPath.unimernet_small
|
|
355
|
+
elif MFR_MODEL == "pp_formulanet_plus_m":
|
|
356
|
+
mfr_model_path = ModelPath.pp_formulanet_plus_m
|
|
357
|
+
else:
|
|
358
|
+
logger.error('MFR model name not allow')
|
|
359
|
+
exit(1)
|
|
360
|
+
|
|
361
|
+
self.mfr_model = self.atom_model_manager.get_atom_model(
|
|
362
|
+
atom_model_name=AtomicModel.MFR,
|
|
363
|
+
mfr_weight_dir=str(os.path.join(auto_download_and_get_model_root_path(mfr_model_path), mfr_model_path)),
|
|
364
|
+
device=self.device,
|
|
365
|
+
)
|
|
@@ -86,6 +86,7 @@ def doc_analyze(
|
|
|
86
86
|
all_image_lists = []
|
|
87
87
|
all_pdf_docs = []
|
|
88
88
|
ocr_enabled_list = []
|
|
89
|
+
load_images_start = time.time()
|
|
89
90
|
for pdf_idx, pdf_bytes in enumerate(pdf_bytes_list):
|
|
90
91
|
# 确定OCR设置
|
|
91
92
|
_ocr_enable = False
|
|
@@ -99,10 +100,7 @@ def doc_analyze(
|
|
|
99
100
|
_lang = lang_list[pdf_idx]
|
|
100
101
|
|
|
101
102
|
# 收集每个数据集中的页面
|
|
102
|
-
# load_images_start = time.time()
|
|
103
103
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
104
|
-
# load_images_time = round(time.time() - load_images_start, 2)
|
|
105
|
-
# logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
|
|
106
104
|
all_image_lists.append(images_list)
|
|
107
105
|
all_pdf_docs.append(pdf_doc)
|
|
108
106
|
for page_idx in range(len(images_list)):
|
|
@@ -111,6 +109,8 @@ def doc_analyze(
|
|
|
111
109
|
pdf_idx, page_idx,
|
|
112
110
|
img_dict['img_pil'], _ocr_enable, _lang,
|
|
113
111
|
))
|
|
112
|
+
load_images_time = round(time.time() - load_images_start, 2)
|
|
113
|
+
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(all_pages_info) / load_images_time, 3)} images/s")
|
|
114
114
|
|
|
115
115
|
# 准备批处理
|
|
116
116
|
images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
|
|
@@ -123,6 +123,7 @@ def doc_analyze(
|
|
|
123
123
|
# 执行批处理
|
|
124
124
|
results = []
|
|
125
125
|
processed_images_count = 0
|
|
126
|
+
infer_start = time.time()
|
|
126
127
|
for index, batch_image in enumerate(batch_images):
|
|
127
128
|
processed_images_count += len(batch_image)
|
|
128
129
|
logger.info(
|
|
@@ -131,6 +132,8 @@ def doc_analyze(
|
|
|
131
132
|
)
|
|
132
133
|
batch_results = batch_image_analyze(batch_image, formula_enable, table_enable)
|
|
133
134
|
results.extend(batch_results)
|
|
135
|
+
infer_time = round(time.time() - infer_start, 2)
|
|
136
|
+
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results) / infer_time, 3)} page/s")
|
|
134
137
|
|
|
135
138
|
# 构建返回结果
|
|
136
139
|
infer_results = []
|
|
@@ -185,7 +188,6 @@ def batch_image_analyze(
|
|
|
185
188
|
batch_ratio = 1
|
|
186
189
|
logger.info(
|
|
187
190
|
f'GPU Memory: {gpu_memory} GB, Batch Ratio: {batch_ratio}. '
|
|
188
|
-
f'You can set MINERU_VIRTUAL_VRAM_SIZE environment variable to adjust GPU memory allocation.'
|
|
189
191
|
)
|
|
190
192
|
|
|
191
193
|
# 检测torch的版本号
|
|
@@ -1,25 +1,12 @@
|
|
|
1
|
-
import re
|
|
2
1
|
from loguru import logger
|
|
3
2
|
|
|
3
|
+
from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
|
|
4
4
|
from mineru.utils.config_reader import get_latex_delimiter_config
|
|
5
5
|
from mineru.backend.pipeline.para_split import ListLineTag
|
|
6
6
|
from mineru.utils.enum_class import BlockType, ContentType, MakeMode
|
|
7
7
|
from mineru.utils.language import detect_lang
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def __is_hyphen_at_line_end(line):
|
|
11
|
-
"""Check if a line ends with one or more letters followed by a hyphen.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
line (str): The line of text to check.
|
|
15
|
-
|
|
16
|
-
Returns:
|
|
17
|
-
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
|
|
18
|
-
"""
|
|
19
|
-
# Use regex to check if the line ends with one or more letters followed by a hyphen
|
|
20
|
-
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
|
21
|
-
|
|
22
|
-
|
|
23
10
|
def make_blocks_to_markdown(paras_of_layout,
|
|
24
11
|
mode,
|
|
25
12
|
img_buket_path='',
|
|
@@ -102,25 +89,6 @@ def make_blocks_to_markdown(paras_of_layout,
|
|
|
102
89
|
return page_markdown
|
|
103
90
|
|
|
104
91
|
|
|
105
|
-
def full_to_half(text: str) -> str:
|
|
106
|
-
"""Convert full-width characters to half-width characters using code point manipulation.
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
text: String containing full-width characters
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
String with full-width characters converted to half-width
|
|
113
|
-
"""
|
|
114
|
-
result = []
|
|
115
|
-
for char in text:
|
|
116
|
-
code = ord(char)
|
|
117
|
-
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
|
|
118
|
-
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
|
|
119
|
-
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
|
120
|
-
else:
|
|
121
|
-
result.append(char)
|
|
122
|
-
return ''.join(result)
|
|
123
|
-
|
|
124
92
|
latex_delimiters_config = get_latex_delimiter_config()
|
|
125
93
|
|
|
126
94
|
default_delimiters = {
|
|
@@ -140,7 +108,7 @@ def merge_para_with_text(para_block):
|
|
|
140
108
|
for line in para_block['lines']:
|
|
141
109
|
for span in line['spans']:
|
|
142
110
|
if span['type'] in [ContentType.TEXT]:
|
|
143
|
-
span['content'] =
|
|
111
|
+
span['content'] = full_to_half_exclude_marks(span['content'])
|
|
144
112
|
block_text += span['content']
|
|
145
113
|
block_lang = detect_lang(block_text)
|
|
146
114
|
|
|
@@ -166,22 +134,45 @@ def merge_para_with_text(para_block):
|
|
|
166
134
|
content = content.strip()
|
|
167
135
|
|
|
168
136
|
if content:
|
|
169
|
-
|
|
137
|
+
|
|
138
|
+
if span_type == ContentType.INTERLINE_EQUATION:
|
|
139
|
+
para_text += content
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# 定义CJK语言集合(中日韩)
|
|
143
|
+
cjk_langs = {'zh', 'ja', 'ko'}
|
|
170
144
|
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
|
171
|
-
|
|
172
|
-
|
|
145
|
+
|
|
146
|
+
# 判断是否为行末span
|
|
147
|
+
is_last_span = j == len(line['spans']) - 1
|
|
148
|
+
|
|
149
|
+
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
|
150
|
+
if is_last_span and span_type not in [ContentType.INLINE_EQUATION]:
|
|
173
151
|
para_text += content
|
|
174
152
|
else:
|
|
175
153
|
para_text += f'{content} '
|
|
176
154
|
else:
|
|
155
|
+
# 西方文本语境下 每行的最后一个span判断是否要去除连字符
|
|
177
156
|
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
|
|
178
157
|
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
|
179
|
-
if
|
|
180
|
-
|
|
158
|
+
if (
|
|
159
|
+
is_last_span
|
|
160
|
+
and span_type == ContentType.TEXT
|
|
161
|
+
and is_hyphen_at_line_end(content)
|
|
162
|
+
):
|
|
163
|
+
# 如果下一行的第一个span是小写字母开头,删除连字符
|
|
164
|
+
if (
|
|
165
|
+
i + 1 < len(para_block['lines'])
|
|
166
|
+
and para_block['lines'][i + 1].get('spans')
|
|
167
|
+
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
|
|
168
|
+
and para_block['lines'][i + 1]['spans'][0].get('content', '')
|
|
169
|
+
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
|
|
170
|
+
):
|
|
171
|
+
para_text += content[:-1]
|
|
172
|
+
else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
|
|
173
|
+
para_text += content
|
|
181
174
|
else: # 西方文本语境下 content间需要空格分隔
|
|
182
175
|
para_text += f'{content} '
|
|
183
|
-
elif span_type == ContentType.INTERLINE_EQUATION:
|
|
184
|
-
para_text += content
|
|
185
176
|
else:
|
|
186
177
|
continue
|
|
187
178
|
|
mineru/backend/vlm/utils.py
CHANGED
|
@@ -72,7 +72,9 @@ def set_lmdeploy_backend(device_type: str) -> str:
|
|
|
72
72
|
|
|
73
73
|
def set_default_gpu_memory_utilization() -> float:
|
|
74
74
|
from vllm import __version__ as vllm_version
|
|
75
|
-
|
|
75
|
+
device = get_device()
|
|
76
|
+
gpu_memory = get_vram(device)
|
|
77
|
+
if version.parse(vllm_version) >= version.parse("0.11.0") and gpu_memory <= 8:
|
|
76
78
|
return 0.7
|
|
77
79
|
else:
|
|
78
80
|
return 0.5
|
|
@@ -202,16 +202,16 @@ def doc_analyze(
|
|
|
202
202
|
if predictor is None:
|
|
203
203
|
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
|
204
204
|
|
|
205
|
-
|
|
205
|
+
load_images_start = time.time()
|
|
206
206
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
207
207
|
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
|
208
|
-
|
|
209
|
-
|
|
208
|
+
load_images_time = round(time.time() - load_images_start, 2)
|
|
209
|
+
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
|
210
210
|
|
|
211
|
-
|
|
211
|
+
infer_start = time.time()
|
|
212
212
|
results = predictor.batch_two_step_extract(images=images_pil_list)
|
|
213
|
-
|
|
214
|
-
|
|
213
|
+
infer_time = round(time.time() - infer_start, 2)
|
|
214
|
+
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
|
215
215
|
|
|
216
216
|
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
|
|
217
217
|
return middle_json, results
|
|
@@ -229,15 +229,15 @@ async def aio_doc_analyze(
|
|
|
229
229
|
if predictor is None:
|
|
230
230
|
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
|
231
231
|
|
|
232
|
-
|
|
232
|
+
load_images_start = time.time()
|
|
233
233
|
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
234
234
|
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
|
235
|
-
|
|
236
|
-
|
|
235
|
+
load_images_time = round(time.time() - load_images_start, 2)
|
|
236
|
+
logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
|
|
237
237
|
|
|
238
|
-
|
|
238
|
+
infer_start = time.time()
|
|
239
239
|
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
|
240
|
-
|
|
241
|
-
|
|
240
|
+
infer_time = round(time.time() - infer_start, 2)
|
|
241
|
+
logger.debug(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
|
242
242
|
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
|
|
243
243
|
return middle_json, results
|