mineru 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
- mineru/backend/vlm/model_output_to_middle_json.py +123 -0
- mineru/backend/vlm/vlm_analyze.py +105 -16
- mineru/backend/vlm/vlm_magic_model.py +201 -135
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
- mineru/cli/client.py +6 -5
- mineru/cli/common.py +17 -16
- mineru/cli/fast_api.py +9 -7
- mineru/cli/gradio_app.py +15 -16
- mineru/cli/vlm_vllm_server.py +4 -0
- mineru/model/table/rec/unet_table/main.py +8 -0
- mineru/model/vlm_vllm_model/__init__.py +0 -0
- mineru/model/vlm_vllm_model/server.py +59 -0
- mineru/resources/header.html +10 -2
- mineru/utils/draw_bbox.py +32 -10
- mineru/utils/enum_class.py +16 -2
- mineru/utils/guess_suffix_or_lang.py +20 -0
- mineru/utils/span_block_fix.py +4 -2
- mineru/version.py +1 -1
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/METADATA +70 -25
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/RECORD +25 -38
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/entry_points.txt +1 -1
- mineru/backend/vlm/base_predictor.py +0 -186
- mineru/backend/vlm/hf_predictor.py +0 -217
- mineru/backend/vlm/predictor.py +0 -111
- mineru/backend/vlm/sglang_client_predictor.py +0 -443
- mineru/backend/vlm/sglang_engine_predictor.py +0 -246
- mineru/backend/vlm/token_to_middle_json.py +0 -122
- mineru/backend/vlm/utils.py +0 -40
- mineru/cli/vlm_sglang_server.py +0 -4
- mineru/model/vlm_hf_model/__init__.py +0 -9
- mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
- mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
- mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
- mineru/model/vlm_sglang_model/__init__.py +0 -14
- mineru/model/vlm_sglang_model/engine.py +0 -264
- mineru/model/vlm_sglang_model/image_processor.py +0 -213
- mineru/model/vlm_sglang_model/logit_processor.py +0 -90
- mineru/model/vlm_sglang_model/model.py +0 -453
- mineru/model/vlm_sglang_model/server.py +0 -75
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/WHEEL +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/top_level.txt +0 -0
|
@@ -245,14 +245,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
|
|
|
245
245
|
if block['type'] == BlockType.TABLE_FOOTNOTE:
|
|
246
246
|
para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
|
|
247
247
|
|
|
248
|
-
|
|
248
|
+
page_width, page_height = page_size
|
|
249
249
|
para_bbox = para_block.get('bbox')
|
|
250
250
|
if para_bbox:
|
|
251
251
|
x0, y0, x1, y1 = para_bbox
|
|
252
252
|
para_content['bbox'] = [
|
|
253
|
-
int(x0 * 1000 /
|
|
253
|
+
int(x0 * 1000 / page_width),
|
|
254
254
|
int(y0 * 1000 / page_height),
|
|
255
|
-
int(x1 * 1000 /
|
|
255
|
+
int(x1 * 1000 / page_width),
|
|
256
256
|
int(y1 * 1000 / page_height),
|
|
257
257
|
]
|
|
258
258
|
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import cv2
|
|
5
|
+
import numpy as np
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from mineru.backend.vlm.vlm_magic_model import MagicModel
|
|
9
|
+
from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
|
|
10
|
+
from mineru.utils.cut_image import cut_image_and_table
|
|
11
|
+
from mineru.utils.enum_class import ContentType
|
|
12
|
+
from mineru.utils.hash_utils import bytes_md5
|
|
13
|
+
from mineru.utils.pdf_image_tools import get_crop_img
|
|
14
|
+
from mineru.utils.table_merge import merge_table
|
|
15
|
+
from mineru.version import __version__
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
heading_level_import_success = False
|
|
19
|
+
llm_aided_config = get_llm_aided_config()
|
|
20
|
+
if llm_aided_config:
|
|
21
|
+
title_aided_config = llm_aided_config.get('title_aided', {})
|
|
22
|
+
if title_aided_config.get('enable', False):
|
|
23
|
+
try:
|
|
24
|
+
from mineru.utils.llm_aided import llm_aided_title
|
|
25
|
+
from mineru.backend.pipeline.model_init import AtomModelSingleton
|
|
26
|
+
heading_level_import_success = True
|
|
27
|
+
except Exception as e:
|
|
28
|
+
logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
|
|
29
|
+
"please execute `pip install mineru[core]` to install the required packages.")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index) -> dict:
|
|
33
|
+
"""将blocks转换为页面信息"""
|
|
34
|
+
|
|
35
|
+
scale = image_dict["scale"]
|
|
36
|
+
# page_pil_img = image_dict["img_pil"]
|
|
37
|
+
page_pil_img = image_dict["img_pil"]
|
|
38
|
+
page_img_md5 = bytes_md5(page_pil_img.tobytes())
|
|
39
|
+
width, height = map(int, page.get_size())
|
|
40
|
+
|
|
41
|
+
magic_model = MagicModel(page_blocks, width, height)
|
|
42
|
+
image_blocks = magic_model.get_image_blocks()
|
|
43
|
+
table_blocks = magic_model.get_table_blocks()
|
|
44
|
+
title_blocks = magic_model.get_title_blocks()
|
|
45
|
+
discarded_blocks = magic_model.get_discarded_blocks()
|
|
46
|
+
code_blocks = magic_model.get_code_blocks()
|
|
47
|
+
ref_text_blocks = magic_model.get_ref_text_blocks()
|
|
48
|
+
phonetic_blocks = magic_model.get_phonetic_blocks()
|
|
49
|
+
list_blocks = magic_model.get_list_blocks()
|
|
50
|
+
|
|
51
|
+
# 如果有标题优化需求,则对title_blocks截图det
|
|
52
|
+
if heading_level_import_success:
|
|
53
|
+
atom_model_manager = AtomModelSingleton()
|
|
54
|
+
ocr_model = atom_model_manager.get_atom_model(
|
|
55
|
+
atom_model_name='ocr',
|
|
56
|
+
ocr_show_log=False,
|
|
57
|
+
det_db_box_thresh=0.3,
|
|
58
|
+
lang='ch_lite'
|
|
59
|
+
)
|
|
60
|
+
for title_block in title_blocks:
|
|
61
|
+
title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
|
|
62
|
+
title_np_img = np.array(title_pil_img)
|
|
63
|
+
# 给title_pil_img添加上下左右各50像素白边padding
|
|
64
|
+
title_np_img = cv2.copyMakeBorder(
|
|
65
|
+
title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
|
|
66
|
+
)
|
|
67
|
+
title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
|
|
68
|
+
ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
|
|
69
|
+
if len(ocr_det_res) > 0:
|
|
70
|
+
# 计算所有res的平均高度
|
|
71
|
+
avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
|
|
72
|
+
title_block['line_avg_height'] = round(avg_height/scale)
|
|
73
|
+
|
|
74
|
+
text_blocks = magic_model.get_text_blocks()
|
|
75
|
+
interline_equation_blocks = magic_model.get_interline_equation_blocks()
|
|
76
|
+
|
|
77
|
+
all_spans = magic_model.get_all_spans()
|
|
78
|
+
# 对image/table/interline_equation的span截图
|
|
79
|
+
for span in all_spans:
|
|
80
|
+
if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
|
|
81
|
+
span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
|
|
82
|
+
|
|
83
|
+
page_blocks = []
|
|
84
|
+
page_blocks.extend([
|
|
85
|
+
*image_blocks,
|
|
86
|
+
*table_blocks,
|
|
87
|
+
*code_blocks,
|
|
88
|
+
*ref_text_blocks,
|
|
89
|
+
*phonetic_blocks,
|
|
90
|
+
*title_blocks,
|
|
91
|
+
*text_blocks,
|
|
92
|
+
*interline_equation_blocks,
|
|
93
|
+
*list_blocks,
|
|
94
|
+
])
|
|
95
|
+
# 对page_blocks根据index的值进行排序
|
|
96
|
+
page_blocks.sort(key=lambda x: x["index"])
|
|
97
|
+
|
|
98
|
+
page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_size": [width, height], "page_idx": page_index}
|
|
99
|
+
return page_info
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_writer):
|
|
103
|
+
middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
|
|
104
|
+
for index, page_blocks in enumerate(model_output_blocks_list):
|
|
105
|
+
page = pdf_doc[index]
|
|
106
|
+
image_dict = images_list[index]
|
|
107
|
+
page_info = blocks_to_page_info(page_blocks, image_dict, page, image_writer, index)
|
|
108
|
+
middle_json["pdf_info"].append(page_info)
|
|
109
|
+
|
|
110
|
+
"""表格跨页合并"""
|
|
111
|
+
table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
|
|
112
|
+
if table_enable:
|
|
113
|
+
merge_table(middle_json["pdf_info"])
|
|
114
|
+
|
|
115
|
+
"""llm优化标题分级"""
|
|
116
|
+
if heading_level_import_success:
|
|
117
|
+
llm_aided_title_start_time = time.time()
|
|
118
|
+
llm_aided_title(middle_json["pdf_info"], title_aided_config)
|
|
119
|
+
logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
|
|
120
|
+
|
|
121
|
+
# 关闭pdf文档
|
|
122
|
+
pdf_doc.close()
|
|
123
|
+
return middle_json
|
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
import os
|
|
2
3
|
import time
|
|
3
4
|
|
|
4
5
|
from loguru import logger
|
|
5
6
|
|
|
7
|
+
from .model_output_to_middle_json import result_to_middle_json
|
|
6
8
|
from ...data.data_reader_writer import DataWriter
|
|
7
9
|
from mineru.utils.pdf_image_tools import load_images_from_pdf
|
|
8
|
-
from .
|
|
9
|
-
|
|
10
|
-
from .token_to_middle_json import result_to_middle_json
|
|
10
|
+
from ...utils.config_reader import get_device
|
|
11
|
+
|
|
11
12
|
from ...utils.enum_class import ImageType
|
|
13
|
+
from ...utils.model_utils import get_vram
|
|
12
14
|
from ...utils.models_download_utils import auto_download_and_get_model_root_path
|
|
13
15
|
|
|
16
|
+
from mineru_vl_utils import MinerUClient
|
|
17
|
+
from packaging import version
|
|
18
|
+
|
|
14
19
|
|
|
15
20
|
class ModelSingleton:
|
|
16
21
|
_instance = None
|
|
@@ -27,24 +32,108 @@ class ModelSingleton:
|
|
|
27
32
|
model_path: str | None,
|
|
28
33
|
server_url: str | None,
|
|
29
34
|
**kwargs,
|
|
30
|
-
) ->
|
|
35
|
+
) -> MinerUClient:
|
|
31
36
|
key = (backend, model_path, server_url)
|
|
32
37
|
if key not in self._models:
|
|
33
|
-
|
|
38
|
+
start_time = time.time()
|
|
39
|
+
model = None
|
|
40
|
+
processor = None
|
|
41
|
+
vllm_llm = None
|
|
42
|
+
vllm_async_llm = None
|
|
43
|
+
batch_size = 0
|
|
44
|
+
if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
|
|
34
45
|
model_path = auto_download_and_get_model_root_path("/","vlm")
|
|
35
|
-
|
|
46
|
+
if backend == "transformers":
|
|
47
|
+
try:
|
|
48
|
+
from transformers import (
|
|
49
|
+
AutoProcessor,
|
|
50
|
+
Qwen2VLForConditionalGeneration,
|
|
51
|
+
)
|
|
52
|
+
from transformers import __version__ as transformers_version
|
|
53
|
+
except ImportError:
|
|
54
|
+
raise ImportError("Please install transformers to use the transformers backend.")
|
|
55
|
+
|
|
56
|
+
if version.parse(transformers_version) >= version.parse("4.56.0"):
|
|
57
|
+
dtype_key = "dtype"
|
|
58
|
+
else:
|
|
59
|
+
dtype_key = "torch_dtype"
|
|
60
|
+
device = get_device()
|
|
61
|
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
62
|
+
model_path,
|
|
63
|
+
device_map={"": device},
|
|
64
|
+
**{dtype_key: "auto"}, # type: ignore
|
|
65
|
+
)
|
|
66
|
+
processor = AutoProcessor.from_pretrained(
|
|
67
|
+
model_path,
|
|
68
|
+
use_fast=True,
|
|
69
|
+
)
|
|
70
|
+
try:
|
|
71
|
+
vram = get_vram(device)
|
|
72
|
+
if vram is not None:
|
|
73
|
+
gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
|
|
74
|
+
if gpu_memory >= 16:
|
|
75
|
+
batch_size = 8
|
|
76
|
+
elif gpu_memory >= 8:
|
|
77
|
+
batch_size = 4
|
|
78
|
+
else:
|
|
79
|
+
batch_size = 1
|
|
80
|
+
logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
|
|
81
|
+
else:
|
|
82
|
+
# Default batch_ratio when VRAM can't be determined
|
|
83
|
+
batch_size = 1
|
|
84
|
+
logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_size}')
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
|
|
87
|
+
batch_size = 1
|
|
88
|
+
elif backend == "vllm-engine":
|
|
89
|
+
try:
|
|
90
|
+
import vllm
|
|
91
|
+
vllm_version = vllm.__version__
|
|
92
|
+
from mineru_vl_utils import MinerULogitsProcessor
|
|
93
|
+
except ImportError:
|
|
94
|
+
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
95
|
+
if "gpu_memory_utilization" not in kwargs:
|
|
96
|
+
kwargs["gpu_memory_utilization"] = 0.5
|
|
97
|
+
if "model" not in kwargs:
|
|
98
|
+
kwargs["model"] = model_path
|
|
99
|
+
if version.parse(vllm_version) >= version.parse("0.10.1") and "logits_processors" not in kwargs:
|
|
100
|
+
kwargs["logits_processors"] = [MinerULogitsProcessor]
|
|
101
|
+
# 使用kwargs为 vllm初始化参数
|
|
102
|
+
vllm_llm = vllm.LLM(**kwargs)
|
|
103
|
+
elif backend == "vllm-async-engine":
|
|
104
|
+
try:
|
|
105
|
+
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
106
|
+
from vllm.v1.engine.async_llm import AsyncLLM
|
|
107
|
+
from vllm import __version__ as vllm_version
|
|
108
|
+
from mineru_vl_utils import MinerULogitsProcessor
|
|
109
|
+
except ImportError:
|
|
110
|
+
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
111
|
+
if "gpu_memory_utilization" not in kwargs:
|
|
112
|
+
kwargs["gpu_memory_utilization"] = 0.5
|
|
113
|
+
if "model" not in kwargs:
|
|
114
|
+
kwargs["model"] = model_path
|
|
115
|
+
if version.parse(vllm_version) >= version.parse("0.10.1") and "logits_processors" not in kwargs:
|
|
116
|
+
kwargs["logits_processors"] = [MinerULogitsProcessor]
|
|
117
|
+
# 使用kwargs为 vllm初始化参数
|
|
118
|
+
vllm_async_llm = AsyncLLM.from_engine_args(AsyncEngineArgs(**kwargs))
|
|
119
|
+
self._models[key] = MinerUClient(
|
|
36
120
|
backend=backend,
|
|
37
|
-
|
|
121
|
+
model=model,
|
|
122
|
+
processor=processor,
|
|
123
|
+
vllm_llm=vllm_llm,
|
|
124
|
+
vllm_async_llm=vllm_async_llm,
|
|
38
125
|
server_url=server_url,
|
|
39
|
-
|
|
126
|
+
batch_size=batch_size,
|
|
40
127
|
)
|
|
128
|
+
elapsed = round(time.time() - start_time, 2)
|
|
129
|
+
logger.info(f"get {backend} predictor cost: {elapsed}s")
|
|
41
130
|
return self._models[key]
|
|
42
131
|
|
|
43
132
|
|
|
44
133
|
def doc_analyze(
|
|
45
134
|
pdf_bytes,
|
|
46
135
|
image_writer: DataWriter | None,
|
|
47
|
-
predictor:
|
|
136
|
+
predictor: MinerUClient | None = None,
|
|
48
137
|
backend="transformers",
|
|
49
138
|
model_path: str | None = None,
|
|
50
139
|
server_url: str | None = None,
|
|
@@ -54,13 +143,13 @@ def doc_analyze(
|
|
|
54
143
|
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
|
55
144
|
|
|
56
145
|
# load_images_start = time.time()
|
|
57
|
-
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.
|
|
58
|
-
|
|
146
|
+
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
147
|
+
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
|
59
148
|
# load_images_time = round(time.time() - load_images_start, 2)
|
|
60
149
|
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
|
|
61
150
|
|
|
62
151
|
# infer_start = time.time()
|
|
63
|
-
results = predictor.
|
|
152
|
+
results = predictor.batch_two_step_extract(images=images_pil_list)
|
|
64
153
|
# infer_time = round(time.time() - infer_start, 2)
|
|
65
154
|
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
|
66
155
|
|
|
@@ -71,7 +160,7 @@ def doc_analyze(
|
|
|
71
160
|
async def aio_doc_analyze(
|
|
72
161
|
pdf_bytes,
|
|
73
162
|
image_writer: DataWriter | None,
|
|
74
|
-
predictor:
|
|
163
|
+
predictor: MinerUClient | None = None,
|
|
75
164
|
backend="transformers",
|
|
76
165
|
model_path: str | None = None,
|
|
77
166
|
server_url: str | None = None,
|
|
@@ -81,13 +170,13 @@ async def aio_doc_analyze(
|
|
|
81
170
|
predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
|
|
82
171
|
|
|
83
172
|
# load_images_start = time.time()
|
|
84
|
-
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.
|
|
85
|
-
|
|
173
|
+
images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
|
|
174
|
+
images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
|
|
86
175
|
# load_images_time = round(time.time() - load_images_start, 2)
|
|
87
176
|
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
|
|
88
177
|
|
|
89
178
|
# infer_start = time.time()
|
|
90
|
-
results = await predictor.
|
|
179
|
+
results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
|
|
91
180
|
# infer_time = round(time.time() - infer_start, 2)
|
|
92
181
|
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
|
|
93
182
|
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
|