mineru 2.2.1__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/batch_analyze.py +1 -1
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
- mineru/backend/vlm/model_output_to_middle_json.py +123 -0
- mineru/backend/vlm/vlm_analyze.py +97 -16
- mineru/backend/vlm/vlm_magic_model.py +201 -135
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
- mineru/cli/client.py +6 -5
- mineru/cli/common.py +17 -16
- mineru/cli/fast_api.py +9 -7
- mineru/cli/gradio_app.py +15 -16
- mineru/cli/vlm_vllm_server.py +4 -0
- mineru/model/table/rec/unet_table/main.py +10 -2
- mineru/model/vlm_vllm_model/__init__.py +0 -0
- mineru/model/vlm_vllm_model/server.py +51 -0
- mineru/resources/header.html +10 -2
- mineru/utils/draw_bbox.py +32 -10
- mineru/utils/enum_class.py +16 -2
- mineru/utils/guess_suffix_or_lang.py +20 -0
- mineru/utils/span_block_fix.py +4 -2
- mineru/version.py +1 -1
- {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/METADATA +71 -23
- {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/RECORD +26 -39
- {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
- mineru/backend/vlm/base_predictor.py +0 -186
- mineru/backend/vlm/hf_predictor.py +0 -217
- mineru/backend/vlm/predictor.py +0 -111
- mineru/backend/vlm/sglang_client_predictor.py +0 -443
- mineru/backend/vlm/sglang_engine_predictor.py +0 -246
- mineru/backend/vlm/token_to_middle_json.py +0 -122
- mineru/backend/vlm/utils.py +0 -40
- mineru/cli/vlm_sglang_server.py +0 -4
- mineru/model/vlm_hf_model/__init__.py +0 -9
- mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
- mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
- mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
- mineru/model/vlm_sglang_model/__init__.py +0 -14
- mineru/model/vlm_sglang_model/engine.py +0 -264
- mineru/model/vlm_sglang_model/image_processor.py +0 -213
- mineru/model/vlm_sglang_model/logit_processor.py +0 -90
- mineru/model/vlm_sglang_model/model.py +0 -453
- mineru/model/vlm_sglang_model/server.py +0 -75
- {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
- {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.2.1.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,217 +0,0 @@
|
|
|
1
|
-
from io import BytesIO
|
|
2
|
-
from typing import Iterable, List, Optional, Union
|
|
3
|
-
|
|
4
|
-
import torch
|
|
5
|
-
from PIL import Image
|
|
6
|
-
from tqdm import tqdm
|
|
7
|
-
from transformers import AutoTokenizer, BitsAndBytesConfig, __version__
|
|
8
|
-
|
|
9
|
-
from ...model.vlm_hf_model import Mineru2QwenForCausalLM
|
|
10
|
-
from ...model.vlm_hf_model.image_processing_mineru2 import process_images
|
|
11
|
-
from .base_predictor import (
|
|
12
|
-
DEFAULT_MAX_NEW_TOKENS,
|
|
13
|
-
DEFAULT_NO_REPEAT_NGRAM_SIZE,
|
|
14
|
-
DEFAULT_PRESENCE_PENALTY,
|
|
15
|
-
DEFAULT_REPETITION_PENALTY,
|
|
16
|
-
DEFAULT_TEMPERATURE,
|
|
17
|
-
DEFAULT_TOP_K,
|
|
18
|
-
DEFAULT_TOP_P,
|
|
19
|
-
BasePredictor,
|
|
20
|
-
)
|
|
21
|
-
from .utils import load_resource
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class HuggingfacePredictor(BasePredictor):
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
model_path: str,
|
|
28
|
-
device_map="auto",
|
|
29
|
-
device="cuda",
|
|
30
|
-
torch_dtype="auto",
|
|
31
|
-
load_in_8bit=False,
|
|
32
|
-
load_in_4bit=False,
|
|
33
|
-
use_flash_attn=False,
|
|
34
|
-
temperature: float = DEFAULT_TEMPERATURE,
|
|
35
|
-
top_p: float = DEFAULT_TOP_P,
|
|
36
|
-
top_k: int = DEFAULT_TOP_K,
|
|
37
|
-
repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
|
|
38
|
-
presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
|
|
39
|
-
no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
|
|
40
|
-
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
41
|
-
**kwargs,
|
|
42
|
-
):
|
|
43
|
-
super().__init__(
|
|
44
|
-
temperature=temperature,
|
|
45
|
-
top_p=top_p,
|
|
46
|
-
top_k=top_k,
|
|
47
|
-
repetition_penalty=repetition_penalty,
|
|
48
|
-
presence_penalty=presence_penalty,
|
|
49
|
-
no_repeat_ngram_size=no_repeat_ngram_size,
|
|
50
|
-
max_new_tokens=max_new_tokens,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
kwargs = {"device_map": device_map, **kwargs}
|
|
54
|
-
|
|
55
|
-
if device != "cuda":
|
|
56
|
-
kwargs["device_map"] = {"": device}
|
|
57
|
-
|
|
58
|
-
if load_in_8bit:
|
|
59
|
-
kwargs["load_in_8bit"] = True
|
|
60
|
-
elif load_in_4bit:
|
|
61
|
-
kwargs["load_in_4bit"] = True
|
|
62
|
-
kwargs["quantization_config"] = BitsAndBytesConfig(
|
|
63
|
-
load_in_4bit=True,
|
|
64
|
-
bnb_4bit_compute_dtype=torch.float16,
|
|
65
|
-
bnb_4bit_use_double_quant=True,
|
|
66
|
-
bnb_4bit_quant_type="nf4",
|
|
67
|
-
)
|
|
68
|
-
else:
|
|
69
|
-
from packaging import version
|
|
70
|
-
if version.parse(__version__) >= version.parse("4.56.0"):
|
|
71
|
-
kwargs["dtype"] = torch_dtype
|
|
72
|
-
else:
|
|
73
|
-
kwargs["torch_dtype"] = torch_dtype
|
|
74
|
-
|
|
75
|
-
if use_flash_attn:
|
|
76
|
-
kwargs["attn_implementation"] = "flash_attention_2"
|
|
77
|
-
|
|
78
|
-
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
79
|
-
self.model = Mineru2QwenForCausalLM.from_pretrained(
|
|
80
|
-
model_path,
|
|
81
|
-
low_cpu_mem_usage=True,
|
|
82
|
-
**kwargs,
|
|
83
|
-
)
|
|
84
|
-
setattr(self.model.config, "_name_or_path", model_path)
|
|
85
|
-
self.model.eval()
|
|
86
|
-
|
|
87
|
-
vision_tower = self.model.get_model().vision_tower
|
|
88
|
-
if device_map != "auto":
|
|
89
|
-
vision_tower.to(device=device_map, dtype=self.model.dtype)
|
|
90
|
-
|
|
91
|
-
self.image_processor = vision_tower.image_processor
|
|
92
|
-
self.eos_token_id = self.model.config.eos_token_id
|
|
93
|
-
|
|
94
|
-
def predict(
|
|
95
|
-
self,
|
|
96
|
-
image: str | bytes,
|
|
97
|
-
prompt: str = "",
|
|
98
|
-
temperature: Optional[float] = None,
|
|
99
|
-
top_p: Optional[float] = None,
|
|
100
|
-
top_k: Optional[int] = None,
|
|
101
|
-
repetition_penalty: Optional[float] = None,
|
|
102
|
-
presence_penalty: Optional[float] = None,
|
|
103
|
-
no_repeat_ngram_size: Optional[int] = None,
|
|
104
|
-
max_new_tokens: Optional[int] = None,
|
|
105
|
-
**kwargs,
|
|
106
|
-
) -> str:
|
|
107
|
-
prompt = self.build_prompt(prompt)
|
|
108
|
-
|
|
109
|
-
if temperature is None:
|
|
110
|
-
temperature = self.temperature
|
|
111
|
-
if top_p is None:
|
|
112
|
-
top_p = self.top_p
|
|
113
|
-
if top_k is None:
|
|
114
|
-
top_k = self.top_k
|
|
115
|
-
if repetition_penalty is None:
|
|
116
|
-
repetition_penalty = self.repetition_penalty
|
|
117
|
-
if no_repeat_ngram_size is None:
|
|
118
|
-
no_repeat_ngram_size = self.no_repeat_ngram_size
|
|
119
|
-
if max_new_tokens is None:
|
|
120
|
-
max_new_tokens = self.max_new_tokens
|
|
121
|
-
|
|
122
|
-
do_sample = (temperature > 0.0) and (top_k > 1)
|
|
123
|
-
|
|
124
|
-
generate_kwargs = {
|
|
125
|
-
"repetition_penalty": repetition_penalty,
|
|
126
|
-
"no_repeat_ngram_size": no_repeat_ngram_size,
|
|
127
|
-
"max_new_tokens": max_new_tokens,
|
|
128
|
-
"do_sample": do_sample,
|
|
129
|
-
}
|
|
130
|
-
if do_sample:
|
|
131
|
-
generate_kwargs["temperature"] = temperature
|
|
132
|
-
generate_kwargs["top_p"] = top_p
|
|
133
|
-
generate_kwargs["top_k"] = top_k
|
|
134
|
-
|
|
135
|
-
if isinstance(image, str):
|
|
136
|
-
image = load_resource(image)
|
|
137
|
-
|
|
138
|
-
image_obj = Image.open(BytesIO(image))
|
|
139
|
-
image_tensor = process_images([image_obj], self.image_processor, self.model.config)
|
|
140
|
-
image_tensor = image_tensor[0].unsqueeze(0)
|
|
141
|
-
image_tensor = image_tensor.to(device=self.model.device, dtype=self.model.dtype)
|
|
142
|
-
image_sizes = [[*image_obj.size]]
|
|
143
|
-
|
|
144
|
-
encoded_inputs = self.tokenizer(prompt, return_tensors="pt")
|
|
145
|
-
input_ids = encoded_inputs.input_ids.to(device=self.model.device)
|
|
146
|
-
attention_mask = encoded_inputs.attention_mask.to(device=self.model.device)
|
|
147
|
-
|
|
148
|
-
with torch.inference_mode():
|
|
149
|
-
output_ids = self.model.generate(
|
|
150
|
-
input_ids,
|
|
151
|
-
attention_mask=attention_mask,
|
|
152
|
-
images=image_tensor,
|
|
153
|
-
image_sizes=image_sizes,
|
|
154
|
-
use_cache=True,
|
|
155
|
-
**generate_kwargs,
|
|
156
|
-
**kwargs,
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
# Remove the last token if it is the eos_token_id
|
|
160
|
-
if len(output_ids[0]) > 0 and output_ids[0, -1] == self.eos_token_id:
|
|
161
|
-
output_ids = output_ids[:, :-1]
|
|
162
|
-
|
|
163
|
-
output = self.tokenizer.batch_decode(
|
|
164
|
-
output_ids,
|
|
165
|
-
skip_special_tokens=False,
|
|
166
|
-
)[0].strip()
|
|
167
|
-
|
|
168
|
-
return output
|
|
169
|
-
|
|
170
|
-
def batch_predict(
|
|
171
|
-
self,
|
|
172
|
-
images: List[str] | List[bytes],
|
|
173
|
-
prompts: Union[List[str], str] = "",
|
|
174
|
-
temperature: Optional[float] = None,
|
|
175
|
-
top_p: Optional[float] = None,
|
|
176
|
-
top_k: Optional[int] = None,
|
|
177
|
-
repetition_penalty: Optional[float] = None,
|
|
178
|
-
presence_penalty: Optional[float] = None, # not supported by hf
|
|
179
|
-
no_repeat_ngram_size: Optional[int] = None,
|
|
180
|
-
max_new_tokens: Optional[int] = None,
|
|
181
|
-
**kwargs,
|
|
182
|
-
) -> List[str]:
|
|
183
|
-
if not isinstance(prompts, list):
|
|
184
|
-
prompts = [prompts] * len(images)
|
|
185
|
-
|
|
186
|
-
assert len(prompts) == len(images), "Length of prompts and images must match."
|
|
187
|
-
|
|
188
|
-
outputs = []
|
|
189
|
-
for prompt, image in tqdm(zip(prompts, images), total=len(images), desc="Predict"):
|
|
190
|
-
output = self.predict(
|
|
191
|
-
image,
|
|
192
|
-
prompt,
|
|
193
|
-
temperature=temperature,
|
|
194
|
-
top_p=top_p,
|
|
195
|
-
top_k=top_k,
|
|
196
|
-
repetition_penalty=repetition_penalty,
|
|
197
|
-
presence_penalty=presence_penalty,
|
|
198
|
-
no_repeat_ngram_size=no_repeat_ngram_size,
|
|
199
|
-
max_new_tokens=max_new_tokens,
|
|
200
|
-
**kwargs,
|
|
201
|
-
)
|
|
202
|
-
outputs.append(output)
|
|
203
|
-
return outputs
|
|
204
|
-
|
|
205
|
-
def stream_predict(
|
|
206
|
-
self,
|
|
207
|
-
image: str | bytes,
|
|
208
|
-
prompt: str = "",
|
|
209
|
-
temperature: Optional[float] = None,
|
|
210
|
-
top_p: Optional[float] = None,
|
|
211
|
-
top_k: Optional[int] = None,
|
|
212
|
-
repetition_penalty: Optional[float] = None,
|
|
213
|
-
presence_penalty: Optional[float] = None,
|
|
214
|
-
no_repeat_ngram_size: Optional[int] = None,
|
|
215
|
-
max_new_tokens: Optional[int] = None,
|
|
216
|
-
) -> Iterable[str]:
|
|
217
|
-
raise NotImplementedError("Streaming is not supported yet.")
|
mineru/backend/vlm/predictor.py
DELETED
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
-
|
|
3
|
-
import time
|
|
4
|
-
|
|
5
|
-
from loguru import logger
|
|
6
|
-
|
|
7
|
-
from .base_predictor import (
|
|
8
|
-
DEFAULT_MAX_NEW_TOKENS,
|
|
9
|
-
DEFAULT_NO_REPEAT_NGRAM_SIZE,
|
|
10
|
-
DEFAULT_PRESENCE_PENALTY,
|
|
11
|
-
DEFAULT_REPETITION_PENALTY,
|
|
12
|
-
DEFAULT_TEMPERATURE,
|
|
13
|
-
DEFAULT_TOP_K,
|
|
14
|
-
DEFAULT_TOP_P,
|
|
15
|
-
BasePredictor,
|
|
16
|
-
)
|
|
17
|
-
from .sglang_client_predictor import SglangClientPredictor
|
|
18
|
-
|
|
19
|
-
hf_loaded = False
|
|
20
|
-
try:
|
|
21
|
-
from .hf_predictor import HuggingfacePredictor
|
|
22
|
-
|
|
23
|
-
hf_loaded = True
|
|
24
|
-
except ImportError as e:
|
|
25
|
-
logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
|
|
26
|
-
|
|
27
|
-
engine_loaded = False
|
|
28
|
-
try:
|
|
29
|
-
from sglang.srt.server_args import ServerArgs
|
|
30
|
-
|
|
31
|
-
from .sglang_engine_predictor import SglangEnginePredictor
|
|
32
|
-
|
|
33
|
-
engine_loaded = True
|
|
34
|
-
except Exception as e:
|
|
35
|
-
logger.warning("sglang is not installed. If you are not using sglang, you can ignore this warning.")
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def get_predictor(
|
|
39
|
-
backend: str = "sglang-client",
|
|
40
|
-
model_path: str | None = None,
|
|
41
|
-
server_url: str | None = None,
|
|
42
|
-
temperature: float = DEFAULT_TEMPERATURE,
|
|
43
|
-
top_p: float = DEFAULT_TOP_P,
|
|
44
|
-
top_k: int = DEFAULT_TOP_K,
|
|
45
|
-
repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
|
|
46
|
-
presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
|
|
47
|
-
no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
|
|
48
|
-
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
49
|
-
http_timeout: int = 600,
|
|
50
|
-
**kwargs,
|
|
51
|
-
) -> BasePredictor:
|
|
52
|
-
start_time = time.time()
|
|
53
|
-
|
|
54
|
-
if backend == "transformers":
|
|
55
|
-
if not model_path:
|
|
56
|
-
raise ValueError("model_path must be provided for transformers backend.")
|
|
57
|
-
if not hf_loaded:
|
|
58
|
-
raise ImportError(
|
|
59
|
-
"transformers is not installed, so huggingface backend cannot be used. "
|
|
60
|
-
"If you need to use huggingface backend, please install transformers first."
|
|
61
|
-
)
|
|
62
|
-
predictor = HuggingfacePredictor(
|
|
63
|
-
model_path=model_path,
|
|
64
|
-
temperature=temperature,
|
|
65
|
-
top_p=top_p,
|
|
66
|
-
top_k=top_k,
|
|
67
|
-
repetition_penalty=repetition_penalty,
|
|
68
|
-
presence_penalty=presence_penalty,
|
|
69
|
-
no_repeat_ngram_size=no_repeat_ngram_size,
|
|
70
|
-
max_new_tokens=max_new_tokens,
|
|
71
|
-
**kwargs,
|
|
72
|
-
)
|
|
73
|
-
elif backend == "sglang-engine":
|
|
74
|
-
if not model_path:
|
|
75
|
-
raise ValueError("model_path must be provided for sglang-engine backend.")
|
|
76
|
-
if not engine_loaded:
|
|
77
|
-
raise ImportError(
|
|
78
|
-
"sglang is not installed, so sglang-engine backend cannot be used. "
|
|
79
|
-
"If you need to use sglang-engine backend for inference, "
|
|
80
|
-
"please install sglang[all]==0.4.8 or a newer version."
|
|
81
|
-
)
|
|
82
|
-
predictor = SglangEnginePredictor(
|
|
83
|
-
server_args=ServerArgs(model_path, **kwargs),
|
|
84
|
-
temperature=temperature,
|
|
85
|
-
top_p=top_p,
|
|
86
|
-
top_k=top_k,
|
|
87
|
-
repetition_penalty=repetition_penalty,
|
|
88
|
-
presence_penalty=presence_penalty,
|
|
89
|
-
no_repeat_ngram_size=no_repeat_ngram_size,
|
|
90
|
-
max_new_tokens=max_new_tokens,
|
|
91
|
-
)
|
|
92
|
-
elif backend == "sglang-client":
|
|
93
|
-
if not server_url:
|
|
94
|
-
raise ValueError("server_url must be provided for sglang-client backend.")
|
|
95
|
-
predictor = SglangClientPredictor(
|
|
96
|
-
server_url=server_url,
|
|
97
|
-
temperature=temperature,
|
|
98
|
-
top_p=top_p,
|
|
99
|
-
top_k=top_k,
|
|
100
|
-
repetition_penalty=repetition_penalty,
|
|
101
|
-
presence_penalty=presence_penalty,
|
|
102
|
-
no_repeat_ngram_size=no_repeat_ngram_size,
|
|
103
|
-
max_new_tokens=max_new_tokens,
|
|
104
|
-
http_timeout=http_timeout,
|
|
105
|
-
)
|
|
106
|
-
else:
|
|
107
|
-
raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
|
|
108
|
-
|
|
109
|
-
elapsed = round(time.time() - start_time, 2)
|
|
110
|
-
logger.info(f"get_predictor cost: {elapsed}s")
|
|
111
|
-
return predictor
|