xinference 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/restful_client.py +1 -1
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +9 -8
- xinference/core/model.py +13 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/worker.py +0 -10
- xinference/model/audio/model_spec.json +53 -1
- xinference/model/audio/model_spec_modelscope.json +57 -1
- xinference/model/embedding/core.py +19 -11
- xinference/model/image/model_spec.json +10 -1
- xinference/model/image/model_spec_modelscope.json +20 -0
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +455 -0
- xinference/model/llm/llm_family.json +185 -397
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +199 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +4 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/vllm/core.py +4 -0
- xinference/model/rerank/core.py +7 -1
- xinference/model/rerank/utils.py +17 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
- xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
- xinference/web/ui/src/locales/en.json +3 -1
- xinference/web/ui/src/locales/zh.json +3 -1
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/METADATA +6 -4
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/__init__.py +0 -0
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
import math
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
+
from threading import Thread
|
|
18
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
|
|
22
|
+
from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
23
|
+
from ...utils import _decode_image, parse_messages
|
|
24
|
+
from ..core import register_non_default_model
|
|
25
|
+
from .core import PytorchMultiModalModel
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@register_transformer
|
|
31
|
+
@register_non_default_model("InternVL3")
|
|
32
|
+
class InternVLChatModel(PytorchMultiModalModel):
|
|
33
|
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
|
34
|
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def match_json(
|
|
38
|
+
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
39
|
+
) -> bool:
|
|
40
|
+
family = model_family.model_family or model_family.model_name
|
|
41
|
+
if "internvl3" in family.lower():
|
|
42
|
+
return True
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
def decide_device(self):
|
|
46
|
+
from transformers import AutoConfig
|
|
47
|
+
|
|
48
|
+
device_map = {}
|
|
49
|
+
world_size = torch.cuda.device_count()
|
|
50
|
+
# single gpu
|
|
51
|
+
if world_size == 1:
|
|
52
|
+
self._device = device_map
|
|
53
|
+
return
|
|
54
|
+
config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
|
|
55
|
+
num_layers = config.llm_config.num_hidden_layers
|
|
56
|
+
|
|
57
|
+
# Since the first GPU will be used for ViT, treat it as half a GPU.
|
|
58
|
+
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
|
|
59
|
+
num_layers_per_gpu = [num_layers_per_gpu] * world_size
|
|
60
|
+
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
|
|
61
|
+
layer_cnt = 0
|
|
62
|
+
for i, num_layer in enumerate(num_layers_per_gpu):
|
|
63
|
+
for j in range(num_layer):
|
|
64
|
+
device_map[f"language_model.model.layers.{layer_cnt}"] = i
|
|
65
|
+
layer_cnt += 1
|
|
66
|
+
device_map["vision_model"] = 0
|
|
67
|
+
device_map["mlp1"] = 0
|
|
68
|
+
device_map["language_model.model.tok_embeddings"] = 0
|
|
69
|
+
device_map["language_model.model.embed_tokens"] = 0
|
|
70
|
+
device_map["language_model.output"] = 0
|
|
71
|
+
device_map["language_model.model.norm"] = 0
|
|
72
|
+
device_map["language_model.lm_head"] = 0
|
|
73
|
+
device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
|
|
74
|
+
self._device = device_map
|
|
75
|
+
|
|
76
|
+
def load_processor(self):
|
|
77
|
+
from transformers import AutoTokenizer
|
|
78
|
+
|
|
79
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
80
|
+
self.model_path, trust_remote_code=True, use_fast=False
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def load_multimodal_model(self):
|
|
84
|
+
from transformers import AutoModel
|
|
85
|
+
|
|
86
|
+
kwargs: Dict[str, Any] = {
|
|
87
|
+
"torch_dtype": torch.bfloat16,
|
|
88
|
+
"low_cpu_mem_usage": True,
|
|
89
|
+
"trust_remote_code": True,
|
|
90
|
+
}
|
|
91
|
+
if self._device:
|
|
92
|
+
kwargs["device_map"] = self._device
|
|
93
|
+
kwargs = self.apply_bnb_quantization(kwargs)
|
|
94
|
+
|
|
95
|
+
self._model = AutoModel.from_pretrained(self.model_path, **kwargs).eval()
|
|
96
|
+
|
|
97
|
+
if not self._device and "none" in self.quantization.lower():
|
|
98
|
+
self._model.cuda()
|
|
99
|
+
|
|
100
|
+
def _build_transform(self, input_size=448):
|
|
101
|
+
import torchvision.transforms as T
|
|
102
|
+
from torchvision.transforms.functional import InterpolationMode
|
|
103
|
+
|
|
104
|
+
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
|
|
105
|
+
transform = T.Compose(
|
|
106
|
+
[
|
|
107
|
+
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
|
108
|
+
T.Resize(
|
|
109
|
+
(input_size, input_size), interpolation=InterpolationMode.BICUBIC
|
|
110
|
+
),
|
|
111
|
+
T.ToTensor(),
|
|
112
|
+
T.Normalize(mean=MEAN, std=STD),
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
return transform
|
|
116
|
+
|
|
117
|
+
# video multi-round conversation
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
|
120
|
+
import numpy as np
|
|
121
|
+
|
|
122
|
+
if bound:
|
|
123
|
+
start, end = bound[0], bound[1]
|
|
124
|
+
else:
|
|
125
|
+
start, end = -100000, 100000
|
|
126
|
+
start_idx = max(first_idx, round(start * fps))
|
|
127
|
+
end_idx = min(round(end * fps), max_frame)
|
|
128
|
+
seg_size = float(end_idx - start_idx) / num_segments
|
|
129
|
+
frame_indices = np.array(
|
|
130
|
+
[
|
|
131
|
+
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
|
|
132
|
+
for idx in range(num_segments)
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
return frame_indices
|
|
136
|
+
|
|
137
|
+
def _find_closest_aspect_ratio(
|
|
138
|
+
self, aspect_ratio, target_ratios, width, height, image_size
|
|
139
|
+
):
|
|
140
|
+
best_ratio_diff = float("inf")
|
|
141
|
+
best_ratio = (1, 1)
|
|
142
|
+
area = width * height
|
|
143
|
+
for ratio in target_ratios:
|
|
144
|
+
target_aspect_ratio = ratio[0] / ratio[1]
|
|
145
|
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
|
146
|
+
if ratio_diff < best_ratio_diff:
|
|
147
|
+
best_ratio_diff = ratio_diff
|
|
148
|
+
best_ratio = ratio
|
|
149
|
+
elif ratio_diff == best_ratio_diff:
|
|
150
|
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
|
151
|
+
best_ratio = ratio
|
|
152
|
+
return best_ratio
|
|
153
|
+
|
|
154
|
+
def _dynamic_preprocess(
|
|
155
|
+
self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
|
|
156
|
+
):
|
|
157
|
+
orig_width, orig_height = image.size
|
|
158
|
+
aspect_ratio = orig_width / orig_height
|
|
159
|
+
|
|
160
|
+
# calculate the existing image aspect ratio
|
|
161
|
+
target_ratios = set(
|
|
162
|
+
(i, j)
|
|
163
|
+
for n in range(min_num, max_num + 1)
|
|
164
|
+
for i in range(1, n + 1)
|
|
165
|
+
for j in range(1, n + 1)
|
|
166
|
+
if i * j <= max_num and i * j >= min_num
|
|
167
|
+
)
|
|
168
|
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
|
169
|
+
|
|
170
|
+
# find the closest aspect ratio to the target
|
|
171
|
+
target_aspect_ratio = self._find_closest_aspect_ratio(
|
|
172
|
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# calculate the target width and height
|
|
176
|
+
target_width = image_size * target_aspect_ratio[0]
|
|
177
|
+
target_height = image_size * target_aspect_ratio[1]
|
|
178
|
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
|
179
|
+
|
|
180
|
+
# resize the image
|
|
181
|
+
resized_img = image.resize((target_width, target_height))
|
|
182
|
+
processed_images = []
|
|
183
|
+
for i in range(blocks):
|
|
184
|
+
box = (
|
|
185
|
+
(i % (target_width // image_size)) * image_size,
|
|
186
|
+
(i // (target_width // image_size)) * image_size,
|
|
187
|
+
((i % (target_width // image_size)) + 1) * image_size,
|
|
188
|
+
((i // (target_width // image_size)) + 1) * image_size,
|
|
189
|
+
)
|
|
190
|
+
# split the image
|
|
191
|
+
split_img = resized_img.crop(box)
|
|
192
|
+
processed_images.append(split_img)
|
|
193
|
+
assert len(processed_images) == blocks
|
|
194
|
+
if use_thumbnail and len(processed_images) != 1:
|
|
195
|
+
thumbnail_img = image.resize((image_size, image_size))
|
|
196
|
+
processed_images.append(thumbnail_img)
|
|
197
|
+
return processed_images
|
|
198
|
+
|
|
199
|
+
def _load_video(
|
|
200
|
+
self, video_path, bound=None, input_size=448, max_num=1, num_segments=32
|
|
201
|
+
):
|
|
202
|
+
from decord import VideoReader, cpu
|
|
203
|
+
from PIL import Image
|
|
204
|
+
|
|
205
|
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
|
206
|
+
max_frame = len(vr) - 1
|
|
207
|
+
fps = float(vr.get_avg_fps())
|
|
208
|
+
|
|
209
|
+
pixel_values_list, num_patches_list = [], []
|
|
210
|
+
transform = self._build_transform(input_size=input_size)
|
|
211
|
+
frame_indices = self._get_index(
|
|
212
|
+
bound, fps, max_frame, first_idx=0, num_segments=num_segments
|
|
213
|
+
)
|
|
214
|
+
for frame_index in frame_indices:
|
|
215
|
+
img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
|
|
216
|
+
img = self._dynamic_preprocess(
|
|
217
|
+
img, image_size=input_size, use_thumbnail=True, max_num=max_num
|
|
218
|
+
)
|
|
219
|
+
pixel_values = [transform(tile) for tile in img]
|
|
220
|
+
pixel_values = torch.stack(pixel_values)
|
|
221
|
+
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
|
222
|
+
num_patches_list.append(pixel_values.shape[0])
|
|
223
|
+
pixel_values_list.append(pixel_values)
|
|
224
|
+
pixel_values = torch.cat(pixel_values_list)
|
|
225
|
+
return pixel_values, num_patches_list
|
|
226
|
+
|
|
227
|
+
def _message_content_to_intern(self, content, image_cnt):
|
|
228
|
+
if not isinstance(content, str):
|
|
229
|
+
texts = []
|
|
230
|
+
image_urls = []
|
|
231
|
+
video_urls = []
|
|
232
|
+
for c in content:
|
|
233
|
+
c_type = c.get("type")
|
|
234
|
+
if c_type == "text":
|
|
235
|
+
texts.append(c["text"])
|
|
236
|
+
elif c_type == "image_url":
|
|
237
|
+
image_urls.append(c["image_url"]["url"])
|
|
238
|
+
elif c_type == "video_url":
|
|
239
|
+
video_urls.append(c["video_url"]["url"])
|
|
240
|
+
if len(video_urls) > 1:
|
|
241
|
+
raise RuntimeError("Only one video per message is supported")
|
|
242
|
+
image_futures = []
|
|
243
|
+
with ThreadPoolExecutor() as executor:
|
|
244
|
+
for image_url in image_urls:
|
|
245
|
+
fut = executor.submit(_decode_image, image_url)
|
|
246
|
+
image_futures.append(fut)
|
|
247
|
+
images = [fut.result() for fut in image_futures]
|
|
248
|
+
videos = []
|
|
249
|
+
for vid_url in video_urls:
|
|
250
|
+
videos.append(self._load_video(vid_url, num_segments=8, max_num=1))
|
|
251
|
+
prefix = ""
|
|
252
|
+
for i, _ in enumerate(images):
|
|
253
|
+
prefix += f"Image-{image_cnt + i + 1}: <image>\n\n"
|
|
254
|
+
|
|
255
|
+
if len(videos) > 0:
|
|
256
|
+
prefix = "".join(
|
|
257
|
+
[f"Frame{i + 1}: <image>\n" for i in range(len(videos[0][1]))]
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
text = prefix + " ".join(texts)
|
|
261
|
+
return text, images, videos
|
|
262
|
+
return content, [], []
|
|
263
|
+
|
|
264
|
+
def _get_prompt_and_chat_history(
|
|
265
|
+
self,
|
|
266
|
+
prompt: Union[str, List[Dict]],
|
|
267
|
+
chat_history: Optional[List[Dict]] = None,
|
|
268
|
+
):
|
|
269
|
+
# Convert openai history to intern vl history
|
|
270
|
+
images = []
|
|
271
|
+
videos = []
|
|
272
|
+
history = []
|
|
273
|
+
image_cnt = 0
|
|
274
|
+
for h1, h2 in zip(*[iter(chat_history or [])] * 2):
|
|
275
|
+
content1, img, vid = self._message_content_to_intern(
|
|
276
|
+
h1["content"], image_cnt
|
|
277
|
+
)
|
|
278
|
+
content2, _, _ = self._message_content_to_intern(h2["content"], image_cnt)
|
|
279
|
+
history.append([content1, content2])
|
|
280
|
+
images.extend(img)
|
|
281
|
+
image_cnt += len(img)
|
|
282
|
+
videos.extend(vid)
|
|
283
|
+
|
|
284
|
+
question, img, vid = self._message_content_to_intern(prompt, image_cnt)
|
|
285
|
+
images.extend(img)
|
|
286
|
+
videos.extend(vid)
|
|
287
|
+
return question, history, images, videos
|
|
288
|
+
|
|
289
|
+
def _load_image(self, image_file, input_size=448, max_num=12):
|
|
290
|
+
image = image_file.convert("RGB")
|
|
291
|
+
transform = self._build_transform(input_size=input_size)
|
|
292
|
+
images = self._dynamic_preprocess(
|
|
293
|
+
image, image_size=input_size, use_thumbnail=True, max_num=max_num
|
|
294
|
+
)
|
|
295
|
+
pixel_values = [transform(image) for image in images]
|
|
296
|
+
pixel_values = torch.stack(pixel_values)
|
|
297
|
+
return pixel_values
|
|
298
|
+
|
|
299
|
+
def build_inputs_from_messages(
|
|
300
|
+
self,
|
|
301
|
+
messages: List[Dict],
|
|
302
|
+
generate_config: Dict,
|
|
303
|
+
):
|
|
304
|
+
from .....thirdparty.internvl.conversation import get_conv_template
|
|
305
|
+
|
|
306
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
307
|
+
content, history, images, videos = self._get_prompt_and_chat_history(
|
|
308
|
+
prompt, chat_history
|
|
309
|
+
)
|
|
310
|
+
num_patches_list = []
|
|
311
|
+
if len(images) == 1:
|
|
312
|
+
content = content.replace("Image-1: <image>\n\n", "<image>\n")
|
|
313
|
+
history = [
|
|
314
|
+
[item[0].replace("Image-1: <image>\n\n", "<image>\n"), item[1]]
|
|
315
|
+
for item in history
|
|
316
|
+
]
|
|
317
|
+
pixel_values = (
|
|
318
|
+
self._load_image(images[-1], max_num=12).to(torch.bfloat16).cuda()
|
|
319
|
+
)
|
|
320
|
+
num_patches_list = (
|
|
321
|
+
[pixel_values.shape[0]] if pixel_values is not None else []
|
|
322
|
+
)
|
|
323
|
+
elif len(images) > 1:
|
|
324
|
+
pixel_values = [
|
|
325
|
+
self._load_image(img, max_num=12).to(torch.bfloat16).cuda()
|
|
326
|
+
for img in images
|
|
327
|
+
]
|
|
328
|
+
num_patches_list = [values.size(0) for values in pixel_values]
|
|
329
|
+
pixel_values = torch.cat(pixel_values, dim=0)
|
|
330
|
+
else:
|
|
331
|
+
pixel_values = None
|
|
332
|
+
|
|
333
|
+
if len(videos) > 0:
|
|
334
|
+
pixel_values = videos[0][0]
|
|
335
|
+
num_patches_list = videos[0][1]
|
|
336
|
+
|
|
337
|
+
assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
|
|
338
|
+
|
|
339
|
+
IMG_START_TOKEN = "<img>"
|
|
340
|
+
IMG_END_TOKEN = "</img>"
|
|
341
|
+
IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
|
|
342
|
+
|
|
343
|
+
img_context_token_id = self._tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
|
344
|
+
self._model.img_context_token_id = img_context_token_id
|
|
345
|
+
|
|
346
|
+
template = get_conv_template(self._model.template)
|
|
347
|
+
template.system_message = self._model.system_message
|
|
348
|
+
eos_token_id = self._tokenizer.convert_tokens_to_ids(template.sep)
|
|
349
|
+
|
|
350
|
+
history = [] if history is None else history
|
|
351
|
+
for old_question, old_answer in history:
|
|
352
|
+
template.append_message(template.roles[0], old_question)
|
|
353
|
+
template.append_message(template.roles[1], old_answer)
|
|
354
|
+
template.append_message(template.roles[0], content)
|
|
355
|
+
template.append_message(template.roles[1], None)
|
|
356
|
+
query = template.get_prompt()
|
|
357
|
+
|
|
358
|
+
for num_patches in num_patches_list:
|
|
359
|
+
image_tokens = (
|
|
360
|
+
IMG_START_TOKEN
|
|
361
|
+
+ IMG_CONTEXT_TOKEN * self._model.num_image_token * num_patches
|
|
362
|
+
+ IMG_END_TOKEN
|
|
363
|
+
)
|
|
364
|
+
query = query.replace("<image>", image_tokens, 1)
|
|
365
|
+
|
|
366
|
+
model_inputs = self._tokenizer(query, return_tensors="pt")
|
|
367
|
+
input_ids = model_inputs["input_ids"].cuda()
|
|
368
|
+
attention_mask = model_inputs["attention_mask"].cuda()
|
|
369
|
+
|
|
370
|
+
return {
|
|
371
|
+
"pixel_values": pixel_values,
|
|
372
|
+
"input_ids": input_ids,
|
|
373
|
+
"attention_mask": attention_mask,
|
|
374
|
+
"eos_token_id": eos_token_id,
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
def build_generate_kwargs(
|
|
378
|
+
self,
|
|
379
|
+
generate_config: Dict,
|
|
380
|
+
) -> Dict[str, Any]:
|
|
381
|
+
return {
|
|
382
|
+
"max_new_tokens": generate_config.get("max_tokens", 1024),
|
|
383
|
+
"do_sample": False,
|
|
384
|
+
"temperature": generate_config.get("temperature", None),
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
def build_streaming_iter(
|
|
388
|
+
self,
|
|
389
|
+
messages: List[Dict],
|
|
390
|
+
generate_config: Dict,
|
|
391
|
+
) -> Tuple[Iterator, int]:
|
|
392
|
+
from transformers import TextIteratorStreamer
|
|
393
|
+
|
|
394
|
+
# Initialize the streamer
|
|
395
|
+
streamer = TextIteratorStreamer(
|
|
396
|
+
self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
configs = self.build_generate_kwargs(generate_config)
|
|
400
|
+
inputs = self.build_inputs_from_messages(messages, generate_config)
|
|
401
|
+
generate_kwargs = {**inputs, **configs, "streamer": streamer}
|
|
402
|
+
thread = Thread(
|
|
403
|
+
target=self._model.generate,
|
|
404
|
+
kwargs=generate_kwargs,
|
|
405
|
+
)
|
|
406
|
+
thread.start()
|
|
407
|
+
return streamer, len(inputs["input_ids"][0])
|
|
408
|
+
|
|
409
|
+
def check_conditions(self, new_text: str) -> Tuple[str, bool]:
|
|
410
|
+
if new_text == self._model.conv_template.sep:
|
|
411
|
+
return "", True
|
|
412
|
+
return new_text, False
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright 2022-
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -12,42 +12,28 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import uuid
|
|
16
15
|
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
|
16
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
18
17
|
|
|
19
18
|
import torch
|
|
20
19
|
from PIL import Image
|
|
21
20
|
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
from
|
|
30
|
-
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
31
|
-
from ..utils import (
|
|
32
|
-
_decode_image,
|
|
33
|
-
generate_chat_completion,
|
|
34
|
-
generate_completion_chunk,
|
|
35
|
-
parse_messages,
|
|
36
|
-
)
|
|
37
|
-
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
38
|
-
from .utils import cache_clean
|
|
21
|
+
from .....core.model import register_batching_multimodal_models
|
|
22
|
+
from .....core.scheduler import InferenceRequest
|
|
23
|
+
from .....model.utils import select_device
|
|
24
|
+
from .....types import PytorchModelConfig
|
|
25
|
+
from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
26
|
+
from ...utils import _decode_image, parse_messages
|
|
27
|
+
from ..core import register_non_default_model
|
|
28
|
+
from .core import PytorchMultiModalModel
|
|
39
29
|
|
|
40
30
|
logger = logging.getLogger(__name__)
|
|
41
31
|
|
|
42
32
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
self._tokenizer = None
|
|
48
|
-
self._model = None
|
|
49
|
-
self._processor = None
|
|
50
|
-
|
|
33
|
+
@register_batching_multimodal_models("MiniCPM-V-2.6")
|
|
34
|
+
@register_transformer
|
|
35
|
+
@register_non_default_model("MiniCPM-V-2.6")
|
|
36
|
+
class Glm4VModel(PytorchMultiModalModel):
|
|
51
37
|
@classmethod
|
|
52
38
|
def match_json(
|
|
53
39
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
@@ -66,15 +52,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
66
52
|
pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
|
|
67
53
|
return pytorch_model_config
|
|
68
54
|
|
|
69
|
-
def
|
|
70
|
-
from transformers import AutoModel
|
|
71
|
-
|
|
72
|
-
return AutoModel
|
|
73
|
-
|
|
74
|
-
def load(self):
|
|
75
|
-
from transformers import AutoModel, AutoProcessor, AutoTokenizer
|
|
76
|
-
from transformers.generation import GenerationConfig
|
|
77
|
-
|
|
55
|
+
def decide_device(self):
|
|
78
56
|
device = self._pytorch_model_config.get("device", "auto")
|
|
79
57
|
self._device = select_device(device)
|
|
80
58
|
self._device = (
|
|
@@ -83,15 +61,25 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
83
61
|
else self._device
|
|
84
62
|
)
|
|
85
63
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
"Error: running int4 model with bitsandbytes on Mac is not supported right now."
|
|
89
|
-
)
|
|
90
|
-
exit()
|
|
64
|
+
def load_processor(self):
|
|
65
|
+
from transformers import AutoProcessor, AutoTokenizer
|
|
91
66
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
67
|
+
min_pixels = self._pytorch_model_config.get("min_pixels")
|
|
68
|
+
max_pixels = self._pytorch_model_config.get("max_pixels")
|
|
69
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
70
|
+
self.model_path,
|
|
71
|
+
trust_remote_code=True,
|
|
72
|
+
min_pixels=min_pixels,
|
|
73
|
+
max_pixels=max_pixels,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
77
|
+
self.model_path, trust_remote_code=True
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def load_multimodal_model(self):
|
|
81
|
+
from transformers import AutoModel
|
|
82
|
+
from transformers.generation import GenerationConfig
|
|
95
83
|
|
|
96
84
|
if "int4" in self.model_path:
|
|
97
85
|
model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
|
|
@@ -104,27 +92,13 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
104
92
|
device_map=self._device,
|
|
105
93
|
**kwargs,
|
|
106
94
|
)
|
|
107
|
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
108
|
-
self.model_path, trust_remote_code=True
|
|
109
|
-
)
|
|
110
95
|
self._model = model.eval()
|
|
111
|
-
self._tokenizer = tokenizer
|
|
112
|
-
|
|
113
96
|
# Specify hyperparameters for generation
|
|
114
97
|
self._model.generation_config = GenerationConfig.from_pretrained(
|
|
115
98
|
self.model_path,
|
|
116
99
|
trust_remote_code=True,
|
|
117
100
|
)
|
|
118
|
-
min_pixels = self._pytorch_model_config.get("min_pixels")
|
|
119
|
-
max_pixels = self._pytorch_model_config.get("max_pixels")
|
|
120
|
-
self._processor = AutoProcessor.from_pretrained(
|
|
121
|
-
self.model_path,
|
|
122
|
-
trust_remote_code=True,
|
|
123
|
-
min_pixels=min_pixels,
|
|
124
|
-
max_pixels=max_pixels,
|
|
125
|
-
)
|
|
126
101
|
self._device = self._model.device
|
|
127
|
-
self._save_tensorizer()
|
|
128
102
|
|
|
129
103
|
def _message_content_to_chat(self, content):
|
|
130
104
|
MAX_NUM_FRAMES = 64
|
|
@@ -220,58 +194,37 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
220
194
|
msgs.append({"role": "user", "content": images_chat + [content]})
|
|
221
195
|
return msgs, video_existed
|
|
222
196
|
|
|
223
|
-
|
|
224
|
-
def chat(
|
|
197
|
+
def build_inputs_from_messages(
|
|
225
198
|
self,
|
|
226
199
|
messages: List[Dict],
|
|
227
|
-
generate_config:
|
|
228
|
-
)
|
|
229
|
-
stream = generate_config.get("stream", False) if generate_config else False
|
|
200
|
+
generate_config: Dict,
|
|
201
|
+
):
|
|
230
202
|
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
231
|
-
|
|
232
203
|
# Set decode params for video
|
|
233
204
|
params = {}
|
|
234
205
|
if video_existed:
|
|
235
206
|
params = {"use_image_id": False, "max_slice_nums": 1}
|
|
207
|
+
return dict(msgs=msgs, image=None, **params)
|
|
236
208
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
completion_id = str(uuid.uuid1())
|
|
253
|
-
for new_text in chat:
|
|
254
|
-
yield generate_completion_chunk(
|
|
255
|
-
chunk_text=new_text,
|
|
256
|
-
finish_reason=None,
|
|
257
|
-
chunk_id=completion_id,
|
|
258
|
-
model_uid=self.model_uid,
|
|
259
|
-
prompt_tokens=-1,
|
|
260
|
-
completion_tokens=-1,
|
|
261
|
-
total_tokens=-1,
|
|
262
|
-
)
|
|
263
|
-
yield generate_completion_chunk(
|
|
264
|
-
chunk_text=None,
|
|
265
|
-
finish_reason="stop",
|
|
266
|
-
chunk_id=completion_id,
|
|
267
|
-
model_uid=self.model_uid,
|
|
268
|
-
prompt_tokens=-1,
|
|
269
|
-
completion_tokens=-1,
|
|
270
|
-
total_tokens=-1,
|
|
271
|
-
has_choice=True,
|
|
272
|
-
has_content=False,
|
|
209
|
+
def build_generate_kwargs(
|
|
210
|
+
self,
|
|
211
|
+
generate_config: Dict,
|
|
212
|
+
) -> Dict[str, Any]:
|
|
213
|
+
return dict(**generate_config)
|
|
214
|
+
|
|
215
|
+
def build_streaming_iter(
|
|
216
|
+
self,
|
|
217
|
+
messages: List[Dict],
|
|
218
|
+
generate_config: Dict,
|
|
219
|
+
) -> Tuple[Iterator, int]:
|
|
220
|
+
inputs = self.build_inputs_from_messages(messages, generate_config)
|
|
221
|
+
config = self.build_generate_kwargs(generate_config)
|
|
222
|
+
chat_iter = self._model.chat(
|
|
223
|
+
**inputs, **config, tokenizer=self._tokenizer, sampling=True
|
|
273
224
|
)
|
|
274
225
|
|
|
226
|
+
return chat_iter, -1
|
|
227
|
+
|
|
275
228
|
def prepare_sanitize_generate_config(self, req: InferenceRequest):
|
|
276
229
|
"""
|
|
277
230
|
Refer to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py
|
|
@@ -376,7 +329,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
376
329
|
because the specific inference process is performed by `self._model.llm`,
|
|
377
330
|
not `self._model` itself
|
|
378
331
|
"""
|
|
379
|
-
from
|
|
332
|
+
from ..utils import batch_inference_one_step
|
|
380
333
|
|
|
381
334
|
self.prepare_batch_inference(req_list)
|
|
382
335
|
batch_inference_one_step(
|