xinference 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +462 -3
- xinference/client/restful/async_restful_client.py +158 -5
- xinference/client/restful/restful_client.py +131 -0
- xinference/core/supervisor.py +12 -0
- xinference/model/audio/model_spec.json +20 -20
- xinference/model/image/model_spec.json +159 -159
- xinference/model/llm/__init__.py +2 -2
- xinference/model/llm/llm_family.json +843 -180
- xinference/model/llm/mlx/distributed_models/core.py +41 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
- xinference/model/llm/sglang/core.py +20 -6
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
- xinference/model/llm/transformers/chatglm.py +3 -0
- xinference/model/llm/transformers/core.py +129 -36
- xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
- xinference/model/llm/transformers/utils.py +23 -0
- xinference/model/llm/utils.py +37 -24
- xinference/model/llm/vllm/core.py +128 -69
- xinference/model/utils.py +74 -31
- xinference/thirdparty/audiotools/core/audio_signal.py +6 -6
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
- xinference/thirdparty/melo/text/chinese_mix.py +2 -2
- xinference/types.py +9 -0
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/{main.d192c4f3.js → main.45e78536.js} +3 -3
- xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/METADATA +7 -5
- {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/RECORD +36 -35
- xinference/ui/web/ui/build/static/js/main.d192c4f3.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
import torch
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
21
|
+
from .....core.model import register_batching_multimodal_models
|
|
22
|
+
from .....model.utils import select_device
|
|
23
|
+
from .....types import PytorchModelConfig
|
|
24
|
+
from ....scheduler.request import InferenceRequest
|
|
25
|
+
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
26
|
+
from ...utils import _decode_image, parse_messages
|
|
27
|
+
from ..core import register_non_default_model
|
|
28
|
+
from .core import PytorchMultiModalModel
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@register_batching_multimodal_models("MiniCPM-V-4.5")
|
|
34
|
+
@register_transformer
|
|
35
|
+
@register_non_default_model("MiniCPM-V-4.5")
|
|
36
|
+
class MiniCPMV45Model(PytorchMultiModalModel):
|
|
37
|
+
@classmethod
|
|
38
|
+
def match_json(
|
|
39
|
+
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
40
|
+
) -> bool:
|
|
41
|
+
family = model_family.model_family or model_family.model_name
|
|
42
|
+
if "MiniCPM-V-4.5".lower() in family.lower():
|
|
43
|
+
return True
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
def _sanitize_model_config(
|
|
47
|
+
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
48
|
+
) -> PytorchModelConfig:
|
|
49
|
+
pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
|
|
50
|
+
assert pytorch_model_config is not None
|
|
51
|
+
# Configure pixel parameters for MiniCPM-V-4.5
|
|
52
|
+
pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
|
|
53
|
+
pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
|
|
54
|
+
return pytorch_model_config
|
|
55
|
+
|
|
56
|
+
def decide_device(self):
|
|
57
|
+
device = self._pytorch_model_config.get("device", "auto")
|
|
58
|
+
self._device = select_device(device)
|
|
59
|
+
self._device = (
|
|
60
|
+
"auto"
|
|
61
|
+
if self._device == "cuda" and self.quantization is None
|
|
62
|
+
else self._device
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def load_processor(self):
|
|
66
|
+
from transformers import AutoProcessor, AutoTokenizer
|
|
67
|
+
|
|
68
|
+
min_pixels = self._pytorch_model_config.get("min_pixels")
|
|
69
|
+
max_pixels = self._pytorch_model_config.get("max_pixels")
|
|
70
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
71
|
+
self.model_path,
|
|
72
|
+
trust_remote_code=True,
|
|
73
|
+
min_pixels=min_pixels,
|
|
74
|
+
max_pixels=max_pixels,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
78
|
+
self.model_path, trust_remote_code=True
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def load_multimodal_model(self):
|
|
82
|
+
from transformers import AutoModel
|
|
83
|
+
from transformers.generation import GenerationConfig
|
|
84
|
+
|
|
85
|
+
if "int4" in self.model_path:
|
|
86
|
+
model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
|
|
87
|
+
else:
|
|
88
|
+
kwargs = self.apply_bnb_quantization()
|
|
89
|
+
model = AutoModel.from_pretrained(
|
|
90
|
+
self.model_path,
|
|
91
|
+
trust_remote_code=True,
|
|
92
|
+
torch_dtype=torch.float16,
|
|
93
|
+
device_map=self._device,
|
|
94
|
+
**kwargs,
|
|
95
|
+
)
|
|
96
|
+
self._model = model.eval()
|
|
97
|
+
# Specify hyperparameters for generation
|
|
98
|
+
self._model.generation_config = GenerationConfig.from_pretrained(
|
|
99
|
+
self.model_path,
|
|
100
|
+
trust_remote_code=True,
|
|
101
|
+
)
|
|
102
|
+
self._device = self._model.device
|
|
103
|
+
|
|
104
|
+
def _message_content_to_chat(self, content):
|
|
105
|
+
MAX_NUM_FRAMES = 64
|
|
106
|
+
|
|
107
|
+
def encode_video(video_path):
|
|
108
|
+
from decord import VideoReader, cpu
|
|
109
|
+
|
|
110
|
+
def uniform_sample(l, n):
|
|
111
|
+
gap = len(l) / n
|
|
112
|
+
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
|
113
|
+
return [l[i] for i in idxs]
|
|
114
|
+
|
|
115
|
+
vr = VideoReader(video_path, ctx=cpu(0))
|
|
116
|
+
sample_fps = round(vr.get_avg_fps() / 1) # FPS
|
|
117
|
+
frame_idx = [i for i in range(0, len(vr), sample_fps)]
|
|
118
|
+
if len(frame_idx) > MAX_NUM_FRAMES:
|
|
119
|
+
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
|
120
|
+
frames = vr.get_batch(frame_idx).asnumpy()
|
|
121
|
+
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
|
122
|
+
logger.info(
|
|
123
|
+
f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
|
|
124
|
+
)
|
|
125
|
+
return frames
|
|
126
|
+
|
|
127
|
+
def _load_video(_url):
|
|
128
|
+
frames = None
|
|
129
|
+
if _url.startswith("data:"):
|
|
130
|
+
raise RuntimeError("Only video url format is supported")
|
|
131
|
+
else:
|
|
132
|
+
frames = encode_video(_url)
|
|
133
|
+
return frames
|
|
134
|
+
|
|
135
|
+
if not isinstance(content, str):
|
|
136
|
+
texts = []
|
|
137
|
+
image_urls = []
|
|
138
|
+
video_urls = []
|
|
139
|
+
for c in content:
|
|
140
|
+
c_type = c.get("type")
|
|
141
|
+
if c_type == "text":
|
|
142
|
+
texts.append(c["text"])
|
|
143
|
+
elif c_type == "image_url":
|
|
144
|
+
image_urls.append(c["image_url"]["url"])
|
|
145
|
+
elif c_type == "video_url":
|
|
146
|
+
video_urls.append(c["video_url"]["url"])
|
|
147
|
+
image_futures = []
|
|
148
|
+
with ThreadPoolExecutor() as executor:
|
|
149
|
+
for image_url in image_urls:
|
|
150
|
+
fut = executor.submit(_decode_image, image_url)
|
|
151
|
+
image_futures.append(fut)
|
|
152
|
+
images = [fut.result() for fut in image_futures]
|
|
153
|
+
frames = []
|
|
154
|
+
if len(video_urls) > 1:
|
|
155
|
+
raise RuntimeError("Only one video per message is supported")
|
|
156
|
+
for v in video_urls:
|
|
157
|
+
frames = _load_video(v)
|
|
158
|
+
text = " ".join(texts)
|
|
159
|
+
return text, images, frames
|
|
160
|
+
return content, [], []
|
|
161
|
+
|
|
162
|
+
def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
|
|
163
|
+
video_existed = False
|
|
164
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
165
|
+
|
|
166
|
+
content, images_chat, video_frames = self._message_content_to_chat(prompt)
|
|
167
|
+
if len(video_frames) > 0:
|
|
168
|
+
video_existed = True
|
|
169
|
+
images_chat = video_frames
|
|
170
|
+
|
|
171
|
+
msgs = []
|
|
172
|
+
query_to_response: List[Dict] = []
|
|
173
|
+
for h in chat_history or []:
|
|
174
|
+
images_history = []
|
|
175
|
+
role = h["role"]
|
|
176
|
+
content_h, images_tmp, video_frames_h = self._message_content_to_chat(
|
|
177
|
+
h["content"]
|
|
178
|
+
)
|
|
179
|
+
if images_tmp != []:
|
|
180
|
+
images_history = images_tmp
|
|
181
|
+
if len(video_frames_h) > 0:
|
|
182
|
+
video_existed = True
|
|
183
|
+
images_history = video_frames_h
|
|
184
|
+
if len(query_to_response) == 0 and role == "user":
|
|
185
|
+
query_to_response.append(
|
|
186
|
+
{"role": "user", "content": images_history + [content_h]}
|
|
187
|
+
)
|
|
188
|
+
if len(query_to_response) == 1 and role == "assistant":
|
|
189
|
+
query_to_response.append(
|
|
190
|
+
{"role": "assistant", "content": images_history + [content_h]}
|
|
191
|
+
)
|
|
192
|
+
if len(query_to_response) == 2:
|
|
193
|
+
msgs.extend(query_to_response)
|
|
194
|
+
query_to_response = []
|
|
195
|
+
msgs.append({"role": "user", "content": images_chat + [content]})
|
|
196
|
+
return msgs, video_existed
|
|
197
|
+
|
|
198
|
+
def build_inputs_from_messages(
|
|
199
|
+
self,
|
|
200
|
+
messages: List[Dict],
|
|
201
|
+
generate_config: Dict,
|
|
202
|
+
):
|
|
203
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
204
|
+
# Set decode params for video
|
|
205
|
+
params = {}
|
|
206
|
+
if video_existed:
|
|
207
|
+
params = {"use_image_id": False, "max_slice_nums": 1}
|
|
208
|
+
return dict(msgs=msgs, image=None, **params)
|
|
209
|
+
|
|
210
|
+
def build_generate_kwargs(
|
|
211
|
+
self,
|
|
212
|
+
generate_config: Dict,
|
|
213
|
+
) -> Dict[str, Any]:
|
|
214
|
+
return dict(**generate_config)
|
|
215
|
+
|
|
216
|
+
def build_streaming_iter(
|
|
217
|
+
self,
|
|
218
|
+
messages: List[Dict],
|
|
219
|
+
generate_config: Dict,
|
|
220
|
+
) -> Tuple[Iterator, int]:
|
|
221
|
+
inputs = self.build_inputs_from_messages(messages, generate_config)
|
|
222
|
+
config = self.build_generate_kwargs(generate_config)
|
|
223
|
+
chat_iter = self._model.chat(
|
|
224
|
+
**inputs, **config, tokenizer=self._tokenizer, sampling=True
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return chat_iter, -1
|
|
228
|
+
|
|
229
|
+
def prepare_sanitize_generate_config(self, req: InferenceRequest):
|
|
230
|
+
"""
|
|
231
|
+
Refer to MiniCPM-V-4.5 documentation for generation parameters
|
|
232
|
+
"""
|
|
233
|
+
raw_config = req.inference_kwargs.get("raw_params", {})
|
|
234
|
+
temperature = raw_config.get("temperature", None)
|
|
235
|
+
if temperature is None:
|
|
236
|
+
raw_config["temperature"] = 0.7
|
|
237
|
+
top_p = raw_config.get("top_p", None)
|
|
238
|
+
if top_p is None:
|
|
239
|
+
raw_config["top_p"] = 0.8
|
|
240
|
+
top_k = raw_config.get("top_k", None)
|
|
241
|
+
if top_k is None:
|
|
242
|
+
raw_config["top_k"] = 100
|
|
243
|
+
repetition_penalty = raw_config.get("repetition_penalty", None)
|
|
244
|
+
if repetition_penalty is None:
|
|
245
|
+
raw_config["repetition_penalty"] = 1.05
|
|
246
|
+
return raw_config
|
|
247
|
+
|
|
248
|
+
def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
|
|
249
|
+
"""
|
|
250
|
+
Handle input IDs and images for MiniCPM-V-4.5
|
|
251
|
+
Based on MiniCPM-V-2.6 implementation with adaptations for 4.5
|
|
252
|
+
"""
|
|
253
|
+
from copy import deepcopy
|
|
254
|
+
|
|
255
|
+
copy_msgs = deepcopy(msgs)
|
|
256
|
+
|
|
257
|
+
images = []
|
|
258
|
+
for i, msg in enumerate(copy_msgs):
|
|
259
|
+
role = msg["role"]
|
|
260
|
+
content = msg["content"]
|
|
261
|
+
assert role in ["user", "assistant"]
|
|
262
|
+
if i == 0:
|
|
263
|
+
assert role == "user", "The role of first msg should be user"
|
|
264
|
+
if isinstance(content, str):
|
|
265
|
+
content = [content]
|
|
266
|
+
cur_msgs = []
|
|
267
|
+
for c in content:
|
|
268
|
+
if isinstance(c, Image.Image):
|
|
269
|
+
images.append(c)
|
|
270
|
+
cur_msgs.append("(<image>./</image>)")
|
|
271
|
+
elif isinstance(c, str):
|
|
272
|
+
cur_msgs.append(c)
|
|
273
|
+
msg["content"] = "\n".join(cur_msgs)
|
|
274
|
+
|
|
275
|
+
return {
|
|
276
|
+
"prompt": self._processor.tokenizer.apply_chat_template(
|
|
277
|
+
copy_msgs, tokenize=False, add_generation_prompt=True
|
|
278
|
+
),
|
|
279
|
+
"input_image": images,
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
|
|
283
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
284
|
+
if video_existed:
|
|
285
|
+
raise RuntimeError(
|
|
286
|
+
f"Continuous batching does not support video inputs for this model: {self.model_uid}"
|
|
287
|
+
)
|
|
288
|
+
return self._handle_input_ids_and_images(msgs)
|
|
289
|
+
|
|
290
|
+
def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
|
|
291
|
+
prompts_lists = [x["prompt"] for x in prompts]
|
|
292
|
+
input_images_lists = [x["input_image"] for x in prompts]
|
|
293
|
+
inputs = self._processor(
|
|
294
|
+
prompts_lists,
|
|
295
|
+
input_images_lists,
|
|
296
|
+
max_slice_nums=None,
|
|
297
|
+
use_image_id=None,
|
|
298
|
+
return_tensors="pt",
|
|
299
|
+
max_length=8192,
|
|
300
|
+
).to(self._model.device)
|
|
301
|
+
inputs.pop("image_sizes")
|
|
302
|
+
|
|
303
|
+
masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
|
|
304
|
+
for i in range(masked_input_ids.shape[0]):
|
|
305
|
+
non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
|
|
306
|
+
req_list[i].prompt_tokens = non_zero_values
|
|
307
|
+
req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
|
|
308
|
+
req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
|
|
309
|
+
|
|
310
|
+
model_inputs = {
|
|
311
|
+
"input_ids": inputs["input_ids"],
|
|
312
|
+
"image_bound": inputs["image_bound"],
|
|
313
|
+
"pixel_values": inputs["pixel_values"],
|
|
314
|
+
"tgt_sizes": inputs["tgt_sizes"],
|
|
315
|
+
}
|
|
316
|
+
model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
|
|
317
|
+
|
|
318
|
+
return {
|
|
319
|
+
"inputs_embeds": model_inputs["inputs_embeds"],
|
|
320
|
+
"attention_mask": inputs["attention_mask"],
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
def build_decode_position_ids(
|
|
324
|
+
self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
|
|
325
|
+
):
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
def batch_inference(self, req_list: List[InferenceRequest]):
|
|
329
|
+
"""
|
|
330
|
+
This method is rewritten
|
|
331
|
+
because the specific inference process is performed by `self._model.llm`,
|
|
332
|
+
not `self._model` itself
|
|
333
|
+
"""
|
|
334
|
+
from ..utils import batch_inference_one_step
|
|
335
|
+
|
|
336
|
+
self.prepare_batch_inference(req_list)
|
|
337
|
+
batch_inference_one_step(
|
|
338
|
+
self, req_list, self.model_uid, self._model.llm, self._tokenizer
|
|
339
|
+
)
|
|
340
|
+
self.handle_batch_inference_results(req_list)
|
|
@@ -281,11 +281,34 @@ def _batch_inference_one_step_internal(
|
|
|
281
281
|
r.append_new_token(token)
|
|
282
282
|
|
|
283
283
|
if decode_reqs:
|
|
284
|
+
# Ensure all decode requests have the same kv_cache reference
|
|
285
|
+
# This prevents batch size mismatches during merging
|
|
284
286
|
decode_kv = decode_reqs[0].kv_cache
|
|
287
|
+
|
|
288
|
+
# Verify that all decode requests share the same kv_cache
|
|
289
|
+
for req in decode_reqs[1:]:
|
|
290
|
+
if req.kv_cache is not decode_kv:
|
|
291
|
+
logger.warning(
|
|
292
|
+
"Inconsistent kv_cache references detected in decode requests. "
|
|
293
|
+
"This may indicate a batching synchronization issue."
|
|
294
|
+
)
|
|
295
|
+
# Use the first decode_kv as the reference to maintain consistency
|
|
296
|
+
req.kv_cache = decode_kv
|
|
297
|
+
|
|
285
298
|
# prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
|
|
286
299
|
merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
|
|
300
|
+
# Update sequence length information after KV cache merge
|
|
301
|
+
_, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
|
|
302
|
+
merged_kv_cache, xinf_model_obj
|
|
303
|
+
)
|
|
287
304
|
for r in valid_req_list:
|
|
288
305
|
r.kv_cache = merged_kv_cache
|
|
306
|
+
# Update attention mask sequence length to match merged KV cache
|
|
307
|
+
if "attention_mask_seq_len" in r.extra_kwargs:
|
|
308
|
+
# Ensure the attention mask length doesn't exceed the merged sequence length
|
|
309
|
+
r.extra_kwargs["attention_mask_seq_len"] = min(
|
|
310
|
+
r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
|
|
311
|
+
)
|
|
289
312
|
empty_cache()
|
|
290
313
|
else:
|
|
291
314
|
for r in valid_req_list:
|
xinference/model/llm/utils.py
CHANGED
|
@@ -75,6 +75,8 @@ QWEN_TOOL_CALL_FAMILY = [
|
|
|
75
75
|
"Qwen3-VL-Thinking",
|
|
76
76
|
"Qwen3-Next-Instruct",
|
|
77
77
|
"Qwen3-Next-Thinking",
|
|
78
|
+
"Qwen3-Omni-Instruct",
|
|
79
|
+
"Qwen3-Omni-Thinking",
|
|
78
80
|
]
|
|
79
81
|
|
|
80
82
|
GLM4_TOOL_CALL_FAMILY = [
|
|
@@ -100,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
|
100
102
|
|
|
101
103
|
|
|
102
104
|
class ChatModelMixin:
|
|
103
|
-
|
|
104
105
|
def __init__(self):
|
|
105
106
|
self.model_family = None
|
|
106
107
|
self.model_uid = None
|
|
@@ -143,7 +144,7 @@ class ChatModelMixin:
|
|
|
143
144
|
tokenize=False,
|
|
144
145
|
**kwargs,
|
|
145
146
|
):
|
|
146
|
-
if "vision" not in self.model_family.model_ability: # type: ignore
|
|
147
|
+
if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability: # type: ignore
|
|
147
148
|
messages = self.convert_messages_with_content_list_to_str_conversion(
|
|
148
149
|
messages
|
|
149
150
|
)
|
|
@@ -186,8 +187,7 @@ class ChatModelMixin:
|
|
|
186
187
|
return kwargs
|
|
187
188
|
else:
|
|
188
189
|
raise TypeError(
|
|
189
|
-
f"`chat_template_kwargs` but be a JSON parsable str "
|
|
190
|
-
f"or dict, got: {kwargs}"
|
|
190
|
+
f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
|
|
191
191
|
)
|
|
192
192
|
elif reasoning_parser and not reasoning_parser.enable_thinking:
|
|
193
193
|
# hybrid model like qwen3,
|
|
@@ -853,11 +853,11 @@ class ChatModelMixin:
|
|
|
853
853
|
"tool_calls": tool_calls,
|
|
854
854
|
}
|
|
855
855
|
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
assert "prompt_tokens" in usage
|
|
859
|
-
except Exception:
|
|
856
|
+
# For tool completion chunks, use None for usage, actual values for stop
|
|
857
|
+
if finish_reason == "tool_calls":
|
|
860
858
|
usage = None
|
|
859
|
+
else:
|
|
860
|
+
usage = c.get("usage")
|
|
861
861
|
return {
|
|
862
862
|
"id": "chat" + f"cmpl-{_id}",
|
|
863
863
|
"model": model_uid,
|
|
@@ -882,25 +882,32 @@ class ChatModelMixin:
|
|
|
882
882
|
):
|
|
883
883
|
if not self.tool_parser:
|
|
884
884
|
return self._get_final_chat_completion_chunk(c)
|
|
885
|
-
|
|
886
|
-
c = self.reasoning_parser.prepare_reasoning_content(c)
|
|
885
|
+
|
|
887
886
|
_id = str(uuid.uuid4())
|
|
888
887
|
reasoning_content = None
|
|
888
|
+
content = ""
|
|
889
|
+
|
|
890
|
+
# First, process reasoning content if reasoning parser exists
|
|
891
|
+
text = c["choices"][0]["text"]
|
|
889
892
|
if self.reasoning_parser and self.reasoning_parser.check_content_parser():
|
|
890
|
-
|
|
891
|
-
reasoning_content,
|
|
893
|
+
# Extract reasoning content directly from the original text
|
|
894
|
+
reasoning_content, processed_content = (
|
|
892
895
|
self.reasoning_parser.extract_reasoning_content(text)
|
|
893
896
|
)
|
|
894
|
-
|
|
897
|
+
# Use the processed content (without thinking tags) for tool parsing
|
|
898
|
+
if processed_content:
|
|
899
|
+
text = processed_content
|
|
895
900
|
|
|
901
|
+
# Then, extract tool calls from the processed text (without thinking tags)
|
|
896
902
|
tool_calls = []
|
|
897
903
|
failed_contents = []
|
|
898
904
|
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
899
905
|
tool_result = self.tool_parser.extract_tool_calls(c)
|
|
900
906
|
else:
|
|
901
|
-
text = c["choices"][0]["text"]
|
|
902
907
|
tool_result = self.tool_parser.extract_tool_calls(text)
|
|
903
|
-
|
|
908
|
+
|
|
909
|
+
# Process tool results
|
|
910
|
+
for tool_content, func, args in tool_result:
|
|
904
911
|
if func:
|
|
905
912
|
tool_calls.append(
|
|
906
913
|
{
|
|
@@ -913,25 +920,31 @@ class ChatModelMixin:
|
|
|
913
920
|
}
|
|
914
921
|
)
|
|
915
922
|
else:
|
|
916
|
-
if
|
|
917
|
-
failed_contents.append(
|
|
918
|
-
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
923
|
+
if tool_content:
|
|
924
|
+
failed_contents.append(tool_content)
|
|
919
925
|
|
|
920
|
-
|
|
926
|
+
# Determine the final content
|
|
927
|
+
if tool_calls:
|
|
928
|
+
# For tool calls, the main content should be empty or contain only non-tool parts
|
|
929
|
+
content = "".join(failed_contents) if failed_contents else ""
|
|
930
|
+
else:
|
|
931
|
+
# For non-tool calls, use the processed content from reasoning parser
|
|
932
|
+
content = text
|
|
933
|
+
|
|
934
|
+
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
921
935
|
|
|
922
936
|
m = {
|
|
923
937
|
"role": "assistant",
|
|
924
|
-
"content": content
|
|
938
|
+
"content": content,
|
|
925
939
|
"tool_calls": tool_calls,
|
|
926
940
|
}
|
|
927
941
|
# add only reasoning_content is None
|
|
928
942
|
if reasoning_content is not None:
|
|
929
943
|
m["reasoning_content"] = reasoning_content
|
|
930
944
|
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
except Exception:
|
|
945
|
+
# For tool completion chunks, use actual usage values when available
|
|
946
|
+
usage = c.get("usage")
|
|
947
|
+
if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
|
|
935
948
|
usage = {
|
|
936
949
|
"prompt_tokens": -1,
|
|
937
950
|
"completion_tokens": -1,
|