xinference 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +415 -1
- xinference/constants.py +2 -0
- xinference/core/supervisor.py +29 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/kokoro.py +1 -1
- xinference/model/audio/kokoro_zh.py +124 -0
- xinference/model/audio/model_spec.json +20 -0
- xinference/model/embedding/sentence_transformers/core.py +4 -4
- xinference/model/embedding/vllm/core.py +7 -1
- xinference/model/image/model_spec.json +2 -3
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +1 -0
- xinference/model/llm/llm_family.json +40 -20
- xinference/model/llm/llm_family.py +1 -0
- xinference/model/llm/mlx/core.py +52 -33
- xinference/model/llm/sglang/core.py +2 -44
- xinference/model/llm/tool_parsers/__init__.py +58 -0
- xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
- xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
- xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
- xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
- xinference/model/llm/transformers/core.py +1 -1
- xinference/model/llm/utils.py +127 -45
- xinference/model/llm/vllm/core.py +2 -61
- xinference/types.py +105 -2
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/METADATA +7 -3
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/RECORD +34 -26
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from . import register_tool_parser
|
|
7
|
+
from .abstract_tool_parser import ToolParser
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_tool_parser("qwen")
|
|
13
|
+
class QwenToolParser(ToolParser):
|
|
14
|
+
"""
|
|
15
|
+
Tool parser implementation for Qwen model.
|
|
16
|
+
|
|
17
|
+
This parser handles the specific format used by Qwen for tool calls,
|
|
18
|
+
which uses XML-like tags for both thinking blocks and tool calls.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the Qwen tool parser.
|
|
25
|
+
|
|
26
|
+
Sets up the XML-like tokens and regex patterns used for parsing
|
|
27
|
+
Qwen model outputs containing thinking blocks and tool calls.
|
|
28
|
+
"""
|
|
29
|
+
super().__init__()
|
|
30
|
+
|
|
31
|
+
# Sentinel tokens for streaming mode
|
|
32
|
+
self.think_start_token: str = "<think>"
|
|
33
|
+
self.think_end_token: str = "</think>"
|
|
34
|
+
self.tool_call_start_token: str = "<tool_call>"
|
|
35
|
+
self.tool_call_end_token: str = "</tool_call>"
|
|
36
|
+
|
|
37
|
+
# Regex patterns for parsing different content types
|
|
38
|
+
self.think_regex = re.compile("<think>(.*?)</think>", re.DOTALL)
|
|
39
|
+
self.content_regex = r"(<(think|tool_call)>.*?</\2>)"
|
|
40
|
+
self.tool_call_complete_regex = re.compile(
|
|
41
|
+
r"<tool_call>(.*?)</tool_call>", re.DOTALL
|
|
42
|
+
)
|
|
43
|
+
self.tool_call_regex = re.compile(
|
|
44
|
+
r"<tool_call>.*?</tool_call>|<tool_call>.*?$", re.DOTALL
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def _parse_json_function_call(
|
|
48
|
+
self,
|
|
49
|
+
function_call_str: str,
|
|
50
|
+
) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Parse JSON function call from string.
|
|
53
|
+
|
|
54
|
+
Extracts the JSON content from tool_call XML tags.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
function_call_str (str): The function call string to parse.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: Extracted JSON string or original string if no match found.
|
|
61
|
+
"""
|
|
62
|
+
function_calls = self.tool_call_complete_regex.findall(function_call_str)
|
|
63
|
+
if len(function_calls) == 0:
|
|
64
|
+
return function_call_str
|
|
65
|
+
return function_calls[-1]
|
|
66
|
+
|
|
67
|
+
def _parse_json_function_call_stream(
|
|
68
|
+
self,
|
|
69
|
+
function_call_str: str,
|
|
70
|
+
) -> Optional[str]:
|
|
71
|
+
"""
|
|
72
|
+
Parse JSON function call from streaming string.
|
|
73
|
+
|
|
74
|
+
Extracts the JSON content from tool_call XML tags in streaming context.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
function_call_str (str): The function call string to parse.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Optional[str]: Extracted JSON string or None if no complete match found.
|
|
81
|
+
"""
|
|
82
|
+
function_calls = self.tool_call_complete_regex.findall(function_call_str)
|
|
83
|
+
if len(function_calls) == 0:
|
|
84
|
+
return None
|
|
85
|
+
return function_calls[-1]
|
|
86
|
+
|
|
87
|
+
def is_contain_think_end_token(self, model_output: str) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
Check if the model output contains the think end token.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
model_output (str): The model output to check.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
bool: True if think end token is present.
|
|
96
|
+
"""
|
|
97
|
+
return self.think_end_token in model_output
|
|
98
|
+
|
|
99
|
+
def is_contain_think(self, model_output: str) -> bool:
|
|
100
|
+
"""
|
|
101
|
+
Check if the model output contains complete thinking blocks.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
model_output (str): The model output to check.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: True if complete thinking blocks are present.
|
|
108
|
+
"""
|
|
109
|
+
return self.think_regex.search(model_output) is not None
|
|
110
|
+
|
|
111
|
+
def is_contain_tool_call(self, model_output: str) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Check if the model output contains complete tool calls.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
model_output (str): The model output to check.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
bool: True if complete tool calls are present.
|
|
120
|
+
"""
|
|
121
|
+
return self.tool_call_complete_regex.search(model_output) is not None
|
|
122
|
+
|
|
123
|
+
def is_contain_tool_call_start_token(self, model_output: str) -> bool:
|
|
124
|
+
"""
|
|
125
|
+
Check if the model output contains the tool call start token.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
model_output (str): The model output to check.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
bool: True if tool call start token is present.
|
|
132
|
+
"""
|
|
133
|
+
return self.tool_call_start_token in model_output
|
|
134
|
+
|
|
135
|
+
def is_contain_tool_call_end_token(self, model_output: str) -> bool:
|
|
136
|
+
"""
|
|
137
|
+
Check if the model output contains the tool call end token.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
model_output (str): The model output to check.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
bool: True if tool call end token is present.
|
|
144
|
+
"""
|
|
145
|
+
return self.tool_call_end_token in model_output
|
|
146
|
+
|
|
147
|
+
def _get_function_calls(self, model_output: str) -> List[str]:
|
|
148
|
+
"""
|
|
149
|
+
Extract all function calls and content blocks from model output.
|
|
150
|
+
|
|
151
|
+
Parses the model output to separate thinking blocks, tool calls,
|
|
152
|
+
and regular content into individual components.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
model_output (str): The complete model output to parse.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
List[str]: List of content blocks (text, thinking blocks, tool calls).
|
|
159
|
+
"""
|
|
160
|
+
functions_calls = []
|
|
161
|
+
last_end = 0
|
|
162
|
+
for m in re.finditer(self.content_regex, model_output, re.DOTALL):
|
|
163
|
+
# Add any text before the current match
|
|
164
|
+
if m.start() > last_end:
|
|
165
|
+
functions_calls.append(model_output[last_end : m.start()])
|
|
166
|
+
# Add the matched content (think or tool_call block)
|
|
167
|
+
functions_calls.append(m.group(0))
|
|
168
|
+
last_end = m.end()
|
|
169
|
+
# Add any remaining text after the last match
|
|
170
|
+
if last_end < len(model_output):
|
|
171
|
+
functions_calls.append(model_output[last_end:])
|
|
172
|
+
return functions_calls
|
|
173
|
+
|
|
174
|
+
def _get_function_calls_streaming(self, model_output: str) -> List[str]:
|
|
175
|
+
"""
|
|
176
|
+
Extract function calls from streaming model output.
|
|
177
|
+
|
|
178
|
+
Finds both complete and incomplete tool calls in streaming context.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
model_output (str): The streaming model output to parse.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List[str]: List of tool call blocks (complete or incomplete).
|
|
185
|
+
"""
|
|
186
|
+
matched_ranges = self.tool_call_regex.findall(model_output)
|
|
187
|
+
return matched_ranges
|
|
188
|
+
|
|
189
|
+
def extract_tool_calls(
|
|
190
|
+
self, model_output: str
|
|
191
|
+
) -> List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
|
|
192
|
+
"""
|
|
193
|
+
Extract tool calls from complete model output.
|
|
194
|
+
|
|
195
|
+
Parses the model output to find tool calls and thinking blocks,
|
|
196
|
+
extracting function names and arguments from JSON content within
|
|
197
|
+
tool_call XML tags.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
model_output (str): The complete output string from the model.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
|
|
204
|
+
A list of tuples where each tuple contains:
|
|
205
|
+
- content (str or None): Raw content if parsing failed, None if successful
|
|
206
|
+
- function_name (str or None): Name of the function to call
|
|
207
|
+
- arguments (dict or None): Function arguments
|
|
208
|
+
|
|
209
|
+
Example:
|
|
210
|
+
>>> parser = QwenToolParser()
|
|
211
|
+
>>> output = '<tool_call>\n{"name": "get_weather", "arguments": {"location": "Beijing"}}\n</tool_call>'
|
|
212
|
+
>>> result = parser.extract_tool_calls(output)
|
|
213
|
+
>>> print(result)
|
|
214
|
+
[(None, 'get_weather', {'location': 'Beijing'})]
|
|
215
|
+
"""
|
|
216
|
+
# If no tool call tokens, return original output as content
|
|
217
|
+
if self.tool_call_start_token not in model_output:
|
|
218
|
+
return [(model_output, None, None)]
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
function_calls = self._get_function_calls(model_output)
|
|
222
|
+
if len(function_calls) == 0:
|
|
223
|
+
return [(model_output, None, None)]
|
|
224
|
+
|
|
225
|
+
results: List[
|
|
226
|
+
Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]
|
|
227
|
+
] = []
|
|
228
|
+
for function_call in function_calls:
|
|
229
|
+
try:
|
|
230
|
+
parsed_json = self._parse_json_function_call(function_call)
|
|
231
|
+
res = json.loads(parsed_json, strict=False)
|
|
232
|
+
results.append((None, res["name"], res["arguments"]))
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.error(
|
|
235
|
+
"Can't parse single qwen tool call output: %s. Error: %s",
|
|
236
|
+
function_call,
|
|
237
|
+
e,
|
|
238
|
+
)
|
|
239
|
+
results.append((function_call, None, None))
|
|
240
|
+
return results
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
logger.error(
|
|
244
|
+
"Can't parse qwen tool call output: %s. Error: %s",
|
|
245
|
+
model_output,
|
|
246
|
+
e,
|
|
247
|
+
)
|
|
248
|
+
return [(model_output, None, None)]
|
|
249
|
+
|
|
250
|
+
def _has_unclosed_tool_call(self, text: str) -> bool:
|
|
251
|
+
"""
|
|
252
|
+
Check if the text has unclosed tool_call tags.
|
|
253
|
+
|
|
254
|
+
Counts the number of opening and closing tool_call tags to determine
|
|
255
|
+
if there are any unclosed tool calls in the text.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
text (str): The text to check for unclosed tags.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
bool: True if there are unclosed tool_call tags.
|
|
262
|
+
"""
|
|
263
|
+
if not text:
|
|
264
|
+
return True
|
|
265
|
+
start_count = text.count(self.tool_call_start_token)
|
|
266
|
+
end_count = text.count(self.tool_call_end_token)
|
|
267
|
+
return start_count > end_count
|
|
268
|
+
|
|
269
|
+
def extract_tool_calls_streaming(
|
|
270
|
+
self, previous_text: List[str], current_text: str, delta_text: str
|
|
271
|
+
) -> Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
|
|
272
|
+
"""
|
|
273
|
+
Extract tool calls from streaming output.
|
|
274
|
+
|
|
275
|
+
Processes streaming model output to detect and extract tool calls
|
|
276
|
+
as they are being generated. Handles incomplete tool calls and
|
|
277
|
+
determines when a complete tool call is available.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
previous_text (List[str]): Previous text chunks from the stream.
|
|
281
|
+
current_text (str): Current accumulated text.
|
|
282
|
+
delta_text (str): New text delta in this chunk.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
|
|
286
|
+
A tuple containing:
|
|
287
|
+
- content (str or None): Text content or None for tool calls
|
|
288
|
+
- function_name (str or None): Name of the function to call
|
|
289
|
+
- arguments (dict or None): Function arguments
|
|
290
|
+
Returns None if no complete tool call is ready.
|
|
291
|
+
|
|
292
|
+
Note:
|
|
293
|
+
This method is designed to work with Qwen's streaming output format
|
|
294
|
+
and handles partial tool calls during generation.
|
|
295
|
+
"""
|
|
296
|
+
try:
|
|
297
|
+
# Check if current output contains tool_call start token
|
|
298
|
+
if self.is_contain_tool_call_start_token(current_text):
|
|
299
|
+
function_calls = self._get_function_calls_streaming(current_text)
|
|
300
|
+
# If the last function call contains thinking, it's not a tool call
|
|
301
|
+
if self.is_contain_think(function_calls[-1]):
|
|
302
|
+
return None
|
|
303
|
+
# If the previous round's tool_call tags are closed, this is a new tool call
|
|
304
|
+
if not self._has_unclosed_tool_call(previous_text[-1]):
|
|
305
|
+
return None
|
|
306
|
+
# Parse and return
|
|
307
|
+
function_call = self._parse_json_function_call_stream(
|
|
308
|
+
function_calls[-1]
|
|
309
|
+
)
|
|
310
|
+
if function_call is None:
|
|
311
|
+
return None
|
|
312
|
+
res = json.loads(function_call, strict=False)
|
|
313
|
+
return None, res["name"], res["arguments"]
|
|
314
|
+
else:
|
|
315
|
+
# Return delta text as regular content
|
|
316
|
+
return (delta_text, None, None)
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error("Error in Qwen streaming tool call extraction: %s", e)
|
|
320
|
+
raise
|
|
@@ -332,6 +332,7 @@ class PytorchModel(LLM):
|
|
|
332
332
|
self.prepare_parse_reasoning_content(
|
|
333
333
|
reasoning_content, enable_thinking=enable_thinking
|
|
334
334
|
)
|
|
335
|
+
self.prepare_parse_tool_calls()
|
|
335
336
|
|
|
336
337
|
logger.debug("Loading Transformers model with kwargs: %s", kwargs)
|
|
337
338
|
|
|
@@ -983,7 +984,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
983
984
|
self.model_family,
|
|
984
985
|
self.model_uid,
|
|
985
986
|
req.completion[0],
|
|
986
|
-
self.reasoning_parser,
|
|
987
987
|
)
|
|
988
988
|
else:
|
|
989
989
|
req.completion[0] = self._to_chat_completion(
|
xinference/model/llm/utils.py
CHANGED
|
@@ -51,6 +51,7 @@ from ...types import (
|
|
|
51
51
|
)
|
|
52
52
|
from .core import chat_context_var
|
|
53
53
|
from .reasoning_parser import ReasoningParser
|
|
54
|
+
from .tool_parsers.glm4_tool_parser import Glm4ToolParser
|
|
54
55
|
|
|
55
56
|
logger = logging.getLogger(__name__)
|
|
56
57
|
|
|
@@ -95,6 +96,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
|
95
96
|
|
|
96
97
|
|
|
97
98
|
class ChatModelMixin:
|
|
99
|
+
|
|
100
|
+
def __init__(self):
|
|
101
|
+
self.model_family = None
|
|
102
|
+
self.model_uid = None
|
|
103
|
+
self.reasoning_parser = None
|
|
104
|
+
self.tool_parser = None
|
|
105
|
+
|
|
98
106
|
@staticmethod
|
|
99
107
|
@functools.lru_cache
|
|
100
108
|
def _compile_jinja_template(chat_template):
|
|
@@ -590,16 +598,41 @@ class ChatModelMixin:
|
|
|
590
598
|
pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
|
|
591
599
|
if pos2 != -1:
|
|
592
600
|
content = content[:pos2]
|
|
601
|
+
|
|
602
|
+
# Skip empty content after extraction
|
|
603
|
+
if not content.strip():
|
|
604
|
+
continue
|
|
605
|
+
|
|
593
606
|
try:
|
|
594
607
|
res = json.loads(content, strict=False)
|
|
595
|
-
|
|
596
|
-
|
|
608
|
+
if isinstance(res, dict):
|
|
609
|
+
# Check if required fields exist
|
|
610
|
+
if "name" in res and "arguments" in res:
|
|
611
|
+
results.append((None, res["name"], res["arguments"]))
|
|
612
|
+
else:
|
|
613
|
+
logger.warning(
|
|
614
|
+
"Missing required fields in qwen tool call: %s", content
|
|
615
|
+
)
|
|
616
|
+
results.append((content, None, None))
|
|
617
|
+
else:
|
|
618
|
+
logger.warning(
|
|
619
|
+
"Qwen tool call result is not a dict: %s", content
|
|
620
|
+
)
|
|
621
|
+
results.append((content, None, None))
|
|
622
|
+
except json.JSONDecodeError as e:
|
|
597
623
|
logger.error(
|
|
598
624
|
"Can't parse single qwen tool call output: %s. Error: %s",
|
|
599
625
|
content,
|
|
600
626
|
e,
|
|
601
627
|
)
|
|
602
628
|
results.append((content, None, None))
|
|
629
|
+
except Exception as e:
|
|
630
|
+
logger.error(
|
|
631
|
+
"Unexpected error parsing qwen tool call: %s. Error: %s",
|
|
632
|
+
content,
|
|
633
|
+
e,
|
|
634
|
+
)
|
|
635
|
+
results.append((content, None, None))
|
|
603
636
|
return results
|
|
604
637
|
|
|
605
638
|
@classmethod
|
|
@@ -757,47 +790,60 @@ class ChatModelMixin:
|
|
|
757
790
|
logger.debug(f"Tool call content: {result}")
|
|
758
791
|
return result
|
|
759
792
|
|
|
760
|
-
@classmethod
|
|
761
793
|
def _post_process_completion_chunk(
|
|
762
|
-
|
|
794
|
+
self,
|
|
763
795
|
model_family,
|
|
764
796
|
model_uid,
|
|
765
797
|
c,
|
|
766
798
|
chunk_id=None,
|
|
767
|
-
|
|
768
|
-
tool_call_text: Optional[str] = None,
|
|
799
|
+
previous_texts: List[str] = [""],
|
|
769
800
|
):
|
|
770
801
|
_id = chunk_id if chunk_id is not None else str(uuid.uuid4())
|
|
771
|
-
|
|
802
|
+
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
803
|
+
tool_result = self.tool_parser.extract_tool_calls_streaming(
|
|
804
|
+
[],
|
|
805
|
+
c,
|
|
806
|
+
c,
|
|
807
|
+
)
|
|
808
|
+
else:
|
|
809
|
+
finish_reason = c["choices"][0]["finish_reason"]
|
|
810
|
+
delta_text = c["choices"][0]["delta"]["content"]
|
|
811
|
+
current_text = (
|
|
812
|
+
previous_texts[-1] + delta_text if previous_texts else delta_text
|
|
813
|
+
)
|
|
814
|
+
tool_result = self.tool_parser.extract_tool_calls_streaming(
|
|
815
|
+
previous_texts,
|
|
816
|
+
current_text,
|
|
817
|
+
delta_text,
|
|
818
|
+
)
|
|
819
|
+
previous_texts[-1] = current_text
|
|
820
|
+
if tool_result is None and not finish_reason:
|
|
821
|
+
return None
|
|
772
822
|
tool_calls = []
|
|
773
823
|
failed_contents = []
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
824
|
+
content, func, args = tool_result if tool_result else ("", None, None)
|
|
825
|
+
if func:
|
|
826
|
+
tool_calls.append(
|
|
827
|
+
{
|
|
828
|
+
"index": 0,
|
|
829
|
+
"id": f"call_{_id}",
|
|
830
|
+
"type": "function",
|
|
831
|
+
"function": {
|
|
832
|
+
"name": func,
|
|
833
|
+
"arguments": json.dumps(args, ensure_ascii=False),
|
|
834
|
+
},
|
|
835
|
+
}
|
|
836
|
+
)
|
|
837
|
+
else:
|
|
838
|
+
failed_contents.append(content)
|
|
790
839
|
|
|
791
|
-
|
|
840
|
+
finish_reason = "tool_calls" if tool_calls else finish_reason
|
|
792
841
|
|
|
793
|
-
|
|
794
|
-
family = model_family.model_family or model_family.model_name
|
|
795
|
-
if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
|
|
796
|
-
content = ""
|
|
842
|
+
content = "".join(failed_contents) if failed_contents else None
|
|
797
843
|
|
|
798
844
|
d = {
|
|
799
845
|
"role": "assistant",
|
|
800
|
-
"content": content,
|
|
846
|
+
"content": content if content else "",
|
|
801
847
|
"tool_calls": tool_calls,
|
|
802
848
|
}
|
|
803
849
|
|
|
@@ -826,29 +872,32 @@ class ChatModelMixin:
|
|
|
826
872
|
"usage": usage,
|
|
827
873
|
}
|
|
828
874
|
|
|
829
|
-
@classmethod
|
|
830
875
|
def _post_process_completion(
|
|
831
|
-
|
|
876
|
+
self,
|
|
832
877
|
model_family,
|
|
833
878
|
model_uid,
|
|
834
879
|
c,
|
|
835
|
-
reasoning_parser: Optional[ReasoningParser] = None,
|
|
836
880
|
):
|
|
837
|
-
if
|
|
838
|
-
|
|
881
|
+
if not self.tool_parser:
|
|
882
|
+
return self._get_final_chat_completion_chunk(c)
|
|
883
|
+
if self.reasoning_parser:
|
|
884
|
+
c = self.reasoning_parser.prepare_reasoning_content(c)
|
|
839
885
|
_id = str(uuid.uuid4())
|
|
840
886
|
reasoning_content = None
|
|
841
|
-
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
887
|
+
if self.reasoning_parser and self.reasoning_parser.check_content_parser():
|
|
842
888
|
text = c["choices"][0]["text"]
|
|
843
|
-
reasoning_content, content =
|
|
844
|
-
text
|
|
889
|
+
reasoning_content, content = (
|
|
890
|
+
self.reasoning_parser.extract_reasoning_content(text)
|
|
845
891
|
)
|
|
846
892
|
c["choices"][0]["text"] = content
|
|
847
893
|
|
|
848
|
-
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
849
|
-
|
|
850
894
|
tool_calls = []
|
|
851
895
|
failed_contents = []
|
|
896
|
+
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
897
|
+
tool_result = self.tool_parser.extract_tool_calls(c)
|
|
898
|
+
else:
|
|
899
|
+
text = c["choices"][0]["text"]
|
|
900
|
+
tool_result = self.tool_parser.extract_tool_calls(text)
|
|
852
901
|
for content, func, args in tool_result:
|
|
853
902
|
if func:
|
|
854
903
|
tool_calls.append(
|
|
@@ -868,14 +917,9 @@ class ChatModelMixin:
|
|
|
868
917
|
|
|
869
918
|
content = "".join(failed_contents) if failed_contents else None
|
|
870
919
|
|
|
871
|
-
# fix: qwen tool_call content field return null
|
|
872
|
-
family = model_family.model_family or model_family.model_name
|
|
873
|
-
if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
|
|
874
|
-
content = ""
|
|
875
|
-
|
|
876
920
|
m = {
|
|
877
921
|
"role": "assistant",
|
|
878
|
-
"content": content,
|
|
922
|
+
"content": content if content else "",
|
|
879
923
|
"tool_calls": tool_calls,
|
|
880
924
|
}
|
|
881
925
|
# add only reasoning_content is None
|
|
@@ -943,6 +987,44 @@ class ChatModelMixin:
|
|
|
943
987
|
|
|
944
988
|
return transformed_messages
|
|
945
989
|
|
|
990
|
+
async def _async_to_tool_completion_chunks(
|
|
991
|
+
self,
|
|
992
|
+
chunks: AsyncGenerator[CompletionChunk, None],
|
|
993
|
+
ctx: Optional[Dict[str, Any]] = None,
|
|
994
|
+
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
995
|
+
def set_context():
|
|
996
|
+
if ctx:
|
|
997
|
+
chat_context_var.set(ctx)
|
|
998
|
+
|
|
999
|
+
i = 0
|
|
1000
|
+
previous_texts = [""]
|
|
1001
|
+
previous_tools_texts = [""]
|
|
1002
|
+
full_text = ""
|
|
1003
|
+
if self.reasoning_parser:
|
|
1004
|
+
set_context()
|
|
1005
|
+
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
1006
|
+
async for completion_chunk in chunks:
|
|
1007
|
+
set_context()
|
|
1008
|
+
chat_chunk = self._to_chat_completion_chunk(
|
|
1009
|
+
completion_chunk, self.reasoning_parser, previous_texts
|
|
1010
|
+
)
|
|
1011
|
+
if (
|
|
1012
|
+
"reasoning_content" in chat_chunk["choices"][0]["delta"]
|
|
1013
|
+
and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
|
|
1014
|
+
):
|
|
1015
|
+
yield chat_chunk
|
|
1016
|
+
continue
|
|
1017
|
+
processed_chunk = self._post_process_completion_chunk(
|
|
1018
|
+
self.model_family,
|
|
1019
|
+
self.model_uid,
|
|
1020
|
+
chat_chunk,
|
|
1021
|
+
previous_texts=previous_tools_texts,
|
|
1022
|
+
)
|
|
1023
|
+
if processed_chunk:
|
|
1024
|
+
yield processed_chunk
|
|
1025
|
+
i += 1
|
|
1026
|
+
logger.debug("Chat finished, output: %s", full_text)
|
|
1027
|
+
|
|
946
1028
|
|
|
947
1029
|
def get_model_version(
|
|
948
1030
|
model_name: str,
|
|
@@ -393,6 +393,7 @@ class VLLMModel(LLM):
|
|
|
393
393
|
self.prepare_parse_reasoning_content(
|
|
394
394
|
reasoning_content, enable_thinking=enable_thinking
|
|
395
395
|
)
|
|
396
|
+
self.prepare_parse_tool_calls()
|
|
396
397
|
|
|
397
398
|
if (
|
|
398
399
|
isinstance(self.model_spec, LlamaCppLLMSpecV2)
|
|
@@ -773,7 +774,6 @@ class VLLMModel(LLM):
|
|
|
773
774
|
sanitized = VLLMGenerateConfig()
|
|
774
775
|
|
|
775
776
|
response_format = generate_config.pop("response_format", None)
|
|
776
|
-
guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
|
|
777
777
|
guided_json_object = None
|
|
778
778
|
guided_json = None
|
|
779
779
|
|
|
@@ -784,8 +784,6 @@ class VLLMModel(LLM):
|
|
|
784
784
|
json_schema = response_format.get("json_schema")
|
|
785
785
|
assert json_schema is not None
|
|
786
786
|
guided_json = json_schema.get("json_schema")
|
|
787
|
-
if guided_decoding_backend is None:
|
|
788
|
-
guided_decoding_backend = "outlines"
|
|
789
787
|
|
|
790
788
|
sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
|
|
791
789
|
sanitized.setdefault("n", generate_config.get("n", 1))
|
|
@@ -833,10 +831,6 @@ class VLLMModel(LLM):
|
|
|
833
831
|
"guided_json_object",
|
|
834
832
|
generate_config.get("guided_json_object", guided_json_object),
|
|
835
833
|
)
|
|
836
|
-
sanitized.setdefault(
|
|
837
|
-
"guided_decoding_backend",
|
|
838
|
-
generate_config.get("guided_decoding_backend", guided_decoding_backend),
|
|
839
|
-
)
|
|
840
834
|
|
|
841
835
|
return sanitized
|
|
842
836
|
|
|
@@ -1291,59 +1285,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1291
1285
|
|
|
1292
1286
|
return processed_messages
|
|
1293
1287
|
|
|
1294
|
-
async def _async_to_tool_completion_chunks(
|
|
1295
|
-
self,
|
|
1296
|
-
chunks: AsyncGenerator[CompletionChunk, None],
|
|
1297
|
-
ctx: Optional[Dict[str, Any]] = {},
|
|
1298
|
-
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
1299
|
-
def set_context():
|
|
1300
|
-
if ctx:
|
|
1301
|
-
chat_context_var.set(ctx)
|
|
1302
|
-
|
|
1303
|
-
i = 0
|
|
1304
|
-
previous_texts = [""]
|
|
1305
|
-
tool_call = False
|
|
1306
|
-
tool_call_texts = [""]
|
|
1307
|
-
full_text = ""
|
|
1308
|
-
if self.reasoning_parser:
|
|
1309
|
-
set_context()
|
|
1310
|
-
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
1311
|
-
async for chunk in chunks:
|
|
1312
|
-
set_context()
|
|
1313
|
-
if i == 0:
|
|
1314
|
-
for first_chunk in self._get_first_chat_completion_chunk(
|
|
1315
|
-
chunk, self.reasoning_parser
|
|
1316
|
-
):
|
|
1317
|
-
yield first_chunk
|
|
1318
|
-
# usage
|
|
1319
|
-
choices = chunk.get("choices")
|
|
1320
|
-
if not choices:
|
|
1321
|
-
yield self._get_final_chat_completion_chunk(chunk)
|
|
1322
|
-
else:
|
|
1323
|
-
full_text += chunk["choices"][0]["text"]
|
|
1324
|
-
if self.is_tool_call_chunk_start(chunk):
|
|
1325
|
-
tool_call = True
|
|
1326
|
-
if tool_call:
|
|
1327
|
-
tool_call_text = tool_call_texts[-1]
|
|
1328
|
-
tool_call_text += chunk["choices"][0]["text"]
|
|
1329
|
-
tool_call_texts.append(tool_call_text)
|
|
1330
|
-
if self.is_tool_call_chunk_end(chunk):
|
|
1331
|
-
yield self._post_process_completion_chunk(
|
|
1332
|
-
self.model_family,
|
|
1333
|
-
self.model_uid,
|
|
1334
|
-
chunk,
|
|
1335
|
-
reasoning_parser=self.reasoning_parser,
|
|
1336
|
-
tool_call_text=tool_call_text,
|
|
1337
|
-
)
|
|
1338
|
-
tool_call = False
|
|
1339
|
-
tool_call_texts = [""]
|
|
1340
|
-
else:
|
|
1341
|
-
yield self._to_chat_completion_chunk(
|
|
1342
|
-
chunk, self.reasoning_parser, previous_texts
|
|
1343
|
-
)
|
|
1344
|
-
i += 1
|
|
1345
|
-
logger.debug("Chat finished, output: %s", full_text)
|
|
1346
|
-
|
|
1347
1288
|
@vllm_check
|
|
1348
1289
|
async def async_chat(
|
|
1349
1290
|
self,
|
|
@@ -1408,7 +1349,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1408
1349
|
assert not isinstance(c, AsyncGenerator)
|
|
1409
1350
|
if tools:
|
|
1410
1351
|
return self._post_process_completion(
|
|
1411
|
-
self.model_family, self.model_uid, c
|
|
1352
|
+
self.model_family, self.model_uid, c
|
|
1412
1353
|
)
|
|
1413
1354
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
1414
1355
|
|