xinference 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +415 -1
  3. xinference/constants.py +2 -0
  4. xinference/core/supervisor.py +29 -1
  5. xinference/model/audio/core.py +5 -0
  6. xinference/model/audio/kokoro.py +1 -1
  7. xinference/model/audio/kokoro_zh.py +124 -0
  8. xinference/model/audio/model_spec.json +20 -0
  9. xinference/model/embedding/sentence_transformers/core.py +4 -4
  10. xinference/model/embedding/vllm/core.py +7 -1
  11. xinference/model/image/model_spec.json +2 -3
  12. xinference/model/llm/core.py +10 -0
  13. xinference/model/llm/llama_cpp/core.py +1 -0
  14. xinference/model/llm/llm_family.json +40 -20
  15. xinference/model/llm/llm_family.py +1 -0
  16. xinference/model/llm/mlx/core.py +52 -33
  17. xinference/model/llm/sglang/core.py +2 -44
  18. xinference/model/llm/tool_parsers/__init__.py +58 -0
  19. xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
  20. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
  21. xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
  22. xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
  23. xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
  24. xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
  25. xinference/model/llm/transformers/core.py +1 -1
  26. xinference/model/llm/utils.py +127 -45
  27. xinference/model/llm/vllm/core.py +2 -61
  28. xinference/types.py +105 -2
  29. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/METADATA +7 -3
  30. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/RECORD +34 -26
  31. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
  32. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
  33. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
  34. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,320 @@
1
+ import json
2
+ import logging
3
+ import re
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ from . import register_tool_parser
7
+ from .abstract_tool_parser import ToolParser
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ @register_tool_parser("qwen")
13
+ class QwenToolParser(ToolParser):
14
+ """
15
+ Tool parser implementation for Qwen model.
16
+
17
+ This parser handles the specific format used by Qwen for tool calls,
18
+ which uses XML-like tags for both thinking blocks and tool calls.
19
+
20
+ """
21
+
22
+ def __init__(self):
23
+ """
24
+ Initialize the Qwen tool parser.
25
+
26
+ Sets up the XML-like tokens and regex patterns used for parsing
27
+ Qwen model outputs containing thinking blocks and tool calls.
28
+ """
29
+ super().__init__()
30
+
31
+ # Sentinel tokens for streaming mode
32
+ self.think_start_token: str = "<think>"
33
+ self.think_end_token: str = "</think>"
34
+ self.tool_call_start_token: str = "<tool_call>"
35
+ self.tool_call_end_token: str = "</tool_call>"
36
+
37
+ # Regex patterns for parsing different content types
38
+ self.think_regex = re.compile("<think>(.*?)</think>", re.DOTALL)
39
+ self.content_regex = r"(<(think|tool_call)>.*?</\2>)"
40
+ self.tool_call_complete_regex = re.compile(
41
+ r"<tool_call>(.*?)</tool_call>", re.DOTALL
42
+ )
43
+ self.tool_call_regex = re.compile(
44
+ r"<tool_call>.*?</tool_call>|<tool_call>.*?$", re.DOTALL
45
+ )
46
+
47
+ def _parse_json_function_call(
48
+ self,
49
+ function_call_str: str,
50
+ ) -> str:
51
+ """
52
+ Parse JSON function call from string.
53
+
54
+ Extracts the JSON content from tool_call XML tags.
55
+
56
+ Args:
57
+ function_call_str (str): The function call string to parse.
58
+
59
+ Returns:
60
+ str: Extracted JSON string or original string if no match found.
61
+ """
62
+ function_calls = self.tool_call_complete_regex.findall(function_call_str)
63
+ if len(function_calls) == 0:
64
+ return function_call_str
65
+ return function_calls[-1]
66
+
67
+ def _parse_json_function_call_stream(
68
+ self,
69
+ function_call_str: str,
70
+ ) -> Optional[str]:
71
+ """
72
+ Parse JSON function call from streaming string.
73
+
74
+ Extracts the JSON content from tool_call XML tags in streaming context.
75
+
76
+ Args:
77
+ function_call_str (str): The function call string to parse.
78
+
79
+ Returns:
80
+ Optional[str]: Extracted JSON string or None if no complete match found.
81
+ """
82
+ function_calls = self.tool_call_complete_regex.findall(function_call_str)
83
+ if len(function_calls) == 0:
84
+ return None
85
+ return function_calls[-1]
86
+
87
+ def is_contain_think_end_token(self, model_output: str) -> bool:
88
+ """
89
+ Check if the model output contains the think end token.
90
+
91
+ Args:
92
+ model_output (str): The model output to check.
93
+
94
+ Returns:
95
+ bool: True if think end token is present.
96
+ """
97
+ return self.think_end_token in model_output
98
+
99
+ def is_contain_think(self, model_output: str) -> bool:
100
+ """
101
+ Check if the model output contains complete thinking blocks.
102
+
103
+ Args:
104
+ model_output (str): The model output to check.
105
+
106
+ Returns:
107
+ bool: True if complete thinking blocks are present.
108
+ """
109
+ return self.think_regex.search(model_output) is not None
110
+
111
+ def is_contain_tool_call(self, model_output: str) -> bool:
112
+ """
113
+ Check if the model output contains complete tool calls.
114
+
115
+ Args:
116
+ model_output (str): The model output to check.
117
+
118
+ Returns:
119
+ bool: True if complete tool calls are present.
120
+ """
121
+ return self.tool_call_complete_regex.search(model_output) is not None
122
+
123
+ def is_contain_tool_call_start_token(self, model_output: str) -> bool:
124
+ """
125
+ Check if the model output contains the tool call start token.
126
+
127
+ Args:
128
+ model_output (str): The model output to check.
129
+
130
+ Returns:
131
+ bool: True if tool call start token is present.
132
+ """
133
+ return self.tool_call_start_token in model_output
134
+
135
+ def is_contain_tool_call_end_token(self, model_output: str) -> bool:
136
+ """
137
+ Check if the model output contains the tool call end token.
138
+
139
+ Args:
140
+ model_output (str): The model output to check.
141
+
142
+ Returns:
143
+ bool: True if tool call end token is present.
144
+ """
145
+ return self.tool_call_end_token in model_output
146
+
147
+ def _get_function_calls(self, model_output: str) -> List[str]:
148
+ """
149
+ Extract all function calls and content blocks from model output.
150
+
151
+ Parses the model output to separate thinking blocks, tool calls,
152
+ and regular content into individual components.
153
+
154
+ Args:
155
+ model_output (str): The complete model output to parse.
156
+
157
+ Returns:
158
+ List[str]: List of content blocks (text, thinking blocks, tool calls).
159
+ """
160
+ functions_calls = []
161
+ last_end = 0
162
+ for m in re.finditer(self.content_regex, model_output, re.DOTALL):
163
+ # Add any text before the current match
164
+ if m.start() > last_end:
165
+ functions_calls.append(model_output[last_end : m.start()])
166
+ # Add the matched content (think or tool_call block)
167
+ functions_calls.append(m.group(0))
168
+ last_end = m.end()
169
+ # Add any remaining text after the last match
170
+ if last_end < len(model_output):
171
+ functions_calls.append(model_output[last_end:])
172
+ return functions_calls
173
+
174
+ def _get_function_calls_streaming(self, model_output: str) -> List[str]:
175
+ """
176
+ Extract function calls from streaming model output.
177
+
178
+ Finds both complete and incomplete tool calls in streaming context.
179
+
180
+ Args:
181
+ model_output (str): The streaming model output to parse.
182
+
183
+ Returns:
184
+ List[str]: List of tool call blocks (complete or incomplete).
185
+ """
186
+ matched_ranges = self.tool_call_regex.findall(model_output)
187
+ return matched_ranges
188
+
189
+ def extract_tool_calls(
190
+ self, model_output: str
191
+ ) -> List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
192
+ """
193
+ Extract tool calls from complete model output.
194
+
195
+ Parses the model output to find tool calls and thinking blocks,
196
+ extracting function names and arguments from JSON content within
197
+ tool_call XML tags.
198
+
199
+ Args:
200
+ model_output (str): The complete output string from the model.
201
+
202
+ Returns:
203
+ List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
204
+ A list of tuples where each tuple contains:
205
+ - content (str or None): Raw content if parsing failed, None if successful
206
+ - function_name (str or None): Name of the function to call
207
+ - arguments (dict or None): Function arguments
208
+
209
+ Example:
210
+ >>> parser = QwenToolParser()
211
+ >>> output = '<tool_call>\n{"name": "get_weather", "arguments": {"location": "Beijing"}}\n</tool_call>'
212
+ >>> result = parser.extract_tool_calls(output)
213
+ >>> print(result)
214
+ [(None, 'get_weather', {'location': 'Beijing'})]
215
+ """
216
+ # If no tool call tokens, return original output as content
217
+ if self.tool_call_start_token not in model_output:
218
+ return [(model_output, None, None)]
219
+
220
+ try:
221
+ function_calls = self._get_function_calls(model_output)
222
+ if len(function_calls) == 0:
223
+ return [(model_output, None, None)]
224
+
225
+ results: List[
226
+ Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]
227
+ ] = []
228
+ for function_call in function_calls:
229
+ try:
230
+ parsed_json = self._parse_json_function_call(function_call)
231
+ res = json.loads(parsed_json, strict=False)
232
+ results.append((None, res["name"], res["arguments"]))
233
+ except Exception as e:
234
+ logger.error(
235
+ "Can't parse single qwen tool call output: %s. Error: %s",
236
+ function_call,
237
+ e,
238
+ )
239
+ results.append((function_call, None, None))
240
+ return results
241
+
242
+ except Exception as e:
243
+ logger.error(
244
+ "Can't parse qwen tool call output: %s. Error: %s",
245
+ model_output,
246
+ e,
247
+ )
248
+ return [(model_output, None, None)]
249
+
250
+ def _has_unclosed_tool_call(self, text: str) -> bool:
251
+ """
252
+ Check if the text has unclosed tool_call tags.
253
+
254
+ Counts the number of opening and closing tool_call tags to determine
255
+ if there are any unclosed tool calls in the text.
256
+
257
+ Args:
258
+ text (str): The text to check for unclosed tags.
259
+
260
+ Returns:
261
+ bool: True if there are unclosed tool_call tags.
262
+ """
263
+ if not text:
264
+ return True
265
+ start_count = text.count(self.tool_call_start_token)
266
+ end_count = text.count(self.tool_call_end_token)
267
+ return start_count > end_count
268
+
269
+ def extract_tool_calls_streaming(
270
+ self, previous_text: List[str], current_text: str, delta_text: str
271
+ ) -> Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
272
+ """
273
+ Extract tool calls from streaming output.
274
+
275
+ Processes streaming model output to detect and extract tool calls
276
+ as they are being generated. Handles incomplete tool calls and
277
+ determines when a complete tool call is available.
278
+
279
+ Args:
280
+ previous_text (List[str]): Previous text chunks from the stream.
281
+ current_text (str): Current accumulated text.
282
+ delta_text (str): New text delta in this chunk.
283
+
284
+ Returns:
285
+ Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
286
+ A tuple containing:
287
+ - content (str or None): Text content or None for tool calls
288
+ - function_name (str or None): Name of the function to call
289
+ - arguments (dict or None): Function arguments
290
+ Returns None if no complete tool call is ready.
291
+
292
+ Note:
293
+ This method is designed to work with Qwen's streaming output format
294
+ and handles partial tool calls during generation.
295
+ """
296
+ try:
297
+ # Check if current output contains tool_call start token
298
+ if self.is_contain_tool_call_start_token(current_text):
299
+ function_calls = self._get_function_calls_streaming(current_text)
300
+ # If the last function call contains thinking, it's not a tool call
301
+ if self.is_contain_think(function_calls[-1]):
302
+ return None
303
+ # If the previous round's tool_call tags are closed, this is a new tool call
304
+ if not self._has_unclosed_tool_call(previous_text[-1]):
305
+ return None
306
+ # Parse and return
307
+ function_call = self._parse_json_function_call_stream(
308
+ function_calls[-1]
309
+ )
310
+ if function_call is None:
311
+ return None
312
+ res = json.loads(function_call, strict=False)
313
+ return None, res["name"], res["arguments"]
314
+ else:
315
+ # Return delta text as regular content
316
+ return (delta_text, None, None)
317
+
318
+ except Exception as e:
319
+ logger.error("Error in Qwen streaming tool call extraction: %s", e)
320
+ raise
@@ -332,6 +332,7 @@ class PytorchModel(LLM):
332
332
  self.prepare_parse_reasoning_content(
333
333
  reasoning_content, enable_thinking=enable_thinking
334
334
  )
335
+ self.prepare_parse_tool_calls()
335
336
 
336
337
  logger.debug("Loading Transformers model with kwargs: %s", kwargs)
337
338
 
@@ -983,7 +984,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
983
984
  self.model_family,
984
985
  self.model_uid,
985
986
  req.completion[0],
986
- self.reasoning_parser,
987
987
  )
988
988
  else:
989
989
  req.completion[0] = self._to_chat_completion(
@@ -51,6 +51,7 @@ from ...types import (
51
51
  )
52
52
  from .core import chat_context_var
53
53
  from .reasoning_parser import ReasoningParser
54
+ from .tool_parsers.glm4_tool_parser import Glm4ToolParser
54
55
 
55
56
  logger = logging.getLogger(__name__)
56
57
 
@@ -95,6 +96,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
95
96
 
96
97
 
97
98
  class ChatModelMixin:
99
+
100
+ def __init__(self):
101
+ self.model_family = None
102
+ self.model_uid = None
103
+ self.reasoning_parser = None
104
+ self.tool_parser = None
105
+
98
106
  @staticmethod
99
107
  @functools.lru_cache
100
108
  def _compile_jinja_template(chat_template):
@@ -590,16 +598,41 @@ class ChatModelMixin:
590
598
  pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
591
599
  if pos2 != -1:
592
600
  content = content[:pos2]
601
+
602
+ # Skip empty content after extraction
603
+ if not content.strip():
604
+ continue
605
+
593
606
  try:
594
607
  res = json.loads(content, strict=False)
595
- results.append((None, res["name"], res["arguments"]))
596
- except Exception as e:
608
+ if isinstance(res, dict):
609
+ # Check if required fields exist
610
+ if "name" in res and "arguments" in res:
611
+ results.append((None, res["name"], res["arguments"]))
612
+ else:
613
+ logger.warning(
614
+ "Missing required fields in qwen tool call: %s", content
615
+ )
616
+ results.append((content, None, None))
617
+ else:
618
+ logger.warning(
619
+ "Qwen tool call result is not a dict: %s", content
620
+ )
621
+ results.append((content, None, None))
622
+ except json.JSONDecodeError as e:
597
623
  logger.error(
598
624
  "Can't parse single qwen tool call output: %s. Error: %s",
599
625
  content,
600
626
  e,
601
627
  )
602
628
  results.append((content, None, None))
629
+ except Exception as e:
630
+ logger.error(
631
+ "Unexpected error parsing qwen tool call: %s. Error: %s",
632
+ content,
633
+ e,
634
+ )
635
+ results.append((content, None, None))
603
636
  return results
604
637
 
605
638
  @classmethod
@@ -757,47 +790,60 @@ class ChatModelMixin:
757
790
  logger.debug(f"Tool call content: {result}")
758
791
  return result
759
792
 
760
- @classmethod
761
793
  def _post_process_completion_chunk(
762
- cls,
794
+ self,
763
795
  model_family,
764
796
  model_uid,
765
797
  c,
766
798
  chunk_id=None,
767
- reasoning_parser: Optional[ReasoningParser] = None,
768
- tool_call_text: Optional[str] = None,
799
+ previous_texts: List[str] = [""],
769
800
  ):
770
801
  _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
771
- tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
802
+ if isinstance(self.tool_parser, Glm4ToolParser):
803
+ tool_result = self.tool_parser.extract_tool_calls_streaming(
804
+ [],
805
+ c,
806
+ c,
807
+ )
808
+ else:
809
+ finish_reason = c["choices"][0]["finish_reason"]
810
+ delta_text = c["choices"][0]["delta"]["content"]
811
+ current_text = (
812
+ previous_texts[-1] + delta_text if previous_texts else delta_text
813
+ )
814
+ tool_result = self.tool_parser.extract_tool_calls_streaming(
815
+ previous_texts,
816
+ current_text,
817
+ delta_text,
818
+ )
819
+ previous_texts[-1] = current_text
820
+ if tool_result is None and not finish_reason:
821
+ return None
772
822
  tool_calls = []
773
823
  failed_contents = []
774
- for content, func, args in tool_result:
775
- if func:
776
- tool_calls.append(
777
- {
778
- "index": 0,
779
- "id": f"call_{_id}",
780
- "type": "function",
781
- "function": {
782
- "name": func,
783
- "arguments": json.dumps(args, ensure_ascii=False),
784
- },
785
- }
786
- )
787
- else:
788
- failed_contents.append(content)
789
- finish_reason = "tool_calls" if tool_calls else "stop"
824
+ content, func, args = tool_result if tool_result else ("", None, None)
825
+ if func:
826
+ tool_calls.append(
827
+ {
828
+ "index": 0,
829
+ "id": f"call_{_id}",
830
+ "type": "function",
831
+ "function": {
832
+ "name": func,
833
+ "arguments": json.dumps(args, ensure_ascii=False),
834
+ },
835
+ }
836
+ )
837
+ else:
838
+ failed_contents.append(content)
790
839
 
791
- content = "".join(failed_contents) if failed_contents else None
840
+ finish_reason = "tool_calls" if tool_calls else finish_reason
792
841
 
793
- # fix: qwen tool_call content field return null
794
- family = model_family.model_family or model_family.model_name
795
- if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
796
- content = ""
842
+ content = "".join(failed_contents) if failed_contents else None
797
843
 
798
844
  d = {
799
845
  "role": "assistant",
800
- "content": content,
846
+ "content": content if content else "",
801
847
  "tool_calls": tool_calls,
802
848
  }
803
849
 
@@ -826,29 +872,32 @@ class ChatModelMixin:
826
872
  "usage": usage,
827
873
  }
828
874
 
829
- @classmethod
830
875
  def _post_process_completion(
831
- cls,
876
+ self,
832
877
  model_family,
833
878
  model_uid,
834
879
  c,
835
- reasoning_parser: Optional[ReasoningParser] = None,
836
880
  ):
837
- if reasoning_parser:
838
- c = reasoning_parser.prepare_reasoning_content(c)
881
+ if not self.tool_parser:
882
+ return self._get_final_chat_completion_chunk(c)
883
+ if self.reasoning_parser:
884
+ c = self.reasoning_parser.prepare_reasoning_content(c)
839
885
  _id = str(uuid.uuid4())
840
886
  reasoning_content = None
841
- if reasoning_parser and reasoning_parser.check_content_parser():
887
+ if self.reasoning_parser and self.reasoning_parser.check_content_parser():
842
888
  text = c["choices"][0]["text"]
843
- reasoning_content, content = reasoning_parser.extract_reasoning_content(
844
- text
889
+ reasoning_content, content = (
890
+ self.reasoning_parser.extract_reasoning_content(text)
845
891
  )
846
892
  c["choices"][0]["text"] = content
847
893
 
848
- tool_result = cls._eval_tool_arguments(model_family, c)
849
-
850
894
  tool_calls = []
851
895
  failed_contents = []
896
+ if isinstance(self.tool_parser, Glm4ToolParser):
897
+ tool_result = self.tool_parser.extract_tool_calls(c)
898
+ else:
899
+ text = c["choices"][0]["text"]
900
+ tool_result = self.tool_parser.extract_tool_calls(text)
852
901
  for content, func, args in tool_result:
853
902
  if func:
854
903
  tool_calls.append(
@@ -868,14 +917,9 @@ class ChatModelMixin:
868
917
 
869
918
  content = "".join(failed_contents) if failed_contents else None
870
919
 
871
- # fix: qwen tool_call content field return null
872
- family = model_family.model_family or model_family.model_name
873
- if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
874
- content = ""
875
-
876
920
  m = {
877
921
  "role": "assistant",
878
- "content": content,
922
+ "content": content if content else "",
879
923
  "tool_calls": tool_calls,
880
924
  }
881
925
  # add only reasoning_content is None
@@ -943,6 +987,44 @@ class ChatModelMixin:
943
987
 
944
988
  return transformed_messages
945
989
 
990
+ async def _async_to_tool_completion_chunks(
991
+ self,
992
+ chunks: AsyncGenerator[CompletionChunk, None],
993
+ ctx: Optional[Dict[str, Any]] = None,
994
+ ) -> AsyncGenerator[ChatCompletionChunk, None]:
995
+ def set_context():
996
+ if ctx:
997
+ chat_context_var.set(ctx)
998
+
999
+ i = 0
1000
+ previous_texts = [""]
1001
+ previous_tools_texts = [""]
1002
+ full_text = ""
1003
+ if self.reasoning_parser:
1004
+ set_context()
1005
+ chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
1006
+ async for completion_chunk in chunks:
1007
+ set_context()
1008
+ chat_chunk = self._to_chat_completion_chunk(
1009
+ completion_chunk, self.reasoning_parser, previous_texts
1010
+ )
1011
+ if (
1012
+ "reasoning_content" in chat_chunk["choices"][0]["delta"]
1013
+ and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
1014
+ ):
1015
+ yield chat_chunk
1016
+ continue
1017
+ processed_chunk = self._post_process_completion_chunk(
1018
+ self.model_family,
1019
+ self.model_uid,
1020
+ chat_chunk,
1021
+ previous_texts=previous_tools_texts,
1022
+ )
1023
+ if processed_chunk:
1024
+ yield processed_chunk
1025
+ i += 1
1026
+ logger.debug("Chat finished, output: %s", full_text)
1027
+
946
1028
 
947
1029
  def get_model_version(
948
1030
  model_name: str,
@@ -393,6 +393,7 @@ class VLLMModel(LLM):
393
393
  self.prepare_parse_reasoning_content(
394
394
  reasoning_content, enable_thinking=enable_thinking
395
395
  )
396
+ self.prepare_parse_tool_calls()
396
397
 
397
398
  if (
398
399
  isinstance(self.model_spec, LlamaCppLLMSpecV2)
@@ -773,7 +774,6 @@ class VLLMModel(LLM):
773
774
  sanitized = VLLMGenerateConfig()
774
775
 
775
776
  response_format = generate_config.pop("response_format", None)
776
- guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
777
777
  guided_json_object = None
778
778
  guided_json = None
779
779
 
@@ -784,8 +784,6 @@ class VLLMModel(LLM):
784
784
  json_schema = response_format.get("json_schema")
785
785
  assert json_schema is not None
786
786
  guided_json = json_schema.get("json_schema")
787
- if guided_decoding_backend is None:
788
- guided_decoding_backend = "outlines"
789
787
 
790
788
  sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
791
789
  sanitized.setdefault("n", generate_config.get("n", 1))
@@ -833,10 +831,6 @@ class VLLMModel(LLM):
833
831
  "guided_json_object",
834
832
  generate_config.get("guided_json_object", guided_json_object),
835
833
  )
836
- sanitized.setdefault(
837
- "guided_decoding_backend",
838
- generate_config.get("guided_decoding_backend", guided_decoding_backend),
839
- )
840
834
 
841
835
  return sanitized
842
836
 
@@ -1291,59 +1285,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1291
1285
 
1292
1286
  return processed_messages
1293
1287
 
1294
- async def _async_to_tool_completion_chunks(
1295
- self,
1296
- chunks: AsyncGenerator[CompletionChunk, None],
1297
- ctx: Optional[Dict[str, Any]] = {},
1298
- ) -> AsyncGenerator[ChatCompletionChunk, None]:
1299
- def set_context():
1300
- if ctx:
1301
- chat_context_var.set(ctx)
1302
-
1303
- i = 0
1304
- previous_texts = [""]
1305
- tool_call = False
1306
- tool_call_texts = [""]
1307
- full_text = ""
1308
- if self.reasoning_parser:
1309
- set_context()
1310
- chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
1311
- async for chunk in chunks:
1312
- set_context()
1313
- if i == 0:
1314
- for first_chunk in self._get_first_chat_completion_chunk(
1315
- chunk, self.reasoning_parser
1316
- ):
1317
- yield first_chunk
1318
- # usage
1319
- choices = chunk.get("choices")
1320
- if not choices:
1321
- yield self._get_final_chat_completion_chunk(chunk)
1322
- else:
1323
- full_text += chunk["choices"][0]["text"]
1324
- if self.is_tool_call_chunk_start(chunk):
1325
- tool_call = True
1326
- if tool_call:
1327
- tool_call_text = tool_call_texts[-1]
1328
- tool_call_text += chunk["choices"][0]["text"]
1329
- tool_call_texts.append(tool_call_text)
1330
- if self.is_tool_call_chunk_end(chunk):
1331
- yield self._post_process_completion_chunk(
1332
- self.model_family,
1333
- self.model_uid,
1334
- chunk,
1335
- reasoning_parser=self.reasoning_parser,
1336
- tool_call_text=tool_call_text,
1337
- )
1338
- tool_call = False
1339
- tool_call_texts = [""]
1340
- else:
1341
- yield self._to_chat_completion_chunk(
1342
- chunk, self.reasoning_parser, previous_texts
1343
- )
1344
- i += 1
1345
- logger.debug("Chat finished, output: %s", full_text)
1346
-
1347
1288
  @vllm_check
1348
1289
  async def async_chat(
1349
1290
  self,
@@ -1408,7 +1349,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1408
1349
  assert not isinstance(c, AsyncGenerator)
1409
1350
  if tools:
1410
1351
  return self._post_process_completion(
1411
- self.model_family, self.model_uid, c, self.reasoning_parser
1352
+ self.model_family, self.model_uid, c
1412
1353
  )
1413
1354
  return self._to_chat_completion(c, self.reasoning_parser)
1414
1355