xinference 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +415 -1
  3. xinference/constants.py +2 -0
  4. xinference/core/supervisor.py +29 -1
  5. xinference/model/audio/core.py +5 -0
  6. xinference/model/audio/kokoro.py +1 -1
  7. xinference/model/audio/kokoro_zh.py +124 -0
  8. xinference/model/audio/model_spec.json +20 -0
  9. xinference/model/embedding/sentence_transformers/core.py +4 -4
  10. xinference/model/embedding/vllm/core.py +7 -1
  11. xinference/model/image/model_spec.json +2 -3
  12. xinference/model/llm/core.py +10 -0
  13. xinference/model/llm/llama_cpp/core.py +1 -0
  14. xinference/model/llm/llm_family.json +40 -20
  15. xinference/model/llm/llm_family.py +1 -0
  16. xinference/model/llm/mlx/core.py +52 -33
  17. xinference/model/llm/sglang/core.py +2 -44
  18. xinference/model/llm/tool_parsers/__init__.py +58 -0
  19. xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
  20. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
  21. xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
  22. xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
  23. xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
  24. xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
  25. xinference/model/llm/transformers/core.py +1 -1
  26. xinference/model/llm/utils.py +127 -45
  27. xinference/model/llm/vllm/core.py +2 -61
  28. xinference/types.py +105 -2
  29. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/METADATA +7 -3
  30. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/RECORD +34 -26
  31. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
  32. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
  33. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
  34. {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,124 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from io import BytesIO
16
+ from typing import TYPE_CHECKING, Optional
17
+
18
+ import numpy as np
19
+
20
+ from ...device_utils import get_available_device, is_device_available
21
+
22
+ if TYPE_CHECKING:
23
+ from .core import AudioModelFamilyV2
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ REPO_ID = "hexgrad/Kokoro-82M-v1.1-zh"
28
+
29
+
30
+ class KokoroZHModel:
31
+ def __init__(
32
+ self,
33
+ model_uid: str,
34
+ model_path: str,
35
+ model_spec: "AudioModelFamilyV2",
36
+ device: Optional[str] = None,
37
+ **kwargs,
38
+ ):
39
+ self.model_family = model_spec
40
+ self._model_uid = model_uid
41
+ self._model_path = model_path
42
+ self._model_spec = model_spec
43
+ self._device = device
44
+ self._model = None
45
+ self._kwargs = kwargs
46
+ self._en_pipeline = None
47
+
48
+ def _en_callable(self, text):
49
+ """
50
+ Fixing the issue of English words being skipped in the Chinese model.
51
+ from https://hf-mirror.com/hexgrad/Kokoro-82M-v1.1-zh/blob/main/samples/make_zh.py
52
+ """
53
+ if text == "Kokoro":
54
+ return "kˈOkəɹO"
55
+ elif text == "Sol":
56
+ return "sˈOl"
57
+ return next(self._en_pipeline(text)).phonemes
58
+
59
+ @property
60
+ def model_ability(self):
61
+ return self._model_spec.model_ability
62
+
63
+ def load(self):
64
+ if self._device is None:
65
+ self._device = get_available_device()
66
+ else:
67
+ if not is_device_available(self._device):
68
+ raise ValueError(f"Device {self._device} is not available!")
69
+
70
+ import os
71
+
72
+ from kokoro import KModel, KPipeline
73
+
74
+ self._en_pipeline = KPipeline(lang_code="a", repo_id=REPO_ID, model=False)
75
+
76
+ config_path = os.path.join(self._model_path, "config.json")
77
+ model_path = os.path.join(self._model_path, "kokoro-v1_1-zh.pth")
78
+ lang_code = self._kwargs.get("lang_code", "z")
79
+ logger.info("Launching Kokoro model with language code: %s", lang_code)
80
+
81
+ self._model = KPipeline(
82
+ lang_code=lang_code,
83
+ model=KModel(config=config_path, model=model_path).to(self._device),
84
+ repo_id=REPO_ID,
85
+ en_callable=self._en_callable,
86
+ device=self._device,
87
+ )
88
+
89
+ def speech(
90
+ self,
91
+ input: str,
92
+ voice: str,
93
+ response_format: str = "mp3",
94
+ speed: float = 1.0,
95
+ stream: bool = False,
96
+ **kwargs,
97
+ ):
98
+ import soundfile
99
+
100
+ if stream:
101
+ raise Exception("Kokoro does not support stream mode.")
102
+ assert self._model is not None
103
+ if not voice:
104
+ voice = "zf_001"
105
+ logger.info("Auto select speaker: %s", voice)
106
+ elif voice.endswith(".pt"):
107
+ logger.info("Using custom voice pt: %s", voice)
108
+ else:
109
+ logger.info("Using voice: %s", voice)
110
+ logger.info("Speech kwargs: %s", kwargs)
111
+ generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
112
+ results = list(generator)
113
+ audio = np.concatenate([r[2] for r in results])
114
+ # Save the generated audio
115
+ with BytesIO() as out:
116
+ with soundfile.SoundFile(
117
+ out,
118
+ "w",
119
+ 24000,
120
+ 1,
121
+ format=response_format.upper(),
122
+ ) as f:
123
+ f.write(audio)
124
+ return out.getvalue()
@@ -862,6 +862,26 @@
862
862
  "model_revision": "master"
863
863
  }
864
864
  }
865
+ },
866
+ {
867
+ "version": 2,
868
+ "model_name": "Kokoro-82M-v1.1-zh",
869
+ "model_family": "Kokoro-zh",
870
+ "model_ability": [
871
+ "text2audio",
872
+ "text2audio_zero_shot"
873
+ ],
874
+ "multilingual": false,
875
+ "model_src": {
876
+ "huggingface": {
877
+ "model_id": "hexgrad/Kokoro-82M-v1.1-zh",
878
+ "model_revision": "01e7505bd6a7a2ac4975463114c3a7650a9f7218"
879
+ },
880
+ "modelscope": {
881
+ "model_id": "AI-ModelScope/Kokoro-82M-v1.1-zh",
882
+ "model_revision": "master"
883
+ }
884
+ }
865
885
  },
866
886
  {
867
887
  "version": 2,
@@ -265,10 +265,10 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
265
265
  "clip" in self.model_family.model_name.lower()
266
266
  or "jina-embeddings-v4" in self.model_family.model_name.lower()
267
267
  ):
268
- if "input_ids" in features and hasattr(
269
- features["input_ids"], "numel"
270
- ):
271
- all_token_nums += features["input_ids"].numel()
268
+ # support input_ids and text_input_ids
269
+ for key in ["input_ids", "text_input_ids"]:
270
+ if key in features and hasattr(features[key], "numel"):
271
+ all_token_nums += features[key].numel()
272
272
  if "pixel_values" in features and hasattr(
273
273
  features["pixel_values"], "numel"
274
274
  ):
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import importlib.util
16
+ import json
16
17
  import logging
17
18
  from typing import List, Union
18
19
 
@@ -54,13 +55,18 @@ class VLLMEmbeddingModel(EmbeddingModel):
54
55
  self._kwargs["hf_overrides"].update(
55
56
  is_matryoshka=True,
56
57
  )
58
+ elif isinstance(self._kwargs["hf_overrides"], str):
59
+ self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"])
60
+ self._kwargs["hf_overrides"].update(
61
+ is_matryoshka=True,
62
+ )
57
63
 
58
64
  self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
59
65
  self._tokenizer = self._model.get_tokenizer()
60
66
 
61
67
  @staticmethod
62
68
  def _get_detailed_instruct(task_description: str, query: str) -> str:
63
- return f"Instruct: {task_description}\nQuery:{query}"
69
+ return f"Instruct: {task_description}\nQuery:{query}" # noqa: E231
64
70
 
65
71
  @cache_clean
66
72
  def create_embedding(
@@ -824,13 +824,12 @@
824
824
  "deepspeed==0.12.3",
825
825
  "peft==0.4.0",
826
826
  "tiktoken==0.6.0",
827
- "bitsandbytes==0.41.0",
828
- "scikit-learn==1.2.2",
829
827
  "sentencepiece==0.1.99",
830
828
  "einops==0.6.1",
831
829
  "einops-exts==0.0.4",
832
830
  "timm==0.6.13",
833
- "numpy==1.26.4"
831
+ "#system_numpy#",
832
+ "#system_torch#"
834
833
  ]
835
834
  },
836
835
  "model_src": {
@@ -27,6 +27,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
27
27
  from ...core.utils import parse_replica_model_uid
28
28
  from ...types import PeftModelConfig
29
29
  from .reasoning_parser import ReasoningParser
30
+ from .tool_parsers import TOOL_PARSERS
30
31
 
31
32
  if TYPE_CHECKING:
32
33
  from .llm_family import LLMFamilyV2, LLMSpecV1
@@ -59,6 +60,7 @@ class LLM(abc.ABC):
59
60
  self.quantization = model_family.model_specs[0].quantization
60
61
  self.model_path = model_path
61
62
  self.reasoning_parser = None
63
+ self.tool_parser = None
62
64
  if args:
63
65
  raise ValueError(f"Unrecognized positional arguments: {args}")
64
66
  if kwargs:
@@ -171,6 +173,14 @@ class LLM(abc.ABC):
171
173
  enable_thinking=enable_thinking,
172
174
  )
173
175
 
176
+ def prepare_parse_tool_calls(self):
177
+ if self.model_family.tool_parser is None:
178
+ return
179
+ if self.model_family.tool_parser not in TOOL_PARSERS:
180
+ return
181
+ tool_parser = TOOL_PARSERS[self.model_family.tool_parser]
182
+ self.tool_parser = tool_parser()
183
+
174
184
 
175
185
  # Context variable for passing per-request chat context (e.g., chat_template_kwargs).
176
186
  # This variable should be set at the beginning of each chat or stream_chat call.
@@ -122,6 +122,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
122
122
  self.prepare_parse_reasoning_content(
123
123
  reasoning_content, enable_thinking=enable_thinking
124
124
  )
125
+ self.prepare_parse_tool_calls()
125
126
 
126
127
  if os.path.isfile(self.model_path):
127
128
  # mostly passed from --model_path
@@ -1008,7 +1008,8 @@
1008
1008
  "<|endoftext|>",
1009
1009
  "<|im_start|>",
1010
1010
  "<|im_end|>"
1011
- ]
1011
+ ],
1012
+ "tool_parser":"qwen"
1012
1013
  },
1013
1014
  {
1014
1015
  "version": 2,
@@ -1070,7 +1071,8 @@
1070
1071
  "<|end_of_text|>",
1071
1072
  "<|eot_id|>",
1072
1073
  "<|eom_id|>"
1073
- ]
1074
+ ],
1075
+ "tool_parser": "llama3"
1074
1076
  },
1075
1077
  {
1076
1078
  "version": 2,
@@ -1133,7 +1135,8 @@
1133
1135
  "<|endoftext|>",
1134
1136
  "<|im_start|>",
1135
1137
  "<|im_end|>"
1136
- ]
1138
+ ],
1139
+ "tool_parser":"qwen"
1137
1140
  },
1138
1141
  {
1139
1142
  "version": 2,
@@ -1946,7 +1949,8 @@
1946
1949
  "<|im_end|>"
1947
1950
  ],
1948
1951
  "reasoning_start_tag": "<think>",
1949
- "reasoning_end_tag": "</think>"
1952
+ "reasoning_end_tag": "</think>",
1953
+ "tool_parser":"qwen"
1950
1954
  },
1951
1955
  {
1952
1956
  "version": 2,
@@ -2209,7 +2213,8 @@
2209
2213
  "<|endoftext|>",
2210
2214
  "<|im_start|>",
2211
2215
  "<|im_end|>"
2212
- ]
2216
+ ],
2217
+ "tool_parser":"qwen"
2213
2218
  },
2214
2219
  {
2215
2220
  "version": 2,
@@ -5772,7 +5777,8 @@
5772
5777
  "<|end▁of▁sentence|>"
5773
5778
  ],
5774
5779
  "reasoning_start_tag": "<think>",
5775
- "reasoning_end_tag": "</think>"
5780
+ "reasoning_end_tag": "</think>",
5781
+ "tool_parser": "deepseek_r1"
5776
5782
  },
5777
5783
  {
5778
5784
  "version": 2,
@@ -6620,7 +6626,8 @@
6620
6626
  ],
6621
6627
  "stop": [
6622
6628
  "<|end▁of▁sentence|>"
6623
- ]
6629
+ ],
6630
+ "tool_parser": "deepseek_v3"
6624
6631
  },
6625
6632
  {
6626
6633
  "version": 2,
@@ -7920,7 +7927,8 @@
7920
7927
  "<|endoftext|>",
7921
7928
  "<|user|>",
7922
7929
  "<|observation|>"
7923
- ]
7930
+ ],
7931
+ "tool_parser":"glm4"
7924
7932
  },
7925
7933
  {
7926
7934
  "version": 2,
@@ -8027,7 +8035,8 @@
8027
8035
  "<|endoftext|>",
8028
8036
  "<|user|>",
8029
8037
  "<|observation|>"
8030
- ]
8038
+ ],
8039
+ "tool_parser":"glm4"
8031
8040
  },
8032
8041
  {
8033
8042
  "version": 2,
@@ -9189,7 +9198,8 @@
9189
9198
  "<|end_of_text|>",
9190
9199
  "<|eot_id|>",
9191
9200
  "<|eom_id|>"
9192
- ]
9201
+ ],
9202
+ "tool_parser": "llama3"
9193
9203
  },
9194
9204
  {
9195
9205
  "version": 2,
@@ -11918,7 +11928,8 @@
11918
11928
  "<|endoftext|>",
11919
11929
  "<|im_start|>",
11920
11930
  "<|im_end|>"
11921
- ]
11931
+ ],
11932
+ "tool_parser":"qwen"
11922
11933
  },
11923
11934
  {
11924
11935
  "version": 2,
@@ -11981,7 +11992,8 @@
11981
11992
  "<|endoftext|>",
11982
11993
  "<|im_start|>",
11983
11994
  "<|im_end|>"
11984
- ]
11995
+ ],
11996
+ "tool_parser":"qwen"
11985
11997
  },
11986
11998
  {
11987
11999
  "version": 2,
@@ -12705,7 +12717,8 @@
12705
12717
  "<|endoftext|>",
12706
12718
  "<|im_start|>",
12707
12719
  "<|im_end|>"
12708
- ]
12720
+ ],
12721
+ "tool_parser":"qwen"
12709
12722
  },
12710
12723
  {
12711
12724
  "version": 2,
@@ -12826,7 +12839,8 @@
12826
12839
  "<|endoftext|>",
12827
12840
  "<|im_start|>",
12828
12841
  "<|im_end|>"
12829
- ]
12842
+ ],
12843
+ "tool_parser":"qwen"
12830
12844
  },
12831
12845
  {
12832
12846
  "version": 2,
@@ -14008,7 +14022,8 @@
14008
14022
  "<|endoftext|>",
14009
14023
  "<|im_start|>",
14010
14024
  "<|im_end|>"
14011
- ]
14025
+ ],
14026
+ "tool_parser":"qwen"
14012
14027
  },
14013
14028
  {
14014
14029
  "version": 2,
@@ -15518,7 +15533,8 @@
15518
15533
  "<|endoftext|>",
15519
15534
  "<|im_start|>",
15520
15535
  "<|im_end|>"
15521
- ]
15536
+ ],
15537
+ "tool_parser":"qwen"
15522
15538
  },
15523
15539
  {
15524
15540
  "version": 2,
@@ -17428,7 +17444,8 @@
17428
17444
  "mlx-lm>=0.24.0 ; sys_platform=='darwin'",
17429
17445
  "#system_numpy#"
17430
17446
  ]
17431
- }
17447
+ },
17448
+ "tool_parser": "qwen"
17432
17449
  },
17433
17450
  {
17434
17451
  "version": 2,
@@ -18043,7 +18060,8 @@
18043
18060
  "<|endoftext|>",
18044
18061
  "<|im_start|>",
18045
18062
  "<|im_end|>"
18046
- ]
18063
+ ],
18064
+ "tool_parser":"qwen"
18047
18065
  },
18048
18066
  {
18049
18067
  "version": 2,
@@ -18655,7 +18673,8 @@
18655
18673
  "<|im_end|>"
18656
18674
  ],
18657
18675
  "reasoning_start_tag": "<think>",
18658
- "reasoning_end_tag": "</think>"
18676
+ "reasoning_end_tag": "</think>",
18677
+ "tool_parser":"qwen"
18659
18678
  },
18660
18679
  {
18661
18680
  "version": 2,
@@ -19438,7 +19457,8 @@
19438
19457
  "stop": [
19439
19458
  "<|endoftext|>",
19440
19459
  "<|im_end|>"
19441
- ]
19460
+ ],
19461
+ "tool_parser":"qwen"
19442
19462
  },
19443
19463
  {
19444
19464
  "version": 2,
@@ -154,6 +154,7 @@ class LLMFamilyV2(BaseModel, ModelInstanceInfoMixin):
154
154
  reasoning_end_tag: Optional[str]
155
155
  cache_config: Optional[dict]
156
156
  virtualenv: Optional[VirtualEnvSettings]
157
+ tool_parser: Optional[str]
157
158
 
158
159
  class Config:
159
160
  extra = "allow"
@@ -148,6 +148,16 @@ class MLXModel(LLM):
148
148
  # to call aynsc method with asyncio.run_coroutine_threadsafe
149
149
  self._loop = loop # type: ignore
150
150
 
151
+ def _cleanup_memory(self):
152
+ import gc
153
+
154
+ import mlx.core as mx
155
+
156
+ # mandatory recycling
157
+ gc.collect()
158
+ # clear the MLX cache
159
+ mx.clear_cache()
160
+
151
161
  @property
152
162
  def driver_info(self) -> Optional[dict]:
153
163
  return self._driver_info
@@ -333,6 +343,7 @@ class MLXModel(LLM):
333
343
  self.prepare_parse_reasoning_content(
334
344
  reasoning_content, enable_thinking=enable_thinking
335
345
  )
346
+ self.prepare_parse_tool_calls()
336
347
 
337
348
  kwargs = {}
338
349
  kwargs["revision"] = self._model_config.get(
@@ -458,14 +469,18 @@ class MLXModel(LLM):
458
469
  repetition_penalty=kwargs.pop("repetition_penalty"),
459
470
  repetition_context_size=kwargs.pop("repetition_context_size"),
460
471
  )
461
- yield from stream_generate(
462
- self._model,
463
- self._tokenizer,
464
- prompt_token_ids,
465
- sampler=sampler,
466
- logits_processors=logits_processors,
467
- **kwargs,
468
- )
472
+ try:
473
+ yield from stream_generate(
474
+ self._model,
475
+ self._tokenizer,
476
+ prompt_token_ids,
477
+ sampler=sampler,
478
+ logits_processors=logits_processors,
479
+ **kwargs,
480
+ )
481
+ finally:
482
+ # after completing the inference, clear the memory.
483
+ self._cleanup_memory()
469
484
 
470
485
  def _prepare_inputs(
471
486
  self, prompt: Union[str, Dict[str, Any]], kwargs
@@ -755,7 +770,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
755
770
  assert not isinstance(c, Iterator)
756
771
  if tools:
757
772
  return self._post_process_completion(
758
- self.model_family, self.model_uid, c, self.reasoning_parser
773
+ self.model_family, self.model_uid, c
759
774
  )
760
775
  return self._to_chat_completion(c, self.reasoning_parser)
761
776
 
@@ -831,18 +846,32 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
831
846
 
832
847
  detokenizer.reset()
833
848
  tic = time.perf_counter()
834
- for n, (token, logprobs) in enumerate(
835
- generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
836
- ):
837
- if n == 0:
838
- prompt_time = time.perf_counter() - tic
839
- prompt_tps = len(input_ids) / prompt_time
840
- tic = time.perf_counter()
841
- if token == tokenizer.eos_token_id:
842
- break
843
- detokenizer.add_token(token)
849
+ try:
850
+ for n, (token, logprobs) in enumerate(
851
+ generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
852
+ ):
853
+ if n == 0:
854
+ prompt_time = time.perf_counter() - tic
855
+ prompt_tps = len(input_ids) / prompt_time
856
+ tic = time.perf_counter()
857
+ if token == tokenizer.eos_token_id:
858
+ break
859
+ detokenizer.add_token(token)
860
+
861
+ # Yield the last segment if streaming
862
+ yield GenerationResponse(
863
+ text=detokenizer.last_segment,
864
+ token=token,
865
+ logprobs=logprobs,
866
+ from_draft=False,
867
+ prompt_tokens=len(input_ids),
868
+ prompt_tps=prompt_tps,
869
+ generation_tokens=n + 1,
870
+ generation_tps=(n + 1) / (time.perf_counter() - tic),
871
+ peak_memory=mx.metal.get_peak_memory() / 1e9,
872
+ )
844
873
 
845
- # Yield the last segment if streaming
874
+ detokenizer.finalize()
846
875
  yield GenerationResponse(
847
876
  text=detokenizer.last_segment,
848
877
  token=token,
@@ -854,19 +883,9 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
854
883
  generation_tps=(n + 1) / (time.perf_counter() - tic),
855
884
  peak_memory=mx.metal.get_peak_memory() / 1e9,
856
885
  )
857
-
858
- detokenizer.finalize()
859
- yield GenerationResponse(
860
- text=detokenizer.last_segment,
861
- token=token,
862
- logprobs=logprobs,
863
- from_draft=False,
864
- prompt_tokens=len(input_ids),
865
- prompt_tps=prompt_tps,
866
- generation_tokens=n + 1,
867
- generation_tps=(n + 1) / (time.perf_counter() - tic),
868
- peak_memory=mx.metal.get_peak_memory() / 1e9,
869
- )
886
+ finally:
887
+ # after completing the inference, clear the memory
888
+ self._cleanup_memory()
870
889
 
871
890
  def _prepare_inputs(
872
891
  self, prompt: Union[str, Dict[str, Any]], kwargs
@@ -175,6 +175,7 @@ class SGLANGModel(LLM):
175
175
  self.prepare_parse_reasoning_content(
176
176
  reasoning_content, enable_thinking=enable_thinking
177
177
  )
178
+ self.prepare_parse_tool_calls()
178
179
 
179
180
  # Fix: GH#2169
180
181
  if sgl.__version__ >= "0.2.14":
@@ -646,49 +647,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
646
647
  def is_tool_call_chunk_end(chunk):
647
648
  return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
648
649
 
649
- async def _async_to_tool_completion_chunks(
650
- self,
651
- chunks: AsyncGenerator[CompletionChunk, None],
652
- ) -> AsyncGenerator[ChatCompletionChunk, None]:
653
- i = 0
654
- previous_texts = [""]
655
- tool_call = False
656
- tool_call_texts = [""]
657
- if self.reasoning_parser:
658
- chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
659
- async for chunk in chunks:
660
- if i == 0:
661
- for first_chunk in self._get_first_chat_completion_chunk(
662
- chunk, self.reasoning_parser
663
- ):
664
- yield first_chunk
665
- # usage
666
- choices = chunk.get("choices")
667
- if not choices:
668
- yield self._get_final_chat_completion_chunk(chunk)
669
- else:
670
- if self.is_tool_call_chunk_start(chunk):
671
- tool_call = True
672
- if tool_call:
673
- tool_call_text = tool_call_texts[-1]
674
- tool_call_text += chunk["choices"][0]["text"]
675
- tool_call_texts.append(tool_call_text)
676
- if self.is_tool_call_chunk_end(chunk):
677
- yield self._post_process_completion_chunk(
678
- self.model_family,
679
- self.model_uid,
680
- chunk,
681
- reasoning_parser=self.reasoning_parser,
682
- tool_call_text=tool_call_text,
683
- )
684
- tool_call = False
685
- tool_call_texts = [""]
686
- else:
687
- yield self._to_chat_completion_chunk(
688
- chunk, self.reasoning_parser, previous_texts
689
- )
690
- i += 1
691
-
692
650
  async def async_chat(
693
651
  self,
694
652
  messages: List[Dict],
@@ -731,7 +689,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
731
689
  assert not isinstance(c, AsyncGenerator)
732
690
  if tools:
733
691
  return self._post_process_completion(
734
- self.model_family, self.model_uid, c, self.reasoning_parser
692
+ self.model_family, self.model_uid, c
735
693
  )
736
694
  return self._to_chat_completion(c, self.reasoning_parser)
737
695