xinference 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (80) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +4 -7
  3. xinference/client/handlers.py +3 -0
  4. xinference/client/restful/restful_client.py +9 -1
  5. xinference/core/model.py +19 -0
  6. xinference/core/resource.py +7 -1
  7. xinference/core/scheduler.py +4 -7
  8. xinference/core/status_guard.py +1 -0
  9. xinference/core/supervisor.py +228 -19
  10. xinference/core/utils.py +1 -29
  11. xinference/core/worker.py +28 -2
  12. xinference/deploy/cmdline.py +33 -3
  13. xinference/deploy/local.py +2 -1
  14. xinference/deploy/test/test_cmdline.py +32 -0
  15. xinference/device_utils.py +43 -1
  16. xinference/model/audio/core.py +5 -0
  17. xinference/model/audio/kokoro.py +122 -0
  18. xinference/model/audio/model_spec.json +8 -0
  19. xinference/model/audio/model_spec_modelscope.json +9 -0
  20. xinference/model/image/stable_diffusion/core.py +15 -6
  21. xinference/model/llm/llama_cpp/core.py +21 -14
  22. xinference/model/llm/llm_family.json +866 -46
  23. xinference/model/llm/llm_family.py +7 -2
  24. xinference/model/llm/llm_family_modelscope.json +873 -16
  25. xinference/model/llm/mlx/core.py +11 -3
  26. xinference/model/llm/reasoning_parsers/__init__.py +13 -0
  27. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +98 -0
  28. xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +140 -0
  29. xinference/model/llm/sglang/core.py +99 -11
  30. xinference/model/llm/transformers/core.py +9 -1
  31. xinference/model/llm/transformers/intern_vl.py +23 -14
  32. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  33. xinference/model/llm/transformers/qwen2_vl.py +20 -3
  34. xinference/model/llm/transformers/utils.py +22 -11
  35. xinference/model/llm/utils.py +164 -20
  36. xinference/model/llm/vllm/core.py +36 -4
  37. xinference/model/llm/vllm/xavier/executor.py +2 -2
  38. xinference/model/llm/vllm/xavier/scheduler.py +3 -3
  39. xinference/thirdparty/internvl/conversation.py +26 -17
  40. xinference/types.py +2 -0
  41. xinference/web/ui/build/asset-manifest.json +6 -6
  42. xinference/web/ui/build/index.html +1 -1
  43. xinference/web/ui/build/static/css/main.f8177338.css +2 -0
  44. xinference/web/ui/build/static/css/main.f8177338.css.map +1 -0
  45. xinference/web/ui/build/static/js/main.ad42919c.js +3 -0
  46. xinference/web/ui/build/static/js/main.ad42919c.js.map +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/0acb065326560592b10888234242f94f67efe28458b90f273d4d4fba9daa0cd2.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/6cb9f6c62ab4042f0b11c5d75e51187188e9d6f5f08b1d63e796e051bafdb457.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +1 -0
  59. xinference/web/ui/src/locales/en.json +14 -1
  60. xinference/web/ui/src/locales/zh.json +14 -1
  61. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/METADATA +18 -17
  62. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/RECORD +67 -60
  63. xinference/web/ui/build/static/css/main.51a587ff.css +0 -2
  64. xinference/web/ui/build/static/css/main.51a587ff.css.map +0 -1
  65. xinference/web/ui/build/static/js/main.b0936c54.js +0 -3
  66. xinference/web/ui/build/static/js/main.b0936c54.js.map +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/a7f1a71f6580dfe810c685a9c1d68e318f71e1fa258fbe50b87a6ac37cc0a598.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +0 -1
  76. /xinference/web/ui/build/static/js/{main.b0936c54.js.LICENSE.txt → main.ad42919c.js.LICENSE.txt} +0 -0
  77. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/LICENSE +0 -0
  78. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/WHEEL +0 -0
  79. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/entry_points.txt +0 -0
  80. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/top_level.txt +0 -0
@@ -31,7 +31,12 @@ from ....types import (
31
31
  )
32
32
  from ..core import LLM
33
33
  from ..llm_family import LLMFamilyV1, LLMSpecV1
34
- from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin, generate_completion_chunk
34
+ from ..utils import (
35
+ DEEPSEEK_TOOL_CALL_FAMILY,
36
+ QWEN_TOOL_CALL_FAMILY,
37
+ ChatModelMixin,
38
+ generate_completion_chunk,
39
+ )
35
40
 
36
41
  logger = logging.getLogger(__name__)
37
42
 
@@ -424,8 +429,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
424
429
  model_family = self.model_family.model_family or self.model_family.model_name
425
430
  tools = generate_config.pop("tools", []) if generate_config else None
426
431
  full_context_kwargs = {}
427
- if tools and model_family in QWEN_TOOL_CALL_FAMILY:
428
- full_context_kwargs["tools"] = tools
432
+ if tools:
433
+ if model_family in QWEN_TOOL_CALL_FAMILY:
434
+ full_context_kwargs["tools"] = tools
435
+ elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
436
+ self._tools_to_messages_for_deepseek(messages, tools)
429
437
  assert self.model_family.chat_template is not None
430
438
  full_prompt = self.get_full_context(
431
439
  messages, self.model_family.chat_template, **full_context_kwargs
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2024 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,98 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, Optional, Tuple, Type, Union
3
+
4
+ from ....types import ChatCompletionChunkDelta, CompletionChoice, CompletionChunk
5
+
6
+
7
+ class ReasoningParser(ABC):
8
+ """Abstract base class for reasoning content parsers."""
9
+
10
+ def __init__(
11
+ self,
12
+ reasoning_start_tag: str = "<think>",
13
+ reasoning_end_tag: str = "</think>",
14
+ ):
15
+ """Initialize the reasoning parser.
16
+
17
+ Args:
18
+ reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
19
+ reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
20
+ """
21
+ self.reasoning_start_tag = reasoning_start_tag
22
+ self.reasoning_end_tag = reasoning_end_tag
23
+
24
+ @abstractmethod
25
+ def extract_reasoning_content_streaming(
26
+ self,
27
+ previous_text: str,
28
+ current_text: str,
29
+ delta: Union[str, CompletionChunk],
30
+ ) -> ChatCompletionChunkDelta:
31
+ """Extract reasoning content from model output in a streaming fashion.
32
+
33
+ Args:
34
+ content (str): The model output content to parse.
35
+
36
+ Yields:
37
+ str: Extracted reasoning content chunks.
38
+ """
39
+ pass
40
+
41
+ @abstractmethod
42
+ def extract_reasoning_content(
43
+ self, model_output: Union[str, CompletionChoice]
44
+ ) -> Tuple[Optional[str], Optional[str]]:
45
+ """Extract reasoning content from model output.
46
+
47
+ Args:
48
+ content (str): The model output content to parse.
49
+
50
+ Returns:
51
+ Optional[str]: Extracted reasoning content, or None if no reasoning content found.
52
+ """
53
+ pass
54
+
55
+
56
+ class ReasoningParserManager:
57
+ """Manager class for reasoning parsers."""
58
+
59
+ _parsers: Dict[str, Type[ReasoningParser]] = {}
60
+
61
+ @classmethod
62
+ def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
63
+ """Register a reasoning parser for a specific model.
64
+
65
+ Args:
66
+ model_name (str): The name of the model.
67
+ parser_cls (Type[ReasoningParser]): The parser class to register.
68
+ """
69
+ cls._parsers[model_name] = parser_cls
70
+
71
+ @classmethod
72
+ def register_module(cls, model_name: str):
73
+ """Decorator for registering a reasoning parser for a specific model.
74
+
75
+ Args:
76
+ model_name (str): The name of the model.
77
+
78
+ Returns:
79
+ Callable: The decorator function.
80
+ """
81
+
82
+ def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
83
+ cls.register(model_name, parser_cls)
84
+ return parser_cls
85
+
86
+ return _register
87
+
88
+ @classmethod
89
+ def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
90
+ """Get the registered parser for a specific model.
91
+
92
+ Args:
93
+ model_name (str): The name of the model.
94
+
95
+ Returns:
96
+ Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
97
+ """
98
+ return cls._parsers.get(model_name)
@@ -0,0 +1,140 @@
1
+ import re
2
+ from typing import Optional, Tuple, Union
3
+
4
+ from ....types import ChatCompletionChunkDelta, CompletionChoice
5
+ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
6
+
7
+
8
+ @ReasoningParserManager.register_module("deepseek-v3")
9
+ @ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
10
+ @ReasoningParserManager.register_module("deepseek-r1-distill-llama")
11
+ class DeepSeekR1ReasoningParser(ReasoningParser):
12
+ """Reasoning parser for DeepSeek-R1 model."""
13
+
14
+ def __init__(
15
+ self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
16
+ ):
17
+ super().__init__(reasoning_start_tag, reasoning_end_tag)
18
+ self.reasoning_regex = re.compile(
19
+ rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
20
+ )
21
+
22
+ def extract_reasoning_content_streaming(
23
+ self,
24
+ previous_text: str,
25
+ current_text: str,
26
+ delta: ChatCompletionChunkDelta,
27
+ ) -> Optional[ChatCompletionChunkDelta]:
28
+ """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
29
+
30
+ Args:
31
+ previous_text (str): The previous accumulated text content.
32
+ current_text (Union[str, ChatCompletionChunk]): The current text chunk or completion chunk.
33
+
34
+ Yields:
35
+ str: Extracted reasoning content chunks.
36
+ """
37
+ if delta is None:
38
+ return delta
39
+
40
+ delta_text = delta["content"]
41
+
42
+ # Check if <think> is present in previous or delta.
43
+ # Keep compatibility with models that don't generate <think> tokens.
44
+ if self.reasoning_start_tag in previous_text:
45
+ if self.reasoning_end_tag in delta_text:
46
+ # <think> in previous, </think> in delta,
47
+ # extract reasoning content
48
+ end_idx = delta_text.find(self.reasoning_end_tag)
49
+ reasoning_content = delta_text[:end_idx]
50
+ content = delta_text[end_idx + len(self.reasoning_end_tag) :]
51
+ delta["reasoning_content"] = reasoning_content
52
+ if content is not None:
53
+ delta["content"] = content
54
+ return delta
55
+ elif self.reasoning_end_tag in previous_text:
56
+ # <think> in previous, </think> in previous,
57
+ # <think> in previous, </think> in previous,
58
+ # reasoning content ends
59
+ return delta
60
+ else:
61
+ # <think> in previous, no </think> in previous or delta,
62
+ # reasoning content continues
63
+ delta["reasoning_content"] = delta_text
64
+ delta["content"] = ""
65
+ return delta
66
+ elif self.reasoning_start_tag in delta_text:
67
+ if self.reasoning_end_tag in delta_text:
68
+ # <think> in delta, </think> in delta, extract reasoning content
69
+ start_idx = delta_text.find(self.reasoning_start_tag)
70
+ end_idx = delta_text.find(self.reasoning_end_tag)
71
+ reasoning_content = delta_text[
72
+ start_idx + len(self.reasoning_start_tag) : end_idx
73
+ ]
74
+ content = delta_text[end_idx + len(self.reasoning_end_tag) :]
75
+ delta["reasoning_content"] = reasoning_content
76
+ if content is not None:
77
+ delta["content"] = content
78
+ return delta
79
+ else:
80
+ # <think> in delta, no </think> in delta,
81
+ # reasoning content continues
82
+ delta["reasoning_content"] = delta_text
83
+ delta["content"] = ""
84
+ return delta
85
+ else:
86
+ # No <think> in previous or delta, also need to check for </think>.
87
+ # Because the model may have generated </think> without <think>
88
+ # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
89
+ if self.reasoning_end_tag in delta_text:
90
+ # </think> in delta with more tokens,
91
+ # extract reasoning content and content
92
+ end_idx = delta_text.find(self.reasoning_end_tag)
93
+ reasoning_content = delta_text[:end_idx]
94
+ content = delta_text[end_idx + len(self.reasoning_end_tag) :]
95
+ delta["reasoning_content"] = reasoning_content
96
+ if content is not None:
97
+ delta["content"] = content
98
+ return delta
99
+ elif self.reasoning_end_tag in previous_text:
100
+ # </think> in previous, thinking content ends
101
+ return delta
102
+ else:
103
+ # no </think> in previous or delta, reasoning content continues
104
+ delta["reasoning_content"] = delta_text
105
+ delta["content"] = ""
106
+ return delta
107
+
108
+ def extract_reasoning_content(
109
+ self, model_output: Union[str, CompletionChoice]
110
+ ) -> Tuple[Optional[str], Optional[str]]:
111
+ """Extract reasoning content from DeepSeek-R1 model output.
112
+
113
+ Args:
114
+ content (str): The model output content to parse.
115
+
116
+ Returns:
117
+ Optional[str]: Extracted reasoning content, or None if no reasoning content found.
118
+ """
119
+ if not isinstance(model_output, str):
120
+ model_output = model_output["text"]
121
+ # DeepSeek R1 doesn't generate <think> now.
122
+ # Thus we assume the reasoning content is always at the start.
123
+ # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
124
+ if self.reasoning_end_tag not in model_output:
125
+ return model_output, None
126
+ else:
127
+ # Add a start token if it's missing to keep compatibility.
128
+ if self.reasoning_start_tag not in model_output:
129
+ model_output = f"{self.reasoning_start_tag}{model_output}"
130
+ # Use a regex to find the reasoning content
131
+ reasoning_content = self.reasoning_regex.findall(model_output)[0]
132
+
133
+ end_index = len(
134
+ f"{self.reasoning_start_tag}{reasoning_content}{self.reasoning_end_tag}"
135
+ )
136
+ final_output = model_output[end_index:]
137
+
138
+ if len(final_output) == 0:
139
+ return reasoning_content, None
140
+ return reasoning_content, final_output
@@ -14,10 +14,14 @@
14
14
 
15
15
  import json
16
16
  import logging
17
+ import sys
18
+ import threading
17
19
  import time
18
20
  import uuid
19
21
  from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
20
22
 
23
+ from xoscar.utils import get_next_port
24
+
21
25
  from ....types import (
22
26
  ChatCompletion,
23
27
  ChatCompletionChunk,
@@ -40,6 +44,10 @@ class SGLANGModelConfig(TypedDict, total=False):
40
44
  mem_fraction_static: float
41
45
  log_level: str
42
46
  attention_reduce_in_fp32: bool # For gemma
47
+ # distributed
48
+ nnodes: Optional[int]
49
+ node_rank: Optional[int]
50
+ dist_init_addr: Optional[str]
43
51
 
44
52
 
45
53
  class SGLANGGenerateConfig(TypedDict, total=False):
@@ -91,6 +99,10 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
91
99
  "qwen2.5-instruct",
92
100
  "qwen2.5-coder-instruct",
93
101
  "QwQ-32B-Preview",
102
+ "deepseek-r1-distill-qwen",
103
+ "deepseek-r1-distill-llama",
104
+ "deepseek-v3",
105
+ "deepseek-r1",
94
106
  ]
95
107
 
96
108
 
@@ -107,6 +119,16 @@ class SGLANGModel(LLM):
107
119
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
108
120
  self._model_config = model_config
109
121
  self._engine = None
122
+ self._address = model_config.pop("address", None) # type: ignore
123
+ self._n_worker = model_config.pop("n_worker", 1) # type: ignore
124
+ self._shard = model_config.pop("shard", 0) # type: ignore
125
+ self._driver_info = model_config.pop("driver_info", None) # type: ignore
126
+ self._loading_thread = None
127
+ self._loading_error = None
128
+
129
+ @property
130
+ def driver_info(self) -> Optional[dict]:
131
+ return self._driver_info
110
132
 
111
133
  def load(self):
112
134
  try:
@@ -128,18 +150,84 @@ class SGLANGModel(LLM):
128
150
  else:
129
151
  self._model_config.setdefault("attention_reduce_in_fp32", False)
130
152
 
131
- logger.info(
132
- f"Loading {self.model_uid} with following model config: {self._model_config}"
133
- )
153
+ # gen port for sgl Runtime,
154
+ # this is useful for sglang service on a same machine.
155
+ # sglang typically find a port between [port, 40000]
156
+ # we need to ensure the generated port < 40000
157
+ sgl_port = None
158
+ for _ in range(10):
159
+ sgl_port = get_next_port()
160
+ if sgl_port >= 40000:
161
+ sgl_port = None
162
+ else:
163
+ break
164
+ if sgl_port is None:
165
+ raise ValueError("Failed to find a port for sglang")
166
+
167
+ if self._n_worker > 1:
168
+ # distributed inference
169
+ self._model_config["nnodes"] = self._n_worker
170
+ self._model_config["node_rank"] = self._shard
171
+ # model across multiple workers
172
+ if self._shard == 0:
173
+ # distributed, need to init driver_info
174
+ assert self._driver_info is None
175
+ # This must run inside Xoscar pool
176
+ dist_init_addr = f"{self._address.split(':', 1)[0]}:{get_next_port()}"
177
+ self._driver_info = {"dist_init_addr": dist_init_addr}
178
+ self._model_config["dist_init_addr"] = dist_init_addr
179
+ else:
180
+ assert self._driver_info is not None
181
+ self._model_config["dist_init_addr"] = self._driver_info[
182
+ "dist_init_addr"
183
+ ]
134
184
 
135
- self._engine = sgl.Runtime(
136
- model_path=self.model_path,
137
- tokenizer_path=self.model_path,
138
- **self._model_config,
139
- )
185
+ logger.info(
186
+ f"Loading {self.model_uid}, shard({self._shard} of {self._n_worker}) with following model config: {self._model_config}"
187
+ )
188
+
189
+ def _load():
190
+ try:
191
+ self._engine = sgl.Runtime(
192
+ model_path=self.model_path,
193
+ tokenizer_path=self.model_path,
194
+ port=sgl_port,
195
+ **self._model_config,
196
+ )
197
+ except:
198
+ logger.exception("Creating sglang Runtime failed")
199
+ self._loading_error = sys.exc_info()
200
+
201
+ self._loading_thread = threading.Thread(target=_load)
202
+ self._loading_thread.start()
203
+ if self._shard == 0:
204
+ # wait for 3 seconds to ensure torch distributed inited first
205
+ self._loading_thread.join(3)
206
+ else:
207
+ logger.info(
208
+ f"Loading {self.model_uid} with following model config: {self._model_config}"
209
+ )
210
+
211
+ self._engine = sgl.Runtime(
212
+ model_path=self.model_path,
213
+ tokenizer_path=self.model_path,
214
+ port=sgl_port,
215
+ **self._model_config,
216
+ )
217
+
218
+ def wait_for_load(self):
219
+ if self._loading_thread:
220
+ if self._shard == 0:
221
+ # for the shard 0, we wait it to complete
222
+ # the sglang will serve forever for the other shards,
223
+ # so we only check if any error happens.
224
+ self._loading_thread.join()
225
+ if self._loading_error:
226
+ _, err, tb = self._loading_error
227
+ raise err.with_traceback(tb)
140
228
 
141
229
  def stop(self):
142
- logger.info("Stopping SGLang engine")
230
+ logger.info("Stopping SGLang engine, sglang pid: %s", self._engine.pid)
143
231
  self._engine.shutdown()
144
232
 
145
233
  def _sanitize_model_config(
@@ -151,7 +239,7 @@ class SGLANGModel(LLM):
151
239
  cuda_count = self._get_cuda_count()
152
240
  model_config.setdefault("tokenizer_mode", "auto")
153
241
  model_config.setdefault("trust_remote_code", True)
154
- model_config.setdefault("tp_size", cuda_count)
242
+ model_config.setdefault("tp_size", cuda_count * self._n_worker)
155
243
  # See https://github.com/sgl-project/sglang/blob/00023d622a6d484e67ef4a0e444f708b8fc861c8/python/sglang/srt/server_args.py#L100-L109
156
244
  mem_fraction_static = model_config.get("mem_fraction_static")
157
245
  if mem_fraction_static is None:
@@ -159,7 +247,7 @@ class SGLANGModel(LLM):
159
247
  if tp_size >= 16:
160
248
  model_config["mem_fraction_static"] = 0.79
161
249
  elif tp_size >= 8:
162
- model_config["mem_fraction_static"] = 0.83
250
+ model_config["mem_fraction_static"] = 0.81
163
251
  elif tp_size >= 4:
164
252
  model_config["mem_fraction_static"] = 0.85
165
253
  elif tp_size >= 2:
@@ -39,7 +39,12 @@ from ....types import (
39
39
  from ...utils import select_device
40
40
  from ..core import LLM
41
41
  from ..llm_family import LLMFamilyV1, LLMSpecV1
42
- from ..utils import LLAMA3_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
42
+ from ..utils import (
43
+ DEEPSEEK_TOOL_CALL_FAMILY,
44
+ LLAMA3_TOOL_CALL_FAMILY,
45
+ QWEN_TOOL_CALL_FAMILY,
46
+ ChatModelMixin,
47
+ )
43
48
  from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
44
49
 
45
50
  logger = logging.getLogger(__name__)
@@ -62,6 +67,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
62
67
  "MiniCPM-V-2.6",
63
68
  "glm-4v",
64
69
  "qwen2-vl-instruct",
70
+ "qwen2.5-vl-instruct",
65
71
  "qwen2-audio",
66
72
  "qwen2-audio-instruct",
67
73
  "deepseek-v2",
@@ -681,6 +687,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
681
687
  or model_family in LLAMA3_TOOL_CALL_FAMILY
682
688
  ):
683
689
  full_context_kwargs["tools"] = tools
690
+ elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
691
+ self._tools_to_messages_for_deepseek(messages, tools)
684
692
  assert self.model_family.chat_template is not None
685
693
  full_prompt = self.get_full_context(
686
694
  messages,
@@ -265,15 +265,24 @@ class InternVLChatModel(PytorchChatModel):
265
265
  if world_size == 1:
266
266
  return None
267
267
  model_size = f"{self.model_spec.model_size_in_billions}B"
268
+ model_name = f"{self.model_family.model_name.lower()}-{model_size}"
268
269
  num_layers = {
269
- "1B": 24,
270
- "2B": 24,
271
- "4B": 32,
272
- "8B": 32,
273
- "26B": 48,
274
- "40B": 60,
275
- "76B": 80,
276
- }[model_size]
270
+ "internvl2-1B": 24,
271
+ "internvl2-2B": 24,
272
+ "internvl2-4B": 32,
273
+ "internvl2-8B": 32,
274
+ "internvl2-26B": 48,
275
+ "internvl2-40B": 60,
276
+ "internvl2-76B": 80,
277
+ "internvl2.5-1B": 24,
278
+ "internvl2.5-2B": 24,
279
+ "internvl2.5-4B": 36,
280
+ "internvl2.5-8B": 32,
281
+ "internvl2.5-26B": 48,
282
+ "internvl2.5-38B": 64,
283
+ "internvl2.5-78B": 80,
284
+ }[model_name]
285
+
277
286
  # Since the first GPU will be used for ViT, treat it as half a GPU.
278
287
  num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
279
288
  num_layers_per_gpu = [num_layers_per_gpu] * world_size
@@ -322,9 +331,7 @@ class InternVLChatModel(PytorchChatModel):
322
331
  self._model.cuda()
323
332
 
324
333
  self._tokenizer = AutoTokenizer.from_pretrained(
325
- self.model_path,
326
- trust_remote_code=True,
327
- use_fast=False,
334
+ self.model_path, trust_remote_code=True, use_fast=False
328
335
  )
329
336
 
330
337
  @cache_clean
@@ -339,11 +346,12 @@ class InternVLChatModel(PytorchChatModel):
339
346
  IMG_END_TOKEN = "</img>"
340
347
  IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
341
348
 
349
+ generate_config = generate_config if isinstance(generate_config, dict) else {}
350
+
342
351
  generation_config = {
343
- "max_new_tokens": generate_config.get("max_tokens", 1024)
344
- if generate_config
345
- else 1024,
352
+ "max_new_tokens": (generate_config.get("max_tokens", 1024)),
346
353
  "do_sample": False,
354
+ "temperature": generate_config.get("temperature", None),
347
355
  }
348
356
 
349
357
  stream = (
@@ -458,6 +466,7 @@ class InternVLChatModel(PytorchChatModel):
458
466
  streamer = TextIteratorStreamer(
459
467
  self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
460
468
  )
469
+
461
470
  # Define the generation configuration
462
471
  generate_kwargs["streamer"] = streamer
463
472
  # Start the model chat in a separate thread
@@ -55,9 +55,9 @@ class Qwen2AudioChatModel(PytorchChatModel):
55
55
 
56
56
  device = self._pytorch_model_config.get("device", "auto")
57
57
  device = select_device(device)
58
- self._device = device
59
58
  # for multiple GPU, set back to auto to make multiple devices work
60
59
  device = "auto" if device == "cuda" else device
60
+ self._device = device
61
61
 
62
62
  self._processor = AutoProcessor.from_pretrained(
63
63
  self.model_path,
@@ -105,6 +105,8 @@ class Qwen2AudioChatModel(PytorchChatModel):
105
105
  inputs = self._processor(
106
106
  text=text, audios=audios, return_tensors="pt", padding=True
107
107
  )
108
+ # Make sure that the inputs and the model are on the same device.
109
+ inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
108
110
  inputs.input_ids = inputs.input_ids.to(self._device)
109
111
  generate_config = generate_config if generate_config else {}
110
112
  stream = generate_config.get("stream", False) if generate_config else False
@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
45
45
  def match(
46
46
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
47
47
  ) -> bool:
48
+ if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
49
+ return False
48
50
  llm_family = model_family.model_family or model_family.model_name
49
51
  if "qwen2-vl-instruct".lower() in llm_family.lower():
50
52
  return True
53
+ if "qwen2.5-vl-instruct".lower() in llm_family.lower():
54
+ return True
51
55
  if "qvq-72b-preview".lower() in llm_family.lower():
52
56
  return True
53
57
  return False
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
55
59
  def load(self):
56
60
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
57
61
 
62
+ try:
63
+ from transformers import Qwen2_5_VLForConditionalGeneration
64
+ except ImportError:
65
+ Qwen2_5_VLForConditionalGeneration = None
66
+
58
67
  device = self._pytorch_model_config.get("device", "auto")
59
68
  device = select_device(device)
60
69
  self._device = device
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
66
75
  )
67
76
  self._tokenizer = self._processor.tokenizer
68
77
  flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
78
+ llm_family = self.model_family.model_family or self.model_family.model_name
79
+ model_cls = (
80
+ Qwen2_5_VLForConditionalGeneration
81
+ if "qwen2.5" in llm_family
82
+ else Qwen2VLForConditionalGeneration
83
+ )
84
+ if model_cls is None:
85
+ raise ImportError("`transformers` version is too old, please upgrade it")
69
86
  if flash_attn_installed:
70
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
87
+ self._model = model_cls.from_pretrained(
71
88
  self.model_path,
72
89
  torch_dtype="bfloat16",
73
90
  device_map=device,
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
76
93
  ).eval()
77
94
  elif is_npu_available():
78
95
  # Ascend do not support bf16
79
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
96
+ self._model = model_cls.from_pretrained(
80
97
  self.model_path,
81
98
  device_map="auto",
82
99
  trust_remote_code=True,
83
100
  torch_dtype="float16",
84
101
  ).eval()
85
102
  else:
86
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
103
+ self._model = model_cls.from_pretrained(
87
104
  self.model_path, device_map=device, trust_remote_code=True
88
105
  ).eval()
89
106