xinference 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +4 -7
- xinference/client/handlers.py +3 -0
- xinference/client/restful/restful_client.py +9 -1
- xinference/core/model.py +19 -0
- xinference/core/resource.py +7 -1
- xinference/core/scheduler.py +4 -7
- xinference/core/status_guard.py +1 -0
- xinference/core/supervisor.py +228 -19
- xinference/core/utils.py +1 -29
- xinference/core/worker.py +28 -2
- xinference/deploy/cmdline.py +33 -3
- xinference/deploy/local.py +2 -1
- xinference/deploy/test/test_cmdline.py +32 -0
- xinference/device_utils.py +43 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/kokoro.py +122 -0
- xinference/model/audio/model_spec.json +8 -0
- xinference/model/audio/model_spec_modelscope.json +9 -0
- xinference/model/image/stable_diffusion/core.py +15 -6
- xinference/model/llm/llama_cpp/core.py +21 -14
- xinference/model/llm/llm_family.json +866 -46
- xinference/model/llm/llm_family.py +7 -2
- xinference/model/llm/llm_family_modelscope.json +873 -16
- xinference/model/llm/mlx/core.py +11 -3
- xinference/model/llm/reasoning_parsers/__init__.py +13 -0
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +98 -0
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +140 -0
- xinference/model/llm/sglang/core.py +99 -11
- xinference/model/llm/transformers/core.py +9 -1
- xinference/model/llm/transformers/intern_vl.py +23 -14
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +20 -3
- xinference/model/llm/transformers/utils.py +22 -11
- xinference/model/llm/utils.py +164 -20
- xinference/model/llm/vllm/core.py +36 -4
- xinference/model/llm/vllm/xavier/executor.py +2 -2
- xinference/model/llm/vllm/xavier/scheduler.py +3 -3
- xinference/thirdparty/internvl/conversation.py +26 -17
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.f8177338.css +2 -0
- xinference/web/ui/build/static/css/main.f8177338.css.map +1 -0
- xinference/web/ui/build/static/js/main.ad42919c.js +3 -0
- xinference/web/ui/build/static/js/main.ad42919c.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0acb065326560592b10888234242f94f67efe28458b90f273d4d4fba9daa0cd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6cb9f6c62ab4042f0b11c5d75e51187188e9d6f5f08b1d63e796e051bafdb457.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +1 -0
- xinference/web/ui/src/locales/en.json +14 -1
- xinference/web/ui/src/locales/zh.json +14 -1
- {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/METADATA +18 -17
- {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/RECORD +67 -60
- xinference/web/ui/build/static/css/main.51a587ff.css +0 -2
- xinference/web/ui/build/static/css/main.51a587ff.css.map +0 -1
- xinference/web/ui/build/static/js/main.b0936c54.js +0 -3
- xinference/web/ui/build/static/js/main.b0936c54.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a7f1a71f6580dfe810c685a9c1d68e318f71e1fa258fbe50b87a6ac37cc0a598.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +0 -1
- /xinference/web/ui/build/static/js/{main.b0936c54.js.LICENSE.txt → main.ad42919c.js.LICENSE.txt} +0 -0
- {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/LICENSE +0 -0
- {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/WHEEL +0 -0
- {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/top_level.txt +0 -0
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -31,7 +31,12 @@ from ....types import (
|
|
|
31
31
|
)
|
|
32
32
|
from ..core import LLM
|
|
33
33
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
34
|
-
from ..utils import
|
|
34
|
+
from ..utils import (
|
|
35
|
+
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
36
|
+
QWEN_TOOL_CALL_FAMILY,
|
|
37
|
+
ChatModelMixin,
|
|
38
|
+
generate_completion_chunk,
|
|
39
|
+
)
|
|
35
40
|
|
|
36
41
|
logger = logging.getLogger(__name__)
|
|
37
42
|
|
|
@@ -424,8 +429,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
424
429
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
425
430
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
426
431
|
full_context_kwargs = {}
|
|
427
|
-
if tools
|
|
428
|
-
|
|
432
|
+
if tools:
|
|
433
|
+
if model_family in QWEN_TOOL_CALL_FAMILY:
|
|
434
|
+
full_context_kwargs["tools"] = tools
|
|
435
|
+
elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
436
|
+
self._tools_to_messages_for_deepseek(messages, tools)
|
|
429
437
|
assert self.model_family.chat_template is not None
|
|
430
438
|
full_prompt = self.get_full_context(
|
|
431
439
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-2024 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Dict, Optional, Tuple, Type, Union
|
|
3
|
+
|
|
4
|
+
from ....types import ChatCompletionChunkDelta, CompletionChoice, CompletionChunk
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ReasoningParser(ABC):
|
|
8
|
+
"""Abstract base class for reasoning content parsers."""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
reasoning_start_tag: str = "<think>",
|
|
13
|
+
reasoning_end_tag: str = "</think>",
|
|
14
|
+
):
|
|
15
|
+
"""Initialize the reasoning parser.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
|
|
19
|
+
reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
|
|
20
|
+
"""
|
|
21
|
+
self.reasoning_start_tag = reasoning_start_tag
|
|
22
|
+
self.reasoning_end_tag = reasoning_end_tag
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def extract_reasoning_content_streaming(
|
|
26
|
+
self,
|
|
27
|
+
previous_text: str,
|
|
28
|
+
current_text: str,
|
|
29
|
+
delta: Union[str, CompletionChunk],
|
|
30
|
+
) -> ChatCompletionChunkDelta:
|
|
31
|
+
"""Extract reasoning content from model output in a streaming fashion.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
content (str): The model output content to parse.
|
|
35
|
+
|
|
36
|
+
Yields:
|
|
37
|
+
str: Extracted reasoning content chunks.
|
|
38
|
+
"""
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def extract_reasoning_content(
|
|
43
|
+
self, model_output: Union[str, CompletionChoice]
|
|
44
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
45
|
+
"""Extract reasoning content from model output.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
content (str): The model output content to parse.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Optional[str]: Extracted reasoning content, or None if no reasoning content found.
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ReasoningParserManager:
|
|
57
|
+
"""Manager class for reasoning parsers."""
|
|
58
|
+
|
|
59
|
+
_parsers: Dict[str, Type[ReasoningParser]] = {}
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
|
|
63
|
+
"""Register a reasoning parser for a specific model.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
model_name (str): The name of the model.
|
|
67
|
+
parser_cls (Type[ReasoningParser]): The parser class to register.
|
|
68
|
+
"""
|
|
69
|
+
cls._parsers[model_name] = parser_cls
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def register_module(cls, model_name: str):
|
|
73
|
+
"""Decorator for registering a reasoning parser for a specific model.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
model_name (str): The name of the model.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Callable: The decorator function.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
|
|
83
|
+
cls.register(model_name, parser_cls)
|
|
84
|
+
return parser_cls
|
|
85
|
+
|
|
86
|
+
return _register
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
|
|
90
|
+
"""Get the registered parser for a specific model.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
model_name (str): The name of the model.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
|
|
97
|
+
"""
|
|
98
|
+
return cls._parsers.get(model_name)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from ....types import ChatCompletionChunkDelta, CompletionChoice
|
|
5
|
+
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@ReasoningParserManager.register_module("deepseek-v3")
|
|
9
|
+
@ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
|
|
10
|
+
@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
|
|
11
|
+
class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
12
|
+
"""Reasoning parser for DeepSeek-R1 model."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
|
|
16
|
+
):
|
|
17
|
+
super().__init__(reasoning_start_tag, reasoning_end_tag)
|
|
18
|
+
self.reasoning_regex = re.compile(
|
|
19
|
+
rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def extract_reasoning_content_streaming(
|
|
23
|
+
self,
|
|
24
|
+
previous_text: str,
|
|
25
|
+
current_text: str,
|
|
26
|
+
delta: ChatCompletionChunkDelta,
|
|
27
|
+
) -> Optional[ChatCompletionChunkDelta]:
|
|
28
|
+
"""Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
previous_text (str): The previous accumulated text content.
|
|
32
|
+
current_text (Union[str, ChatCompletionChunk]): The current text chunk or completion chunk.
|
|
33
|
+
|
|
34
|
+
Yields:
|
|
35
|
+
str: Extracted reasoning content chunks.
|
|
36
|
+
"""
|
|
37
|
+
if delta is None:
|
|
38
|
+
return delta
|
|
39
|
+
|
|
40
|
+
delta_text = delta["content"]
|
|
41
|
+
|
|
42
|
+
# Check if <think> is present in previous or delta.
|
|
43
|
+
# Keep compatibility with models that don't generate <think> tokens.
|
|
44
|
+
if self.reasoning_start_tag in previous_text:
|
|
45
|
+
if self.reasoning_end_tag in delta_text:
|
|
46
|
+
# <think> in previous, </think> in delta,
|
|
47
|
+
# extract reasoning content
|
|
48
|
+
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
49
|
+
reasoning_content = delta_text[:end_idx]
|
|
50
|
+
content = delta_text[end_idx + len(self.reasoning_end_tag) :]
|
|
51
|
+
delta["reasoning_content"] = reasoning_content
|
|
52
|
+
if content is not None:
|
|
53
|
+
delta["content"] = content
|
|
54
|
+
return delta
|
|
55
|
+
elif self.reasoning_end_tag in previous_text:
|
|
56
|
+
# <think> in previous, </think> in previous,
|
|
57
|
+
# <think> in previous, </think> in previous,
|
|
58
|
+
# reasoning content ends
|
|
59
|
+
return delta
|
|
60
|
+
else:
|
|
61
|
+
# <think> in previous, no </think> in previous or delta,
|
|
62
|
+
# reasoning content continues
|
|
63
|
+
delta["reasoning_content"] = delta_text
|
|
64
|
+
delta["content"] = ""
|
|
65
|
+
return delta
|
|
66
|
+
elif self.reasoning_start_tag in delta_text:
|
|
67
|
+
if self.reasoning_end_tag in delta_text:
|
|
68
|
+
# <think> in delta, </think> in delta, extract reasoning content
|
|
69
|
+
start_idx = delta_text.find(self.reasoning_start_tag)
|
|
70
|
+
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
71
|
+
reasoning_content = delta_text[
|
|
72
|
+
start_idx + len(self.reasoning_start_tag) : end_idx
|
|
73
|
+
]
|
|
74
|
+
content = delta_text[end_idx + len(self.reasoning_end_tag) :]
|
|
75
|
+
delta["reasoning_content"] = reasoning_content
|
|
76
|
+
if content is not None:
|
|
77
|
+
delta["content"] = content
|
|
78
|
+
return delta
|
|
79
|
+
else:
|
|
80
|
+
# <think> in delta, no </think> in delta,
|
|
81
|
+
# reasoning content continues
|
|
82
|
+
delta["reasoning_content"] = delta_text
|
|
83
|
+
delta["content"] = ""
|
|
84
|
+
return delta
|
|
85
|
+
else:
|
|
86
|
+
# No <think> in previous or delta, also need to check for </think>.
|
|
87
|
+
# Because the model may have generated </think> without <think>
|
|
88
|
+
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
|
|
89
|
+
if self.reasoning_end_tag in delta_text:
|
|
90
|
+
# </think> in delta with more tokens,
|
|
91
|
+
# extract reasoning content and content
|
|
92
|
+
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
93
|
+
reasoning_content = delta_text[:end_idx]
|
|
94
|
+
content = delta_text[end_idx + len(self.reasoning_end_tag) :]
|
|
95
|
+
delta["reasoning_content"] = reasoning_content
|
|
96
|
+
if content is not None:
|
|
97
|
+
delta["content"] = content
|
|
98
|
+
return delta
|
|
99
|
+
elif self.reasoning_end_tag in previous_text:
|
|
100
|
+
# </think> in previous, thinking content ends
|
|
101
|
+
return delta
|
|
102
|
+
else:
|
|
103
|
+
# no </think> in previous or delta, reasoning content continues
|
|
104
|
+
delta["reasoning_content"] = delta_text
|
|
105
|
+
delta["content"] = ""
|
|
106
|
+
return delta
|
|
107
|
+
|
|
108
|
+
def extract_reasoning_content(
|
|
109
|
+
self, model_output: Union[str, CompletionChoice]
|
|
110
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
111
|
+
"""Extract reasoning content from DeepSeek-R1 model output.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
content (str): The model output content to parse.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Optional[str]: Extracted reasoning content, or None if no reasoning content found.
|
|
118
|
+
"""
|
|
119
|
+
if not isinstance(model_output, str):
|
|
120
|
+
model_output = model_output["text"]
|
|
121
|
+
# DeepSeek R1 doesn't generate <think> now.
|
|
122
|
+
# Thus we assume the reasoning content is always at the start.
|
|
123
|
+
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
|
|
124
|
+
if self.reasoning_end_tag not in model_output:
|
|
125
|
+
return model_output, None
|
|
126
|
+
else:
|
|
127
|
+
# Add a start token if it's missing to keep compatibility.
|
|
128
|
+
if self.reasoning_start_tag not in model_output:
|
|
129
|
+
model_output = f"{self.reasoning_start_tag}{model_output}"
|
|
130
|
+
# Use a regex to find the reasoning content
|
|
131
|
+
reasoning_content = self.reasoning_regex.findall(model_output)[0]
|
|
132
|
+
|
|
133
|
+
end_index = len(
|
|
134
|
+
f"{self.reasoning_start_tag}{reasoning_content}{self.reasoning_end_tag}"
|
|
135
|
+
)
|
|
136
|
+
final_output = model_output[end_index:]
|
|
137
|
+
|
|
138
|
+
if len(final_output) == 0:
|
|
139
|
+
return reasoning_content, None
|
|
140
|
+
return reasoning_content, final_output
|
|
@@ -14,10 +14,14 @@
|
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
16
|
import logging
|
|
17
|
+
import sys
|
|
18
|
+
import threading
|
|
17
19
|
import time
|
|
18
20
|
import uuid
|
|
19
21
|
from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
|
|
20
22
|
|
|
23
|
+
from xoscar.utils import get_next_port
|
|
24
|
+
|
|
21
25
|
from ....types import (
|
|
22
26
|
ChatCompletion,
|
|
23
27
|
ChatCompletionChunk,
|
|
@@ -40,6 +44,10 @@ class SGLANGModelConfig(TypedDict, total=False):
|
|
|
40
44
|
mem_fraction_static: float
|
|
41
45
|
log_level: str
|
|
42
46
|
attention_reduce_in_fp32: bool # For gemma
|
|
47
|
+
# distributed
|
|
48
|
+
nnodes: Optional[int]
|
|
49
|
+
node_rank: Optional[int]
|
|
50
|
+
dist_init_addr: Optional[str]
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
class SGLANGGenerateConfig(TypedDict, total=False):
|
|
@@ -91,6 +99,10 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
|
91
99
|
"qwen2.5-instruct",
|
|
92
100
|
"qwen2.5-coder-instruct",
|
|
93
101
|
"QwQ-32B-Preview",
|
|
102
|
+
"deepseek-r1-distill-qwen",
|
|
103
|
+
"deepseek-r1-distill-llama",
|
|
104
|
+
"deepseek-v3",
|
|
105
|
+
"deepseek-r1",
|
|
94
106
|
]
|
|
95
107
|
|
|
96
108
|
|
|
@@ -107,6 +119,16 @@ class SGLANGModel(LLM):
|
|
|
107
119
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
108
120
|
self._model_config = model_config
|
|
109
121
|
self._engine = None
|
|
122
|
+
self._address = model_config.pop("address", None) # type: ignore
|
|
123
|
+
self._n_worker = model_config.pop("n_worker", 1) # type: ignore
|
|
124
|
+
self._shard = model_config.pop("shard", 0) # type: ignore
|
|
125
|
+
self._driver_info = model_config.pop("driver_info", None) # type: ignore
|
|
126
|
+
self._loading_thread = None
|
|
127
|
+
self._loading_error = None
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def driver_info(self) -> Optional[dict]:
|
|
131
|
+
return self._driver_info
|
|
110
132
|
|
|
111
133
|
def load(self):
|
|
112
134
|
try:
|
|
@@ -128,18 +150,84 @@ class SGLANGModel(LLM):
|
|
|
128
150
|
else:
|
|
129
151
|
self._model_config.setdefault("attention_reduce_in_fp32", False)
|
|
130
152
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
153
|
+
# gen port for sgl Runtime,
|
|
154
|
+
# this is useful for sglang service on a same machine.
|
|
155
|
+
# sglang typically find a port between [port, 40000]
|
|
156
|
+
# we need to ensure the generated port < 40000
|
|
157
|
+
sgl_port = None
|
|
158
|
+
for _ in range(10):
|
|
159
|
+
sgl_port = get_next_port()
|
|
160
|
+
if sgl_port >= 40000:
|
|
161
|
+
sgl_port = None
|
|
162
|
+
else:
|
|
163
|
+
break
|
|
164
|
+
if sgl_port is None:
|
|
165
|
+
raise ValueError("Failed to find a port for sglang")
|
|
166
|
+
|
|
167
|
+
if self._n_worker > 1:
|
|
168
|
+
# distributed inference
|
|
169
|
+
self._model_config["nnodes"] = self._n_worker
|
|
170
|
+
self._model_config["node_rank"] = self._shard
|
|
171
|
+
# model across multiple workers
|
|
172
|
+
if self._shard == 0:
|
|
173
|
+
# distributed, need to init driver_info
|
|
174
|
+
assert self._driver_info is None
|
|
175
|
+
# This must run inside Xoscar pool
|
|
176
|
+
dist_init_addr = f"{self._address.split(':', 1)[0]}:{get_next_port()}"
|
|
177
|
+
self._driver_info = {"dist_init_addr": dist_init_addr}
|
|
178
|
+
self._model_config["dist_init_addr"] = dist_init_addr
|
|
179
|
+
else:
|
|
180
|
+
assert self._driver_info is not None
|
|
181
|
+
self._model_config["dist_init_addr"] = self._driver_info[
|
|
182
|
+
"dist_init_addr"
|
|
183
|
+
]
|
|
134
184
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
185
|
+
logger.info(
|
|
186
|
+
f"Loading {self.model_uid}, shard({self._shard} of {self._n_worker}) with following model config: {self._model_config}"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def _load():
|
|
190
|
+
try:
|
|
191
|
+
self._engine = sgl.Runtime(
|
|
192
|
+
model_path=self.model_path,
|
|
193
|
+
tokenizer_path=self.model_path,
|
|
194
|
+
port=sgl_port,
|
|
195
|
+
**self._model_config,
|
|
196
|
+
)
|
|
197
|
+
except:
|
|
198
|
+
logger.exception("Creating sglang Runtime failed")
|
|
199
|
+
self._loading_error = sys.exc_info()
|
|
200
|
+
|
|
201
|
+
self._loading_thread = threading.Thread(target=_load)
|
|
202
|
+
self._loading_thread.start()
|
|
203
|
+
if self._shard == 0:
|
|
204
|
+
# wait for 3 seconds to ensure torch distributed inited first
|
|
205
|
+
self._loading_thread.join(3)
|
|
206
|
+
else:
|
|
207
|
+
logger.info(
|
|
208
|
+
f"Loading {self.model_uid} with following model config: {self._model_config}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
self._engine = sgl.Runtime(
|
|
212
|
+
model_path=self.model_path,
|
|
213
|
+
tokenizer_path=self.model_path,
|
|
214
|
+
port=sgl_port,
|
|
215
|
+
**self._model_config,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def wait_for_load(self):
|
|
219
|
+
if self._loading_thread:
|
|
220
|
+
if self._shard == 0:
|
|
221
|
+
# for the shard 0, we wait it to complete
|
|
222
|
+
# the sglang will serve forever for the other shards,
|
|
223
|
+
# so we only check if any error happens.
|
|
224
|
+
self._loading_thread.join()
|
|
225
|
+
if self._loading_error:
|
|
226
|
+
_, err, tb = self._loading_error
|
|
227
|
+
raise err.with_traceback(tb)
|
|
140
228
|
|
|
141
229
|
def stop(self):
|
|
142
|
-
logger.info("Stopping SGLang engine")
|
|
230
|
+
logger.info("Stopping SGLang engine, sglang pid: %s", self._engine.pid)
|
|
143
231
|
self._engine.shutdown()
|
|
144
232
|
|
|
145
233
|
def _sanitize_model_config(
|
|
@@ -151,7 +239,7 @@ class SGLANGModel(LLM):
|
|
|
151
239
|
cuda_count = self._get_cuda_count()
|
|
152
240
|
model_config.setdefault("tokenizer_mode", "auto")
|
|
153
241
|
model_config.setdefault("trust_remote_code", True)
|
|
154
|
-
model_config.setdefault("tp_size", cuda_count)
|
|
242
|
+
model_config.setdefault("tp_size", cuda_count * self._n_worker)
|
|
155
243
|
# See https://github.com/sgl-project/sglang/blob/00023d622a6d484e67ef4a0e444f708b8fc861c8/python/sglang/srt/server_args.py#L100-L109
|
|
156
244
|
mem_fraction_static = model_config.get("mem_fraction_static")
|
|
157
245
|
if mem_fraction_static is None:
|
|
@@ -159,7 +247,7 @@ class SGLANGModel(LLM):
|
|
|
159
247
|
if tp_size >= 16:
|
|
160
248
|
model_config["mem_fraction_static"] = 0.79
|
|
161
249
|
elif tp_size >= 8:
|
|
162
|
-
model_config["mem_fraction_static"] = 0.
|
|
250
|
+
model_config["mem_fraction_static"] = 0.81
|
|
163
251
|
elif tp_size >= 4:
|
|
164
252
|
model_config["mem_fraction_static"] = 0.85
|
|
165
253
|
elif tp_size >= 2:
|
|
@@ -39,7 +39,12 @@ from ....types import (
|
|
|
39
39
|
from ...utils import select_device
|
|
40
40
|
from ..core import LLM
|
|
41
41
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
42
|
-
from ..utils import
|
|
42
|
+
from ..utils import (
|
|
43
|
+
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
44
|
+
LLAMA3_TOOL_CALL_FAMILY,
|
|
45
|
+
QWEN_TOOL_CALL_FAMILY,
|
|
46
|
+
ChatModelMixin,
|
|
47
|
+
)
|
|
43
48
|
from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
|
|
44
49
|
|
|
45
50
|
logger = logging.getLogger(__name__)
|
|
@@ -62,6 +67,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
|
62
67
|
"MiniCPM-V-2.6",
|
|
63
68
|
"glm-4v",
|
|
64
69
|
"qwen2-vl-instruct",
|
|
70
|
+
"qwen2.5-vl-instruct",
|
|
65
71
|
"qwen2-audio",
|
|
66
72
|
"qwen2-audio-instruct",
|
|
67
73
|
"deepseek-v2",
|
|
@@ -681,6 +687,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
681
687
|
or model_family in LLAMA3_TOOL_CALL_FAMILY
|
|
682
688
|
):
|
|
683
689
|
full_context_kwargs["tools"] = tools
|
|
690
|
+
elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
691
|
+
self._tools_to_messages_for_deepseek(messages, tools)
|
|
684
692
|
assert self.model_family.chat_template is not None
|
|
685
693
|
full_prompt = self.get_full_context(
|
|
686
694
|
messages,
|
|
@@ -265,15 +265,24 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
265
265
|
if world_size == 1:
|
|
266
266
|
return None
|
|
267
267
|
model_size = f"{self.model_spec.model_size_in_billions}B"
|
|
268
|
+
model_name = f"{self.model_family.model_name.lower()}-{model_size}"
|
|
268
269
|
num_layers = {
|
|
269
|
-
"1B": 24,
|
|
270
|
-
"2B": 24,
|
|
271
|
-
"4B": 32,
|
|
272
|
-
"8B": 32,
|
|
273
|
-
"26B": 48,
|
|
274
|
-
"40B": 60,
|
|
275
|
-
"76B": 80,
|
|
276
|
-
|
|
270
|
+
"internvl2-1B": 24,
|
|
271
|
+
"internvl2-2B": 24,
|
|
272
|
+
"internvl2-4B": 32,
|
|
273
|
+
"internvl2-8B": 32,
|
|
274
|
+
"internvl2-26B": 48,
|
|
275
|
+
"internvl2-40B": 60,
|
|
276
|
+
"internvl2-76B": 80,
|
|
277
|
+
"internvl2.5-1B": 24,
|
|
278
|
+
"internvl2.5-2B": 24,
|
|
279
|
+
"internvl2.5-4B": 36,
|
|
280
|
+
"internvl2.5-8B": 32,
|
|
281
|
+
"internvl2.5-26B": 48,
|
|
282
|
+
"internvl2.5-38B": 64,
|
|
283
|
+
"internvl2.5-78B": 80,
|
|
284
|
+
}[model_name]
|
|
285
|
+
|
|
277
286
|
# Since the first GPU will be used for ViT, treat it as half a GPU.
|
|
278
287
|
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
|
|
279
288
|
num_layers_per_gpu = [num_layers_per_gpu] * world_size
|
|
@@ -322,9 +331,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
322
331
|
self._model.cuda()
|
|
323
332
|
|
|
324
333
|
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
325
|
-
self.model_path,
|
|
326
|
-
trust_remote_code=True,
|
|
327
|
-
use_fast=False,
|
|
334
|
+
self.model_path, trust_remote_code=True, use_fast=False
|
|
328
335
|
)
|
|
329
336
|
|
|
330
337
|
@cache_clean
|
|
@@ -339,11 +346,12 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
339
346
|
IMG_END_TOKEN = "</img>"
|
|
340
347
|
IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
|
|
341
348
|
|
|
349
|
+
generate_config = generate_config if isinstance(generate_config, dict) else {}
|
|
350
|
+
|
|
342
351
|
generation_config = {
|
|
343
|
-
"max_new_tokens": generate_config.get("max_tokens", 1024)
|
|
344
|
-
if generate_config
|
|
345
|
-
else 1024,
|
|
352
|
+
"max_new_tokens": (generate_config.get("max_tokens", 1024)),
|
|
346
353
|
"do_sample": False,
|
|
354
|
+
"temperature": generate_config.get("temperature", None),
|
|
347
355
|
}
|
|
348
356
|
|
|
349
357
|
stream = (
|
|
@@ -458,6 +466,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
458
466
|
streamer = TextIteratorStreamer(
|
|
459
467
|
self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
|
|
460
468
|
)
|
|
469
|
+
|
|
461
470
|
# Define the generation configuration
|
|
462
471
|
generate_kwargs["streamer"] = streamer
|
|
463
472
|
# Start the model chat in a separate thread
|
|
@@ -55,9 +55,9 @@ class Qwen2AudioChatModel(PytorchChatModel):
|
|
|
55
55
|
|
|
56
56
|
device = self._pytorch_model_config.get("device", "auto")
|
|
57
57
|
device = select_device(device)
|
|
58
|
-
self._device = device
|
|
59
58
|
# for multiple GPU, set back to auto to make multiple devices work
|
|
60
59
|
device = "auto" if device == "cuda" else device
|
|
60
|
+
self._device = device
|
|
61
61
|
|
|
62
62
|
self._processor = AutoProcessor.from_pretrained(
|
|
63
63
|
self.model_path,
|
|
@@ -105,6 +105,8 @@ class Qwen2AudioChatModel(PytorchChatModel):
|
|
|
105
105
|
inputs = self._processor(
|
|
106
106
|
text=text, audios=audios, return_tensors="pt", padding=True
|
|
107
107
|
)
|
|
108
|
+
# Make sure that the inputs and the model are on the same device.
|
|
109
|
+
inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
|
|
108
110
|
inputs.input_ids = inputs.input_ids.to(self._device)
|
|
109
111
|
generate_config = generate_config if generate_config else {}
|
|
110
112
|
stream = generate_config.get("stream", False) if generate_config else False
|
|
@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
45
45
|
def match(
|
|
46
46
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
47
47
|
) -> bool:
|
|
48
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
49
|
+
return False
|
|
48
50
|
llm_family = model_family.model_family or model_family.model_name
|
|
49
51
|
if "qwen2-vl-instruct".lower() in llm_family.lower():
|
|
50
52
|
return True
|
|
53
|
+
if "qwen2.5-vl-instruct".lower() in llm_family.lower():
|
|
54
|
+
return True
|
|
51
55
|
if "qvq-72b-preview".lower() in llm_family.lower():
|
|
52
56
|
return True
|
|
53
57
|
return False
|
|
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
55
59
|
def load(self):
|
|
56
60
|
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
|
57
61
|
|
|
62
|
+
try:
|
|
63
|
+
from transformers import Qwen2_5_VLForConditionalGeneration
|
|
64
|
+
except ImportError:
|
|
65
|
+
Qwen2_5_VLForConditionalGeneration = None
|
|
66
|
+
|
|
58
67
|
device = self._pytorch_model_config.get("device", "auto")
|
|
59
68
|
device = select_device(device)
|
|
60
69
|
self._device = device
|
|
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
66
75
|
)
|
|
67
76
|
self._tokenizer = self._processor.tokenizer
|
|
68
77
|
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
78
|
+
llm_family = self.model_family.model_family or self.model_family.model_name
|
|
79
|
+
model_cls = (
|
|
80
|
+
Qwen2_5_VLForConditionalGeneration
|
|
81
|
+
if "qwen2.5" in llm_family
|
|
82
|
+
else Qwen2VLForConditionalGeneration
|
|
83
|
+
)
|
|
84
|
+
if model_cls is None:
|
|
85
|
+
raise ImportError("`transformers` version is too old, please upgrade it")
|
|
69
86
|
if flash_attn_installed:
|
|
70
|
-
self._model =
|
|
87
|
+
self._model = model_cls.from_pretrained(
|
|
71
88
|
self.model_path,
|
|
72
89
|
torch_dtype="bfloat16",
|
|
73
90
|
device_map=device,
|
|
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
76
93
|
).eval()
|
|
77
94
|
elif is_npu_available():
|
|
78
95
|
# Ascend do not support bf16
|
|
79
|
-
self._model =
|
|
96
|
+
self._model = model_cls.from_pretrained(
|
|
80
97
|
self.model_path,
|
|
81
98
|
device_map="auto",
|
|
82
99
|
trust_remote_code=True,
|
|
83
100
|
torch_dtype="float16",
|
|
84
101
|
).eval()
|
|
85
102
|
else:
|
|
86
|
-
self._model =
|
|
103
|
+
self._model = model_cls.from_pretrained(
|
|
87
104
|
self.model_path, device_map=device, trust_remote_code=True
|
|
88
105
|
).eval()
|
|
89
106
|
|