xinference 1.2.2__py3-none-any.whl → 1.3.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/restful_client.py +9 -1
- xinference/core/model.py +19 -0
- xinference/core/resource.py +7 -1
- xinference/core/status_guard.py +1 -0
- xinference/core/supervisor.py +228 -19
- xinference/core/utils.py +1 -29
- xinference/core/worker.py +28 -2
- xinference/deploy/cmdline.py +33 -3
- xinference/deploy/test/test_cmdline.py +32 -0
- xinference/device_utils.py +43 -1
- xinference/model/audio/kokoro.py +19 -36
- xinference/model/audio/model_spec.json +1 -1
- xinference/model/image/stable_diffusion/core.py +15 -6
- xinference/model/llm/llm_family.json +521 -6
- xinference/model/llm/llm_family.py +3 -1
- xinference/model/llm/llm_family_modelscope.json +559 -6
- xinference/model/llm/reasoning_parsers/__init__.py +13 -0
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +98 -0
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +140 -0
- xinference/model/llm/sglang/core.py +99 -11
- xinference/model/llm/transformers/intern_vl.py +23 -14
- xinference/model/llm/utils.py +55 -18
- xinference/model/llm/vllm/core.py +23 -2
- xinference/model/llm/vllm/xavier/executor.py +2 -2
- xinference/model/llm/vllm/xavier/scheduler.py +3 -3
- xinference/thirdparty/internvl/conversation.py +26 -17
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.f8177338.css +2 -0
- xinference/web/ui/build/static/css/main.f8177338.css.map +1 -0
- xinference/web/ui/build/static/js/main.ad42919c.js +3 -0
- xinference/web/ui/build/static/js/main.ad42919c.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0acb065326560592b10888234242f94f67efe28458b90f273d4d4fba9daa0cd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6cb9f6c62ab4042f0b11c5d75e51187188e9d6f5f08b1d63e796e051bafdb457.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +1 -0
- xinference/web/ui/src/locales/en.json +14 -1
- xinference/web/ui/src/locales/zh.json +14 -1
- {xinference-1.2.2.dist-info → xinference-1.3.0.post1.dist-info}/METADATA +11 -11
- {xinference-1.2.2.dist-info → xinference-1.3.0.post1.dist-info}/RECORD +55 -49
- xinference/web/ui/build/static/css/main.51a587ff.css +0 -2
- xinference/web/ui/build/static/css/main.51a587ff.css.map +0 -1
- xinference/web/ui/build/static/js/main.b0936c54.js +0 -3
- xinference/web/ui/build/static/js/main.b0936c54.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a7f1a71f6580dfe810c685a9c1d68e318f71e1fa258fbe50b87a6ac37cc0a598.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +0 -1
- /xinference/web/ui/build/static/js/{main.b0936c54.js.LICENSE.txt → main.ad42919c.js.LICENSE.txt} +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.post1.dist-info}/LICENSE +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.post1.dist-info}/WHEEL +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Dict, Optional, Tuple, Type, Union
|
|
3
|
+
|
|
4
|
+
from ....types import ChatCompletionChunkDelta, CompletionChoice
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ReasoningParser(ABC):
|
|
8
|
+
"""Abstract base class for reasoning content parsers."""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
reasoning_start_tag: str = "<think>",
|
|
13
|
+
reasoning_end_tag: str = "</think>",
|
|
14
|
+
):
|
|
15
|
+
"""Initialize the reasoning parser.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
|
|
19
|
+
reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
|
|
20
|
+
"""
|
|
21
|
+
self.reasoning_start_tag = reasoning_start_tag
|
|
22
|
+
self.reasoning_end_tag = reasoning_end_tag
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def extract_reasoning_content_streaming(
|
|
26
|
+
self,
|
|
27
|
+
previous_text: str,
|
|
28
|
+
current_text: str,
|
|
29
|
+
delta: ChatCompletionChunkDelta,
|
|
30
|
+
) -> ChatCompletionChunkDelta:
|
|
31
|
+
"""Extract reasoning content from model output in a streaming fashion.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
content (str): The model output content to parse.
|
|
35
|
+
|
|
36
|
+
Yields:
|
|
37
|
+
str: Extracted reasoning content chunks.
|
|
38
|
+
"""
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def extract_reasoning_content(
|
|
43
|
+
self, model_output: Union[str, CompletionChoice]
|
|
44
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
45
|
+
"""Extract reasoning content from model output.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
content (str): The model output content to parse.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Optional[str]: Extracted reasoning content, or None if no reasoning content found.
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ReasoningParserManager:
|
|
57
|
+
"""Manager class for reasoning parsers."""
|
|
58
|
+
|
|
59
|
+
_parsers: Dict[str, Type[ReasoningParser]] = {}
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
|
|
63
|
+
"""Register a reasoning parser for a specific model.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
model_name (str): The name of the model.
|
|
67
|
+
parser_cls (Type[ReasoningParser]): The parser class to register.
|
|
68
|
+
"""
|
|
69
|
+
cls._parsers[model_name] = parser_cls
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def register_module(cls, model_name: str):
|
|
73
|
+
"""Decorator for registering a reasoning parser for a specific model.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
model_name (str): The name of the model.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Callable: The decorator function.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
|
|
83
|
+
cls.register(model_name, parser_cls)
|
|
84
|
+
return parser_cls
|
|
85
|
+
|
|
86
|
+
return _register
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
|
|
90
|
+
"""Get the registered parser for a specific model.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
model_name (str): The name of the model.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
|
|
97
|
+
"""
|
|
98
|
+
return cls._parsers.get(model_name)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from ....types import ChatCompletionChunkDelta, CompletionChoice
|
|
5
|
+
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@ReasoningParserManager.register_module("deepseek-v3")
|
|
9
|
+
@ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
|
|
10
|
+
@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
|
|
11
|
+
class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
12
|
+
"""Reasoning parser for DeepSeek-R1 model."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
|
|
16
|
+
):
|
|
17
|
+
super().__init__(reasoning_start_tag, reasoning_end_tag)
|
|
18
|
+
self.reasoning_regex = re.compile(
|
|
19
|
+
rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def extract_reasoning_content_streaming(
|
|
23
|
+
self,
|
|
24
|
+
previous_text: str,
|
|
25
|
+
current_text: str,
|
|
26
|
+
delta: ChatCompletionChunkDelta,
|
|
27
|
+
) -> ChatCompletionChunkDelta:
|
|
28
|
+
"""Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
previous_text (str): The previous accumulated text content.
|
|
32
|
+
current_text (Union[str, ChatCompletionChunk]): The current text chunk or completion chunk.
|
|
33
|
+
|
|
34
|
+
Yields:
|
|
35
|
+
str: Extracted reasoning content chunks.
|
|
36
|
+
"""
|
|
37
|
+
if delta is None:
|
|
38
|
+
return delta
|
|
39
|
+
|
|
40
|
+
delta_text = delta["content"]
|
|
41
|
+
|
|
42
|
+
# Check if <think> is present in previous or delta.
|
|
43
|
+
# Keep compatibility with models that don't generate <think> tokens.
|
|
44
|
+
if self.reasoning_start_tag in previous_text:
|
|
45
|
+
if self.reasoning_end_tag in delta_text:
|
|
46
|
+
# <think> in previous, </think> in delta,
|
|
47
|
+
# extract reasoning content
|
|
48
|
+
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
49
|
+
reasoning_content = delta_text[:end_idx]
|
|
50
|
+
content = delta_text[end_idx + len(self.reasoning_end_tag) :]
|
|
51
|
+
delta["reasoning_content"] = reasoning_content
|
|
52
|
+
if content is not None:
|
|
53
|
+
delta["content"] = content
|
|
54
|
+
return delta
|
|
55
|
+
elif self.reasoning_end_tag in previous_text:
|
|
56
|
+
# <think> in previous, </think> in previous,
|
|
57
|
+
# <think> in previous, </think> in previous,
|
|
58
|
+
# reasoning content ends
|
|
59
|
+
return delta
|
|
60
|
+
else:
|
|
61
|
+
# <think> in previous, no </think> in previous or delta,
|
|
62
|
+
# reasoning content continues
|
|
63
|
+
delta["reasoning_content"] = delta_text
|
|
64
|
+
delta["content"] = ""
|
|
65
|
+
return delta
|
|
66
|
+
elif self.reasoning_start_tag in delta_text:
|
|
67
|
+
if self.reasoning_end_tag in delta_text:
|
|
68
|
+
# <think> in delta, </think> in delta, extract reasoning content
|
|
69
|
+
start_idx = delta_text.find(self.reasoning_start_tag)
|
|
70
|
+
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
71
|
+
reasoning_content = delta_text[
|
|
72
|
+
start_idx + len(self.reasoning_start_tag) : end_idx
|
|
73
|
+
]
|
|
74
|
+
content = delta_text[end_idx + len(self.reasoning_end_tag) :]
|
|
75
|
+
delta["reasoning_content"] = reasoning_content
|
|
76
|
+
if content is not None:
|
|
77
|
+
delta["content"] = content
|
|
78
|
+
return delta
|
|
79
|
+
else:
|
|
80
|
+
# <think> in delta, no </think> in delta,
|
|
81
|
+
# reasoning content continues
|
|
82
|
+
delta["reasoning_content"] = delta_text
|
|
83
|
+
delta["content"] = ""
|
|
84
|
+
return delta
|
|
85
|
+
else:
|
|
86
|
+
# No <think> in previous or delta, also need to check for </think>.
|
|
87
|
+
# Because the model may have generated </think> without <think>
|
|
88
|
+
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
|
|
89
|
+
if self.reasoning_end_tag in delta_text:
|
|
90
|
+
# </think> in delta with more tokens,
|
|
91
|
+
# extract reasoning content and content
|
|
92
|
+
end_idx = delta_text.find(self.reasoning_end_tag)
|
|
93
|
+
reasoning_content = delta_text[:end_idx]
|
|
94
|
+
content = delta_text[end_idx + len(self.reasoning_end_tag) :]
|
|
95
|
+
delta["reasoning_content"] = reasoning_content
|
|
96
|
+
if content is not None:
|
|
97
|
+
delta["content"] = content
|
|
98
|
+
return delta
|
|
99
|
+
elif self.reasoning_end_tag in previous_text:
|
|
100
|
+
# </think> in previous, thinking content ends
|
|
101
|
+
return delta
|
|
102
|
+
else:
|
|
103
|
+
# no </think> in previous or delta, reasoning content continues
|
|
104
|
+
delta["reasoning_content"] = delta_text
|
|
105
|
+
delta["content"] = ""
|
|
106
|
+
return delta
|
|
107
|
+
|
|
108
|
+
def extract_reasoning_content(
|
|
109
|
+
self, model_output: Union[str, CompletionChoice]
|
|
110
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
111
|
+
"""Extract reasoning content from DeepSeek-R1 model output.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
content (str): The model output content to parse.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Optional[str]: Extracted reasoning content, or None if no reasoning content found.
|
|
118
|
+
"""
|
|
119
|
+
if not isinstance(model_output, str):
|
|
120
|
+
model_output = model_output["text"]
|
|
121
|
+
# DeepSeek R1 doesn't generate <think> now.
|
|
122
|
+
# Thus we assume the reasoning content is always at the start.
|
|
123
|
+
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
|
|
124
|
+
if self.reasoning_end_tag not in model_output:
|
|
125
|
+
return model_output, ""
|
|
126
|
+
else:
|
|
127
|
+
# Add a start token if it's missing to keep compatibility.
|
|
128
|
+
if self.reasoning_start_tag not in model_output:
|
|
129
|
+
model_output = f"{self.reasoning_start_tag}{model_output}"
|
|
130
|
+
# Use a regex to find the reasoning content
|
|
131
|
+
reasoning_content = self.reasoning_regex.findall(model_output)[0]
|
|
132
|
+
|
|
133
|
+
end_index = len(
|
|
134
|
+
f"{self.reasoning_start_tag}{reasoning_content}{self.reasoning_end_tag}"
|
|
135
|
+
)
|
|
136
|
+
final_output = model_output[end_index:]
|
|
137
|
+
|
|
138
|
+
if len(final_output) == 0:
|
|
139
|
+
return reasoning_content, ""
|
|
140
|
+
return reasoning_content, final_output
|
|
@@ -14,10 +14,14 @@
|
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
16
|
import logging
|
|
17
|
+
import sys
|
|
18
|
+
import threading
|
|
17
19
|
import time
|
|
18
20
|
import uuid
|
|
19
21
|
from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
|
|
20
22
|
|
|
23
|
+
from xoscar.utils import get_next_port
|
|
24
|
+
|
|
21
25
|
from ....types import (
|
|
22
26
|
ChatCompletion,
|
|
23
27
|
ChatCompletionChunk,
|
|
@@ -40,6 +44,10 @@ class SGLANGModelConfig(TypedDict, total=False):
|
|
|
40
44
|
mem_fraction_static: float
|
|
41
45
|
log_level: str
|
|
42
46
|
attention_reduce_in_fp32: bool # For gemma
|
|
47
|
+
# distributed
|
|
48
|
+
nnodes: Optional[int]
|
|
49
|
+
node_rank: Optional[int]
|
|
50
|
+
dist_init_addr: Optional[str]
|
|
43
51
|
|
|
44
52
|
|
|
45
53
|
class SGLANGGenerateConfig(TypedDict, total=False):
|
|
@@ -91,6 +99,10 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
|
91
99
|
"qwen2.5-instruct",
|
|
92
100
|
"qwen2.5-coder-instruct",
|
|
93
101
|
"QwQ-32B-Preview",
|
|
102
|
+
"deepseek-r1-distill-qwen",
|
|
103
|
+
"deepseek-r1-distill-llama",
|
|
104
|
+
"deepseek-v3",
|
|
105
|
+
"deepseek-r1",
|
|
94
106
|
]
|
|
95
107
|
|
|
96
108
|
|
|
@@ -107,6 +119,16 @@ class SGLANGModel(LLM):
|
|
|
107
119
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
108
120
|
self._model_config = model_config
|
|
109
121
|
self._engine = None
|
|
122
|
+
self._address = model_config.pop("address", None) # type: ignore
|
|
123
|
+
self._n_worker = model_config.pop("n_worker", 1) # type: ignore
|
|
124
|
+
self._shard = model_config.pop("shard", 0) # type: ignore
|
|
125
|
+
self._driver_info = model_config.pop("driver_info", None) # type: ignore
|
|
126
|
+
self._loading_thread = None
|
|
127
|
+
self._loading_error = None
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def driver_info(self) -> Optional[dict]:
|
|
131
|
+
return self._driver_info
|
|
110
132
|
|
|
111
133
|
def load(self):
|
|
112
134
|
try:
|
|
@@ -128,18 +150,84 @@ class SGLANGModel(LLM):
|
|
|
128
150
|
else:
|
|
129
151
|
self._model_config.setdefault("attention_reduce_in_fp32", False)
|
|
130
152
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
153
|
+
# gen port for sgl Runtime,
|
|
154
|
+
# this is useful for sglang service on a same machine.
|
|
155
|
+
# sglang typically find a port between [port, 40000]
|
|
156
|
+
# we need to ensure the generated port < 40000
|
|
157
|
+
sgl_port = None
|
|
158
|
+
for _ in range(10):
|
|
159
|
+
sgl_port = get_next_port()
|
|
160
|
+
if sgl_port >= 40000:
|
|
161
|
+
sgl_port = None
|
|
162
|
+
else:
|
|
163
|
+
break
|
|
164
|
+
if sgl_port is None:
|
|
165
|
+
raise ValueError("Failed to find a port for sglang")
|
|
166
|
+
|
|
167
|
+
if self._n_worker > 1:
|
|
168
|
+
# distributed inference
|
|
169
|
+
self._model_config["nnodes"] = self._n_worker
|
|
170
|
+
self._model_config["node_rank"] = self._shard
|
|
171
|
+
# model across multiple workers
|
|
172
|
+
if self._shard == 0:
|
|
173
|
+
# distributed, need to init driver_info
|
|
174
|
+
assert self._driver_info is None
|
|
175
|
+
# This must run inside Xoscar pool
|
|
176
|
+
dist_init_addr = f"{self._address.split(':', 1)[0]}:{get_next_port()}"
|
|
177
|
+
self._driver_info = {"dist_init_addr": dist_init_addr}
|
|
178
|
+
self._model_config["dist_init_addr"] = dist_init_addr
|
|
179
|
+
else:
|
|
180
|
+
assert self._driver_info is not None
|
|
181
|
+
self._model_config["dist_init_addr"] = self._driver_info[
|
|
182
|
+
"dist_init_addr"
|
|
183
|
+
]
|
|
134
184
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
185
|
+
logger.info(
|
|
186
|
+
f"Loading {self.model_uid}, shard({self._shard} of {self._n_worker}) with following model config: {self._model_config}"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def _load():
|
|
190
|
+
try:
|
|
191
|
+
self._engine = sgl.Runtime(
|
|
192
|
+
model_path=self.model_path,
|
|
193
|
+
tokenizer_path=self.model_path,
|
|
194
|
+
port=sgl_port,
|
|
195
|
+
**self._model_config,
|
|
196
|
+
)
|
|
197
|
+
except:
|
|
198
|
+
logger.exception("Creating sglang Runtime failed")
|
|
199
|
+
self._loading_error = sys.exc_info()
|
|
200
|
+
|
|
201
|
+
self._loading_thread = threading.Thread(target=_load)
|
|
202
|
+
self._loading_thread.start()
|
|
203
|
+
if self._shard == 0:
|
|
204
|
+
# wait for 3 seconds to ensure torch distributed inited first
|
|
205
|
+
self._loading_thread.join(3)
|
|
206
|
+
else:
|
|
207
|
+
logger.info(
|
|
208
|
+
f"Loading {self.model_uid} with following model config: {self._model_config}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
self._engine = sgl.Runtime(
|
|
212
|
+
model_path=self.model_path,
|
|
213
|
+
tokenizer_path=self.model_path,
|
|
214
|
+
port=sgl_port,
|
|
215
|
+
**self._model_config,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def wait_for_load(self):
|
|
219
|
+
if self._loading_thread:
|
|
220
|
+
if self._shard == 0:
|
|
221
|
+
# for the shard 0, we wait it to complete
|
|
222
|
+
# the sglang will serve forever for the other shards,
|
|
223
|
+
# so we only check if any error happens.
|
|
224
|
+
self._loading_thread.join()
|
|
225
|
+
if self._loading_error:
|
|
226
|
+
_, err, tb = self._loading_error
|
|
227
|
+
raise err.with_traceback(tb)
|
|
140
228
|
|
|
141
229
|
def stop(self):
|
|
142
|
-
logger.info("Stopping SGLang engine")
|
|
230
|
+
logger.info("Stopping SGLang engine, sglang pid: %s", self._engine.pid)
|
|
143
231
|
self._engine.shutdown()
|
|
144
232
|
|
|
145
233
|
def _sanitize_model_config(
|
|
@@ -151,7 +239,7 @@ class SGLANGModel(LLM):
|
|
|
151
239
|
cuda_count = self._get_cuda_count()
|
|
152
240
|
model_config.setdefault("tokenizer_mode", "auto")
|
|
153
241
|
model_config.setdefault("trust_remote_code", True)
|
|
154
|
-
model_config.setdefault("tp_size", cuda_count)
|
|
242
|
+
model_config.setdefault("tp_size", cuda_count * self._n_worker)
|
|
155
243
|
# See https://github.com/sgl-project/sglang/blob/00023d622a6d484e67ef4a0e444f708b8fc861c8/python/sglang/srt/server_args.py#L100-L109
|
|
156
244
|
mem_fraction_static = model_config.get("mem_fraction_static")
|
|
157
245
|
if mem_fraction_static is None:
|
|
@@ -159,7 +247,7 @@ class SGLANGModel(LLM):
|
|
|
159
247
|
if tp_size >= 16:
|
|
160
248
|
model_config["mem_fraction_static"] = 0.79
|
|
161
249
|
elif tp_size >= 8:
|
|
162
|
-
model_config["mem_fraction_static"] = 0.
|
|
250
|
+
model_config["mem_fraction_static"] = 0.81
|
|
163
251
|
elif tp_size >= 4:
|
|
164
252
|
model_config["mem_fraction_static"] = 0.85
|
|
165
253
|
elif tp_size >= 2:
|
|
@@ -265,15 +265,24 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
265
265
|
if world_size == 1:
|
|
266
266
|
return None
|
|
267
267
|
model_size = f"{self.model_spec.model_size_in_billions}B"
|
|
268
|
+
model_name = f"{self.model_family.model_name.lower()}-{model_size}"
|
|
268
269
|
num_layers = {
|
|
269
|
-
"1B": 24,
|
|
270
|
-
"2B": 24,
|
|
271
|
-
"4B": 32,
|
|
272
|
-
"8B": 32,
|
|
273
|
-
"26B": 48,
|
|
274
|
-
"40B": 60,
|
|
275
|
-
"76B": 80,
|
|
276
|
-
|
|
270
|
+
"internvl2-1B": 24,
|
|
271
|
+
"internvl2-2B": 24,
|
|
272
|
+
"internvl2-4B": 32,
|
|
273
|
+
"internvl2-8B": 32,
|
|
274
|
+
"internvl2-26B": 48,
|
|
275
|
+
"internvl2-40B": 60,
|
|
276
|
+
"internvl2-76B": 80,
|
|
277
|
+
"internvl2.5-1B": 24,
|
|
278
|
+
"internvl2.5-2B": 24,
|
|
279
|
+
"internvl2.5-4B": 36,
|
|
280
|
+
"internvl2.5-8B": 32,
|
|
281
|
+
"internvl2.5-26B": 48,
|
|
282
|
+
"internvl2.5-38B": 64,
|
|
283
|
+
"internvl2.5-78B": 80,
|
|
284
|
+
}[model_name]
|
|
285
|
+
|
|
277
286
|
# Since the first GPU will be used for ViT, treat it as half a GPU.
|
|
278
287
|
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
|
|
279
288
|
num_layers_per_gpu = [num_layers_per_gpu] * world_size
|
|
@@ -322,9 +331,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
322
331
|
self._model.cuda()
|
|
323
332
|
|
|
324
333
|
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
325
|
-
self.model_path,
|
|
326
|
-
trust_remote_code=True,
|
|
327
|
-
use_fast=False,
|
|
334
|
+
self.model_path, trust_remote_code=True, use_fast=False
|
|
328
335
|
)
|
|
329
336
|
|
|
330
337
|
@cache_clean
|
|
@@ -339,11 +346,12 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
339
346
|
IMG_END_TOKEN = "</img>"
|
|
340
347
|
IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
|
|
341
348
|
|
|
349
|
+
generate_config = generate_config if isinstance(generate_config, dict) else {}
|
|
350
|
+
|
|
342
351
|
generation_config = {
|
|
343
|
-
"max_new_tokens": generate_config.get("max_tokens", 1024)
|
|
344
|
-
if generate_config
|
|
345
|
-
else 1024,
|
|
352
|
+
"max_new_tokens": (generate_config.get("max_tokens", 1024)),
|
|
346
353
|
"do_sample": False,
|
|
354
|
+
"temperature": generate_config.get("temperature", None),
|
|
347
355
|
}
|
|
348
356
|
|
|
349
357
|
stream = (
|
|
@@ -458,6 +466,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
458
466
|
streamer = TextIteratorStreamer(
|
|
459
467
|
self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
|
|
460
468
|
)
|
|
469
|
+
|
|
461
470
|
# Define the generation configuration
|
|
462
471
|
generate_kwargs["streamer"] = streamer
|
|
463
472
|
# Start the model chat in a separate thread
|
xinference/model/llm/utils.py
CHANGED
|
@@ -54,6 +54,7 @@ from .llm_family import (
|
|
|
54
54
|
_get_cache_dir,
|
|
55
55
|
get_cache_status,
|
|
56
56
|
)
|
|
57
|
+
from .reasoning_parsers.abs_reasoning_parsers import ReasoningParser
|
|
57
58
|
|
|
58
59
|
logger = logging.getLogger(__name__)
|
|
59
60
|
|
|
@@ -321,6 +322,7 @@ class ChatModelMixin:
|
|
|
321
322
|
def _to_chat_completion_chunks(
|
|
322
323
|
cls,
|
|
323
324
|
chunks: Iterator[CompletionChunk],
|
|
325
|
+
reasoning_parse: Optional[ReasoningParser] = None,
|
|
324
326
|
) -> Iterator[ChatCompletionChunk]:
|
|
325
327
|
for i, chunk in enumerate(chunks):
|
|
326
328
|
if i == 0:
|
|
@@ -365,37 +367,72 @@ class ChatModelMixin:
|
|
|
365
367
|
async def _async_to_chat_completion_chunks(
|
|
366
368
|
cls,
|
|
367
369
|
chunks: AsyncGenerator[CompletionChunk, None],
|
|
370
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
368
371
|
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
369
372
|
i = 0
|
|
373
|
+
previous_text = ""
|
|
374
|
+
current_text = ""
|
|
370
375
|
async for chunk in chunks:
|
|
371
376
|
if i == 0:
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
yield cls._get_final_chat_completion_chunk(chunk)
|
|
377
|
+
chat_chunk = cls._get_first_chat_completion_chunk(chunk)
|
|
378
|
+
elif not chunk.get("choices"):
|
|
379
|
+
# usage
|
|
380
|
+
chat_chunk = cls._get_final_chat_completion_chunk(chunk)
|
|
377
381
|
else:
|
|
378
|
-
|
|
382
|
+
chat_chunk = cls._to_chat_completion_chunk(chunk)
|
|
383
|
+
if reasoning_parser is not None:
|
|
384
|
+
choices = chat_chunk.get("choices")
|
|
385
|
+
if choices is None:
|
|
386
|
+
continue
|
|
387
|
+
for choice in choices:
|
|
388
|
+
delta = choice.get("delta")
|
|
389
|
+
if not delta:
|
|
390
|
+
continue
|
|
391
|
+
current_text = previous_text + delta.get("content", "")
|
|
392
|
+
choice[
|
|
393
|
+
"delta"
|
|
394
|
+
] = reasoning_parser.extract_reasoning_content_streaming(
|
|
395
|
+
previous_text=previous_text,
|
|
396
|
+
current_text=current_text,
|
|
397
|
+
delta=delta,
|
|
398
|
+
)
|
|
399
|
+
previous_text = current_text
|
|
400
|
+
yield chat_chunk
|
|
379
401
|
i += 1
|
|
380
402
|
|
|
381
403
|
@staticmethod
|
|
382
|
-
def _to_chat_completion(
|
|
404
|
+
def _to_chat_completion(
|
|
405
|
+
completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
|
|
406
|
+
) -> ChatCompletion:
|
|
407
|
+
choices = []
|
|
408
|
+
for i, choice in enumerate(completion["choices"]):
|
|
409
|
+
content = choice["text"]
|
|
410
|
+
reasoning_content = None
|
|
411
|
+
|
|
412
|
+
if reasoning_parser is not None:
|
|
413
|
+
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
414
|
+
choice
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
message = {"role": "assistant", "content": content}
|
|
418
|
+
|
|
419
|
+
# add only reasoning_content is None
|
|
420
|
+
if reasoning_content is not None:
|
|
421
|
+
message["reasoning_content"] = reasoning_content
|
|
422
|
+
|
|
423
|
+
choices.append(
|
|
424
|
+
{
|
|
425
|
+
"index": i,
|
|
426
|
+
"message": message,
|
|
427
|
+
"finish_reason": choice["finish_reason"],
|
|
428
|
+
}
|
|
429
|
+
)
|
|
383
430
|
return {
|
|
384
431
|
"id": "chat" + completion["id"],
|
|
385
432
|
"object": "chat.completion",
|
|
386
433
|
"created": completion["created"],
|
|
387
434
|
"model": completion["model"],
|
|
388
|
-
"choices":
|
|
389
|
-
{
|
|
390
|
-
"index": i,
|
|
391
|
-
"message": {
|
|
392
|
-
"role": "assistant",
|
|
393
|
-
"content": choice["text"],
|
|
394
|
-
},
|
|
395
|
-
"finish_reason": choice["finish_reason"],
|
|
396
|
-
}
|
|
397
|
-
for i, choice in enumerate(completion["choices"])
|
|
398
|
-
],
|
|
435
|
+
"choices": choices, # type: ignore
|
|
399
436
|
"usage": completion["usage"],
|
|
400
437
|
}
|
|
401
438
|
|
|
@@ -43,6 +43,8 @@ from ....types import (
|
|
|
43
43
|
)
|
|
44
44
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
45
45
|
from ..llm_family import CustomLLMFamilyV1
|
|
46
|
+
from ..reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
|
|
47
|
+
from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
|
|
46
48
|
from ..utils import (
|
|
47
49
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
48
50
|
QWEN_TOOL_CALL_FAMILY,
|
|
@@ -72,6 +74,7 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
72
74
|
limit_mm_per_prompt: Optional[Dict[str, int]]
|
|
73
75
|
guided_decoding_backend: Optional[str]
|
|
74
76
|
scheduling_policy: Optional[str]
|
|
77
|
+
reasoning_content: bool
|
|
75
78
|
|
|
76
79
|
|
|
77
80
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -176,6 +179,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.1":
|
|
|
176
179
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat")
|
|
177
180
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2-chat-0628")
|
|
178
181
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v2.5")
|
|
182
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-v3")
|
|
183
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1")
|
|
179
184
|
|
|
180
185
|
if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
|
|
181
186
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
|
|
@@ -190,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
190
195
|
|
|
191
196
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
192
197
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
198
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
|
|
193
199
|
|
|
194
200
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
|
|
195
201
|
VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
|
|
@@ -206,6 +212,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
|
|
|
206
212
|
if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
|
|
207
213
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
|
|
208
214
|
|
|
215
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
|
|
216
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen-2.5-instruct-1m")
|
|
217
|
+
|
|
209
218
|
|
|
210
219
|
class VLLMModel(LLM):
|
|
211
220
|
def __init__(
|
|
@@ -234,6 +243,7 @@ class VLLMModel(LLM):
|
|
|
234
243
|
self.lora_modules = peft_model
|
|
235
244
|
self.lora_requests: List[LoRARequest] = []
|
|
236
245
|
self._xavier_config = None
|
|
246
|
+
self.reasoning_parser = None
|
|
237
247
|
|
|
238
248
|
def set_xavier_config(self, value: Optional[Dict]):
|
|
239
249
|
self._xavier_config = value # type: ignore
|
|
@@ -262,6 +272,16 @@ class VLLMModel(LLM):
|
|
|
262
272
|
multiprocessing.set_start_method("fork", force=True)
|
|
263
273
|
|
|
264
274
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
275
|
+
reasoning_content = self._model_config.pop("reasoning_content")
|
|
276
|
+
|
|
277
|
+
# Initialize reasoning parser if model has reasoning ability
|
|
278
|
+
if "reasoning" in self.model_family.model_ability and reasoning_content:
|
|
279
|
+
module_name = self.model_family.model_family or self.model_family.model_name
|
|
280
|
+
self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
|
|
281
|
+
self.reasoning_parser = self.reasoning_parser(
|
|
282
|
+
self.model_family.reasoning_start_tag,
|
|
283
|
+
self.model_family.reasoning_end_tag,
|
|
284
|
+
)
|
|
265
285
|
if self.lora_modules is None:
|
|
266
286
|
self.lora_requests = []
|
|
267
287
|
else:
|
|
@@ -368,6 +388,7 @@ class VLLMModel(LLM):
|
|
|
368
388
|
model_config.setdefault("quantization", None)
|
|
369
389
|
model_config.setdefault("max_model_len", None)
|
|
370
390
|
model_config.setdefault("guided_decoding_backend", "outlines")
|
|
391
|
+
model_config.setdefault("reasoning_content", False)
|
|
371
392
|
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
372
393
|
if vllm.__version__ >= "0.6.3":
|
|
373
394
|
model_config.setdefault("scheduling_policy", "fcfs")
|
|
@@ -835,7 +856,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
835
856
|
assert isinstance(agen, AsyncGenerator)
|
|
836
857
|
if tools:
|
|
837
858
|
return self._async_to_tool_completion_chunks(agen)
|
|
838
|
-
return self._async_to_chat_completion_chunks(agen)
|
|
859
|
+
return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
|
|
839
860
|
else:
|
|
840
861
|
c = await self.async_generate(
|
|
841
862
|
full_prompt, generate_config, request_id=request_id
|
|
@@ -843,7 +864,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
843
864
|
assert not isinstance(c, AsyncGenerator)
|
|
844
865
|
if tools:
|
|
845
866
|
return self._tool_calls_completion(self.model_family, self.model_uid, c)
|
|
846
|
-
return self._to_chat_completion(c)
|
|
867
|
+
return self._to_chat_completion(c, self.reasoning_parser)
|
|
847
868
|
|
|
848
869
|
|
|
849
870
|
class VLLMVisionModel(VLLMModel, ChatModelMixin):
|