xinference 1.7.0.post1__py3-none-any.whl → 1.7.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +3 -4
- xinference/client/__init__.py +2 -0
- xinference/client/common.py +49 -2
- xinference/client/handlers.py +18 -0
- xinference/client/restful/async_restful_client.py +1760 -0
- xinference/client/restful/restful_client.py +74 -78
- xinference/core/media_interface.py +3 -1
- xinference/core/model.py +5 -4
- xinference/core/supervisor.py +10 -5
- xinference/core/worker.py +15 -14
- xinference/deploy/local.py +51 -9
- xinference/deploy/worker.py +5 -3
- xinference/device_utils.py +22 -3
- xinference/model/audio/fish_speech.py +23 -34
- xinference/model/audio/model_spec.json +4 -2
- xinference/model/audio/model_spec_modelscope.json +4 -2
- xinference/model/audio/utils.py +2 -2
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +8 -8
- xinference/model/embedding/custom.py +6 -1
- xinference/model/embedding/embed_family.py +0 -41
- xinference/model/embedding/model_spec.json +10 -1
- xinference/model/embedding/model_spec_modelscope.json +10 -1
- xinference/model/embedding/sentence_transformers/core.py +30 -15
- xinference/model/flexible/core.py +1 -1
- xinference/model/flexible/launchers/__init__.py +2 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -1
- xinference/model/flexible/launchers/modelscope_launcher.py +47 -0
- xinference/model/flexible/launchers/transformers_launcher.py +5 -5
- xinference/model/flexible/launchers/yolo_launcher.py +62 -0
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/core.py +18 -1
- xinference/model/llm/llama_cpp/core.py +1 -1
- xinference/model/llm/llm_family.json +41 -1
- xinference/model/llm/llm_family.py +6 -0
- xinference/model/llm/llm_family_modelscope.json +43 -1
- xinference/model/llm/mlx/core.py +271 -18
- xinference/model/llm/mlx/distributed_models/__init__.py +13 -0
- xinference/model/llm/mlx/distributed_models/core.py +164 -0
- xinference/model/llm/mlx/distributed_models/deepseek_v3.py +75 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +82 -0
- xinference/model/llm/mlx/distributed_models/qwen3.py +82 -0
- xinference/model/llm/mlx/distributed_models/qwen3_moe.py +76 -0
- xinference/model/llm/reasoning_parser.py +12 -6
- xinference/model/llm/sglang/core.py +8 -4
- xinference/model/llm/transformers/chatglm.py +4 -1
- xinference/model/llm/transformers/core.py +4 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +10 -4
- xinference/model/llm/transformers/multimodal/intern_vl.py +1 -1
- xinference/model/llm/utils.py +36 -17
- xinference/model/llm/vllm/core.py +142 -34
- xinference/model/llm/vllm/distributed_executor.py +96 -21
- xinference/model/llm/vllm/xavier/transfer.py +2 -2
- xinference/model/rerank/core.py +16 -9
- xinference/model/rerank/model_spec.json +3 -3
- xinference/model/rerank/model_spec_modelscope.json +3 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.9b12b7f9.js +3 -0
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0fd4820d93f99509e80d8702dc3f6f8272424acab5608fa7c0e82cb1d3250a87.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f75545479c17fdfe2a00235fa4a0e9da1ae95e6b3caafba87ded92de6b0240e4.json +1 -0
- xinference/web/ui/src/locales/en.json +3 -0
- xinference/web/ui/src/locales/ja.json +3 -0
- xinference/web/ui/src/locales/ko.json +3 -0
- xinference/web/ui/src/locales/zh.json +3 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/METADATA +4 -3
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/RECORD +77 -67
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +0 -3
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +0 -1
- /xinference/web/ui/build/static/js/{main.8a9e3ba0.js.LICENSE.txt → main.9b12b7f9.js.LICENSE.txt} +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/WHEEL +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Any, Optional
|
|
17
|
+
|
|
18
|
+
import mlx.core as mx
|
|
19
|
+
import mlx.nn as nn
|
|
20
|
+
from mlx_lm.models.base import create_attention_mask
|
|
21
|
+
from mlx_lm.models.qwen2 import Model as _Model
|
|
22
|
+
from mlx_lm.models.qwen2 import ModelArgs
|
|
23
|
+
from mlx_lm.models.qwen2 import Qwen2Model as _Qwen2Model
|
|
24
|
+
|
|
25
|
+
from .core import DistributedModelMixin
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Qwen2Model(_Qwen2Model, DistributedModelMixin):
|
|
31
|
+
def __init__(self, *args, **kwargs):
|
|
32
|
+
_Qwen2Model.__init__(self, *args, **kwargs)
|
|
33
|
+
DistributedModelMixin.__init__(self)
|
|
34
|
+
|
|
35
|
+
def __call__(
|
|
36
|
+
self,
|
|
37
|
+
x: mx.array,
|
|
38
|
+
mask: Optional[mx.array] = None,
|
|
39
|
+
cache: Optional[Any] = None,
|
|
40
|
+
input_embeddings: Optional[mx.array] = None,
|
|
41
|
+
) -> mx.array:
|
|
42
|
+
if input_embeddings is not None:
|
|
43
|
+
h = input_embeddings
|
|
44
|
+
else:
|
|
45
|
+
h = self.embed_tokens(x)
|
|
46
|
+
|
|
47
|
+
pipeline_rank = self.rank
|
|
48
|
+
pipeline_size = self.world_size
|
|
49
|
+
if mask is None:
|
|
50
|
+
mask = create_attention_mask(h, cache)
|
|
51
|
+
|
|
52
|
+
if cache is None:
|
|
53
|
+
cache = [None] * self.num_layers
|
|
54
|
+
|
|
55
|
+
# Receive from the previous process in the pipeline
|
|
56
|
+
|
|
57
|
+
if pipeline_rank < pipeline_size - 1:
|
|
58
|
+
# wait for previous result
|
|
59
|
+
h = self._wait_prev_stage_result()
|
|
60
|
+
|
|
61
|
+
for i in range(self.num_layers):
|
|
62
|
+
h = self.layers[self.start_idx + i](h, mask, cache[i])
|
|
63
|
+
mx.eval(h)
|
|
64
|
+
|
|
65
|
+
# Send to the next process in the pipeline
|
|
66
|
+
if pipeline_rank != 0:
|
|
67
|
+
self._send_stage_result(h)
|
|
68
|
+
h = self._get_result()
|
|
69
|
+
else:
|
|
70
|
+
self._broadcast_result(h)
|
|
71
|
+
|
|
72
|
+
return self.norm(h)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Model(_Model):
|
|
76
|
+
def __init__(self, args: ModelArgs):
|
|
77
|
+
nn.Module.__init__(self)
|
|
78
|
+
self.args = args
|
|
79
|
+
self.model_type = args.model_type
|
|
80
|
+
self.model = Qwen2Model(args)
|
|
81
|
+
if not args.tie_word_embeddings:
|
|
82
|
+
self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Any, Optional
|
|
17
|
+
|
|
18
|
+
import mlx.core as mx
|
|
19
|
+
import mlx.nn as nn
|
|
20
|
+
from mlx_lm.models.base import create_attention_mask
|
|
21
|
+
from mlx_lm.models.qwen3 import Model as _Model
|
|
22
|
+
from mlx_lm.models.qwen3 import ModelArgs
|
|
23
|
+
from mlx_lm.models.qwen3 import Qwen3Model as _Qwen3Model
|
|
24
|
+
|
|
25
|
+
from .core import DistributedModelMixin
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Qwen3Model(_Qwen3Model, DistributedModelMixin):
|
|
31
|
+
def __init__(self, *args, **kwargs):
|
|
32
|
+
_Qwen3Model.__init__(self, *args, **kwargs)
|
|
33
|
+
DistributedModelMixin.__init__(self)
|
|
34
|
+
|
|
35
|
+
def __call__(
|
|
36
|
+
self,
|
|
37
|
+
x: mx.array,
|
|
38
|
+
mask: Optional[mx.array] = None,
|
|
39
|
+
cache: Optional[Any] = None,
|
|
40
|
+
input_embeddings: Optional[mx.array] = None,
|
|
41
|
+
) -> mx.array:
|
|
42
|
+
if input_embeddings is not None:
|
|
43
|
+
h = input_embeddings
|
|
44
|
+
else:
|
|
45
|
+
h = self.embed_tokens(x)
|
|
46
|
+
|
|
47
|
+
pipeline_rank = self.rank
|
|
48
|
+
pipeline_size = self.world_size
|
|
49
|
+
if mask is None:
|
|
50
|
+
mask = create_attention_mask(h, cache)
|
|
51
|
+
|
|
52
|
+
if cache is None:
|
|
53
|
+
cache = [None] * self.num_layers
|
|
54
|
+
|
|
55
|
+
# Receive from the previous process in the pipeline
|
|
56
|
+
|
|
57
|
+
if pipeline_rank < pipeline_size - 1:
|
|
58
|
+
# wait for previous result
|
|
59
|
+
h = self._wait_prev_stage_result()
|
|
60
|
+
|
|
61
|
+
for i in range(self.num_layers):
|
|
62
|
+
h = self.layers[self.start_idx + i](h, mask, cache[i])
|
|
63
|
+
mx.eval(h)
|
|
64
|
+
|
|
65
|
+
# Send to the next process in the pipeline
|
|
66
|
+
if pipeline_rank != 0:
|
|
67
|
+
self._send_stage_result(h)
|
|
68
|
+
h = self._get_result()
|
|
69
|
+
else:
|
|
70
|
+
self._broadcast_result(h)
|
|
71
|
+
|
|
72
|
+
return self.norm(h)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Model(_Model):
|
|
76
|
+
def __init__(self, args: ModelArgs):
|
|
77
|
+
nn.Module.__init__(self)
|
|
78
|
+
self.args = args
|
|
79
|
+
self.model_type = args.model_type
|
|
80
|
+
self.model = Qwen3Model(args)
|
|
81
|
+
if not args.tie_word_embeddings:
|
|
82
|
+
self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
import mlx.core as mx
|
|
18
|
+
import mlx.nn as nn
|
|
19
|
+
from mlx_lm.models.base import create_attention_mask
|
|
20
|
+
from mlx_lm.models.qwen3_moe import Model as _Model
|
|
21
|
+
from mlx_lm.models.qwen3_moe import ModelArgs
|
|
22
|
+
from mlx_lm.models.qwen3_moe import Qwen3MoeModel as _Qwen3MoeModel
|
|
23
|
+
|
|
24
|
+
from .core import DistributedModelMixin
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Qwen3MoeModel(_Qwen3MoeModel, DistributedModelMixin):
|
|
30
|
+
def __init__(self, *args, **kwargs):
|
|
31
|
+
_Qwen3MoeModel.__init__(self, *args, **kwargs)
|
|
32
|
+
DistributedModelMixin.__init__(self)
|
|
33
|
+
|
|
34
|
+
def __call__(
|
|
35
|
+
self,
|
|
36
|
+
inputs: mx.array,
|
|
37
|
+
mask: mx.array = None,
|
|
38
|
+
cache=None,
|
|
39
|
+
):
|
|
40
|
+
h = self.embed_tokens(inputs)
|
|
41
|
+
|
|
42
|
+
pipeline_rank = self.rank
|
|
43
|
+
pipeline_size = self.world_size
|
|
44
|
+
if mask is None:
|
|
45
|
+
mask = create_attention_mask(h, cache)
|
|
46
|
+
|
|
47
|
+
if cache is None:
|
|
48
|
+
cache = [None] * self.num_layers
|
|
49
|
+
|
|
50
|
+
# Receive from the previous process in the pipeline
|
|
51
|
+
|
|
52
|
+
if pipeline_rank < pipeline_size - 1:
|
|
53
|
+
# wait for previous result
|
|
54
|
+
h = self._wait_prev_stage_result()
|
|
55
|
+
|
|
56
|
+
for i in range(self.num_layers):
|
|
57
|
+
h = self.layers[self.start_idx + i](h, mask, cache[i])
|
|
58
|
+
mx.eval(h)
|
|
59
|
+
|
|
60
|
+
# Send to the next process in the pipeline
|
|
61
|
+
if pipeline_rank != 0:
|
|
62
|
+
self._send_stage_result(h)
|
|
63
|
+
h = self._get_result()
|
|
64
|
+
else:
|
|
65
|
+
self._broadcast_result(h)
|
|
66
|
+
|
|
67
|
+
return self.norm(h)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Model(_Model):
|
|
71
|
+
def __init__(self, args: ModelArgs):
|
|
72
|
+
nn.Module.__init__(self)
|
|
73
|
+
self.args = args
|
|
74
|
+
self.model_type = args.model_type
|
|
75
|
+
self.model = Qwen3MoeModel(args)
|
|
76
|
+
self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
|
|
@@ -222,6 +222,12 @@ class ReasoningParser:
|
|
|
222
222
|
],
|
|
223
223
|
)
|
|
224
224
|
|
|
225
|
+
def is_enable_thinking(self):
|
|
226
|
+
from .core import chat_context_var
|
|
227
|
+
|
|
228
|
+
context = chat_context_var.get({})
|
|
229
|
+
return context.get("enable_thinking", self.enable_thinking)
|
|
230
|
+
|
|
225
231
|
async def prepare_reasoning_content_streaming(
|
|
226
232
|
self, chunks: AsyncGenerator[CompletionChunk, None]
|
|
227
233
|
):
|
|
@@ -237,7 +243,7 @@ class ReasoningParser:
|
|
|
237
243
|
|
|
238
244
|
# If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
|
|
239
245
|
# yield chunks as is
|
|
240
|
-
if not self.reasoning_start_tag or not self.
|
|
246
|
+
if not self.reasoning_start_tag or not self.is_enable_thinking():
|
|
241
247
|
async for chunk in chunks:
|
|
242
248
|
yield chunk
|
|
243
249
|
return
|
|
@@ -266,7 +272,7 @@ class ReasoningParser:
|
|
|
266
272
|
continue
|
|
267
273
|
assert isinstance(delta, dict)
|
|
268
274
|
text = delta.get("content")
|
|
269
|
-
if text
|
|
275
|
+
if not text:
|
|
270
276
|
continue
|
|
271
277
|
# If the first chunk doesn't contain the reasoning_start_tag
|
|
272
278
|
if self.reasoning_start_tag not in text:
|
|
@@ -277,7 +283,7 @@ class ReasoningParser:
|
|
|
277
283
|
else:
|
|
278
284
|
# For standard completion chunks
|
|
279
285
|
text = choices[0].get("text")
|
|
280
|
-
if text
|
|
286
|
+
if not text:
|
|
281
287
|
continue
|
|
282
288
|
# If the first chunk doesn't contain the reasoning_start_tag
|
|
283
289
|
if self.reasoning_start_tag not in text:
|
|
@@ -304,7 +310,7 @@ class ReasoningParser:
|
|
|
304
310
|
"""
|
|
305
311
|
# If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
|
|
306
312
|
# yield chunks as is
|
|
307
|
-
if not self.reasoning_start_tag or not self.
|
|
313
|
+
if not self.reasoning_start_tag or not self.is_enable_thinking():
|
|
308
314
|
for chunk in chunks:
|
|
309
315
|
yield chunk
|
|
310
316
|
return
|
|
@@ -365,7 +371,7 @@ class ReasoningParser:
|
|
|
365
371
|
completion: The completion object containing model output,
|
|
366
372
|
which can be either a chat completion or a standard completion.
|
|
367
373
|
"""
|
|
368
|
-
if not self.reasoning_start_tag or not self.
|
|
374
|
+
if not self.reasoning_start_tag or not self.is_enable_thinking():
|
|
369
375
|
return completion
|
|
370
376
|
|
|
371
377
|
if completion.get("object") == "chat.completion" and completion.get("choices"):
|
|
@@ -399,7 +405,7 @@ class ReasoningParser:
|
|
|
399
405
|
or an empty list if no modification is needed
|
|
400
406
|
"""
|
|
401
407
|
chunks: List[ChatCompletionChunk] = []
|
|
402
|
-
if not self.reasoning_start_tag or not self.
|
|
408
|
+
if not self.reasoning_start_tag or not self.is_enable_thinking():
|
|
403
409
|
return chunks
|
|
404
410
|
|
|
405
411
|
choices = chunk.get("choices")
|
|
@@ -33,6 +33,7 @@ from ....types import (
|
|
|
33
33
|
CompletionUsage,
|
|
34
34
|
)
|
|
35
35
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
36
|
+
from ..core import chat_context_var
|
|
36
37
|
from ..llm_family import CustomLLMFamilyV1
|
|
37
38
|
from ..utils import ChatModelMixin, generate_completion_chunk
|
|
38
39
|
|
|
@@ -582,16 +583,17 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
582
583
|
request_id: Optional[str] = None,
|
|
583
584
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
584
585
|
assert self.model_family.chat_template is not None
|
|
585
|
-
|
|
586
|
+
chat_template_kwargs = (
|
|
586
587
|
self._get_chat_template_kwargs_from_generate_config(
|
|
587
588
|
generate_config, self.reasoning_parser
|
|
588
589
|
)
|
|
589
590
|
or {}
|
|
590
591
|
)
|
|
592
|
+
chat_context_var.set(chat_template_kwargs)
|
|
593
|
+
full_context_kwargs = chat_template_kwargs.copy()
|
|
591
594
|
full_prompt = self.get_full_context(
|
|
592
595
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
593
596
|
)
|
|
594
|
-
|
|
595
597
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
596
598
|
stream = generate_config.get("stream", None)
|
|
597
599
|
if stream:
|
|
@@ -656,14 +658,16 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
|
|
|
656
658
|
chat_template: str = (
|
|
657
659
|
self.model_family.chat_template if self.model_family.chat_template else ""
|
|
658
660
|
)
|
|
659
|
-
|
|
660
|
-
full_context_kwargs = (
|
|
661
|
+
chat_template_kwargs = (
|
|
661
662
|
self._get_chat_template_kwargs_from_generate_config(
|
|
662
663
|
generate_config, self.reasoning_parser
|
|
663
664
|
)
|
|
664
665
|
or {}
|
|
665
666
|
)
|
|
667
|
+
chat_context_var.set(chat_template_kwargs)
|
|
668
|
+
full_context_kwargs = chat_template_kwargs.copy()
|
|
666
669
|
prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
|
|
670
|
+
|
|
667
671
|
images, video_inputs = process_vision_info(messages)
|
|
668
672
|
if video_inputs:
|
|
669
673
|
raise ValueError("Not support video input now.")
|
|
@@ -22,6 +22,7 @@ import torch
|
|
|
22
22
|
|
|
23
23
|
from ....core.scheduler import InferenceRequest
|
|
24
24
|
from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
|
|
25
|
+
from ..core import chat_context_var
|
|
25
26
|
from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
26
27
|
from ..utils import (
|
|
27
28
|
GLM4_TOOL_CALL_FAMILY,
|
|
@@ -464,12 +465,14 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
464
465
|
tools = list(tools) if tools is not None else None
|
|
465
466
|
tool_choice = r.generate_config.get("tool_choice", "none")
|
|
466
467
|
|
|
467
|
-
|
|
468
|
+
chat_template_kwargs = (
|
|
468
469
|
self._get_chat_template_kwargs_from_generate_config(
|
|
469
470
|
r.generate_config, self.reasoning_parser
|
|
470
471
|
)
|
|
471
472
|
or {}
|
|
472
473
|
)
|
|
474
|
+
chat_context_var.set(chat_template_kwargs)
|
|
475
|
+
full_context_kwargs = chat_template_kwargs.copy()
|
|
473
476
|
r.prompt = self._process_messages(
|
|
474
477
|
r.prompt, tools=tools, tool_choice=tool_choice
|
|
475
478
|
)
|
|
@@ -37,7 +37,7 @@ from ....types import (
|
|
|
37
37
|
PytorchModelConfig,
|
|
38
38
|
)
|
|
39
39
|
from ...utils import select_device
|
|
40
|
-
from ..core import LLM
|
|
40
|
+
from ..core import LLM, chat_context_var
|
|
41
41
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
42
42
|
from ..utils import (
|
|
43
43
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
@@ -725,12 +725,14 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
725
725
|
|
|
726
726
|
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
|
|
727
727
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
728
|
-
|
|
728
|
+
chat_template_kwargs = (
|
|
729
729
|
self._get_chat_template_kwargs_from_generate_config(
|
|
730
730
|
generate_config, self.reasoning_parser
|
|
731
731
|
)
|
|
732
732
|
or {}
|
|
733
733
|
)
|
|
734
|
+
chat_context_var.set(chat_template_kwargs)
|
|
735
|
+
full_context_kwargs = chat_template_kwargs.copy()
|
|
734
736
|
if (
|
|
735
737
|
tools
|
|
736
738
|
and model_family in QWEN_TOOL_CALL_FAMILY
|
|
@@ -20,6 +20,7 @@ from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union
|
|
|
20
20
|
import torch
|
|
21
21
|
|
|
22
22
|
from .....model.utils import select_device
|
|
23
|
+
from ...core import chat_context_var
|
|
23
24
|
from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
24
25
|
from ...utils import _decode_image, parse_messages
|
|
25
26
|
from ..core import register_non_default_model
|
|
@@ -33,8 +34,8 @@ logger = logging.getLogger(__name__)
|
|
|
33
34
|
class CogAgentChatModel(PytorchMultiModalModel):
|
|
34
35
|
def __init__(self, *args, **kws):
|
|
35
36
|
super().__init__(*args, **kws)
|
|
36
|
-
self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac"
|
|
37
|
-
self._format: Optional[
|
|
37
|
+
self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac" # type: ignore
|
|
38
|
+
self._format: Optional[ # type: ignore
|
|
38
39
|
Literal[
|
|
39
40
|
"(Answer in Action-Operation-Sensitive format.)",
|
|
40
41
|
"(Answer in Status-Plan-Action-Operation format.)",
|
|
@@ -187,9 +188,14 @@ class CogAgentChatModel(PytorchMultiModalModel):
|
|
|
187
188
|
"return_tensors": "pt",
|
|
188
189
|
"return_dict": True,
|
|
189
190
|
}
|
|
190
|
-
|
|
191
|
-
self._get_chat_template_kwargs_from_generate_config(
|
|
191
|
+
chat_template_kwargs = (
|
|
192
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
193
|
+
generate_config, self.reasoning_parser
|
|
194
|
+
)
|
|
195
|
+
or {}
|
|
192
196
|
)
|
|
197
|
+
chat_context_var.set(chat_template_kwargs)
|
|
198
|
+
full_context_kwargs.update(chat_template_kwargs)
|
|
193
199
|
assert self.model_family.chat_template is not None
|
|
194
200
|
inputs = self.get_full_context(
|
|
195
201
|
[{"role": "user", "image": image, "content": query}],
|
|
@@ -83,7 +83,7 @@ class InternVLChatModel(PytorchMultiModalModel):
|
|
|
83
83
|
def load_multimodal_model(self):
|
|
84
84
|
from transformers import AutoModel
|
|
85
85
|
|
|
86
|
-
kwargs: Dict[str, Any] = {
|
|
86
|
+
kwargs: Dict[str, Any] = { # type: ignore
|
|
87
87
|
"torch_dtype": torch.bfloat16,
|
|
88
88
|
"low_cpu_mem_usage": True,
|
|
89
89
|
"trust_remote_code": True,
|
xinference/model/llm/utils.py
CHANGED
|
@@ -167,13 +167,7 @@ class ChatModelMixin:
|
|
|
167
167
|
generate_config: Optional[Union[dict, Any]],
|
|
168
168
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
169
169
|
) -> Optional[dict]:
|
|
170
|
-
if
|
|
171
|
-
# hybrid model like qwen3,
|
|
172
|
-
# disabled thinking
|
|
173
|
-
return {"enable_thinking": False}
|
|
174
|
-
if not generate_config:
|
|
175
|
-
return None
|
|
176
|
-
if "chat_template_kwargs" in generate_config:
|
|
170
|
+
if generate_config and "chat_template_kwargs" in generate_config:
|
|
177
171
|
kwargs = generate_config["chat_template_kwargs"]
|
|
178
172
|
if isinstance(kwargs, str):
|
|
179
173
|
try:
|
|
@@ -190,6 +184,10 @@ class ChatModelMixin:
|
|
|
190
184
|
f"`chat_template_kwargs` but be a JSON parsable str "
|
|
191
185
|
f"or dict, got: {kwargs}"
|
|
192
186
|
)
|
|
187
|
+
elif reasoning_parser and not reasoning_parser.enable_thinking:
|
|
188
|
+
# hybrid model like qwen3,
|
|
189
|
+
# disabled thinking
|
|
190
|
+
return {"enable_thinking": False}
|
|
193
191
|
return None
|
|
194
192
|
|
|
195
193
|
@staticmethod
|
|
@@ -220,7 +218,7 @@ class ChatModelMixin:
|
|
|
220
218
|
_messages = [x for x in messages] # copy for not modifying the origin messages
|
|
221
219
|
_messages.append({"role": "assistant", "content": ""})
|
|
222
220
|
|
|
223
|
-
if
|
|
221
|
+
if "internvl" in model_family.lower():
|
|
224
222
|
system_prompt = (
|
|
225
223
|
messages[0]["content"] if messages[0]["role"] == "system" else ""
|
|
226
224
|
)
|
|
@@ -558,14 +556,24 @@ class ChatModelMixin:
|
|
|
558
556
|
@classmethod
|
|
559
557
|
def _handle_qwen_tool_result(cls, text: str) -> List[Tuple]:
|
|
560
558
|
text: str = text.strip() # type: ignore
|
|
561
|
-
|
|
559
|
+
|
|
560
|
+
def split_into_blocks(text: str) -> list[str]:
|
|
561
|
+
# Match blocks starting with <think> or <tool_call> and ending with </think> or </tool_call>
|
|
562
|
+
pattern = r"(<(think|tool_call)>.*?</\2>)"
|
|
563
|
+
blocks = re.findall(pattern, text, re.DOTALL)
|
|
564
|
+
return [match[0] for match in blocks]
|
|
565
|
+
|
|
566
|
+
contents = split_into_blocks(text)
|
|
562
567
|
results: List[Tuple] = []
|
|
563
568
|
for content in contents:
|
|
564
569
|
content = content.strip()
|
|
565
570
|
if content:
|
|
566
|
-
|
|
567
|
-
if
|
|
568
|
-
content = content[
|
|
571
|
+
pos1 = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
|
|
572
|
+
if pos1 != -1:
|
|
573
|
+
content = content[pos1 + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
|
|
574
|
+
pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
|
|
575
|
+
if pos2 != -1:
|
|
576
|
+
content = content[:pos2]
|
|
569
577
|
content = content.strip()
|
|
570
578
|
try:
|
|
571
579
|
res = json.loads(content)
|
|
@@ -580,8 +588,12 @@ class ChatModelMixin:
|
|
|
580
588
|
return results
|
|
581
589
|
|
|
582
590
|
@classmethod
|
|
583
|
-
def _eval_qwen_chat_arguments(
|
|
591
|
+
def _eval_qwen_chat_arguments(
|
|
592
|
+
cls, c, tool_call_text: Optional[str] = None
|
|
593
|
+
) -> List[Tuple]:
|
|
584
594
|
text = c["choices"][0]["text"]
|
|
595
|
+
if tool_call_text:
|
|
596
|
+
text = tool_call_text
|
|
585
597
|
return cls._handle_qwen_tool_result(text)
|
|
586
598
|
|
|
587
599
|
@classmethod
|
|
@@ -662,12 +674,14 @@ class ChatModelMixin:
|
|
|
662
674
|
return results
|
|
663
675
|
|
|
664
676
|
@classmethod
|
|
665
|
-
def _eval_tool_arguments(
|
|
677
|
+
def _eval_tool_arguments(
|
|
678
|
+
cls, model_family, c, tool_call_text: Optional[str] = None
|
|
679
|
+
):
|
|
666
680
|
family = model_family.model_family or model_family.model_name
|
|
667
681
|
if family in GLM4_TOOL_CALL_FAMILY:
|
|
668
682
|
result = cls._eval_glm_chat_arguments(c)
|
|
669
683
|
elif family in QWEN_TOOL_CALL_FAMILY:
|
|
670
|
-
result = cls._eval_qwen_chat_arguments(c)
|
|
684
|
+
result = cls._eval_qwen_chat_arguments(c, tool_call_text)
|
|
671
685
|
elif family in LLAMA3_TOOL_CALL_FAMILY:
|
|
672
686
|
result = cls._eval_llama3_chat_arguments(c)
|
|
673
687
|
elif family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
@@ -687,15 +701,17 @@ class ChatModelMixin:
|
|
|
687
701
|
c,
|
|
688
702
|
chunk_id=None,
|
|
689
703
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
704
|
+
tool_call_text: Optional[str] = None,
|
|
690
705
|
):
|
|
691
706
|
_id = chunk_id if chunk_id is not None else str(uuid.uuid4())
|
|
692
|
-
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
707
|
+
tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
|
|
693
708
|
tool_calls = []
|
|
694
709
|
failed_contents = []
|
|
695
710
|
for content, func, args in tool_result:
|
|
696
711
|
if func:
|
|
697
712
|
tool_calls.append(
|
|
698
713
|
{
|
|
714
|
+
"index": 0,
|
|
699
715
|
"id": f"call_{_id}",
|
|
700
716
|
"type": "function",
|
|
701
717
|
"function": {
|
|
@@ -782,9 +798,12 @@ class ChatModelMixin:
|
|
|
782
798
|
}
|
|
783
799
|
)
|
|
784
800
|
else:
|
|
785
|
-
|
|
801
|
+
if content:
|
|
802
|
+
failed_contents.append(content)
|
|
786
803
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
787
804
|
|
|
805
|
+
content = ". ".join(failed_contents) if failed_contents else None
|
|
806
|
+
|
|
788
807
|
# fix: qwen tool_call content field return null
|
|
789
808
|
family = model_family.model_family or model_family.model_name
|
|
790
809
|
if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
|