xinference 1.8.1rc1__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +2 -1
- xinference/core/model.py +5 -0
- xinference/core/supervisor.py +2 -3
- xinference/core/worker.py +3 -4
- xinference/deploy/local.py +5 -0
- xinference/deploy/worker.py +6 -0
- xinference/model/core.py +3 -0
- xinference/model/embedding/sentence_transformers/core.py +3 -4
- xinference/model/embedding/vllm/core.py +4 -3
- xinference/model/image/model_spec.json +69 -0
- xinference/model/image/stable_diffusion/core.py +22 -0
- xinference/model/llm/cache_manager.py +17 -3
- xinference/model/llm/harmony.py +245 -0
- xinference/model/llm/llm_family.json +293 -8
- xinference/model/llm/llm_family.py +1 -1
- xinference/model/llm/sglang/core.py +108 -5
- xinference/model/llm/transformers/core.py +15 -7
- xinference/model/llm/transformers/gemma3.py +1 -1
- xinference/model/llm/transformers/gpt_oss.py +91 -0
- xinference/model/llm/transformers/multimodal/core.py +1 -1
- xinference/model/llm/transformers/multimodal/gemma3.py +1 -1
- xinference/model/llm/transformers/multimodal/glm4_1v.py +2 -2
- xinference/model/llm/transformers/multimodal/ovis2.py +1 -1
- xinference/model/llm/transformers/multimodal/qwen-omni.py +7 -8
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +9 -6
- xinference/model/llm/transformers/utils.py +1 -33
- xinference/model/llm/utils.py +61 -7
- xinference/model/llm/vllm/core.py +38 -8
- xinference/model/rerank/__init__.py +66 -23
- xinference/model/rerank/cache_manager.py +35 -0
- xinference/model/rerank/core.py +84 -339
- xinference/model/rerank/custom.py +33 -8
- xinference/model/rerank/model_spec.json +251 -212
- xinference/model/rerank/rerank_family.py +137 -0
- xinference/model/rerank/sentence_transformers/__init__.py +13 -0
- xinference/model/rerank/sentence_transformers/core.py +337 -0
- xinference/model/rerank/vllm/__init__.py +13 -0
- xinference/model/rerank/vllm/core.py +106 -0
- xinference/model/utils.py +109 -0
- xinference/types.py +2 -0
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/{main.b969199a.js → main.4918643a.js} +3 -3
- xinference/ui/web/ui/build/static/js/{main.b969199a.js.map → main.4918643a.js.map} +1 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/28012da921a51f1082549956d3ae82acd769a754b22afda9acddd98a4daf9ea4.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/475936ebe725eca62a6f52ce182c06a19b2cef4df9545a05ed0591ee0c539d43.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/aee5aaba26f2b1e816a3ea9efa68bad8b95695a3d80adcfd8dd57a7bb17ac71a.json +1 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/METADATA +6 -1
- {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/RECORD +58 -50
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1409a96b9f9f9f5de99a89ab0f738f6da62b449521b0a8d3e4efcf7f5c23534d.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/43b889c3a8e2634092ade463d52481c7c5581c72ded8f23bc5f012ea0ef8cea5.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/5d47532fb42128280d87f57c8a0b02bc1930f7ef764aa7e90579247df18bba83.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/830882bb275468a969614824a9ab8983f874b4581f2eb625e9c66426cdc65e5b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9df08abcb5a7c1e48a4eb25c5d5f5d7253ea6854a4397e6d74d1fd75a14acda1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/b99034986a06445701accc7a4914bb9320947435e8d4e15793392ca4f679316c.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.b969199a.js.LICENSE.txt → main.4918643a.js.LICENSE.txt} +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/WHEEL +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -286,12 +286,18 @@ class PytorchModel(LLM):
|
|
|
286
286
|
|
|
287
287
|
kwargs = {}
|
|
288
288
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
289
|
+
torch_dtype = self._pytorch_model_config.get("torch_dtype")
|
|
290
|
+
if torch_dtype is not None:
|
|
291
|
+
if isinstance(torch_dtype, str) and torch_dtype != "auto":
|
|
292
|
+
torch_dtype = getattr(torch, torch_dtype)
|
|
293
|
+
kwargs["torch_dtype"] = torch_dtype
|
|
293
294
|
else:
|
|
294
|
-
|
|
295
|
+
dtype = get_device_preferred_dtype(self._device)
|
|
296
|
+
|
|
297
|
+
if dtype is not None:
|
|
298
|
+
kwargs["torch_dtype"] = dtype
|
|
299
|
+
else:
|
|
300
|
+
raise ValueError(f"Device {self._device} is not supported in temporary")
|
|
295
301
|
|
|
296
302
|
kwargs["revision"] = self._pytorch_model_config.get(
|
|
297
303
|
"revision", self.model_spec.model_revision
|
|
@@ -327,6 +333,8 @@ class PytorchModel(LLM):
|
|
|
327
333
|
reasoning_content, enable_thinking=enable_thinking
|
|
328
334
|
)
|
|
329
335
|
|
|
336
|
+
logger.debug("Loading Transformers model with kwargs: %s", kwargs)
|
|
337
|
+
|
|
330
338
|
if self._check_tensorizer_integrity():
|
|
331
339
|
self._model, self._tokenizer = self._load_tensorizer(**kwargs)
|
|
332
340
|
else:
|
|
@@ -488,7 +496,7 @@ class PytorchModel(LLM):
|
|
|
488
496
|
def match_json(
|
|
489
497
|
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
490
498
|
) -> bool:
|
|
491
|
-
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
499
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
492
500
|
return False
|
|
493
501
|
model_family = llm_family.model_family or llm_family.model_name
|
|
494
502
|
if model_family in NON_DEFAULT_MODEL_LIST:
|
|
@@ -878,7 +886,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
878
886
|
def match_json(
|
|
879
887
|
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
880
888
|
) -> bool:
|
|
881
|
-
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
889
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
882
890
|
return False
|
|
883
891
|
model_family = llm_family.model_family or llm_family.model_name
|
|
884
892
|
if model_family in NON_DEFAULT_MODEL_LIST:
|
|
@@ -28,7 +28,7 @@ class Gemma3TextChatModel(PytorchChatModel):
|
|
|
28
28
|
def match_json(
|
|
29
29
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
30
30
|
) -> bool:
|
|
31
|
-
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
31
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
32
32
|
return False
|
|
33
33
|
llm_family = model_family.model_family or model_family.model_name
|
|
34
34
|
if "gemma-3-1b-it".lower() in llm_family.lower():
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import inspect
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Dict, Iterator, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
from ....types import (
|
|
19
|
+
ChatCompletion,
|
|
20
|
+
ChatCompletionChunk,
|
|
21
|
+
PytorchGenerateConfig,
|
|
22
|
+
PytorchModelConfig,
|
|
23
|
+
)
|
|
24
|
+
from ..harmony import async_stream_harmony_chat_completion
|
|
25
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
26
|
+
from .core import PytorchChatModel, register_non_default_model
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@register_transformer
|
|
32
|
+
@register_non_default_model("gpt-oss")
|
|
33
|
+
class GPTOSSPytorchChatModel(PytorchChatModel):
|
|
34
|
+
def _sanitize_model_config(
|
|
35
|
+
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
36
|
+
) -> PytorchModelConfig:
|
|
37
|
+
config = super()._sanitize_model_config(pytorch_model_config)
|
|
38
|
+
config.setdefault("torch_dtype", "auto")
|
|
39
|
+
return config # type:ignore
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def match_json(
|
|
43
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
44
|
+
) -> bool:
|
|
45
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
46
|
+
return False
|
|
47
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
48
|
+
if "gpt" not in model_family and "oss" not in model_family:
|
|
49
|
+
return False
|
|
50
|
+
if "chat" not in llm_family.model_ability:
|
|
51
|
+
return False
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
async def chat( # type:ignore
|
|
55
|
+
self,
|
|
56
|
+
messages: List[Dict],
|
|
57
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
58
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
59
|
+
gen = super().chat(messages, generate_config=generate_config)
|
|
60
|
+
|
|
61
|
+
if inspect.iscoroutine(gen):
|
|
62
|
+
gen = await gen
|
|
63
|
+
|
|
64
|
+
if inspect.isasyncgen(gen):
|
|
65
|
+
# Streaming
|
|
66
|
+
async def stream_parser():
|
|
67
|
+
full_text = ""
|
|
68
|
+
full_reasoning = ""
|
|
69
|
+
|
|
70
|
+
async for parsed_chunk in async_stream_harmony_chat_completion(gen):
|
|
71
|
+
choices = parsed_chunk.get("choices")
|
|
72
|
+
if choices and len(choices) > 0:
|
|
73
|
+
delta = choices[0].get("delta", {})
|
|
74
|
+
if delta.get("content"):
|
|
75
|
+
full_text += delta["content"]
|
|
76
|
+
if delta.get("reasoning_content"):
|
|
77
|
+
full_reasoning += delta["reasoning_content"]
|
|
78
|
+
yield parsed_chunk
|
|
79
|
+
|
|
80
|
+
logger.debug(
|
|
81
|
+
"Chat finished, content: %r, reasoning: %r",
|
|
82
|
+
full_text,
|
|
83
|
+
full_reasoning,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return stream_parser()
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
# Non-streaming sync - handle single result
|
|
90
|
+
async for parsed_completion in async_stream_harmony_chat_completion(gen): # type: ignore
|
|
91
|
+
return parsed_completion
|
|
@@ -21,9 +21,9 @@ from .....types import (
|
|
|
21
21
|
CompletionChunk,
|
|
22
22
|
PytorchGenerateConfig,
|
|
23
23
|
)
|
|
24
|
+
from ....utils import cache_clean
|
|
24
25
|
from ...utils import generate_chat_completion, generate_completion_chunk
|
|
25
26
|
from ..core import PytorchChatModel
|
|
26
|
-
from ..utils import cache_clean
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class PytorchMultiModalModel(PytorchChatModel):
|
|
@@ -31,7 +31,7 @@ class Gemma3ChatModel(PytorchMultiModalModel):
|
|
|
31
31
|
def match_json(
|
|
32
32
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
33
33
|
) -> bool:
|
|
34
|
-
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
34
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
35
35
|
return False
|
|
36
36
|
llm_family = model_family.model_family or model_family.model_name
|
|
37
37
|
if "gemma-3-it".lower() in llm_family.lower():
|
|
@@ -28,14 +28,14 @@ logger = logging.getLogger(__name__)
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@register_transformer
|
|
31
|
-
@register_non_default_model("glm-4.1v-thinking")
|
|
31
|
+
@register_non_default_model("glm-4.1v-thinking", "glm-4.5v")
|
|
32
32
|
class Glm4_1VModel(PytorchMultiModalModel):
|
|
33
33
|
@classmethod
|
|
34
34
|
def match_json(
|
|
35
35
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
36
36
|
) -> bool:
|
|
37
37
|
family = model_family.model_family or model_family.model_name
|
|
38
|
-
if "glm-4.1v" in family.lower():
|
|
38
|
+
if "glm-4.1v" in family.lower() or "glm-4.5v" in family.lower():
|
|
39
39
|
return True
|
|
40
40
|
return False
|
|
41
41
|
|
|
@@ -37,7 +37,7 @@ class Ovis2ChatModel(PytorchMultiModalModel):
|
|
|
37
37
|
def match_json(
|
|
38
38
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
39
39
|
) -> bool:
|
|
40
|
-
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
40
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
41
41
|
return False
|
|
42
42
|
llm_family = model_family.model_family or model_family.model_name
|
|
43
43
|
if "ovis2".lower() in llm_family.lower():
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import base64
|
|
15
|
-
import importlib.util
|
|
16
15
|
import io
|
|
17
16
|
import logging
|
|
18
17
|
import time
|
|
@@ -20,13 +19,13 @@ import uuid
|
|
|
20
19
|
from threading import Thread
|
|
21
20
|
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
22
21
|
|
|
23
|
-
from .....model.utils import select_device
|
|
24
22
|
from .....types import (
|
|
25
23
|
ChatCompletion,
|
|
26
24
|
ChatCompletionAudio,
|
|
27
25
|
ChatCompletionChoice,
|
|
28
26
|
CompletionUsage,
|
|
29
27
|
)
|
|
28
|
+
from ....utils import is_flash_attn_available, select_device
|
|
30
29
|
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
31
30
|
from ..core import PytorchGenerateConfig, register_non_default_model
|
|
32
31
|
from .core import PytorchMultiModalModel
|
|
@@ -46,7 +45,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
|
|
|
46
45
|
def match_json(
|
|
47
46
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
48
47
|
) -> bool:
|
|
49
|
-
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
48
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
50
49
|
return False
|
|
51
50
|
llm_family = model_family.model_family or model_family.model_name
|
|
52
51
|
if "qwen2.5-omni".lower() in llm_family.lower():
|
|
@@ -71,12 +70,12 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
|
|
|
71
70
|
|
|
72
71
|
# for multiple GPU, set back to auto to make multiple devices work
|
|
73
72
|
device = "auto" if self._device == "cuda" else self._device
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
if not flash_attn_installed
|
|
78
|
-
else {"attn_implementation": "flash_attention_2"}
|
|
73
|
+
kwargs = {}
|
|
74
|
+
enable_flash_attn = self._pytorch_model_config.get(
|
|
75
|
+
"enable_flash_attn", is_flash_attn_available()
|
|
79
76
|
)
|
|
77
|
+
if enable_flash_attn:
|
|
78
|
+
kwargs["attn_implementation"] = "flash_attention_2"
|
|
80
79
|
kwargs = self.apply_bnb_quantization(kwargs)
|
|
81
80
|
logger.debug("Loading model with extra kwargs: %s", kwargs)
|
|
82
81
|
|
|
@@ -11,15 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import importlib.util
|
|
15
14
|
import logging
|
|
16
15
|
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
17
16
|
|
|
18
17
|
from .....core.model import register_batching_multimodal_models
|
|
19
18
|
from .....device_utils import is_npu_available
|
|
20
|
-
from .....model.utils import select_device
|
|
21
19
|
from .....types import PytorchModelConfig
|
|
22
20
|
from ....scheduler.request import InferenceRequest
|
|
21
|
+
from ....utils import is_flash_attn_available, select_device
|
|
23
22
|
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
24
23
|
from ..core import register_non_default_model
|
|
25
24
|
from .core import PytorchMultiModalModel
|
|
@@ -48,7 +47,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
48
47
|
def match_json(
|
|
49
48
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
50
49
|
) -> bool:
|
|
51
|
-
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
50
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
52
51
|
return False
|
|
53
52
|
llm_family = model_family.model_family or model_family.model_name
|
|
54
53
|
if "qwen2-vl-instruct".lower() in llm_family.lower():
|
|
@@ -87,7 +86,6 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
87
86
|
Qwen2_5_VLForConditionalGeneration = None
|
|
88
87
|
|
|
89
88
|
kwargs = self.apply_bnb_quantization()
|
|
90
|
-
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
91
89
|
llm_family = self.model_family.model_family or self.model_family.model_name
|
|
92
90
|
model_cls = (
|
|
93
91
|
Qwen2_5_VLForConditionalGeneration
|
|
@@ -97,12 +95,17 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
97
95
|
if model_cls is None:
|
|
98
96
|
raise ImportError("`transformers` version is too old, please upgrade it")
|
|
99
97
|
device = "auto" if self._device == "cuda" else self._device
|
|
100
|
-
|
|
98
|
+
|
|
99
|
+
enable_flash_attn = self._pytorch_model_config.get(
|
|
100
|
+
"enable_flash_attn", is_flash_attn_available()
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if enable_flash_attn:
|
|
101
104
|
self._model = model_cls.from_pretrained(
|
|
102
105
|
self.model_path,
|
|
103
106
|
torch_dtype="bfloat16",
|
|
104
|
-
device_map=device,
|
|
105
107
|
attn_implementation="flash_attention_2",
|
|
108
|
+
device_map=device,
|
|
106
109
|
trust_remote_code=True,
|
|
107
110
|
**kwargs,
|
|
108
111
|
).eval()
|
|
@@ -12,8 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
import functools
|
|
15
|
+
|
|
17
16
|
import logging
|
|
18
17
|
import os
|
|
19
18
|
import time
|
|
@@ -495,34 +494,3 @@ def batch_inference_one_step(
|
|
|
495
494
|
for r in req_list:
|
|
496
495
|
r.stopped = True
|
|
497
496
|
r.error_msg = str(e)
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
def cache_clean(fn):
|
|
501
|
-
@functools.wraps(fn)
|
|
502
|
-
async def _async_wrapper(self, *args, **kwargs):
|
|
503
|
-
import gc
|
|
504
|
-
|
|
505
|
-
from ....device_utils import empty_cache
|
|
506
|
-
|
|
507
|
-
result = await fn(self, *args, **kwargs)
|
|
508
|
-
|
|
509
|
-
gc.collect()
|
|
510
|
-
empty_cache()
|
|
511
|
-
return result
|
|
512
|
-
|
|
513
|
-
@functools.wraps(fn)
|
|
514
|
-
def _wrapper(self, *args, **kwargs):
|
|
515
|
-
import gc
|
|
516
|
-
|
|
517
|
-
from ....device_utils import empty_cache
|
|
518
|
-
|
|
519
|
-
result = fn(self, *args, **kwargs)
|
|
520
|
-
|
|
521
|
-
gc.collect()
|
|
522
|
-
empty_cache()
|
|
523
|
-
return result
|
|
524
|
-
|
|
525
|
-
if asyncio.iscoroutinefunction(fn):
|
|
526
|
-
return _async_wrapper
|
|
527
|
-
else:
|
|
528
|
-
return _wrapper
|
xinference/model/llm/utils.py
CHANGED
|
@@ -67,6 +67,9 @@ QWEN_TOOL_CALL_FAMILY = [
|
|
|
67
67
|
"qwen3",
|
|
68
68
|
"HuatuoGPT-o1-Qwen2.5",
|
|
69
69
|
"DianJin-R1",
|
|
70
|
+
"Qwen3-Thinking",
|
|
71
|
+
"Qwen3-Instruct",
|
|
72
|
+
"Qwen3-Coder",
|
|
70
73
|
]
|
|
71
74
|
|
|
72
75
|
GLM4_TOOL_CALL_FAMILY = [
|
|
@@ -79,9 +82,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
|
|
|
79
82
|
"HuatuoGPT-o1-LLaMA-3.1",
|
|
80
83
|
]
|
|
81
84
|
|
|
82
|
-
DEEPSEEK_TOOL_CALL_FAMILY = [
|
|
83
|
-
"deepseek-v3",
|
|
84
|
-
]
|
|
85
|
+
DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"]
|
|
85
86
|
|
|
86
87
|
TOOL_CALL_FAMILY = (
|
|
87
88
|
QWEN_TOOL_CALL_FAMILY
|
|
@@ -167,8 +168,7 @@ class ChatModelMixin:
|
|
|
167
168
|
return json.loads(kwargs)
|
|
168
169
|
except json.JSONDecodeError:
|
|
169
170
|
raise TypeError(
|
|
170
|
-
f"`chat_template_kwargs` should be json parsable, "
|
|
171
|
-
f"got: {kwargs}"
|
|
171
|
+
f"`chat_template_kwargs` should be json parsable, got: {kwargs}"
|
|
172
172
|
)
|
|
173
173
|
elif isinstance(kwargs, dict):
|
|
174
174
|
return kwargs
|
|
@@ -254,7 +254,7 @@ class ChatModelMixin:
|
|
|
254
254
|
ret += role + "\n" + text + intra_message_sep + "\n"
|
|
255
255
|
else:
|
|
256
256
|
placeholders = "\n".join(
|
|
257
|
-
f"Image-{i+1}: <image>\n"
|
|
257
|
+
f"Image-{i + 1}: <image>\n"
|
|
258
258
|
for i in range(
|
|
259
259
|
len(images) - len(image_futures), len(images)
|
|
260
260
|
)
|
|
@@ -463,6 +463,7 @@ class ChatModelMixin:
|
|
|
463
463
|
chat_context_var.set(ctx)
|
|
464
464
|
|
|
465
465
|
previous_texts = [""]
|
|
466
|
+
full_text = ""
|
|
466
467
|
# Process chunks
|
|
467
468
|
if reasoning_parser:
|
|
468
469
|
set_context()
|
|
@@ -474,10 +475,14 @@ class ChatModelMixin:
|
|
|
474
475
|
# usage
|
|
475
476
|
chat_chunk = cls._get_final_chat_completion_chunk(chunk)
|
|
476
477
|
else:
|
|
478
|
+
if choices[0].get("text"):
|
|
479
|
+
full_text += choices[0]["text"] # type: ignore
|
|
480
|
+
|
|
477
481
|
chat_chunk = cls._to_chat_completion_chunk(
|
|
478
482
|
chunk, reasoning_parser, previous_texts
|
|
479
483
|
)
|
|
480
484
|
yield chat_chunk
|
|
485
|
+
logger.debug("Chat finished, output: %s", full_text)
|
|
481
486
|
|
|
482
487
|
@staticmethod
|
|
483
488
|
def _to_chat_completion(
|
|
@@ -683,6 +688,52 @@ class ChatModelMixin:
|
|
|
683
688
|
|
|
684
689
|
return results
|
|
685
690
|
|
|
691
|
+
@classmethod
|
|
692
|
+
def _eval_deepseek_r1_arguments(cls, c) -> List[Tuple]:
|
|
693
|
+
"""
|
|
694
|
+
Parses tool calls from deepseek-r1 (0528) chat template format.
|
|
695
|
+
Returns:
|
|
696
|
+
List of (None, function_name, arguments_dict)
|
|
697
|
+
or (raw_content, None, None) if parsing fails.
|
|
698
|
+
"""
|
|
699
|
+
text = c["choices"][0]["text"]
|
|
700
|
+
pattern = (
|
|
701
|
+
r"<\|tool▁call▁begin|>function<\|tool▁sep|>([^\n]+)\n"
|
|
702
|
+
r"```json\n(.*?)\n```<\|tool▁call▁end|>"
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
matches = re.findall(pattern, text, re.DOTALL)
|
|
706
|
+
if not matches:
|
|
707
|
+
return [(text, None, None)]
|
|
708
|
+
|
|
709
|
+
tool_calls = set()
|
|
710
|
+
results = []
|
|
711
|
+
|
|
712
|
+
for func_name, raw_json in matches:
|
|
713
|
+
func_and_args = None
|
|
714
|
+
try:
|
|
715
|
+
func_and_args = json.loads(raw_json)
|
|
716
|
+
arguments_hashable = frozenset(func_and_args.items())
|
|
717
|
+
tool_call_tuple = (
|
|
718
|
+
None,
|
|
719
|
+
func_name,
|
|
720
|
+
func_and_args,
|
|
721
|
+
)
|
|
722
|
+
except Exception:
|
|
723
|
+
tool_call_tuple = (raw_json, None, None)
|
|
724
|
+
arguments_hashable = None
|
|
725
|
+
|
|
726
|
+
dedup_key = (
|
|
727
|
+
(func_name, arguments_hashable)
|
|
728
|
+
if func_and_args is not None
|
|
729
|
+
else raw_json
|
|
730
|
+
)
|
|
731
|
+
if dedup_key not in tool_calls:
|
|
732
|
+
tool_calls.add(dedup_key)
|
|
733
|
+
results.append(tool_call_tuple)
|
|
734
|
+
|
|
735
|
+
return results
|
|
736
|
+
|
|
686
737
|
@classmethod
|
|
687
738
|
def _eval_tool_arguments(
|
|
688
739
|
cls, model_family, c, tool_call_text: Optional[str] = None
|
|
@@ -695,7 +746,10 @@ class ChatModelMixin:
|
|
|
695
746
|
elif family in LLAMA3_TOOL_CALL_FAMILY:
|
|
696
747
|
result = cls._eval_llama3_chat_arguments(c)
|
|
697
748
|
elif family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
698
|
-
|
|
749
|
+
if family == "deepseek-r1-0528":
|
|
750
|
+
result = cls._eval_deepseek_r1_arguments(c)
|
|
751
|
+
else:
|
|
752
|
+
result = cls._eval_deepseek_chat_arguments(c)
|
|
699
753
|
else:
|
|
700
754
|
raise Exception(
|
|
701
755
|
f"Model {model_family.model_name} is not support tool calls."
|
|
@@ -89,6 +89,7 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
89
89
|
mm_processor_kwargs: NotRequired[dict[str, Any]]
|
|
90
90
|
min_pixels: NotRequired[int]
|
|
91
91
|
max_pixels: NotRequired[int]
|
|
92
|
+
enable_expert_parallel: bool
|
|
92
93
|
|
|
93
94
|
|
|
94
95
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -273,8 +274,12 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
|
|
|
273
274
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
|
|
274
275
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
|
|
275
276
|
|
|
276
|
-
if VLLM_INSTALLED and VLLM_VERSION
|
|
277
|
+
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
|
|
277
278
|
VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
|
|
279
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
|
|
280
|
+
|
|
281
|
+
if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
|
|
282
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
|
|
278
283
|
|
|
279
284
|
|
|
280
285
|
class VLLMModel(LLM):
|
|
@@ -557,7 +562,9 @@ class VLLMModel(LLM):
|
|
|
557
562
|
raise err.with_traceback(tb)
|
|
558
563
|
|
|
559
564
|
# set context length after engine inited
|
|
560
|
-
|
|
565
|
+
# if shard > 0, the engine will be inited in another process
|
|
566
|
+
if self._engine:
|
|
567
|
+
self._set_context_length()
|
|
561
568
|
|
|
562
569
|
def _set_context_length(self):
|
|
563
570
|
from vllm import envs
|
|
@@ -839,7 +846,7 @@ class VLLMModel(LLM):
|
|
|
839
846
|
return False
|
|
840
847
|
if not cls._is_linux():
|
|
841
848
|
return False
|
|
842
|
-
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
|
|
849
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
|
|
843
850
|
return False
|
|
844
851
|
if llm_spec.model_format == "pytorch":
|
|
845
852
|
if quantization != "none" and not (quantization is None):
|
|
@@ -1187,7 +1194,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1187
1194
|
def match_json(
|
|
1188
1195
|
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
1189
1196
|
) -> bool:
|
|
1190
|
-
if llm_spec.model_format not in [
|
|
1197
|
+
if llm_spec.model_format not in [
|
|
1198
|
+
"pytorch",
|
|
1199
|
+
"gptq",
|
|
1200
|
+
"awq",
|
|
1201
|
+
"fp8",
|
|
1202
|
+
"bnb",
|
|
1203
|
+
"ggufv2",
|
|
1204
|
+
]:
|
|
1191
1205
|
return False
|
|
1192
1206
|
if llm_spec.model_format == "pytorch":
|
|
1193
1207
|
if quantization != "none" and not (quantization is None):
|
|
@@ -1284,6 +1298,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1284
1298
|
previous_texts = [""]
|
|
1285
1299
|
tool_call = False
|
|
1286
1300
|
tool_call_texts = [""]
|
|
1301
|
+
full_text = ""
|
|
1287
1302
|
if self.reasoning_parser:
|
|
1288
1303
|
set_context()
|
|
1289
1304
|
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
@@ -1299,6 +1314,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1299
1314
|
if not choices:
|
|
1300
1315
|
yield self._get_final_chat_completion_chunk(chunk)
|
|
1301
1316
|
else:
|
|
1317
|
+
full_text += chunk["choices"][0]["text"]
|
|
1302
1318
|
if self.is_tool_call_chunk_start(chunk):
|
|
1303
1319
|
tool_call = True
|
|
1304
1320
|
if tool_call:
|
|
@@ -1320,6 +1336,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1320
1336
|
chunk, self.reasoning_parser, previous_texts
|
|
1321
1337
|
)
|
|
1322
1338
|
i += 1
|
|
1339
|
+
logger.debug("Chat finished, output: %s", full_text)
|
|
1323
1340
|
|
|
1324
1341
|
@vllm_check
|
|
1325
1342
|
async def async_chat(
|
|
@@ -1348,13 +1365,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1348
1365
|
):
|
|
1349
1366
|
full_context_kwargs["tools"] = tools
|
|
1350
1367
|
assert self.model_family.chat_template is not None
|
|
1351
|
-
full_prompt = self.get_full_context(
|
|
1352
|
-
messages, self.model_family.chat_template, **full_context_kwargs
|
|
1353
|
-
)
|
|
1354
1368
|
|
|
1355
1369
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
1356
1370
|
stream = generate_config.get("stream", None)
|
|
1357
1371
|
|
|
1372
|
+
lora_request = None
|
|
1373
|
+
lora_model = generate_config.get("lora_name")
|
|
1374
|
+
if lora_model is not None:
|
|
1375
|
+
for lora in self.lora_requests:
|
|
1376
|
+
if lora_model == lora.lora_name:
|
|
1377
|
+
lora_request = lora
|
|
1378
|
+
break
|
|
1379
|
+
tokenizer = await self._get_tokenizer(lora_request)
|
|
1380
|
+
|
|
1381
|
+
full_prompt = self.get_full_context(
|
|
1382
|
+
messages,
|
|
1383
|
+
self.model_family.chat_template,
|
|
1384
|
+
tokenizer=tokenizer,
|
|
1385
|
+
**full_context_kwargs,
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1358
1388
|
if stream:
|
|
1359
1389
|
agen = await self.async_generate(
|
|
1360
1390
|
full_prompt, generate_config, tools, request_id=request_id
|
|
@@ -1386,7 +1416,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1386
1416
|
return False
|
|
1387
1417
|
if not cls._is_linux():
|
|
1388
1418
|
return False
|
|
1389
|
-
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
|
|
1419
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
|
|
1390
1420
|
return False
|
|
1391
1421
|
if llm_spec.model_format == "pytorch":
|
|
1392
1422
|
if quantization != "none" and not (quantization is None):
|