xinference 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +415 -1
- xinference/constants.py +2 -0
- xinference/core/supervisor.py +29 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/kokoro.py +1 -1
- xinference/model/audio/kokoro_zh.py +124 -0
- xinference/model/audio/model_spec.json +20 -0
- xinference/model/embedding/sentence_transformers/core.py +4 -4
- xinference/model/embedding/vllm/core.py +7 -1
- xinference/model/image/model_spec.json +2 -3
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +1 -0
- xinference/model/llm/llm_family.json +40 -20
- xinference/model/llm/llm_family.py +1 -0
- xinference/model/llm/mlx/core.py +52 -33
- xinference/model/llm/sglang/core.py +2 -44
- xinference/model/llm/tool_parsers/__init__.py +58 -0
- xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
- xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
- xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
- xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
- xinference/model/llm/transformers/core.py +1 -1
- xinference/model/llm/utils.py +127 -45
- xinference/model/llm/vllm/core.py +2 -61
- xinference/types.py +105 -2
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/METADATA +7 -3
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/RECORD +34 -26
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
from io import BytesIO
|
|
16
|
+
from typing import TYPE_CHECKING, Optional
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from ...device_utils import get_available_device, is_device_available
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from .core import AudioModelFamilyV2
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
REPO_ID = "hexgrad/Kokoro-82M-v1.1-zh"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class KokoroZHModel:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
model_uid: str,
|
|
34
|
+
model_path: str,
|
|
35
|
+
model_spec: "AudioModelFamilyV2",
|
|
36
|
+
device: Optional[str] = None,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
39
|
+
self.model_family = model_spec
|
|
40
|
+
self._model_uid = model_uid
|
|
41
|
+
self._model_path = model_path
|
|
42
|
+
self._model_spec = model_spec
|
|
43
|
+
self._device = device
|
|
44
|
+
self._model = None
|
|
45
|
+
self._kwargs = kwargs
|
|
46
|
+
self._en_pipeline = None
|
|
47
|
+
|
|
48
|
+
def _en_callable(self, text):
|
|
49
|
+
"""
|
|
50
|
+
Fixing the issue of English words being skipped in the Chinese model.
|
|
51
|
+
from https://hf-mirror.com/hexgrad/Kokoro-82M-v1.1-zh/blob/main/samples/make_zh.py
|
|
52
|
+
"""
|
|
53
|
+
if text == "Kokoro":
|
|
54
|
+
return "kˈOkəɹO"
|
|
55
|
+
elif text == "Sol":
|
|
56
|
+
return "sˈOl"
|
|
57
|
+
return next(self._en_pipeline(text)).phonemes
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def model_ability(self):
|
|
61
|
+
return self._model_spec.model_ability
|
|
62
|
+
|
|
63
|
+
def load(self):
|
|
64
|
+
if self._device is None:
|
|
65
|
+
self._device = get_available_device()
|
|
66
|
+
else:
|
|
67
|
+
if not is_device_available(self._device):
|
|
68
|
+
raise ValueError(f"Device {self._device} is not available!")
|
|
69
|
+
|
|
70
|
+
import os
|
|
71
|
+
|
|
72
|
+
from kokoro import KModel, KPipeline
|
|
73
|
+
|
|
74
|
+
self._en_pipeline = KPipeline(lang_code="a", repo_id=REPO_ID, model=False)
|
|
75
|
+
|
|
76
|
+
config_path = os.path.join(self._model_path, "config.json")
|
|
77
|
+
model_path = os.path.join(self._model_path, "kokoro-v1_1-zh.pth")
|
|
78
|
+
lang_code = self._kwargs.get("lang_code", "z")
|
|
79
|
+
logger.info("Launching Kokoro model with language code: %s", lang_code)
|
|
80
|
+
|
|
81
|
+
self._model = KPipeline(
|
|
82
|
+
lang_code=lang_code,
|
|
83
|
+
model=KModel(config=config_path, model=model_path).to(self._device),
|
|
84
|
+
repo_id=REPO_ID,
|
|
85
|
+
en_callable=self._en_callable,
|
|
86
|
+
device=self._device,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def speech(
|
|
90
|
+
self,
|
|
91
|
+
input: str,
|
|
92
|
+
voice: str,
|
|
93
|
+
response_format: str = "mp3",
|
|
94
|
+
speed: float = 1.0,
|
|
95
|
+
stream: bool = False,
|
|
96
|
+
**kwargs,
|
|
97
|
+
):
|
|
98
|
+
import soundfile
|
|
99
|
+
|
|
100
|
+
if stream:
|
|
101
|
+
raise Exception("Kokoro does not support stream mode.")
|
|
102
|
+
assert self._model is not None
|
|
103
|
+
if not voice:
|
|
104
|
+
voice = "zf_001"
|
|
105
|
+
logger.info("Auto select speaker: %s", voice)
|
|
106
|
+
elif voice.endswith(".pt"):
|
|
107
|
+
logger.info("Using custom voice pt: %s", voice)
|
|
108
|
+
else:
|
|
109
|
+
logger.info("Using voice: %s", voice)
|
|
110
|
+
logger.info("Speech kwargs: %s", kwargs)
|
|
111
|
+
generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
|
|
112
|
+
results = list(generator)
|
|
113
|
+
audio = np.concatenate([r[2] for r in results])
|
|
114
|
+
# Save the generated audio
|
|
115
|
+
with BytesIO() as out:
|
|
116
|
+
with soundfile.SoundFile(
|
|
117
|
+
out,
|
|
118
|
+
"w",
|
|
119
|
+
24000,
|
|
120
|
+
1,
|
|
121
|
+
format=response_format.upper(),
|
|
122
|
+
) as f:
|
|
123
|
+
f.write(audio)
|
|
124
|
+
return out.getvalue()
|
|
@@ -862,6 +862,26 @@
|
|
|
862
862
|
"model_revision": "master"
|
|
863
863
|
}
|
|
864
864
|
}
|
|
865
|
+
},
|
|
866
|
+
{
|
|
867
|
+
"version": 2,
|
|
868
|
+
"model_name": "Kokoro-82M-v1.1-zh",
|
|
869
|
+
"model_family": "Kokoro-zh",
|
|
870
|
+
"model_ability": [
|
|
871
|
+
"text2audio",
|
|
872
|
+
"text2audio_zero_shot"
|
|
873
|
+
],
|
|
874
|
+
"multilingual": false,
|
|
875
|
+
"model_src": {
|
|
876
|
+
"huggingface": {
|
|
877
|
+
"model_id": "hexgrad/Kokoro-82M-v1.1-zh",
|
|
878
|
+
"model_revision": "01e7505bd6a7a2ac4975463114c3a7650a9f7218"
|
|
879
|
+
},
|
|
880
|
+
"modelscope": {
|
|
881
|
+
"model_id": "AI-ModelScope/Kokoro-82M-v1.1-zh",
|
|
882
|
+
"model_revision": "master"
|
|
883
|
+
}
|
|
884
|
+
}
|
|
865
885
|
},
|
|
866
886
|
{
|
|
867
887
|
"version": 2,
|
|
@@ -265,10 +265,10 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
|
|
|
265
265
|
"clip" in self.model_family.model_name.lower()
|
|
266
266
|
or "jina-embeddings-v4" in self.model_family.model_name.lower()
|
|
267
267
|
):
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
268
|
+
# support input_ids and text_input_ids
|
|
269
|
+
for key in ["input_ids", "text_input_ids"]:
|
|
270
|
+
if key in features and hasattr(features[key], "numel"):
|
|
271
|
+
all_token_nums += features[key].numel()
|
|
272
272
|
if "pixel_values" in features and hasattr(
|
|
273
273
|
features["pixel_values"], "numel"
|
|
274
274
|
):
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import importlib.util
|
|
16
|
+
import json
|
|
16
17
|
import logging
|
|
17
18
|
from typing import List, Union
|
|
18
19
|
|
|
@@ -54,13 +55,18 @@ class VLLMEmbeddingModel(EmbeddingModel):
|
|
|
54
55
|
self._kwargs["hf_overrides"].update(
|
|
55
56
|
is_matryoshka=True,
|
|
56
57
|
)
|
|
58
|
+
elif isinstance(self._kwargs["hf_overrides"], str):
|
|
59
|
+
self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"])
|
|
60
|
+
self._kwargs["hf_overrides"].update(
|
|
61
|
+
is_matryoshka=True,
|
|
62
|
+
)
|
|
57
63
|
|
|
58
64
|
self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
|
|
59
65
|
self._tokenizer = self._model.get_tokenizer()
|
|
60
66
|
|
|
61
67
|
@staticmethod
|
|
62
68
|
def _get_detailed_instruct(task_description: str, query: str) -> str:
|
|
63
|
-
return f"Instruct: {task_description}\nQuery:{query}"
|
|
69
|
+
return f"Instruct: {task_description}\nQuery:{query}" # noqa: E231
|
|
64
70
|
|
|
65
71
|
@cache_clean
|
|
66
72
|
def create_embedding(
|
|
@@ -824,13 +824,12 @@
|
|
|
824
824
|
"deepspeed==0.12.3",
|
|
825
825
|
"peft==0.4.0",
|
|
826
826
|
"tiktoken==0.6.0",
|
|
827
|
-
"bitsandbytes==0.41.0",
|
|
828
|
-
"scikit-learn==1.2.2",
|
|
829
827
|
"sentencepiece==0.1.99",
|
|
830
828
|
"einops==0.6.1",
|
|
831
829
|
"einops-exts==0.0.4",
|
|
832
830
|
"timm==0.6.13",
|
|
833
|
-
"
|
|
831
|
+
"#system_numpy#",
|
|
832
|
+
"#system_torch#"
|
|
834
833
|
]
|
|
835
834
|
},
|
|
836
835
|
"model_src": {
|
xinference/model/llm/core.py
CHANGED
|
@@ -27,6 +27,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
|
|
|
27
27
|
from ...core.utils import parse_replica_model_uid
|
|
28
28
|
from ...types import PeftModelConfig
|
|
29
29
|
from .reasoning_parser import ReasoningParser
|
|
30
|
+
from .tool_parsers import TOOL_PARSERS
|
|
30
31
|
|
|
31
32
|
if TYPE_CHECKING:
|
|
32
33
|
from .llm_family import LLMFamilyV2, LLMSpecV1
|
|
@@ -59,6 +60,7 @@ class LLM(abc.ABC):
|
|
|
59
60
|
self.quantization = model_family.model_specs[0].quantization
|
|
60
61
|
self.model_path = model_path
|
|
61
62
|
self.reasoning_parser = None
|
|
63
|
+
self.tool_parser = None
|
|
62
64
|
if args:
|
|
63
65
|
raise ValueError(f"Unrecognized positional arguments: {args}")
|
|
64
66
|
if kwargs:
|
|
@@ -171,6 +173,14 @@ class LLM(abc.ABC):
|
|
|
171
173
|
enable_thinking=enable_thinking,
|
|
172
174
|
)
|
|
173
175
|
|
|
176
|
+
def prepare_parse_tool_calls(self):
|
|
177
|
+
if self.model_family.tool_parser is None:
|
|
178
|
+
return
|
|
179
|
+
if self.model_family.tool_parser not in TOOL_PARSERS:
|
|
180
|
+
return
|
|
181
|
+
tool_parser = TOOL_PARSERS[self.model_family.tool_parser]
|
|
182
|
+
self.tool_parser = tool_parser()
|
|
183
|
+
|
|
174
184
|
|
|
175
185
|
# Context variable for passing per-request chat context (e.g., chat_template_kwargs).
|
|
176
186
|
# This variable should be set at the beginning of each chat or stream_chat call.
|
|
@@ -122,6 +122,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
122
122
|
self.prepare_parse_reasoning_content(
|
|
123
123
|
reasoning_content, enable_thinking=enable_thinking
|
|
124
124
|
)
|
|
125
|
+
self.prepare_parse_tool_calls()
|
|
125
126
|
|
|
126
127
|
if os.path.isfile(self.model_path):
|
|
127
128
|
# mostly passed from --model_path
|
|
@@ -1008,7 +1008,8 @@
|
|
|
1008
1008
|
"<|endoftext|>",
|
|
1009
1009
|
"<|im_start|>",
|
|
1010
1010
|
"<|im_end|>"
|
|
1011
|
-
]
|
|
1011
|
+
],
|
|
1012
|
+
"tool_parser":"qwen"
|
|
1012
1013
|
},
|
|
1013
1014
|
{
|
|
1014
1015
|
"version": 2,
|
|
@@ -1070,7 +1071,8 @@
|
|
|
1070
1071
|
"<|end_of_text|>",
|
|
1071
1072
|
"<|eot_id|>",
|
|
1072
1073
|
"<|eom_id|>"
|
|
1073
|
-
]
|
|
1074
|
+
],
|
|
1075
|
+
"tool_parser": "llama3"
|
|
1074
1076
|
},
|
|
1075
1077
|
{
|
|
1076
1078
|
"version": 2,
|
|
@@ -1133,7 +1135,8 @@
|
|
|
1133
1135
|
"<|endoftext|>",
|
|
1134
1136
|
"<|im_start|>",
|
|
1135
1137
|
"<|im_end|>"
|
|
1136
|
-
]
|
|
1138
|
+
],
|
|
1139
|
+
"tool_parser":"qwen"
|
|
1137
1140
|
},
|
|
1138
1141
|
{
|
|
1139
1142
|
"version": 2,
|
|
@@ -1946,7 +1949,8 @@
|
|
|
1946
1949
|
"<|im_end|>"
|
|
1947
1950
|
],
|
|
1948
1951
|
"reasoning_start_tag": "<think>",
|
|
1949
|
-
"reasoning_end_tag": "</think>"
|
|
1952
|
+
"reasoning_end_tag": "</think>",
|
|
1953
|
+
"tool_parser":"qwen"
|
|
1950
1954
|
},
|
|
1951
1955
|
{
|
|
1952
1956
|
"version": 2,
|
|
@@ -2209,7 +2213,8 @@
|
|
|
2209
2213
|
"<|endoftext|>",
|
|
2210
2214
|
"<|im_start|>",
|
|
2211
2215
|
"<|im_end|>"
|
|
2212
|
-
]
|
|
2216
|
+
],
|
|
2217
|
+
"tool_parser":"qwen"
|
|
2213
2218
|
},
|
|
2214
2219
|
{
|
|
2215
2220
|
"version": 2,
|
|
@@ -5772,7 +5777,8 @@
|
|
|
5772
5777
|
"<|end▁of▁sentence|>"
|
|
5773
5778
|
],
|
|
5774
5779
|
"reasoning_start_tag": "<think>",
|
|
5775
|
-
"reasoning_end_tag": "</think>"
|
|
5780
|
+
"reasoning_end_tag": "</think>",
|
|
5781
|
+
"tool_parser": "deepseek_r1"
|
|
5776
5782
|
},
|
|
5777
5783
|
{
|
|
5778
5784
|
"version": 2,
|
|
@@ -6620,7 +6626,8 @@
|
|
|
6620
6626
|
],
|
|
6621
6627
|
"stop": [
|
|
6622
6628
|
"<|end▁of▁sentence|>"
|
|
6623
|
-
]
|
|
6629
|
+
],
|
|
6630
|
+
"tool_parser": "deepseek_v3"
|
|
6624
6631
|
},
|
|
6625
6632
|
{
|
|
6626
6633
|
"version": 2,
|
|
@@ -7920,7 +7927,8 @@
|
|
|
7920
7927
|
"<|endoftext|>",
|
|
7921
7928
|
"<|user|>",
|
|
7922
7929
|
"<|observation|>"
|
|
7923
|
-
]
|
|
7930
|
+
],
|
|
7931
|
+
"tool_parser":"glm4"
|
|
7924
7932
|
},
|
|
7925
7933
|
{
|
|
7926
7934
|
"version": 2,
|
|
@@ -8027,7 +8035,8 @@
|
|
|
8027
8035
|
"<|endoftext|>",
|
|
8028
8036
|
"<|user|>",
|
|
8029
8037
|
"<|observation|>"
|
|
8030
|
-
]
|
|
8038
|
+
],
|
|
8039
|
+
"tool_parser":"glm4"
|
|
8031
8040
|
},
|
|
8032
8041
|
{
|
|
8033
8042
|
"version": 2,
|
|
@@ -9189,7 +9198,8 @@
|
|
|
9189
9198
|
"<|end_of_text|>",
|
|
9190
9199
|
"<|eot_id|>",
|
|
9191
9200
|
"<|eom_id|>"
|
|
9192
|
-
]
|
|
9201
|
+
],
|
|
9202
|
+
"tool_parser": "llama3"
|
|
9193
9203
|
},
|
|
9194
9204
|
{
|
|
9195
9205
|
"version": 2,
|
|
@@ -11918,7 +11928,8 @@
|
|
|
11918
11928
|
"<|endoftext|>",
|
|
11919
11929
|
"<|im_start|>",
|
|
11920
11930
|
"<|im_end|>"
|
|
11921
|
-
]
|
|
11931
|
+
],
|
|
11932
|
+
"tool_parser":"qwen"
|
|
11922
11933
|
},
|
|
11923
11934
|
{
|
|
11924
11935
|
"version": 2,
|
|
@@ -11981,7 +11992,8 @@
|
|
|
11981
11992
|
"<|endoftext|>",
|
|
11982
11993
|
"<|im_start|>",
|
|
11983
11994
|
"<|im_end|>"
|
|
11984
|
-
]
|
|
11995
|
+
],
|
|
11996
|
+
"tool_parser":"qwen"
|
|
11985
11997
|
},
|
|
11986
11998
|
{
|
|
11987
11999
|
"version": 2,
|
|
@@ -12705,7 +12717,8 @@
|
|
|
12705
12717
|
"<|endoftext|>",
|
|
12706
12718
|
"<|im_start|>",
|
|
12707
12719
|
"<|im_end|>"
|
|
12708
|
-
]
|
|
12720
|
+
],
|
|
12721
|
+
"tool_parser":"qwen"
|
|
12709
12722
|
},
|
|
12710
12723
|
{
|
|
12711
12724
|
"version": 2,
|
|
@@ -12826,7 +12839,8 @@
|
|
|
12826
12839
|
"<|endoftext|>",
|
|
12827
12840
|
"<|im_start|>",
|
|
12828
12841
|
"<|im_end|>"
|
|
12829
|
-
]
|
|
12842
|
+
],
|
|
12843
|
+
"tool_parser":"qwen"
|
|
12830
12844
|
},
|
|
12831
12845
|
{
|
|
12832
12846
|
"version": 2,
|
|
@@ -14008,7 +14022,8 @@
|
|
|
14008
14022
|
"<|endoftext|>",
|
|
14009
14023
|
"<|im_start|>",
|
|
14010
14024
|
"<|im_end|>"
|
|
14011
|
-
]
|
|
14025
|
+
],
|
|
14026
|
+
"tool_parser":"qwen"
|
|
14012
14027
|
},
|
|
14013
14028
|
{
|
|
14014
14029
|
"version": 2,
|
|
@@ -15518,7 +15533,8 @@
|
|
|
15518
15533
|
"<|endoftext|>",
|
|
15519
15534
|
"<|im_start|>",
|
|
15520
15535
|
"<|im_end|>"
|
|
15521
|
-
]
|
|
15536
|
+
],
|
|
15537
|
+
"tool_parser":"qwen"
|
|
15522
15538
|
},
|
|
15523
15539
|
{
|
|
15524
15540
|
"version": 2,
|
|
@@ -17428,7 +17444,8 @@
|
|
|
17428
17444
|
"mlx-lm>=0.24.0 ; sys_platform=='darwin'",
|
|
17429
17445
|
"#system_numpy#"
|
|
17430
17446
|
]
|
|
17431
|
-
}
|
|
17447
|
+
},
|
|
17448
|
+
"tool_parser": "qwen"
|
|
17432
17449
|
},
|
|
17433
17450
|
{
|
|
17434
17451
|
"version": 2,
|
|
@@ -18043,7 +18060,8 @@
|
|
|
18043
18060
|
"<|endoftext|>",
|
|
18044
18061
|
"<|im_start|>",
|
|
18045
18062
|
"<|im_end|>"
|
|
18046
|
-
]
|
|
18063
|
+
],
|
|
18064
|
+
"tool_parser":"qwen"
|
|
18047
18065
|
},
|
|
18048
18066
|
{
|
|
18049
18067
|
"version": 2,
|
|
@@ -18655,7 +18673,8 @@
|
|
|
18655
18673
|
"<|im_end|>"
|
|
18656
18674
|
],
|
|
18657
18675
|
"reasoning_start_tag": "<think>",
|
|
18658
|
-
"reasoning_end_tag": "</think>"
|
|
18676
|
+
"reasoning_end_tag": "</think>",
|
|
18677
|
+
"tool_parser":"qwen"
|
|
18659
18678
|
},
|
|
18660
18679
|
{
|
|
18661
18680
|
"version": 2,
|
|
@@ -19438,7 +19457,8 @@
|
|
|
19438
19457
|
"stop": [
|
|
19439
19458
|
"<|endoftext|>",
|
|
19440
19459
|
"<|im_end|>"
|
|
19441
|
-
]
|
|
19460
|
+
],
|
|
19461
|
+
"tool_parser":"qwen"
|
|
19442
19462
|
},
|
|
19443
19463
|
{
|
|
19444
19464
|
"version": 2,
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -148,6 +148,16 @@ class MLXModel(LLM):
|
|
|
148
148
|
# to call aynsc method with asyncio.run_coroutine_threadsafe
|
|
149
149
|
self._loop = loop # type: ignore
|
|
150
150
|
|
|
151
|
+
def _cleanup_memory(self):
|
|
152
|
+
import gc
|
|
153
|
+
|
|
154
|
+
import mlx.core as mx
|
|
155
|
+
|
|
156
|
+
# mandatory recycling
|
|
157
|
+
gc.collect()
|
|
158
|
+
# clear the MLX cache
|
|
159
|
+
mx.clear_cache()
|
|
160
|
+
|
|
151
161
|
@property
|
|
152
162
|
def driver_info(self) -> Optional[dict]:
|
|
153
163
|
return self._driver_info
|
|
@@ -333,6 +343,7 @@ class MLXModel(LLM):
|
|
|
333
343
|
self.prepare_parse_reasoning_content(
|
|
334
344
|
reasoning_content, enable_thinking=enable_thinking
|
|
335
345
|
)
|
|
346
|
+
self.prepare_parse_tool_calls()
|
|
336
347
|
|
|
337
348
|
kwargs = {}
|
|
338
349
|
kwargs["revision"] = self._model_config.get(
|
|
@@ -458,14 +469,18 @@ class MLXModel(LLM):
|
|
|
458
469
|
repetition_penalty=kwargs.pop("repetition_penalty"),
|
|
459
470
|
repetition_context_size=kwargs.pop("repetition_context_size"),
|
|
460
471
|
)
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
472
|
+
try:
|
|
473
|
+
yield from stream_generate(
|
|
474
|
+
self._model,
|
|
475
|
+
self._tokenizer,
|
|
476
|
+
prompt_token_ids,
|
|
477
|
+
sampler=sampler,
|
|
478
|
+
logits_processors=logits_processors,
|
|
479
|
+
**kwargs,
|
|
480
|
+
)
|
|
481
|
+
finally:
|
|
482
|
+
# after completing the inference, clear the memory.
|
|
483
|
+
self._cleanup_memory()
|
|
469
484
|
|
|
470
485
|
def _prepare_inputs(
|
|
471
486
|
self, prompt: Union[str, Dict[str, Any]], kwargs
|
|
@@ -755,7 +770,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
755
770
|
assert not isinstance(c, Iterator)
|
|
756
771
|
if tools:
|
|
757
772
|
return self._post_process_completion(
|
|
758
|
-
self.model_family, self.model_uid, c
|
|
773
|
+
self.model_family, self.model_uid, c
|
|
759
774
|
)
|
|
760
775
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
761
776
|
|
|
@@ -831,18 +846,32 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
831
846
|
|
|
832
847
|
detokenizer.reset()
|
|
833
848
|
tic = time.perf_counter()
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
849
|
+
try:
|
|
850
|
+
for n, (token, logprobs) in enumerate(
|
|
851
|
+
generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
|
|
852
|
+
):
|
|
853
|
+
if n == 0:
|
|
854
|
+
prompt_time = time.perf_counter() - tic
|
|
855
|
+
prompt_tps = len(input_ids) / prompt_time
|
|
856
|
+
tic = time.perf_counter()
|
|
857
|
+
if token == tokenizer.eos_token_id:
|
|
858
|
+
break
|
|
859
|
+
detokenizer.add_token(token)
|
|
860
|
+
|
|
861
|
+
# Yield the last segment if streaming
|
|
862
|
+
yield GenerationResponse(
|
|
863
|
+
text=detokenizer.last_segment,
|
|
864
|
+
token=token,
|
|
865
|
+
logprobs=logprobs,
|
|
866
|
+
from_draft=False,
|
|
867
|
+
prompt_tokens=len(input_ids),
|
|
868
|
+
prompt_tps=prompt_tps,
|
|
869
|
+
generation_tokens=n + 1,
|
|
870
|
+
generation_tps=(n + 1) / (time.perf_counter() - tic),
|
|
871
|
+
peak_memory=mx.metal.get_peak_memory() / 1e9,
|
|
872
|
+
)
|
|
844
873
|
|
|
845
|
-
|
|
874
|
+
detokenizer.finalize()
|
|
846
875
|
yield GenerationResponse(
|
|
847
876
|
text=detokenizer.last_segment,
|
|
848
877
|
token=token,
|
|
@@ -854,19 +883,9 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
854
883
|
generation_tps=(n + 1) / (time.perf_counter() - tic),
|
|
855
884
|
peak_memory=mx.metal.get_peak_memory() / 1e9,
|
|
856
885
|
)
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
text=detokenizer.last_segment,
|
|
861
|
-
token=token,
|
|
862
|
-
logprobs=logprobs,
|
|
863
|
-
from_draft=False,
|
|
864
|
-
prompt_tokens=len(input_ids),
|
|
865
|
-
prompt_tps=prompt_tps,
|
|
866
|
-
generation_tokens=n + 1,
|
|
867
|
-
generation_tps=(n + 1) / (time.perf_counter() - tic),
|
|
868
|
-
peak_memory=mx.metal.get_peak_memory() / 1e9,
|
|
869
|
-
)
|
|
886
|
+
finally:
|
|
887
|
+
# after completing the inference, clear the memory
|
|
888
|
+
self._cleanup_memory()
|
|
870
889
|
|
|
871
890
|
def _prepare_inputs(
|
|
872
891
|
self, prompt: Union[str, Dict[str, Any]], kwargs
|
|
@@ -175,6 +175,7 @@ class SGLANGModel(LLM):
|
|
|
175
175
|
self.prepare_parse_reasoning_content(
|
|
176
176
|
reasoning_content, enable_thinking=enable_thinking
|
|
177
177
|
)
|
|
178
|
+
self.prepare_parse_tool_calls()
|
|
178
179
|
|
|
179
180
|
# Fix: GH#2169
|
|
180
181
|
if sgl.__version__ >= "0.2.14":
|
|
@@ -646,49 +647,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
646
647
|
def is_tool_call_chunk_end(chunk):
|
|
647
648
|
return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
|
|
648
649
|
|
|
649
|
-
async def _async_to_tool_completion_chunks(
|
|
650
|
-
self,
|
|
651
|
-
chunks: AsyncGenerator[CompletionChunk, None],
|
|
652
|
-
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
653
|
-
i = 0
|
|
654
|
-
previous_texts = [""]
|
|
655
|
-
tool_call = False
|
|
656
|
-
tool_call_texts = [""]
|
|
657
|
-
if self.reasoning_parser:
|
|
658
|
-
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
659
|
-
async for chunk in chunks:
|
|
660
|
-
if i == 0:
|
|
661
|
-
for first_chunk in self._get_first_chat_completion_chunk(
|
|
662
|
-
chunk, self.reasoning_parser
|
|
663
|
-
):
|
|
664
|
-
yield first_chunk
|
|
665
|
-
# usage
|
|
666
|
-
choices = chunk.get("choices")
|
|
667
|
-
if not choices:
|
|
668
|
-
yield self._get_final_chat_completion_chunk(chunk)
|
|
669
|
-
else:
|
|
670
|
-
if self.is_tool_call_chunk_start(chunk):
|
|
671
|
-
tool_call = True
|
|
672
|
-
if tool_call:
|
|
673
|
-
tool_call_text = tool_call_texts[-1]
|
|
674
|
-
tool_call_text += chunk["choices"][0]["text"]
|
|
675
|
-
tool_call_texts.append(tool_call_text)
|
|
676
|
-
if self.is_tool_call_chunk_end(chunk):
|
|
677
|
-
yield self._post_process_completion_chunk(
|
|
678
|
-
self.model_family,
|
|
679
|
-
self.model_uid,
|
|
680
|
-
chunk,
|
|
681
|
-
reasoning_parser=self.reasoning_parser,
|
|
682
|
-
tool_call_text=tool_call_text,
|
|
683
|
-
)
|
|
684
|
-
tool_call = False
|
|
685
|
-
tool_call_texts = [""]
|
|
686
|
-
else:
|
|
687
|
-
yield self._to_chat_completion_chunk(
|
|
688
|
-
chunk, self.reasoning_parser, previous_texts
|
|
689
|
-
)
|
|
690
|
-
i += 1
|
|
691
|
-
|
|
692
650
|
async def async_chat(
|
|
693
651
|
self,
|
|
694
652
|
messages: List[Dict],
|
|
@@ -731,7 +689,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
731
689
|
assert not isinstance(c, AsyncGenerator)
|
|
732
690
|
if tools:
|
|
733
691
|
return self._post_process_completion(
|
|
734
|
-
self.model_family, self.model_uid, c
|
|
692
|
+
self.model_family, self.model_uid, c
|
|
735
693
|
)
|
|
736
694
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
737
695
|
|