xinference 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/chat_interface.py +39 -24
- xinference/model/llm/__init__.py +3 -0
- xinference/model/llm/core.py +2 -5
- xinference/model/llm/llama_cpp/core.py +52 -16
- xinference/model/llm/llm_family.json +364 -21
- xinference/model/llm/llm_family_modelscope.json +258 -23
- xinference/model/llm/mlx/core.py +15 -11
- xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +19 -14
- xinference/model/llm/sglang/core.py +2 -0
- xinference/model/llm/transformers/core.py +3 -2
- xinference/model/llm/transformers/gemma3.py +185 -0
- xinference/model/llm/transformers/intern_vl.py +0 -2
- xinference/model/llm/utils.py +78 -32
- xinference/model/llm/vllm/core.py +10 -3
- xinference/types.py +2 -2
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
- xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
- xinference/web/ui/build/static/js/main.3cea968e.js +3 -0
- xinference/web/ui/build/static/js/main.3cea968e.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
- xinference/web/ui/src/locales/en.json +2 -2
- xinference/web/ui/src/locales/zh.json +1 -1
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/METADATA +3 -3
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/RECORD +35 -36
- xinference/model/llm/reasoning_parsers/__init__.py +0 -13
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
- xinference/web/ui/build/static/css/main.f8177338.css +0 -2
- xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
- /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.3cea968e.js.LICENSE.txt} +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/LICENSE +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/WHEEL +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-03-
|
|
11
|
+
"date": "2025-03-21T14:33:52+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.
|
|
14
|
+
"full-revisionid": "ac88d425e3d5fc12166e22c4032286327871f5f2",
|
|
15
|
+
"version": "1.4.0"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -113,6 +113,7 @@ class GradioInterface:
|
|
|
113
113
|
max_tokens: int,
|
|
114
114
|
temperature: float,
|
|
115
115
|
lora_name: str,
|
|
116
|
+
stream: bool,
|
|
116
117
|
) -> Generator:
|
|
117
118
|
from ..client import RESTfulClient
|
|
118
119
|
|
|
@@ -123,29 +124,40 @@ class GradioInterface:
|
|
|
123
124
|
messages = to_chat(flatten(history))
|
|
124
125
|
messages.append(dict(role="user", content=message))
|
|
125
126
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
127
|
+
if stream:
|
|
128
|
+
response_content = ""
|
|
129
|
+
for chunk in model.chat(
|
|
130
|
+
messages,
|
|
131
|
+
generate_config={
|
|
132
|
+
"max_tokens": int(max_tokens),
|
|
133
|
+
"temperature": temperature,
|
|
134
|
+
"stream": True,
|
|
135
|
+
"lora_name": lora_name,
|
|
136
|
+
},
|
|
137
|
+
):
|
|
138
|
+
assert isinstance(chunk, dict)
|
|
139
|
+
delta = chunk["choices"][0]["delta"]
|
|
140
|
+
if "content" not in delta or delta["content"] is None:
|
|
141
|
+
continue
|
|
142
|
+
else:
|
|
143
|
+
# some model like deepseek-r1-distill-qwen
|
|
144
|
+
# will generate <think>...</think> ...
|
|
145
|
+
# in gradio, no output will be rendered,
|
|
146
|
+
# thus escape html tags in advance
|
|
147
|
+
response_content += html.escape(delta["content"])
|
|
148
|
+
yield response_content
|
|
149
|
+
|
|
150
|
+
yield response_content
|
|
151
|
+
else:
|
|
152
|
+
result = model.chat(
|
|
153
|
+
messages,
|
|
154
|
+
generate_config={
|
|
155
|
+
"max_tokens": int(max_tokens),
|
|
156
|
+
"temperature": temperature,
|
|
157
|
+
"lora_name": lora_name,
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
yield html.escape(result["choices"][0]["message"]["content"]) # type: ignore
|
|
149
161
|
|
|
150
162
|
return gr.ChatInterface(
|
|
151
163
|
fn=generate_wrapper,
|
|
@@ -153,7 +165,9 @@ class GradioInterface:
|
|
|
153
165
|
gr.Slider(
|
|
154
166
|
minimum=1,
|
|
155
167
|
maximum=self.context_length,
|
|
156
|
-
value=512
|
|
168
|
+
value=512
|
|
169
|
+
if "reasoning" not in self.model_ability
|
|
170
|
+
else self.context_length // 2,
|
|
157
171
|
step=1,
|
|
158
172
|
label="Max Tokens",
|
|
159
173
|
),
|
|
@@ -161,6 +175,7 @@ class GradioInterface:
|
|
|
161
175
|
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
162
176
|
),
|
|
163
177
|
gr.Text(label="LoRA Name"),
|
|
178
|
+
gr.Checkbox(label="Stream", value=True),
|
|
164
179
|
],
|
|
165
180
|
title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
|
|
166
181
|
css="""
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -143,6 +143,7 @@ def _install():
|
|
|
143
143
|
DeepSeekV2PytorchModel,
|
|
144
144
|
)
|
|
145
145
|
from .transformers.deepseek_vl import DeepSeekVLChatModel
|
|
146
|
+
from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
|
|
146
147
|
from .transformers.glm4v import Glm4VModel
|
|
147
148
|
from .transformers.glm_edge_v import GlmEdgeVModel
|
|
148
149
|
from .transformers.intern_vl import InternVLChatModel
|
|
@@ -198,6 +199,8 @@ def _install():
|
|
|
198
199
|
OptPytorchModel,
|
|
199
200
|
GlmEdgeVModel,
|
|
200
201
|
CogAgentChatModel,
|
|
202
|
+
Gemma3TextChatModel,
|
|
203
|
+
Gemma3ChatModel,
|
|
201
204
|
]
|
|
202
205
|
)
|
|
203
206
|
if OmniLMMModel: # type: ignore
|
xinference/model/llm/core.py
CHANGED
|
@@ -25,8 +25,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
|
|
|
25
25
|
from ...core.utils import parse_replica_model_uid
|
|
26
26
|
from ...types import PeftModelConfig
|
|
27
27
|
from ..core import ModelDescription
|
|
28
|
-
from .
|
|
29
|
-
from .reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
|
|
28
|
+
from .reasoning_parser import ReasoningParser
|
|
30
29
|
|
|
31
30
|
if TYPE_CHECKING:
|
|
32
31
|
from .llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -123,9 +122,7 @@ class LLM(abc.ABC):
|
|
|
123
122
|
def prepare_parse_reasoning_content(self, reasoning_content):
|
|
124
123
|
# Initialize reasoning parser if model has reasoning ability
|
|
125
124
|
if "reasoning" in self.model_family.model_ability and reasoning_content:
|
|
126
|
-
|
|
127
|
-
self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
|
|
128
|
-
self.reasoning_parser = self.reasoning_parser(
|
|
125
|
+
self.reasoning_parser = ReasoningParser(
|
|
129
126
|
self.model_family.reasoning_start_tag,
|
|
130
127
|
self.model_family.reasoning_end_tag,
|
|
131
128
|
)
|
|
@@ -39,11 +39,16 @@ logger = logging.getLogger(__name__)
|
|
|
39
39
|
USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
42
|
+
class _Done:
|
|
43
43
|
pass
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
class
|
|
46
|
+
class _Error:
|
|
47
|
+
def __init__(self, msg):
|
|
48
|
+
self.msg = msg
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class XllamaCppModel(LLM, ChatModelMixin):
|
|
47
52
|
def __init__(
|
|
48
53
|
self,
|
|
49
54
|
model_uid: str,
|
|
@@ -83,6 +88,7 @@ class XllamaCppModel(LLM):
|
|
|
83
88
|
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
84
89
|
elif self._is_linux():
|
|
85
90
|
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
91
|
+
llamacpp_model_config.setdefault("reasoning_content", False)
|
|
86
92
|
|
|
87
93
|
return llamacpp_model_config
|
|
88
94
|
|
|
@@ -131,6 +137,9 @@ class XllamaCppModel(LLM):
|
|
|
131
137
|
|
|
132
138
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
133
139
|
|
|
140
|
+
reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
|
|
141
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
142
|
+
|
|
134
143
|
if os.path.isfile(self.model_path):
|
|
135
144
|
# mostly passed from --model_path
|
|
136
145
|
model_path = os.path.realpath(self.model_path)
|
|
@@ -196,7 +205,14 @@ class XllamaCppModel(LLM):
|
|
|
196
205
|
)
|
|
197
206
|
prompt_json = orjson.dumps(data)
|
|
198
207
|
|
|
199
|
-
def
|
|
208
|
+
def _error_callback(err):
|
|
209
|
+
try:
|
|
210
|
+
msg = orjson.loads(err)
|
|
211
|
+
q.put(_Error(msg))
|
|
212
|
+
except Exception as e:
|
|
213
|
+
q.put(_Error(str(e)))
|
|
214
|
+
|
|
215
|
+
def _ok_callback(ok):
|
|
200
216
|
try:
|
|
201
217
|
res = orjson.loads(ok)
|
|
202
218
|
res["model"] = self.model_uid
|
|
@@ -205,10 +221,10 @@ class XllamaCppModel(LLM):
|
|
|
205
221
|
logger.exception("handle_completions callback failed: %s", e)
|
|
206
222
|
|
|
207
223
|
try:
|
|
208
|
-
self._llm.handle_completions(prompt_json,
|
|
224
|
+
self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
|
|
209
225
|
except Exception as ex:
|
|
210
226
|
logger.exception("handle_completions failed: %s", ex)
|
|
211
|
-
q.put(
|
|
227
|
+
q.put(_Done)
|
|
212
228
|
|
|
213
229
|
assert self._executor
|
|
214
230
|
self._executor.submit(_handle_completion)
|
|
@@ -216,12 +232,17 @@ class XllamaCppModel(LLM):
|
|
|
216
232
|
if stream:
|
|
217
233
|
|
|
218
234
|
def _to_iterator():
|
|
219
|
-
while (r := q.get()) is not
|
|
235
|
+
while (r := q.get()) is not _Done:
|
|
236
|
+
if type(r) is _Error:
|
|
237
|
+
raise Exception("Got error in generate stream: %s", r.msg)
|
|
220
238
|
yield r
|
|
221
239
|
|
|
222
240
|
return _to_iterator()
|
|
223
241
|
else:
|
|
224
|
-
|
|
242
|
+
r = q.get()
|
|
243
|
+
if type(r) is _Error:
|
|
244
|
+
raise Exception("Got error in generate: %s", r.msg)
|
|
245
|
+
return r
|
|
225
246
|
|
|
226
247
|
def chat(
|
|
227
248
|
self,
|
|
@@ -249,7 +270,14 @@ class XllamaCppModel(LLM):
|
|
|
249
270
|
)
|
|
250
271
|
prompt_json = orjson.dumps(data)
|
|
251
272
|
|
|
252
|
-
def
|
|
273
|
+
def _error_callback(err):
|
|
274
|
+
try:
|
|
275
|
+
msg = orjson.loads(err)
|
|
276
|
+
q.put(_Error(msg))
|
|
277
|
+
except Exception as e:
|
|
278
|
+
q.put(_Error(str(e)))
|
|
279
|
+
|
|
280
|
+
def _ok_callback(ok):
|
|
253
281
|
try:
|
|
254
282
|
res = orjson.loads(ok)
|
|
255
283
|
res["model"] = self.model_uid
|
|
@@ -259,11 +287,11 @@ class XllamaCppModel(LLM):
|
|
|
259
287
|
|
|
260
288
|
try:
|
|
261
289
|
self._llm.handle_chat_completions(
|
|
262
|
-
prompt_json,
|
|
290
|
+
prompt_json, _error_callback, _ok_callback
|
|
263
291
|
)
|
|
264
292
|
except Exception as ex:
|
|
265
293
|
logger.exception("handle_chat_completions failed: %s", ex)
|
|
266
|
-
q.put(
|
|
294
|
+
q.put(_Done)
|
|
267
295
|
|
|
268
296
|
assert self._executor
|
|
269
297
|
self._executor.submit(_handle_chat_completion)
|
|
@@ -271,12 +299,19 @@ class XllamaCppModel(LLM):
|
|
|
271
299
|
if stream:
|
|
272
300
|
|
|
273
301
|
def _to_iterator():
|
|
274
|
-
while (r := q.get()) is not
|
|
302
|
+
while (r := q.get()) is not _Done:
|
|
303
|
+
if type(r) is _Error:
|
|
304
|
+
raise Exception("Got error in chat stream: %s", r.msg)
|
|
275
305
|
yield r
|
|
276
306
|
|
|
277
|
-
return
|
|
307
|
+
return self._to_chat_completion_chunks(
|
|
308
|
+
_to_iterator(), self.reasoning_parser
|
|
309
|
+
)
|
|
278
310
|
else:
|
|
279
|
-
|
|
311
|
+
r = q.get()
|
|
312
|
+
if type(r) is _Error:
|
|
313
|
+
raise Exception("Got error in chat: %s", r.msg)
|
|
314
|
+
return self._to_chat_completion(r, self.reasoning_parser)
|
|
280
315
|
|
|
281
316
|
|
|
282
317
|
class LlamaCppModel(LLM):
|
|
@@ -527,10 +562,11 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
527
562
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
528
563
|
full_context_kwargs = {}
|
|
529
564
|
if tools:
|
|
530
|
-
if
|
|
565
|
+
if (
|
|
566
|
+
model_family in QWEN_TOOL_CALL_FAMILY
|
|
567
|
+
or model_family in DEEPSEEK_TOOL_CALL_FAMILY
|
|
568
|
+
):
|
|
531
569
|
full_context_kwargs["tools"] = tools
|
|
532
|
-
elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
533
|
-
self._tools_to_messages_for_deepseek(messages, tools)
|
|
534
570
|
assert self.model_family.chat_template is not None
|
|
535
571
|
full_prompt = self.get_full_context(
|
|
536
572
|
messages, self.model_family.chat_template, **full_context_kwargs
|