xinference 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (45) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/chat_interface.py +39 -24
  3. xinference/model/llm/__init__.py +3 -0
  4. xinference/model/llm/core.py +2 -5
  5. xinference/model/llm/llama_cpp/core.py +52 -16
  6. xinference/model/llm/llm_family.json +364 -21
  7. xinference/model/llm/llm_family_modelscope.json +258 -23
  8. xinference/model/llm/mlx/core.py +15 -11
  9. xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +19 -14
  10. xinference/model/llm/sglang/core.py +2 -0
  11. xinference/model/llm/transformers/core.py +3 -2
  12. xinference/model/llm/transformers/gemma3.py +185 -0
  13. xinference/model/llm/transformers/intern_vl.py +0 -2
  14. xinference/model/llm/utils.py +78 -32
  15. xinference/model/llm/vllm/core.py +10 -3
  16. xinference/types.py +2 -2
  17. xinference/web/ui/build/asset-manifest.json +6 -6
  18. xinference/web/ui/build/index.html +1 -1
  19. xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
  20. xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
  21. xinference/web/ui/build/static/js/main.3cea968e.js +3 -0
  22. xinference/web/ui/build/static/js/main.3cea968e.js.map +1 -0
  23. xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +1 -0
  24. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
  25. xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +1 -0
  26. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
  27. xinference/web/ui/src/locales/en.json +2 -2
  28. xinference/web/ui/src/locales/zh.json +1 -1
  29. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/METADATA +3 -3
  30. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/RECORD +35 -36
  31. xinference/model/llm/reasoning_parsers/__init__.py +0 -13
  32. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
  33. xinference/web/ui/build/static/css/main.f8177338.css +0 -2
  34. xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
  35. xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
  36. xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
  37. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
  38. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
  39. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
  40. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
  41. /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.3cea968e.js.LICENSE.txt} +0 -0
  42. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/LICENSE +0 -0
  43. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/WHEEL +0 -0
  44. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/entry_points.txt +0 -0
  45. {xinference-1.3.1.dist-info → xinference-1.4.0.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-03-09T12:06:50+0800",
11
+ "date": "2025-03-21T14:33:52+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "5d6ec937ce2aca2511e9e0debc4c2ab06ca41f09",
15
- "version": "1.3.1"
14
+ "full-revisionid": "ac88d425e3d5fc12166e22c4032286327871f5f2",
15
+ "version": "1.4.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -113,6 +113,7 @@ class GradioInterface:
113
113
  max_tokens: int,
114
114
  temperature: float,
115
115
  lora_name: str,
116
+ stream: bool,
116
117
  ) -> Generator:
117
118
  from ..client import RESTfulClient
118
119
 
@@ -123,29 +124,40 @@ class GradioInterface:
123
124
  messages = to_chat(flatten(history))
124
125
  messages.append(dict(role="user", content=message))
125
126
 
126
- response_content = ""
127
- for chunk in model.chat(
128
- messages,
129
- generate_config={
130
- "max_tokens": int(max_tokens),
131
- "temperature": temperature,
132
- "stream": True,
133
- "lora_name": lora_name,
134
- },
135
- ):
136
- assert isinstance(chunk, dict)
137
- delta = chunk["choices"][0]["delta"]
138
- if "content" not in delta:
139
- continue
140
- else:
141
- # some model like deepseek-r1-distill-qwen
142
- # will generate <think>...</think> ...
143
- # in gradio, no output will be rendered,
144
- # thus escape html tags in advance
145
- response_content += html.escape(delta["content"])
146
- yield response_content
147
-
148
- yield response_content
127
+ if stream:
128
+ response_content = ""
129
+ for chunk in model.chat(
130
+ messages,
131
+ generate_config={
132
+ "max_tokens": int(max_tokens),
133
+ "temperature": temperature,
134
+ "stream": True,
135
+ "lora_name": lora_name,
136
+ },
137
+ ):
138
+ assert isinstance(chunk, dict)
139
+ delta = chunk["choices"][0]["delta"]
140
+ if "content" not in delta or delta["content"] is None:
141
+ continue
142
+ else:
143
+ # some model like deepseek-r1-distill-qwen
144
+ # will generate <think>...</think> ...
145
+ # in gradio, no output will be rendered,
146
+ # thus escape html tags in advance
147
+ response_content += html.escape(delta["content"])
148
+ yield response_content
149
+
150
+ yield response_content
151
+ else:
152
+ result = model.chat(
153
+ messages,
154
+ generate_config={
155
+ "max_tokens": int(max_tokens),
156
+ "temperature": temperature,
157
+ "lora_name": lora_name,
158
+ },
159
+ )
160
+ yield html.escape(result["choices"][0]["message"]["content"]) # type: ignore
149
161
 
150
162
  return gr.ChatInterface(
151
163
  fn=generate_wrapper,
@@ -153,7 +165,9 @@ class GradioInterface:
153
165
  gr.Slider(
154
166
  minimum=1,
155
167
  maximum=self.context_length,
156
- value=512,
168
+ value=512
169
+ if "reasoning" not in self.model_ability
170
+ else self.context_length // 2,
157
171
  step=1,
158
172
  label="Max Tokens",
159
173
  ),
@@ -161,6 +175,7 @@ class GradioInterface:
161
175
  minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
162
176
  ),
163
177
  gr.Text(label="LoRA Name"),
178
+ gr.Checkbox(label="Stream", value=True),
164
179
  ],
165
180
  title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
166
181
  css="""
@@ -143,6 +143,7 @@ def _install():
143
143
  DeepSeekV2PytorchModel,
144
144
  )
145
145
  from .transformers.deepseek_vl import DeepSeekVLChatModel
146
+ from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
146
147
  from .transformers.glm4v import Glm4VModel
147
148
  from .transformers.glm_edge_v import GlmEdgeVModel
148
149
  from .transformers.intern_vl import InternVLChatModel
@@ -198,6 +199,8 @@ def _install():
198
199
  OptPytorchModel,
199
200
  GlmEdgeVModel,
200
201
  CogAgentChatModel,
202
+ Gemma3TextChatModel,
203
+ Gemma3ChatModel,
201
204
  ]
202
205
  )
203
206
  if OmniLMMModel: # type: ignore
@@ -25,8 +25,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
25
25
  from ...core.utils import parse_replica_model_uid
26
26
  from ...types import PeftModelConfig
27
27
  from ..core import ModelDescription
28
- from .reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
29
- from .reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
28
+ from .reasoning_parser import ReasoningParser
30
29
 
31
30
  if TYPE_CHECKING:
32
31
  from .llm_family import LLMFamilyV1, LLMSpecV1
@@ -123,9 +122,7 @@ class LLM(abc.ABC):
123
122
  def prepare_parse_reasoning_content(self, reasoning_content):
124
123
  # Initialize reasoning parser if model has reasoning ability
125
124
  if "reasoning" in self.model_family.model_ability and reasoning_content:
126
- module_name = self.model_family.model_family or self.model_family.model_name
127
- self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
128
- self.reasoning_parser = self.reasoning_parser(
125
+ self.reasoning_parser = ReasoningParser(
129
126
  self.model_family.reasoning_start_tag,
130
127
  self.model_family.reasoning_end_tag,
131
128
  )
@@ -39,11 +39,16 @@ logger = logging.getLogger(__name__)
39
39
  USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
40
40
 
41
41
 
42
- class _Sentinel:
42
+ class _Done:
43
43
  pass
44
44
 
45
45
 
46
- class XllamaCppModel(LLM):
46
+ class _Error:
47
+ def __init__(self, msg):
48
+ self.msg = msg
49
+
50
+
51
+ class XllamaCppModel(LLM, ChatModelMixin):
47
52
  def __init__(
48
53
  self,
49
54
  model_uid: str,
@@ -83,6 +88,7 @@ class XllamaCppModel(LLM):
83
88
  llamacpp_model_config.setdefault("n_gpu_layers", -1)
84
89
  elif self._is_linux():
85
90
  llamacpp_model_config.setdefault("n_gpu_layers", -1)
91
+ llamacpp_model_config.setdefault("reasoning_content", False)
86
92
 
87
93
  return llamacpp_model_config
88
94
 
@@ -131,6 +137,9 @@ class XllamaCppModel(LLM):
131
137
 
132
138
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
133
139
 
140
+ reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
141
+ self.prepare_parse_reasoning_content(reasoning_content)
142
+
134
143
  if os.path.isfile(self.model_path):
135
144
  # mostly passed from --model_path
136
145
  model_path = os.path.realpath(self.model_path)
@@ -196,7 +205,14 @@ class XllamaCppModel(LLM):
196
205
  )
197
206
  prompt_json = orjson.dumps(data)
198
207
 
199
- def _res_callback(ok):
208
+ def _error_callback(err):
209
+ try:
210
+ msg = orjson.loads(err)
211
+ q.put(_Error(msg))
212
+ except Exception as e:
213
+ q.put(_Error(str(e)))
214
+
215
+ def _ok_callback(ok):
200
216
  try:
201
217
  res = orjson.loads(ok)
202
218
  res["model"] = self.model_uid
@@ -205,10 +221,10 @@ class XllamaCppModel(LLM):
205
221
  logger.exception("handle_completions callback failed: %s", e)
206
222
 
207
223
  try:
208
- self._llm.handle_completions(prompt_json, _res_callback, _res_callback)
224
+ self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
209
225
  except Exception as ex:
210
226
  logger.exception("handle_completions failed: %s", ex)
211
- q.put(_Sentinel)
227
+ q.put(_Done)
212
228
 
213
229
  assert self._executor
214
230
  self._executor.submit(_handle_completion)
@@ -216,12 +232,17 @@ class XllamaCppModel(LLM):
216
232
  if stream:
217
233
 
218
234
  def _to_iterator():
219
- while (r := q.get()) is not _Sentinel:
235
+ while (r := q.get()) is not _Done:
236
+ if type(r) is _Error:
237
+ raise Exception("Got error in generate stream: %s", r.msg)
220
238
  yield r
221
239
 
222
240
  return _to_iterator()
223
241
  else:
224
- return q.get()
242
+ r = q.get()
243
+ if type(r) is _Error:
244
+ raise Exception("Got error in generate: %s", r.msg)
245
+ return r
225
246
 
226
247
  def chat(
227
248
  self,
@@ -249,7 +270,14 @@ class XllamaCppModel(LLM):
249
270
  )
250
271
  prompt_json = orjson.dumps(data)
251
272
 
252
- def _res_callback(ok):
273
+ def _error_callback(err):
274
+ try:
275
+ msg = orjson.loads(err)
276
+ q.put(_Error(msg))
277
+ except Exception as e:
278
+ q.put(_Error(str(e)))
279
+
280
+ def _ok_callback(ok):
253
281
  try:
254
282
  res = orjson.loads(ok)
255
283
  res["model"] = self.model_uid
@@ -259,11 +287,11 @@ class XllamaCppModel(LLM):
259
287
 
260
288
  try:
261
289
  self._llm.handle_chat_completions(
262
- prompt_json, _res_callback, _res_callback
290
+ prompt_json, _error_callback, _ok_callback
263
291
  )
264
292
  except Exception as ex:
265
293
  logger.exception("handle_chat_completions failed: %s", ex)
266
- q.put(_Sentinel)
294
+ q.put(_Done)
267
295
 
268
296
  assert self._executor
269
297
  self._executor.submit(_handle_chat_completion)
@@ -271,12 +299,19 @@ class XllamaCppModel(LLM):
271
299
  if stream:
272
300
 
273
301
  def _to_iterator():
274
- while (r := q.get()) is not _Sentinel:
302
+ while (r := q.get()) is not _Done:
303
+ if type(r) is _Error:
304
+ raise Exception("Got error in chat stream: %s", r.msg)
275
305
  yield r
276
306
 
277
- return _to_iterator()
307
+ return self._to_chat_completion_chunks(
308
+ _to_iterator(), self.reasoning_parser
309
+ )
278
310
  else:
279
- return q.get()
311
+ r = q.get()
312
+ if type(r) is _Error:
313
+ raise Exception("Got error in chat: %s", r.msg)
314
+ return self._to_chat_completion(r, self.reasoning_parser)
280
315
 
281
316
 
282
317
  class LlamaCppModel(LLM):
@@ -527,10 +562,11 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
527
562
  tools = generate_config.pop("tools", []) if generate_config else None
528
563
  full_context_kwargs = {}
529
564
  if tools:
530
- if model_family in QWEN_TOOL_CALL_FAMILY:
565
+ if (
566
+ model_family in QWEN_TOOL_CALL_FAMILY
567
+ or model_family in DEEPSEEK_TOOL_CALL_FAMILY
568
+ ):
531
569
  full_context_kwargs["tools"] = tools
532
- elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
533
- self._tools_to_messages_for_deepseek(messages, tools)
534
570
  assert self.model_family.chat_template is not None
535
571
  full_prompt = self.get_full_context(
536
572
  messages, self.model_family.chat_template, **full_context_kwargs