xinference 1.3.0.post2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +1 -0
- xinference/conftest.py +7 -0
- xinference/core/model.py +3 -1
- xinference/core/scheduler.py +3 -0
- xinference/core/worker.py +1 -1
- xinference/model/embedding/core.py +12 -5
- xinference/model/llm/__init__.py +2 -1
- xinference/model/llm/core.py +13 -0
- xinference/model/llm/llama_cpp/core.py +260 -3
- xinference/model/llm/llm_family.json +306 -17
- xinference/model/llm/llm_family_modelscope.json +347 -28
- xinference/model/llm/mlx/core.py +15 -4
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +1 -1
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +4 -5
- xinference/model/llm/sglang/core.py +7 -2
- xinference/model/llm/transformers/chatglm.py +4 -4
- xinference/model/llm/transformers/core.py +22 -5
- xinference/model/llm/transformers/intern_vl.py +2 -1
- xinference/model/llm/transformers/utils.py +1 -1
- xinference/model/llm/utils.py +103 -67
- xinference/model/llm/vllm/core.py +29 -42
- xinference/types.py +4 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
- xinference/web/ui/src/locales/en.json +9 -1
- xinference/web/ui/src/locales/zh.json +9 -1
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/METADATA +7 -3
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/RECORD +43 -42
- xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
- xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
- /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/LICENSE +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/WHEEL +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.0.post2.dist-info → xinference-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -48,6 +48,7 @@ class SGLANGModelConfig(TypedDict, total=False):
|
|
|
48
48
|
nnodes: Optional[int]
|
|
49
49
|
node_rank: Optional[int]
|
|
50
50
|
dist_init_addr: Optional[str]
|
|
51
|
+
reasoning_content: bool
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
class SGLANGGenerateConfig(TypedDict, total=False):
|
|
@@ -99,6 +100,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
|
99
100
|
"qwen2.5-instruct",
|
|
100
101
|
"qwen2.5-coder-instruct",
|
|
101
102
|
"QwQ-32B-Preview",
|
|
103
|
+
"QwQ-32B",
|
|
102
104
|
"deepseek-r1-distill-qwen",
|
|
103
105
|
"deepseek-r1-distill-llama",
|
|
104
106
|
"deepseek-v3",
|
|
@@ -143,6 +145,8 @@ class SGLANGModel(LLM):
|
|
|
143
145
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
144
146
|
|
|
145
147
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
148
|
+
reasoning_content = self._model_config.pop("reasoning_content")
|
|
149
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
146
150
|
|
|
147
151
|
# Fix: GH#2169
|
|
148
152
|
if sgl.__version__ >= "0.2.14":
|
|
@@ -255,6 +259,7 @@ class SGLANGModel(LLM):
|
|
|
255
259
|
else:
|
|
256
260
|
model_config["mem_fraction_static"] = 0.88
|
|
257
261
|
model_config.setdefault("log_level", "info")
|
|
262
|
+
model_config.setdefault("reasoning_content", False)
|
|
258
263
|
|
|
259
264
|
return model_config
|
|
260
265
|
|
|
@@ -547,8 +552,8 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
547
552
|
if stream:
|
|
548
553
|
agen = await self.async_generate(full_prompt, generate_config) # type: ignore
|
|
549
554
|
assert isinstance(agen, AsyncGenerator)
|
|
550
|
-
return self._async_to_chat_completion_chunks(agen)
|
|
555
|
+
return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
|
|
551
556
|
else:
|
|
552
557
|
c = await self.async_generate(full_prompt, generate_config) # type: ignore
|
|
553
558
|
assert not isinstance(c, AsyncGenerator)
|
|
554
|
-
return self._to_chat_completion(c)
|
|
559
|
+
return self._to_chat_completion(c, self.reasoning_parser)
|
|
@@ -383,7 +383,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
383
383
|
function_call = self._process_response_non_streaming(
|
|
384
384
|
response, tools, use_tool=True
|
|
385
385
|
)
|
|
386
|
-
return self.
|
|
386
|
+
return self._post_process_completion(
|
|
387
387
|
self.model_family, self.model_uid, function_call
|
|
388
388
|
)
|
|
389
389
|
else:
|
|
@@ -397,7 +397,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
397
397
|
prompt_tokens = len(inputs["input_ids"][0])
|
|
398
398
|
for chunk_text in self._stream_chat(inputs, tools, **kwargs):
|
|
399
399
|
if tools and isinstance(chunk_text, dict):
|
|
400
|
-
yield self.
|
|
400
|
+
yield self._post_process_completion_chunk(
|
|
401
401
|
self.model_family, self.model_uid, chunk_text
|
|
402
402
|
)
|
|
403
403
|
return
|
|
@@ -484,7 +484,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
484
484
|
function_call = self._process_response_non_streaming(
|
|
485
485
|
response, req.tools, use_tool=True
|
|
486
486
|
)
|
|
487
|
-
req.completion[0] = self.
|
|
487
|
+
req.completion[0] = self._post_process_completion(
|
|
488
488
|
self.model_family, self.model_uid, function_call
|
|
489
489
|
)
|
|
490
490
|
req.completion[0]["usage"] = usage
|
|
@@ -516,7 +516,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
516
516
|
c for c in req.completion if not isinstance(c, str)
|
|
517
517
|
][0]["id"]
|
|
518
518
|
results.append(
|
|
519
|
-
self.
|
|
519
|
+
self._post_process_completion_chunk(
|
|
520
520
|
self.model_family,
|
|
521
521
|
self.model_uid,
|
|
522
522
|
new_response,
|
|
@@ -61,6 +61,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
|
61
61
|
"deepseek-vl-chat",
|
|
62
62
|
"internvl-chat",
|
|
63
63
|
"internvl2",
|
|
64
|
+
"Internvl2.5",
|
|
65
|
+
"Internvl2.5-MPO",
|
|
64
66
|
"cogvlm2",
|
|
65
67
|
"cogvlm2-video-llama3-chat",
|
|
66
68
|
"MiniCPM-Llama3-V-2_5",
|
|
@@ -112,6 +114,7 @@ class PytorchModel(LLM):
|
|
|
112
114
|
pytorch_model_config.setdefault("trust_remote_code", True)
|
|
113
115
|
pytorch_model_config.setdefault("max_num_seqs", 16)
|
|
114
116
|
pytorch_model_config.setdefault("enable_tensorizer", False)
|
|
117
|
+
pytorch_model_config.setdefault("reasoning_content", False)
|
|
115
118
|
return pytorch_model_config
|
|
116
119
|
|
|
117
120
|
def _sanitize_generate_config(
|
|
@@ -324,6 +327,9 @@ class PytorchModel(LLM):
|
|
|
324
327
|
kwargs.update({"device_map": "auto"})
|
|
325
328
|
is_device_map_auto = True
|
|
326
329
|
|
|
330
|
+
reasoning_content = self._pytorch_model_config.pop("reasoning_content")
|
|
331
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
332
|
+
|
|
327
333
|
if self._check_tensorizer_integrity():
|
|
328
334
|
self._model, self._tokenizer = self._load_tensorizer(**kwargs)
|
|
329
335
|
else:
|
|
@@ -714,23 +720,34 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
714
720
|
|
|
715
721
|
def handle_chat_result_non_streaming(self, req: InferenceRequest):
|
|
716
722
|
if req.tools:
|
|
717
|
-
req.completion[0] = self.
|
|
718
|
-
self.model_family,
|
|
723
|
+
req.completion[0] = self._post_process_completion(
|
|
724
|
+
self.model_family,
|
|
725
|
+
self.model_uid,
|
|
726
|
+
req.completion[0],
|
|
727
|
+
self.reasoning_parser,
|
|
719
728
|
)
|
|
720
729
|
else:
|
|
721
|
-
req.completion[0] = self._to_chat_completion(
|
|
730
|
+
req.completion[0] = self._to_chat_completion(
|
|
731
|
+
req.completion[0], self.reasoning_parser
|
|
732
|
+
)
|
|
722
733
|
|
|
723
734
|
def handle_chat_result_streaming(self, req: InferenceRequest):
|
|
724
735
|
results = []
|
|
725
736
|
for i, c in enumerate(req.completion):
|
|
726
737
|
if c == "<bos_stream>":
|
|
727
738
|
results.append(
|
|
728
|
-
self._get_first_chat_completion_chunk(
|
|
739
|
+
self._get_first_chat_completion_chunk(
|
|
740
|
+
req.completion[i + 1], self.reasoning_parser
|
|
741
|
+
)
|
|
729
742
|
)
|
|
730
743
|
elif c == "<eos_stream>":
|
|
731
744
|
break
|
|
732
745
|
else:
|
|
733
|
-
results.append(
|
|
746
|
+
results.append(
|
|
747
|
+
self._to_chat_completion_chunk(
|
|
748
|
+
c, self.reasoning_parser, req.previous_texts
|
|
749
|
+
)
|
|
750
|
+
)
|
|
734
751
|
|
|
735
752
|
if req.stopped and req.include_usage:
|
|
736
753
|
results.append(self._get_final_chat_completion_chunk(req.completion[-1]))
|
|
@@ -265,7 +265,8 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
265
265
|
if world_size == 1:
|
|
266
266
|
return None
|
|
267
267
|
model_size = f"{self.model_spec.model_size_in_billions}B"
|
|
268
|
-
model_name =
|
|
268
|
+
model_name = self.model_family.model_name.lower().replace("-mpo", "")
|
|
269
|
+
model_name = f"{model_name}-{model_size}"
|
|
269
270
|
num_layers = {
|
|
270
271
|
"internvl2-1B": 24,
|
|
271
272
|
"internvl2-2B": 24,
|
|
@@ -132,7 +132,7 @@ def _pad_seqs_inplace(seqs: List[List[int]], reqs: List[InferenceRequest], pad:
|
|
|
132
132
|
|
|
133
133
|
def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
|
|
134
134
|
max_new_tokens = int(
|
|
135
|
-
r.sanitized_generate_config.get("max_tokens"
|
|
135
|
+
r.sanitized_generate_config.get("max_tokens") or max_tokens_field.default
|
|
136
136
|
)
|
|
137
137
|
return context_len - max_new_tokens - 8
|
|
138
138
|
|
xinference/model/llm/utils.py
CHANGED
|
@@ -41,6 +41,7 @@ from ...types import (
|
|
|
41
41
|
ChatCompletion,
|
|
42
42
|
ChatCompletionChoice,
|
|
43
43
|
ChatCompletionChunk,
|
|
44
|
+
ChatCompletionChunkDelta,
|
|
44
45
|
ChatCompletionMessage,
|
|
45
46
|
Completion,
|
|
46
47
|
CompletionChoice,
|
|
@@ -243,62 +244,73 @@ class ChatModelMixin:
|
|
|
243
244
|
raise ValueError(f"Invalid model family: {model_family}")
|
|
244
245
|
|
|
245
246
|
@classmethod
|
|
246
|
-
def _to_chat_completion_chunk(
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
247
|
+
def _to_chat_completion_chunk(
|
|
248
|
+
cls,
|
|
249
|
+
chunk: CompletionChunk,
|
|
250
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
251
|
+
previous_texts: Optional[List[str]] = None,
|
|
252
|
+
) -> ChatCompletionChunk:
|
|
253
|
+
choices_list = []
|
|
254
|
+
for i, choice in enumerate(chunk["choices"]):
|
|
255
|
+
delta = ChatCompletionChunkDelta()
|
|
256
|
+
if "text" in choice and choice["finish_reason"] is None:
|
|
257
|
+
if reasoning_parser is None:
|
|
258
|
+
delta["content"] = choice["text"]
|
|
259
|
+
else:
|
|
260
|
+
assert previous_texts is not None
|
|
261
|
+
current_text = previous_texts[-1] + choice["text"]
|
|
262
|
+
delta = reasoning_parser.extract_reasoning_content_streaming(
|
|
263
|
+
previous_text=previous_texts[-1],
|
|
264
|
+
current_text=current_text,
|
|
265
|
+
delta_text=choice["text"],
|
|
266
|
+
)
|
|
267
|
+
previous_texts[-1] = current_text
|
|
268
|
+
if "tool_calls" in choice:
|
|
269
|
+
delta["tool_calls"] = choice["tool_calls"]
|
|
270
|
+
choices_list.append(
|
|
271
|
+
{
|
|
272
|
+
"index": i,
|
|
273
|
+
"delta": delta,
|
|
274
|
+
"finish_reason": choice["finish_reason"],
|
|
275
|
+
}
|
|
276
|
+
)
|
|
255
277
|
chat_chunk = {
|
|
256
278
|
"id": "chat" + chunk["id"],
|
|
257
279
|
"model": chunk["model"],
|
|
258
280
|
"created": chunk["created"],
|
|
259
281
|
"object": "chat.completion.chunk",
|
|
260
|
-
"choices":
|
|
261
|
-
{
|
|
262
|
-
"index": i,
|
|
263
|
-
"delta": {
|
|
264
|
-
**(
|
|
265
|
-
{"content": choice["text"]}
|
|
266
|
-
if ("text" in choice and choice["finish_reason"] is None)
|
|
267
|
-
else {}
|
|
268
|
-
),
|
|
269
|
-
**(
|
|
270
|
-
{"tool_calls": choice["tool_calls"]}
|
|
271
|
-
if "tool_calls" in choice
|
|
272
|
-
else {}
|
|
273
|
-
),
|
|
274
|
-
},
|
|
275
|
-
"finish_reason": choice["finish_reason"],
|
|
276
|
-
}
|
|
277
|
-
for i, choice in enumerate(chunk["choices"])
|
|
278
|
-
],
|
|
282
|
+
"choices": choices_list,
|
|
279
283
|
}
|
|
280
284
|
return cast(ChatCompletionChunk, chat_chunk)
|
|
281
285
|
|
|
282
286
|
@classmethod
|
|
283
287
|
def _get_first_chat_completion_chunk(
|
|
284
|
-
cls,
|
|
288
|
+
cls,
|
|
289
|
+
chunk: CompletionChunk,
|
|
290
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
285
291
|
) -> ChatCompletionChunk:
|
|
292
|
+
choices_list = []
|
|
293
|
+
for i, choice in enumerate(chunk["choices"]):
|
|
294
|
+
delta = {
|
|
295
|
+
"role": "assistant",
|
|
296
|
+
}
|
|
297
|
+
if reasoning_parser is None:
|
|
298
|
+
delta["content"] = ""
|
|
299
|
+
else:
|
|
300
|
+
delta["reasoning_content"] = ""
|
|
301
|
+
choices_list.append(
|
|
302
|
+
{
|
|
303
|
+
"index": i,
|
|
304
|
+
"delta": delta,
|
|
305
|
+
"finish_reason": None,
|
|
306
|
+
}
|
|
307
|
+
)
|
|
286
308
|
chat_chunk = {
|
|
287
309
|
"id": "chat" + chunk["id"],
|
|
288
310
|
"model": chunk["model"],
|
|
289
311
|
"created": chunk["created"],
|
|
290
312
|
"object": "chat.completion.chunk",
|
|
291
|
-
"choices":
|
|
292
|
-
{
|
|
293
|
-
"index": i,
|
|
294
|
-
"delta": {
|
|
295
|
-
"role": "assistant",
|
|
296
|
-
"content": "",
|
|
297
|
-
},
|
|
298
|
-
"finish_reason": None,
|
|
299
|
-
}
|
|
300
|
-
for i, choice in enumerate(chunk["choices"])
|
|
301
|
-
],
|
|
313
|
+
"choices": choices_list,
|
|
302
314
|
}
|
|
303
315
|
return cast(ChatCompletionChunk, chat_chunk)
|
|
304
316
|
|
|
@@ -324,15 +336,18 @@ class ChatModelMixin:
|
|
|
324
336
|
chunks: Iterator[CompletionChunk],
|
|
325
337
|
reasoning_parse: Optional[ReasoningParser] = None,
|
|
326
338
|
) -> Iterator[ChatCompletionChunk]:
|
|
339
|
+
previous_texts = [""]
|
|
327
340
|
for i, chunk in enumerate(chunks):
|
|
328
341
|
if i == 0:
|
|
329
|
-
yield cls._get_first_chat_completion_chunk(chunk)
|
|
342
|
+
yield cls._get_first_chat_completion_chunk(chunk, reasoning_parse)
|
|
330
343
|
# usage
|
|
331
344
|
choices = chunk.get("choices")
|
|
332
345
|
if not choices:
|
|
333
346
|
yield cls._get_final_chat_completion_chunk(chunk)
|
|
334
347
|
else:
|
|
335
|
-
yield cls._to_chat_completion_chunk(
|
|
348
|
+
yield cls._to_chat_completion_chunk(
|
|
349
|
+
chunk, reasoning_parse, previous_texts
|
|
350
|
+
)
|
|
336
351
|
|
|
337
352
|
@classmethod
|
|
338
353
|
def _tools_to_messages_for_deepseek(
|
|
@@ -370,33 +385,19 @@ class ChatModelMixin:
|
|
|
370
385
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
371
386
|
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
372
387
|
i = 0
|
|
373
|
-
|
|
374
|
-
current_text = ""
|
|
388
|
+
previous_texts = [""]
|
|
375
389
|
async for chunk in chunks:
|
|
376
390
|
if i == 0:
|
|
377
|
-
chat_chunk = cls._get_first_chat_completion_chunk(
|
|
391
|
+
chat_chunk = cls._get_first_chat_completion_chunk(
|
|
392
|
+
chunk, reasoning_parser
|
|
393
|
+
)
|
|
378
394
|
elif not chunk.get("choices"):
|
|
379
395
|
# usage
|
|
380
396
|
chat_chunk = cls._get_final_chat_completion_chunk(chunk)
|
|
381
397
|
else:
|
|
382
|
-
chat_chunk = cls._to_chat_completion_chunk(
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
if choices is None:
|
|
386
|
-
continue
|
|
387
|
-
for choice in choices:
|
|
388
|
-
delta = choice.get("delta")
|
|
389
|
-
if not delta:
|
|
390
|
-
continue
|
|
391
|
-
current_text = previous_text + delta.get("content", "")
|
|
392
|
-
choice[
|
|
393
|
-
"delta"
|
|
394
|
-
] = reasoning_parser.extract_reasoning_content_streaming(
|
|
395
|
-
previous_text=previous_text,
|
|
396
|
-
current_text=current_text,
|
|
397
|
-
delta=delta,
|
|
398
|
-
)
|
|
399
|
-
previous_text = current_text
|
|
398
|
+
chat_chunk = cls._to_chat_completion_chunk(
|
|
399
|
+
chunk, reasoning_parser, previous_texts
|
|
400
|
+
)
|
|
400
401
|
yield chat_chunk
|
|
401
402
|
i += 1
|
|
402
403
|
|
|
@@ -565,7 +566,14 @@ class ChatModelMixin:
|
|
|
565
566
|
return result
|
|
566
567
|
|
|
567
568
|
@classmethod
|
|
568
|
-
def
|
|
569
|
+
def _post_process_completion_chunk(
|
|
570
|
+
cls,
|
|
571
|
+
model_family,
|
|
572
|
+
model_uid,
|
|
573
|
+
c,
|
|
574
|
+
chunk_id=None,
|
|
575
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
576
|
+
):
|
|
569
577
|
_id = chunk_id if chunk_id is not None else str(uuid.uuid4())
|
|
570
578
|
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
571
579
|
tool_calls = []
|
|
@@ -585,11 +593,22 @@ class ChatModelMixin:
|
|
|
585
593
|
else:
|
|
586
594
|
failed_contents.append(content)
|
|
587
595
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
596
|
+
|
|
597
|
+
reasoning_content = None
|
|
598
|
+
content = ". ".join(failed_contents) if failed_contents else None
|
|
599
|
+
if reasoning_parser is not None:
|
|
600
|
+
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
601
|
+
content
|
|
602
|
+
)
|
|
588
603
|
d = {
|
|
589
604
|
"role": "assistant",
|
|
590
|
-
"content":
|
|
605
|
+
"content": content,
|
|
591
606
|
"tool_calls": tool_calls,
|
|
592
607
|
}
|
|
608
|
+
# add only reasoning_content is None
|
|
609
|
+
if reasoning_content is not None:
|
|
610
|
+
d["reasoning_content"] = reasoning_content
|
|
611
|
+
|
|
593
612
|
try:
|
|
594
613
|
usage = c.get("usage")
|
|
595
614
|
assert "prompt_tokens" in usage
|
|
@@ -616,7 +635,13 @@ class ChatModelMixin:
|
|
|
616
635
|
}
|
|
617
636
|
|
|
618
637
|
@classmethod
|
|
619
|
-
def
|
|
638
|
+
def _post_process_completion(
|
|
639
|
+
cls,
|
|
640
|
+
model_family,
|
|
641
|
+
model_uid,
|
|
642
|
+
c,
|
|
643
|
+
reasoning_parser: Optional[ReasoningParser] = None,
|
|
644
|
+
):
|
|
620
645
|
_id = str(uuid.uuid4())
|
|
621
646
|
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
622
647
|
|
|
@@ -637,11 +662,22 @@ class ChatModelMixin:
|
|
|
637
662
|
else:
|
|
638
663
|
failed_contents.append(content)
|
|
639
664
|
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
665
|
+
|
|
666
|
+
reasoning_content = None
|
|
667
|
+
content = ". ".join(failed_contents) if failed_contents else None
|
|
668
|
+
if reasoning_parser is not None:
|
|
669
|
+
reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
|
|
670
|
+
content
|
|
671
|
+
)
|
|
640
672
|
m = {
|
|
641
673
|
"role": "assistant",
|
|
642
|
-
"content":
|
|
674
|
+
"content": content,
|
|
643
675
|
"tool_calls": tool_calls,
|
|
644
676
|
}
|
|
677
|
+
# add only reasoning_content is None
|
|
678
|
+
if reasoning_content is not None:
|
|
679
|
+
m["reasoning_content"] = reasoning_content
|
|
680
|
+
|
|
645
681
|
try:
|
|
646
682
|
usage = c.get("usage")
|
|
647
683
|
assert "prompt_tokens" in usage
|
|
@@ -43,8 +43,6 @@ from ....types import (
|
|
|
43
43
|
)
|
|
44
44
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
45
45
|
from ..llm_family import CustomLLMFamilyV1
|
|
46
|
-
from ..reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
|
|
47
|
-
from ..reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
|
|
48
46
|
from ..utils import (
|
|
49
47
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
50
48
|
QWEN_TOOL_CALL_FAMILY,
|
|
@@ -160,6 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
160
158
|
VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
|
|
161
159
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
|
|
162
160
|
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
|
|
161
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
|
|
163
162
|
VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
|
|
164
163
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
|
|
165
164
|
|
|
@@ -196,6 +195,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
196
195
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
197
196
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
198
197
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
|
|
198
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
|
|
199
199
|
|
|
200
200
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
|
|
201
201
|
VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
|
|
@@ -211,9 +211,10 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
|
|
|
211
211
|
|
|
212
212
|
if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
|
|
213
213
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
|
|
214
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
|
|
214
215
|
|
|
215
216
|
if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
|
|
216
|
-
VLLM_SUPPORTED_CHAT_MODELS.append("
|
|
217
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
|
|
217
218
|
|
|
218
219
|
|
|
219
220
|
class VLLMModel(LLM):
|
|
@@ -243,7 +244,6 @@ class VLLMModel(LLM):
|
|
|
243
244
|
self.lora_modules = peft_model
|
|
244
245
|
self.lora_requests: List[LoRARequest] = []
|
|
245
246
|
self._xavier_config = None
|
|
246
|
-
self.reasoning_parser = None
|
|
247
247
|
|
|
248
248
|
def set_xavier_config(self, value: Optional[Dict]):
|
|
249
249
|
self._xavier_config = value # type: ignore
|
|
@@ -274,14 +274,8 @@ class VLLMModel(LLM):
|
|
|
274
274
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
275
275
|
reasoning_content = self._model_config.pop("reasoning_content")
|
|
276
276
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
module_name = self.model_family.model_family or self.model_family.model_name
|
|
280
|
-
self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
|
|
281
|
-
self.reasoning_parser = self.reasoning_parser(
|
|
282
|
-
self.model_family.reasoning_start_tag,
|
|
283
|
-
self.model_family.reasoning_end_tag,
|
|
284
|
-
)
|
|
277
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
278
|
+
|
|
285
279
|
if self.lora_modules is None:
|
|
286
280
|
self.lora_requests = []
|
|
287
281
|
else:
|
|
@@ -581,6 +575,8 @@ class VLLMModel(LLM):
|
|
|
581
575
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
582
576
|
|
|
583
577
|
sanitized_generate_config = self._sanitize_generate_config(generate_config)
|
|
578
|
+
if self.reasoning_parser:
|
|
579
|
+
sanitized_generate_config.pop("stop")
|
|
584
580
|
logger.debug(
|
|
585
581
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
586
582
|
)
|
|
@@ -812,18 +808,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
812
808
|
i = 0
|
|
813
809
|
async for chunk in chunks:
|
|
814
810
|
if i == 0:
|
|
815
|
-
yield self._get_first_chat_completion_chunk(
|
|
811
|
+
yield self._get_first_chat_completion_chunk(
|
|
812
|
+
chunk, self.reasoning_parser
|
|
813
|
+
)
|
|
816
814
|
# usage
|
|
817
815
|
choices = chunk.get("choices")
|
|
818
816
|
if not choices:
|
|
819
817
|
yield self._get_final_chat_completion_chunk(chunk)
|
|
820
818
|
else:
|
|
821
819
|
if self.is_tool_call_chunk(chunk):
|
|
822
|
-
yield self.
|
|
823
|
-
self.model_family,
|
|
820
|
+
yield self._post_process_completion_chunk(
|
|
821
|
+
self.model_family,
|
|
822
|
+
self.model_uid,
|
|
823
|
+
chunk,
|
|
824
|
+
reasoning_parser=self.reasoning_parser,
|
|
824
825
|
)
|
|
825
826
|
else:
|
|
826
|
-
yield self._to_chat_completion_chunk(chunk)
|
|
827
|
+
yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
|
|
827
828
|
i += 1
|
|
828
829
|
|
|
829
830
|
@vllm_check
|
|
@@ -863,7 +864,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
863
864
|
)
|
|
864
865
|
assert not isinstance(c, AsyncGenerator)
|
|
865
866
|
if tools:
|
|
866
|
-
return self.
|
|
867
|
+
return self._post_process_completion(
|
|
868
|
+
self.model_family, self.model_uid, c, self.reasoning_parser
|
|
869
|
+
)
|
|
867
870
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
868
871
|
|
|
869
872
|
|
|
@@ -905,31 +908,15 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
905
908
|
def _sanitize_model_config(
|
|
906
909
|
self, model_config: Optional[VLLMModelConfig]
|
|
907
910
|
) -> VLLMModelConfig:
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
model_config.setdefault("swap_space", 4)
|
|
918
|
-
model_config.setdefault("gpu_memory_utilization", 0.90)
|
|
919
|
-
model_config.setdefault("max_num_seqs", 256)
|
|
920
|
-
model_config.setdefault("quantization", None)
|
|
921
|
-
model_config.setdefault("max_model_len", None)
|
|
922
|
-
model_config["limit_mm_per_prompt"] = (
|
|
923
|
-
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
924
|
-
if model_config.get("limit_mm_per_prompt")
|
|
925
|
-
else {
|
|
926
|
-
"image": 2, # default 2 images all chat
|
|
927
|
-
}
|
|
928
|
-
)
|
|
929
|
-
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
930
|
-
if vllm.__version__ >= "0.6.3":
|
|
931
|
-
model_config.setdefault("scheduling_policy", "fcfs")
|
|
932
|
-
|
|
911
|
+
model_config = super()._sanitize_model_config(model_config)
|
|
912
|
+
if vllm.__version__ >= "0.5.5":
|
|
913
|
+
model_config["limit_mm_per_prompt"] = (
|
|
914
|
+
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
915
|
+
if model_config.get("limit_mm_per_prompt")
|
|
916
|
+
else {
|
|
917
|
+
"image": 2, # default 2 images all chat
|
|
918
|
+
}
|
|
919
|
+
)
|
|
933
920
|
return model_config
|
|
934
921
|
|
|
935
922
|
def _sanitize_chat_config(
|
xinference/types.py
CHANGED
|
@@ -78,6 +78,7 @@ class EmbeddingData(TypedDict):
|
|
|
78
78
|
class Embedding(TypedDict):
|
|
79
79
|
object: Literal["list"]
|
|
80
80
|
model: str
|
|
81
|
+
model_replica: str
|
|
81
82
|
data: List[EmbeddingData]
|
|
82
83
|
usage: EmbeddingUsage
|
|
83
84
|
|
|
@@ -276,6 +277,7 @@ class LlamaCppModelConfig(TypedDict, total=False):
|
|
|
276
277
|
use_mmap: bool
|
|
277
278
|
use_mlock: bool
|
|
278
279
|
n_threads: Optional[int]
|
|
280
|
+
n_parallel: Optional[int]
|
|
279
281
|
n_batch: int
|
|
280
282
|
last_n_tokens_size: int
|
|
281
283
|
lora_base: Optional[str]
|
|
@@ -284,6 +286,7 @@ class LlamaCppModelConfig(TypedDict, total=False):
|
|
|
284
286
|
n_gqa: Optional[int] # (TEMPORARY) must be 8 for llama2 70b
|
|
285
287
|
rms_norm_eps: Optional[float] # (TEMPORARY)
|
|
286
288
|
verbose: bool
|
|
289
|
+
reasoning_content: bool
|
|
287
290
|
|
|
288
291
|
|
|
289
292
|
class PytorchGenerateConfig(TypedDict, total=False):
|
|
@@ -330,6 +333,7 @@ class PytorchModelConfig(TypedDict, total=False):
|
|
|
330
333
|
trust_remote_code: bool
|
|
331
334
|
max_num_seqs: int
|
|
332
335
|
enable_tensorizer: Optional[bool]
|
|
336
|
+
reasoning_content: bool
|
|
333
337
|
|
|
334
338
|
|
|
335
339
|
def get_pydantic_model_from_method(
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.f8177338.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.55b70cb7.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.f8177338.css.map": "./static/css/main.f8177338.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.55b70cb7.js.map": "./static/js/main.55b70cb7.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.f8177338.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.55b70cb7.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.55b70cb7.js"></script><link href="./static/css/main.f8177338.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|