xinference 1.3.1__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/chat_interface.py +39 -24
- xinference/model/llm/core.py +2 -5
- xinference/model/llm/llama_cpp/core.py +9 -3
- xinference/model/llm/llm_family.json +93 -9
- xinference/model/llm/llm_family_modelscope.json +10 -10
- xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} +5 -8
- xinference/model/llm/utils.py +41 -3
- xinference/model/llm/vllm/core.py +2 -0
- {xinference-1.3.1.dist-info → xinference-1.3.1.post1.dist-info}/METADATA +3 -3
- {xinference-1.3.1.dist-info → xinference-1.3.1.post1.dist-info}/RECORD +15 -17
- xinference/model/llm/reasoning_parsers/__init__.py +0 -13
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +0 -98
- {xinference-1.3.1.dist-info → xinference-1.3.1.post1.dist-info}/LICENSE +0 -0
- {xinference-1.3.1.dist-info → xinference-1.3.1.post1.dist-info}/WHEEL +0 -0
- {xinference-1.3.1.dist-info → xinference-1.3.1.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.1.dist-info → xinference-1.3.1.post1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-03-
|
|
11
|
+
"date": "2025-03-11T12:00:36+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.3.1"
|
|
14
|
+
"full-revisionid": "2ef99fbb5450a76a6ba07a909f58b8c2e4c22a28",
|
|
15
|
+
"version": "1.3.1.post1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -113,6 +113,7 @@ class GradioInterface:
|
|
|
113
113
|
max_tokens: int,
|
|
114
114
|
temperature: float,
|
|
115
115
|
lora_name: str,
|
|
116
|
+
stream: bool,
|
|
116
117
|
) -> Generator:
|
|
117
118
|
from ..client import RESTfulClient
|
|
118
119
|
|
|
@@ -123,29 +124,40 @@ class GradioInterface:
|
|
|
123
124
|
messages = to_chat(flatten(history))
|
|
124
125
|
messages.append(dict(role="user", content=message))
|
|
125
126
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
127
|
+
if stream:
|
|
128
|
+
response_content = ""
|
|
129
|
+
for chunk in model.chat(
|
|
130
|
+
messages,
|
|
131
|
+
generate_config={
|
|
132
|
+
"max_tokens": int(max_tokens),
|
|
133
|
+
"temperature": temperature,
|
|
134
|
+
"stream": True,
|
|
135
|
+
"lora_name": lora_name,
|
|
136
|
+
},
|
|
137
|
+
):
|
|
138
|
+
assert isinstance(chunk, dict)
|
|
139
|
+
delta = chunk["choices"][0]["delta"]
|
|
140
|
+
if "content" not in delta:
|
|
141
|
+
continue
|
|
142
|
+
else:
|
|
143
|
+
# some model like deepseek-r1-distill-qwen
|
|
144
|
+
# will generate <think>...</think> ...
|
|
145
|
+
# in gradio, no output will be rendered,
|
|
146
|
+
# thus escape html tags in advance
|
|
147
|
+
response_content += html.escape(delta["content"])
|
|
148
|
+
yield response_content
|
|
149
|
+
|
|
150
|
+
yield response_content
|
|
151
|
+
else:
|
|
152
|
+
result = model.chat(
|
|
153
|
+
messages,
|
|
154
|
+
generate_config={
|
|
155
|
+
"max_tokens": int(max_tokens),
|
|
156
|
+
"temperature": temperature,
|
|
157
|
+
"lora_name": lora_name,
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
yield html.escape(result["choices"][0]["message"]["content"]) # type: ignore
|
|
149
161
|
|
|
150
162
|
return gr.ChatInterface(
|
|
151
163
|
fn=generate_wrapper,
|
|
@@ -153,7 +165,9 @@ class GradioInterface:
|
|
|
153
165
|
gr.Slider(
|
|
154
166
|
minimum=1,
|
|
155
167
|
maximum=self.context_length,
|
|
156
|
-
value=512
|
|
168
|
+
value=512
|
|
169
|
+
if "reasoning" not in self.model_ability
|
|
170
|
+
else self.context_length // 2,
|
|
157
171
|
step=1,
|
|
158
172
|
label="Max Tokens",
|
|
159
173
|
),
|
|
@@ -161,6 +175,7 @@ class GradioInterface:
|
|
|
161
175
|
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
162
176
|
),
|
|
163
177
|
gr.Text(label="LoRA Name"),
|
|
178
|
+
gr.Checkbox(label="Stream", value=True),
|
|
164
179
|
],
|
|
165
180
|
title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
|
|
166
181
|
css="""
|
xinference/model/llm/core.py
CHANGED
|
@@ -25,8 +25,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
|
|
|
25
25
|
from ...core.utils import parse_replica_model_uid
|
|
26
26
|
from ...types import PeftModelConfig
|
|
27
27
|
from ..core import ModelDescription
|
|
28
|
-
from .
|
|
29
|
-
from .reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
|
|
28
|
+
from .reasoning_parser import ReasoningParser
|
|
30
29
|
|
|
31
30
|
if TYPE_CHECKING:
|
|
32
31
|
from .llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -123,9 +122,7 @@ class LLM(abc.ABC):
|
|
|
123
122
|
def prepare_parse_reasoning_content(self, reasoning_content):
|
|
124
123
|
# Initialize reasoning parser if model has reasoning ability
|
|
125
124
|
if "reasoning" in self.model_family.model_ability and reasoning_content:
|
|
126
|
-
|
|
127
|
-
self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
|
|
128
|
-
self.reasoning_parser = self.reasoning_parser(
|
|
125
|
+
self.reasoning_parser = ReasoningParser(
|
|
129
126
|
self.model_family.reasoning_start_tag,
|
|
130
127
|
self.model_family.reasoning_end_tag,
|
|
131
128
|
)
|
|
@@ -43,7 +43,7 @@ class _Sentinel:
|
|
|
43
43
|
pass
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
class XllamaCppModel(LLM):
|
|
46
|
+
class XllamaCppModel(LLM, ChatModelMixin):
|
|
47
47
|
def __init__(
|
|
48
48
|
self,
|
|
49
49
|
model_uid: str,
|
|
@@ -83,6 +83,7 @@ class XllamaCppModel(LLM):
|
|
|
83
83
|
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
84
84
|
elif self._is_linux():
|
|
85
85
|
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
86
|
+
llamacpp_model_config.setdefault("reasoning_content", False)
|
|
86
87
|
|
|
87
88
|
return llamacpp_model_config
|
|
88
89
|
|
|
@@ -131,6 +132,9 @@ class XllamaCppModel(LLM):
|
|
|
131
132
|
|
|
132
133
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
133
134
|
|
|
135
|
+
reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
|
|
136
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
137
|
+
|
|
134
138
|
if os.path.isfile(self.model_path):
|
|
135
139
|
# mostly passed from --model_path
|
|
136
140
|
model_path = os.path.realpath(self.model_path)
|
|
@@ -274,9 +278,11 @@ class XllamaCppModel(LLM):
|
|
|
274
278
|
while (r := q.get()) is not _Sentinel:
|
|
275
279
|
yield r
|
|
276
280
|
|
|
277
|
-
return
|
|
281
|
+
return self._to_chat_completion_chunks(
|
|
282
|
+
_to_iterator(), self.reasoning_parser
|
|
283
|
+
)
|
|
278
284
|
else:
|
|
279
|
-
return q.get()
|
|
285
|
+
return self._to_chat_completion(q.get(), self.reasoning_parser)
|
|
280
286
|
|
|
281
287
|
|
|
282
288
|
class LlamaCppModel(LLM):
|
|
@@ -9449,7 +9449,7 @@
|
|
|
9449
9449
|
},
|
|
9450
9450
|
{
|
|
9451
9451
|
"version": 1,
|
|
9452
|
-
"context_length":
|
|
9452
|
+
"context_length": 131072,
|
|
9453
9453
|
"model_name": "QwQ-32B",
|
|
9454
9454
|
"model_lang": [
|
|
9455
9455
|
"en",
|
|
@@ -9496,15 +9496,99 @@
|
|
|
9496
9496
|
"model_size_in_billions": 32,
|
|
9497
9497
|
"quantizations": [
|
|
9498
9498
|
"fp16",
|
|
9499
|
-
"
|
|
9500
|
-
"
|
|
9501
|
-
"
|
|
9502
|
-
"
|
|
9503
|
-
"
|
|
9504
|
-
"
|
|
9505
|
-
"
|
|
9506
|
-
"
|
|
9499
|
+
"q2_k",
|
|
9500
|
+
"q3_k_m",
|
|
9501
|
+
"q4_0",
|
|
9502
|
+
"q4_k_m",
|
|
9503
|
+
"q5_0",
|
|
9504
|
+
"q5_k_m",
|
|
9505
|
+
"q6_k",
|
|
9506
|
+
"q8_0"
|
|
9507
9507
|
],
|
|
9508
|
+
"quantization_parts": {
|
|
9509
|
+
"fp16": [
|
|
9510
|
+
"00001-of-000017",
|
|
9511
|
+
"00002-of-000017",
|
|
9512
|
+
"00003-of-000017",
|
|
9513
|
+
"00004-of-000017",
|
|
9514
|
+
"00005-of-000017",
|
|
9515
|
+
"00006-of-000017",
|
|
9516
|
+
"00007-of-000017",
|
|
9517
|
+
"00008-of-000017",
|
|
9518
|
+
"00009-of-000017",
|
|
9519
|
+
"00010-of-000017",
|
|
9520
|
+
"00011-of-000017",
|
|
9521
|
+
"00012-of-000017",
|
|
9522
|
+
"00013-of-000017",
|
|
9523
|
+
"00014-of-000017",
|
|
9524
|
+
"00015-of-000017",
|
|
9525
|
+
"00016-of-000017",
|
|
9526
|
+
"00017-of-000017"
|
|
9527
|
+
],
|
|
9528
|
+
"q2_k": [
|
|
9529
|
+
"00001-of-00004",
|
|
9530
|
+
"00002-of-00004",
|
|
9531
|
+
"00003-of-00004",
|
|
9532
|
+
"00004-of-00004"
|
|
9533
|
+
],
|
|
9534
|
+
"q3_k_m": [
|
|
9535
|
+
"00001-of-00005",
|
|
9536
|
+
"00002-of-00005",
|
|
9537
|
+
"00003-of-00005",
|
|
9538
|
+
"00004-of-00005",
|
|
9539
|
+
"00005-of-00005"
|
|
9540
|
+
],
|
|
9541
|
+
"q4_0": [
|
|
9542
|
+
"00001-of-00005",
|
|
9543
|
+
"00002-of-00005",
|
|
9544
|
+
"00003-of-00005",
|
|
9545
|
+
"00004-of-00005",
|
|
9546
|
+
"00005-of-00005"
|
|
9547
|
+
],
|
|
9548
|
+
"q4_k_m": [
|
|
9549
|
+
"00001-of-00005",
|
|
9550
|
+
"00002-of-00005",
|
|
9551
|
+
"00003-of-00005",
|
|
9552
|
+
"00004-of-00005",
|
|
9553
|
+
"00005-of-00005"
|
|
9554
|
+
],
|
|
9555
|
+
"q5_0": [
|
|
9556
|
+
"00001-of-00006",
|
|
9557
|
+
"00002-of-00006",
|
|
9558
|
+
"00003-of-00006",
|
|
9559
|
+
"00004-of-00006",
|
|
9560
|
+
"00005-of-00006",
|
|
9561
|
+
"00006-of-00006"
|
|
9562
|
+
],
|
|
9563
|
+
"q5_k_m": [
|
|
9564
|
+
"00001-of-00006",
|
|
9565
|
+
"00002-of-00006",
|
|
9566
|
+
"00003-of-00006",
|
|
9567
|
+
"00004-of-00006",
|
|
9568
|
+
"00005-of-00006",
|
|
9569
|
+
"00006-of-00006"
|
|
9570
|
+
],
|
|
9571
|
+
"q6_k": [
|
|
9572
|
+
"00001-of-00007",
|
|
9573
|
+
"00002-of-00007",
|
|
9574
|
+
"00003-of-00007",
|
|
9575
|
+
"00004-of-00007",
|
|
9576
|
+
"00005-of-00007",
|
|
9577
|
+
"00006-of-00007",
|
|
9578
|
+
"00007-of-00007"
|
|
9579
|
+
],
|
|
9580
|
+
"q8_0": [
|
|
9581
|
+
"00001-of-00009",
|
|
9582
|
+
"00002-of-00009",
|
|
9583
|
+
"00003-of-00009",
|
|
9584
|
+
"00004-of-00009",
|
|
9585
|
+
"00005-of-00009",
|
|
9586
|
+
"00006-of-00009",
|
|
9587
|
+
"00007-of-00009",
|
|
9588
|
+
"00008-of-00009",
|
|
9589
|
+
"00009-of-00009"
|
|
9590
|
+
]
|
|
9591
|
+
},
|
|
9508
9592
|
"model_id": "Qwen/QwQ-32B-GGUF",
|
|
9509
9593
|
"model_file_name_template": "qwq-32b-{quantization}.gguf"
|
|
9510
9594
|
}
|
|
@@ -7217,7 +7217,7 @@
|
|
|
7217
7217
|
],
|
|
7218
7218
|
"model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
|
|
7219
7219
|
"model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf",
|
|
7220
|
-
|
|
7220
|
+
"model_hub": "modelscope"
|
|
7221
7221
|
}
|
|
7222
7222
|
],
|
|
7223
7223
|
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
|
@@ -7234,7 +7234,7 @@
|
|
|
7234
7234
|
},
|
|
7235
7235
|
{
|
|
7236
7236
|
"version": 1,
|
|
7237
|
-
"context_length":
|
|
7237
|
+
"context_length": 131072,
|
|
7238
7238
|
"model_name": "QwQ-32B",
|
|
7239
7239
|
"model_lang": [
|
|
7240
7240
|
"en",
|
|
@@ -7284,14 +7284,14 @@
|
|
|
7284
7284
|
"model_size_in_billions": 32,
|
|
7285
7285
|
"quantizations": [
|
|
7286
7286
|
"fp16",
|
|
7287
|
-
"
|
|
7288
|
-
"
|
|
7289
|
-
"
|
|
7290
|
-
"
|
|
7291
|
-
"
|
|
7292
|
-
"
|
|
7293
|
-
"
|
|
7294
|
-
"
|
|
7287
|
+
"q2_k",
|
|
7288
|
+
"q3_k_m",
|
|
7289
|
+
"q4_0",
|
|
7290
|
+
"q4_k_m",
|
|
7291
|
+
"q5_0",
|
|
7292
|
+
"q5_k_m",
|
|
7293
|
+
"q6_k",
|
|
7294
|
+
"q8_0"
|
|
7295
7295
|
],
|
|
7296
7296
|
"model_id": "Qwen/QwQ-32B-GGUF",
|
|
7297
7297
|
"model_file_name_template": "qwq-32b-{quantization}.gguf",
|
xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py}
RENAMED
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import Optional, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from
|
|
5
|
-
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
|
4
|
+
from ...types import ChatCompletionChunkDelta, CompletionChoice
|
|
6
5
|
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
|
|
11
|
-
class DeepSeekR1ReasoningParser(ReasoningParser):
|
|
12
|
-
"""Reasoning parser for DeepSeek-R1 model."""
|
|
7
|
+
class ReasoningParser:
|
|
8
|
+
"""Reasoning parser for reasoning model."""
|
|
13
9
|
|
|
14
10
|
def __init__(
|
|
15
11
|
self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
|
|
16
12
|
):
|
|
17
|
-
|
|
13
|
+
self.reasoning_start_tag = reasoning_start_tag
|
|
14
|
+
self.reasoning_end_tag = reasoning_end_tag
|
|
18
15
|
self.reasoning_regex = re.compile(
|
|
19
16
|
rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
|
|
20
17
|
)
|
xinference/model/llm/utils.py
CHANGED
|
@@ -55,7 +55,7 @@ from .llm_family import (
|
|
|
55
55
|
_get_cache_dir,
|
|
56
56
|
get_cache_status,
|
|
57
57
|
)
|
|
58
|
-
from .
|
|
58
|
+
from .reasoning_parser import ReasoningParser
|
|
59
59
|
|
|
60
60
|
logger = logging.getLogger(__name__)
|
|
61
61
|
|
|
@@ -250,8 +250,30 @@ class ChatModelMixin:
|
|
|
250
250
|
reasoning_parser: Optional[ReasoningParser] = None,
|
|
251
251
|
previous_texts: Optional[List[str]] = None,
|
|
252
252
|
) -> ChatCompletionChunk:
|
|
253
|
+
choices = chunk.get("choices")
|
|
254
|
+
if (
|
|
255
|
+
chunk.get("object") == "chat.completion.chunk"
|
|
256
|
+
and choices
|
|
257
|
+
and "delta" in choices[0]
|
|
258
|
+
):
|
|
259
|
+
if reasoning_parser is not None:
|
|
260
|
+
# process parsing reasoning content
|
|
261
|
+
assert previous_texts is not None
|
|
262
|
+
delta = choices[0]["delta"] # type: ignore
|
|
263
|
+
if text := delta.get("content"):
|
|
264
|
+
current_text = previous_texts[-1] + text
|
|
265
|
+
delta = reasoning_parser.extract_reasoning_content_streaming(
|
|
266
|
+
previous_text=previous_texts[-1],
|
|
267
|
+
current_text=current_text,
|
|
268
|
+
delta_text=text,
|
|
269
|
+
)
|
|
270
|
+
previous_texts[-1] = current_text
|
|
271
|
+
choices[0]["delta"] = delta # type: ignore
|
|
272
|
+
# Already a ChatCompletionChunk, we don't need to convert chunk.
|
|
273
|
+
return cast(ChatCompletionChunk, chunk)
|
|
274
|
+
|
|
253
275
|
choices_list = []
|
|
254
|
-
for i, choice in enumerate(
|
|
276
|
+
for i, choice in enumerate(choices): # type: ignore
|
|
255
277
|
delta = ChatCompletionChunkDelta()
|
|
256
278
|
if "text" in choice and choice["finish_reason"] is None:
|
|
257
279
|
if reasoning_parser is None:
|
|
@@ -345,9 +367,10 @@ class ChatModelMixin:
|
|
|
345
367
|
if not choices:
|
|
346
368
|
yield cls._get_final_chat_completion_chunk(chunk)
|
|
347
369
|
else:
|
|
348
|
-
|
|
370
|
+
r = cls._to_chat_completion_chunk(
|
|
349
371
|
chunk, reasoning_parse, previous_texts
|
|
350
372
|
)
|
|
373
|
+
yield r
|
|
351
374
|
|
|
352
375
|
@classmethod
|
|
353
376
|
def _tools_to_messages_for_deepseek(
|
|
@@ -405,6 +428,21 @@ class ChatModelMixin:
|
|
|
405
428
|
def _to_chat_completion(
|
|
406
429
|
completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
|
|
407
430
|
) -> ChatCompletion:
|
|
431
|
+
if completion.get("object") == "chat.completion" and completion.get("choices"):
|
|
432
|
+
# Already a ChatCompletion
|
|
433
|
+
if reasoning_parser is not None:
|
|
434
|
+
for choice in completion["choices"]:
|
|
435
|
+
message = choice["message"] # type: ignore
|
|
436
|
+
text = message["content"]
|
|
437
|
+
(
|
|
438
|
+
reasoning_content,
|
|
439
|
+
content,
|
|
440
|
+
) = reasoning_parser.extract_reasoning_content(text)
|
|
441
|
+
message["content"] = content
|
|
442
|
+
if reasoning_content is not None:
|
|
443
|
+
message["reasoning_content"] = reasoning_content
|
|
444
|
+
return cast(ChatCompletion, completion)
|
|
445
|
+
|
|
408
446
|
choices = []
|
|
409
447
|
for i, choice in enumerate(completion["choices"]):
|
|
410
448
|
content = choice["text"]
|
|
@@ -576,6 +576,8 @@ class VLLMModel(LLM):
|
|
|
576
576
|
|
|
577
577
|
sanitized_generate_config = self._sanitize_generate_config(generate_config)
|
|
578
578
|
if self.reasoning_parser:
|
|
579
|
+
# For reasoning model, the </think> we be split into multiple words,
|
|
580
|
+
# if `stop` param is passed, so we pop it from config.
|
|
579
581
|
sanitized_generate_config.pop("stop")
|
|
580
582
|
logger.debug(
|
|
581
583
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xinference
|
|
3
|
-
Version: 1.3.1
|
|
3
|
+
Version: 1.3.1.post1
|
|
4
4
|
Summary: Model Serving Made Easy
|
|
5
5
|
Home-page: https://github.com/xorbitsai/inference
|
|
6
6
|
Author: Qin Xuye
|
|
@@ -292,6 +292,7 @@ potential of cutting-edge AI models.
|
|
|
292
292
|
|
|
293
293
|
## 🔥 Hot Topics
|
|
294
294
|
### Framework Enhancements
|
|
295
|
+
- [Xllamacpp](https://github.com/xorbitsai/xllamacpp): New llama.cpp Python binding, maintained by Xinference team, supports continuous batching and is more production-ready.: [#2997](https://github.com/xorbitsai/inference/pull/2997)
|
|
295
296
|
- Distributed inference: running models across workers: [#2877](https://github.com/xorbitsai/inference/pull/2877)
|
|
296
297
|
- VLLM enhancement: Shared KV cache across multiple replicas: [#2732](https://github.com/xorbitsai/inference/pull/2732)
|
|
297
298
|
- Support Continuous batching for Transformers engine: [#1724](https://github.com/xorbitsai/inference/pull/1724)
|
|
@@ -299,8 +300,8 @@ potential of cutting-edge AI models.
|
|
|
299
300
|
- Support specifying worker and GPU indexes for launching models: [#1195](https://github.com/xorbitsai/inference/pull/1195)
|
|
300
301
|
- Support SGLang backend: [#1161](https://github.com/xorbitsai/inference/pull/1161)
|
|
301
302
|
- Support LoRA for LLM and image models: [#1080](https://github.com/xorbitsai/inference/pull/1080)
|
|
302
|
-
- Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929)
|
|
303
303
|
### New Models
|
|
304
|
+
- Built-in support for [QwQ-32B](https://qwenlm.github.io/blog/qwq-32b/): [#3005](https://github.com/xorbitsai/inference/pull/3005)
|
|
304
305
|
- Built-in support for [DeepSeek V3 and R1](https://github.com/deepseek-ai/DeepSeek-R1): [#2864](https://github.com/xorbitsai/inference/pull/2864)
|
|
305
306
|
- Built-in support for [InternVL2.5](https://internvl.github.io/blog/2024-12-05-InternVL-2.5/): [#2776](https://github.com/xorbitsai/inference/pull/2776)
|
|
306
307
|
- Built-in support for [DeepSeek-R1-Distill-Llama](https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-distill-models): [#2811](https://github.com/xorbitsai/inference/pull/2811)
|
|
@@ -308,7 +309,6 @@ potential of cutting-edge AI models.
|
|
|
308
309
|
- Built-in support for [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M): [#2790](https://github.com/xorbitsai/inference/pull/2790)
|
|
309
310
|
- Built-in support for [qwen2.5-vl](https://github.com/QwenLM/Qwen2.5-VL): [#2788](https://github.com/xorbitsai/inference/pull/2788)
|
|
310
311
|
- Built-in support for [internlm3-instruct](https://github.com/InternLM/InternLM): [#2789](https://github.com/xorbitsai/inference/pull/2789)
|
|
311
|
-
- Built-in support for [MeloTTS](https://github.com/myshell-ai/MeloTTS): [#2760](https://github.com/xorbitsai/inference/pull/2760)
|
|
312
312
|
### Integrations
|
|
313
313
|
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
|
|
314
314
|
- [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
xinference/__init__.py,sha256=nmTTrYbIpj964ZF6ojtgOM7E85JBOj1EyQbmYjbj1jw,915
|
|
2
2
|
xinference/_compat.py,sha256=URSJQLXrcsTO9B_4x0wVDPijYQDhuVJmZ95npID560w,4197
|
|
3
|
-
xinference/_version.py,sha256=
|
|
3
|
+
xinference/_version.py,sha256=qMz600g9USAjrV1nTxM3bBcpOiTBWs1VJPEtdMggVGg,503
|
|
4
4
|
xinference/conftest.py,sha256=ZB7li77s4_H4ZEQpDo2PX-b4zrs8-bIpvh59P_CaSoo,9691
|
|
5
5
|
xinference/constants.py,sha256=mEW4HDzjXtDXN61Mt6TtJrJ4ljbB6VUkh97e3oDbNx4,3905
|
|
6
6
|
xinference/device_utils.py,sha256=ELsqvnjvz9wYthTyQFzKSV4mZsaASz6hj_IsfMmfMWc,4447
|
|
@@ -21,7 +21,7 @@ xinference/client/restful/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7
|
|
|
21
21
|
xinference/client/restful/restful_client.py,sha256=DofFF0ZaOmBpCVp9qtAeYDGbvd-KS5u4_GMGp8AbbM4,53994
|
|
22
22
|
xinference/core/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
23
23
|
xinference/core/cache_tracker.py,sha256=3ubjYCU5aZToSp2GEuzedECVrg-PR4kThTefrFUkb9g,6971
|
|
24
|
-
xinference/core/chat_interface.py,sha256=
|
|
24
|
+
xinference/core/chat_interface.py,sha256=X5ZC91M_uKIG8NW1xupKUDNoqzUHMpLp4-ijf-YhjbE,21766
|
|
25
25
|
xinference/core/event.py,sha256=42F38H2WOl6aPxp2oxX6WNxHRRxbnvYRmbt4Ar7NP4U,1640
|
|
26
26
|
xinference/core/image_interface.py,sha256=5Iuoiw3g2TvgOYi3gRIAGApve2nNzfMPduRrBHvd1NY,13755
|
|
27
27
|
xinference/core/metrics.py,sha256=ScmTG15Uq3h_ob72ybZSMWdnk8P4sUZFcm60f4ikSXc,2631
|
|
@@ -87,23 +87,21 @@ xinference/model/image/stable_diffusion/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17k
|
|
|
87
87
|
xinference/model/image/stable_diffusion/core.py,sha256=V3BaASwx8q1YERb4jhhaYEDFiwh3BuPAz8pVZTuktAQ,24717
|
|
88
88
|
xinference/model/image/stable_diffusion/mlx.py,sha256=GZsozzGB04NfHAdU9MI6gwWE1t_A-s_Ddn_ic8DlkKQ,7476
|
|
89
89
|
xinference/model/llm/__init__.py,sha256=UJOSz9zr5mAj8Fm09yoZbEe4xBWYnSxUV9aGE50e5dc,14184
|
|
90
|
-
xinference/model/llm/core.py,sha256=
|
|
91
|
-
xinference/model/llm/llm_family.json,sha256=
|
|
90
|
+
xinference/model/llm/core.py,sha256=2AYRKdiJ5L1iKU9CE_C09IbEtE2KrsIy4dqkqg2txes,8626
|
|
91
|
+
xinference/model/llm/llm_family.json,sha256=xszAQbwI5lvkdcxRSoowNddvPfuFF2aT6xnvzeyzo8w,374447
|
|
92
92
|
xinference/model/llm/llm_family.py,sha256=SrgTmEKspAELhVqmMs7Rz6xUk7rmc9V61urvbWAZOVE,39214
|
|
93
93
|
xinference/model/llm/llm_family_csghub.json,sha256=zMKWbihsxQNVB1u5iKJbZUkbOfQ4IPNq1KQ-8IDPQQA,8759
|
|
94
|
-
xinference/model/llm/llm_family_modelscope.json,sha256=
|
|
94
|
+
xinference/model/llm/llm_family_modelscope.json,sha256=2OuPertAGKnryliUofjnqemLrhHW1aaq6-4tPsrbKNI,304592
|
|
95
95
|
xinference/model/llm/llm_family_openmind_hub.json,sha256=jl9pfbe5DztoxgEwKBxDk1Wd7TziTiJ48_Ie_lJdYjA,67872
|
|
96
96
|
xinference/model/llm/memory.py,sha256=GLNmXBI-AtMbuaJfEf50fnhN4rdbOZjLyT6L_Vjqa5g,10206
|
|
97
|
-
xinference/model/llm/
|
|
97
|
+
xinference/model/llm/reasoning_parser.py,sha256=Gqf2WS5olkD2SfJ3wlCAQsmQUZOIc1185h_01pIs7TQ,6067
|
|
98
|
+
xinference/model/llm/utils.py,sha256=QYYGfaPYr3GG7dyowNgs4qyayid-gOtJ_ToXqAMVhSU,32846
|
|
98
99
|
xinference/model/llm/llama_cpp/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
99
|
-
xinference/model/llm/llama_cpp/core.py,sha256=
|
|
100
|
+
xinference/model/llm/llama_cpp/core.py,sha256=rUVyaP5tE6xz9jy0m4iZ0ys99vl3sicnWZsEnbaXPfw,21181
|
|
100
101
|
xinference/model/llm/lmdeploy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
102
|
xinference/model/llm/lmdeploy/core.py,sha256=WvSP3x6t-HBv6hKh1qWZatFAzlcZCyyKqvc3ua8yPTI,19835
|
|
102
103
|
xinference/model/llm/mlx/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
103
104
|
xinference/model/llm/mlx/core.py,sha256=l4_MKw5UckM81kaCwgriy0KZU3zPN38p36P3J9USmgA,23568
|
|
104
|
-
xinference/model/llm/reasoning_parsers/__init__.py,sha256=-sjSIQ4K6w-TEzx49kVaWeWC443fnZqODU91GCQ_JNo,581
|
|
105
|
-
xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py,sha256=YtOOVbSl6fLugn3vmzo_AQbbjl6H5kX9DPpP9KP3gnY,3004
|
|
106
|
-
xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py,sha256=HmnAsNcoeUpyUSTNF0j_0Z4Am7OKiGrJnhNj-BhtQf0,6323
|
|
107
105
|
xinference/model/llm/sglang/__init__.py,sha256=-sjSIQ4K6w-TEzx49kVaWeWC443fnZqODU91GCQ_JNo,581
|
|
108
106
|
xinference/model/llm/sglang/core.py,sha256=tMbvQOwQu5uBXBTMK5Vh-FR2Gc-Nbc0HIhp2iy47wCA,20606
|
|
109
107
|
xinference/model/llm/transformers/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
@@ -130,7 +128,7 @@ xinference/model/llm/transformers/tensorizer_utils.py,sha256=VXSYbPZtCbd8lVvsnjD
|
|
|
130
128
|
xinference/model/llm/transformers/utils.py,sha256=KETjuVR_RpF--fno0KxT068fD1v4REFhe-0wy_sCwRs,19584
|
|
131
129
|
xinference/model/llm/transformers/yi_vl.py,sha256=iCdRLw-wizbU-qXXc8CT4DhC0Pt-uYg0vFwXEhAZjQg,8961
|
|
132
130
|
xinference/model/llm/vllm/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
133
|
-
xinference/model/llm/vllm/core.py,sha256=
|
|
131
|
+
xinference/model/llm/vllm/core.py,sha256=j5sdlrctBnouLJAfBs0Ofa1JbngTlYsDzrs2ManQN0o,38261
|
|
134
132
|
xinference/model/llm/vllm/utils.py,sha256=LKOmwfFRrlSecawxT-uE39tC2RQbf1UIiSH9Uz90X6w,1313
|
|
135
133
|
xinference/model/llm/vllm/xavier/__init__.py,sha256=CyLLkbImZouAk4lePIgKXT4WQoqyauIEwdqea5IOUVU,581
|
|
136
134
|
xinference/model/llm/vllm/xavier/allocator.py,sha256=SJ2eCOxF6CWTBZIP39FRxeK6fxIE8pRshOPnSRc72d4,2691
|
|
@@ -15726,9 +15724,9 @@ xinference/web/ui/node_modules/yup/package.json,sha256=xRFSROB9NKxqSWHEVFvSTsPs9
|
|
|
15726
15724
|
xinference/web/ui/node_modules/yup/node_modules/type-fest/package.json,sha256=JTv2zTTVgxQ2H82m1-6qEpdMv08lHjFx4Puf_MsbB_Q,1134
|
|
15727
15725
|
xinference/web/ui/src/locales/en.json,sha256=5MN-GKLcPOeUAsDbv_MRvD4uf86WsvUC6rhzTAtQevA,8925
|
|
15728
15726
|
xinference/web/ui/src/locales/zh.json,sha256=27HeH4Qc96KuDJ9cgb4OybpUFG-GuuUiaD0ASsg3lyQ,8666
|
|
15729
|
-
xinference-1.3.1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
15730
|
-
xinference-1.3.1.dist-info/METADATA,sha256=
|
|
15731
|
-
xinference-1.3.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
15732
|
-
xinference-1.3.1.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
|
|
15733
|
-
xinference-1.3.1.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
|
|
15734
|
-
xinference-1.3.1.dist-info/RECORD,,
|
|
15727
|
+
xinference-1.3.1.post1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
15728
|
+
xinference-1.3.1.post1.dist-info/METADATA,sha256=aag4egJ0PONQQ2mu6fWiY9KDHMpqqTzSTHAOPO11vb8,24447
|
|
15729
|
+
xinference-1.3.1.post1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
15730
|
+
xinference-1.3.1.post1.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
|
|
15731
|
+
xinference-1.3.1.post1.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
|
|
15732
|
+
xinference-1.3.1.post1.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# Copyright 2022-2024 XProbe Inc.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Dict, Optional, Tuple, Type, Union
|
|
3
|
-
|
|
4
|
-
from ....types import ChatCompletionChunkDelta, CompletionChoice
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class ReasoningParser(ABC):
|
|
8
|
-
"""Abstract base class for reasoning content parsers."""
|
|
9
|
-
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
reasoning_start_tag: str = "<think>",
|
|
13
|
-
reasoning_end_tag: str = "</think>",
|
|
14
|
-
):
|
|
15
|
-
"""Initialize the reasoning parser.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
reasoning_start_tag (str, optional): Start tag for reasoning content. Defaults to "<think>".
|
|
19
|
-
reasoning_end_tag (str, optional): End tag for reasoning content. Defaults to "</think>".
|
|
20
|
-
"""
|
|
21
|
-
self.reasoning_start_tag = reasoning_start_tag
|
|
22
|
-
self.reasoning_end_tag = reasoning_end_tag
|
|
23
|
-
|
|
24
|
-
@abstractmethod
|
|
25
|
-
def extract_reasoning_content_streaming(
|
|
26
|
-
self,
|
|
27
|
-
previous_text: str,
|
|
28
|
-
current_text: str,
|
|
29
|
-
delta_text: str,
|
|
30
|
-
) -> ChatCompletionChunkDelta:
|
|
31
|
-
"""Extract reasoning content from model output in a streaming fashion.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
content (str): The model output content to parse.
|
|
35
|
-
|
|
36
|
-
Yields:
|
|
37
|
-
str: Extracted reasoning content chunks.
|
|
38
|
-
"""
|
|
39
|
-
pass
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def extract_reasoning_content(
|
|
43
|
-
self, model_output: Union[str, CompletionChoice]
|
|
44
|
-
) -> Tuple[Optional[str], Optional[str]]:
|
|
45
|
-
"""Extract reasoning content from model output.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
content (str): The model output content to parse.
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
Optional[str]: Extracted reasoning content, or None if no reasoning content found.
|
|
52
|
-
"""
|
|
53
|
-
pass
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class ReasoningParserManager:
|
|
57
|
-
"""Manager class for reasoning parsers."""
|
|
58
|
-
|
|
59
|
-
_parsers: Dict[str, Type[ReasoningParser]] = {}
|
|
60
|
-
|
|
61
|
-
@classmethod
|
|
62
|
-
def register(cls, model_name: str, parser_cls: Type[ReasoningParser]) -> None:
|
|
63
|
-
"""Register a reasoning parser for a specific model.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
model_name (str): The name of the model.
|
|
67
|
-
parser_cls (Type[ReasoningParser]): The parser class to register.
|
|
68
|
-
"""
|
|
69
|
-
cls._parsers[model_name] = parser_cls
|
|
70
|
-
|
|
71
|
-
@classmethod
|
|
72
|
-
def register_module(cls, model_name: str):
|
|
73
|
-
"""Decorator for registering a reasoning parser for a specific model.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
model_name (str): The name of the model.
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
Callable: The decorator function.
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
def _register(parser_cls: Type[ReasoningParser]) -> Type[ReasoningParser]:
|
|
83
|
-
cls.register(model_name, parser_cls)
|
|
84
|
-
return parser_cls
|
|
85
|
-
|
|
86
|
-
return _register
|
|
87
|
-
|
|
88
|
-
@classmethod
|
|
89
|
-
def get_parser(cls, model_name: str) -> Optional[Type[ReasoningParser]]:
|
|
90
|
-
"""Get the registered parser for a specific model.
|
|
91
|
-
|
|
92
|
-
Args:
|
|
93
|
-
model_name (str): The name of the model.
|
|
94
|
-
|
|
95
|
-
Returns:
|
|
96
|
-
Optional[Type[ReasoningParser]]: The registered parser class, or None if not found.
|
|
97
|
-
"""
|
|
98
|
-
return cls._parsers.get(model_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|