xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +2 -1
- xinference/core/model.py +8 -4
- xinference/core/supervisor.py +2 -3
- xinference/core/worker.py +7 -5
- xinference/deploy/cmdline.py +2 -0
- xinference/deploy/local.py +5 -0
- xinference/deploy/test/test_cmdline.py +1 -1
- xinference/deploy/worker.py +6 -0
- xinference/model/audio/cosyvoice.py +0 -1
- xinference/model/audio/model_spec.json +44 -20
- xinference/model/core.py +3 -0
- xinference/model/embedding/flag/core.py +5 -0
- xinference/model/embedding/llama_cpp/core.py +22 -19
- xinference/model/embedding/sentence_transformers/core.py +18 -4
- xinference/model/embedding/vllm/core.py +36 -9
- xinference/model/image/cache_manager.py +56 -0
- xinference/model/image/core.py +9 -0
- xinference/model/image/model_spec.json +178 -1
- xinference/model/image/stable_diffusion/core.py +155 -23
- xinference/model/llm/cache_manager.py +17 -3
- xinference/model/llm/harmony.py +245 -0
- xinference/model/llm/llama_cpp/core.py +41 -40
- xinference/model/llm/llm_family.json +688 -11
- xinference/model/llm/llm_family.py +1 -1
- xinference/model/llm/sglang/core.py +108 -5
- xinference/model/llm/transformers/core.py +20 -18
- xinference/model/llm/transformers/gemma3.py +1 -1
- xinference/model/llm/transformers/gpt_oss.py +91 -0
- xinference/model/llm/transformers/multimodal/core.py +1 -1
- xinference/model/llm/transformers/multimodal/gemma3.py +1 -1
- xinference/model/llm/transformers/multimodal/glm4_1v.py +2 -2
- xinference/model/llm/transformers/multimodal/ovis2.py +1 -1
- xinference/model/llm/transformers/multimodal/qwen-omni.py +7 -8
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +9 -6
- xinference/model/llm/transformers/utils.py +1 -33
- xinference/model/llm/utils.py +61 -7
- xinference/model/llm/vllm/core.py +44 -8
- xinference/model/rerank/__init__.py +66 -23
- xinference/model/rerank/cache_manager.py +35 -0
- xinference/model/rerank/core.py +87 -339
- xinference/model/rerank/custom.py +33 -8
- xinference/model/rerank/model_spec.json +251 -212
- xinference/model/rerank/rerank_family.py +137 -0
- xinference/model/rerank/sentence_transformers/__init__.py +13 -0
- xinference/model/rerank/sentence_transformers/core.py +337 -0
- xinference/model/rerank/vllm/__init__.py +13 -0
- xinference/model/rerank/vllm/core.py +156 -0
- xinference/model/utils.py +108 -0
- xinference/model/video/model_spec.json +95 -1
- xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
- xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
- xinference/thirdparty/cosyvoice/bin/train.py +23 -3
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
- xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
- xinference/thirdparty/cosyvoice/cli/model.py +53 -75
- xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
- xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
- xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
- xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
- xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
- xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
- xinference/thirdparty/cosyvoice/utils/common.py +20 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
- xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
- xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
- xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
- xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
- xinference/types.py +2 -0
- xinference/ui/gradio/chat_interface.py +2 -0
- xinference/ui/gradio/media_interface.py +353 -7
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/28012da921a51f1082549956d3ae82acd769a754b22afda9acddd98a4daf9ea4.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/475936ebe725eca62a6f52ce182c06a19b2cef4df9545a05ed0591ee0c539d43.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/aee5aaba26f2b1e816a3ea9efa68bad8b95695a3d80adcfd8dd57a7bb17ac71a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
- xinference/ui/web/ui/src/locales/en.json +2 -0
- xinference/ui/web/ui/src/locales/ja.json +2 -0
- xinference/ui/web/ui/src/locales/ko.json +2 -0
- xinference/ui/web/ui/src/locales/zh.json +2 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/METADATA +15 -10
- {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/RECORD +98 -89
- xinference/ui/web/ui/build/static/js/main.b969199a.js +0 -3
- xinference/ui/web/ui/build/static/js/main.b969199a.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1409a96b9f9f9f5de99a89ab0f738f6da62b449521b0a8d3e4efcf7f5c23534d.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/43b889c3a8e2634092ade463d52481c7c5581c72ded8f23bc5f012ea0ef8cea5.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/5d47532fb42128280d87f57c8a0b02bc1930f7ef764aa7e90579247df18bba83.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/830882bb275468a969614824a9ab8983f874b4581f2eb625e9c66426cdc65e5b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9df08abcb5a7c1e48a4eb25c5d5f5d7253ea6854a4397e6d74d1fd75a14acda1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/b99034986a06445701accc7a4914bb9320947435e8d4e15793392ca4f679316c.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.b969199a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/WHEEL +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from copy import deepcopy
|
|
16
|
+
from typing import TYPE_CHECKING, AsyncGenerator, Dict, Union
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from ...types import ChatCompletion, ChatCompletionChunk
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HarmonyStreamParser:
|
|
23
|
+
def __init__(self):
|
|
24
|
+
# Current channel: either 'analysis', 'final', or None if not started yet
|
|
25
|
+
self.current_channel = None
|
|
26
|
+
# Buffer for accumulating text when looking for 'assistantfinal' marker
|
|
27
|
+
self.buffer = ""
|
|
28
|
+
|
|
29
|
+
def feed(self, text):
|
|
30
|
+
"""
|
|
31
|
+
Feed a chunk of text into the parser and return parsed segments.
|
|
32
|
+
|
|
33
|
+
Each segment is a dict:
|
|
34
|
+
{
|
|
35
|
+
"channel": "analysis" | "final",
|
|
36
|
+
"content": <string>
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
The parser detects 'assistantfinal' markers inside reasoning text,
|
|
40
|
+
splits the reasoning and final content correctly, and switches the channel.
|
|
41
|
+
"""
|
|
42
|
+
segments = []
|
|
43
|
+
|
|
44
|
+
# If we are currently in 'analysis' mode
|
|
45
|
+
if self.current_channel == "analysis":
|
|
46
|
+
# Add text to buffer and check for 'assistantfinal' marker
|
|
47
|
+
self.buffer += text
|
|
48
|
+
if "assistantfinal" in self.buffer:
|
|
49
|
+
# Split reasoning and final content
|
|
50
|
+
before, after = self.buffer.split("assistantfinal", 1)
|
|
51
|
+
if before:
|
|
52
|
+
segments.append({"channel": "analysis", "content": before})
|
|
53
|
+
# Switch to final channel
|
|
54
|
+
self.current_channel = "final"
|
|
55
|
+
self.buffer = ""
|
|
56
|
+
if after:
|
|
57
|
+
segments.append({"channel": "final", "content": after})
|
|
58
|
+
return segments
|
|
59
|
+
else:
|
|
60
|
+
# Check if buffer ends with partial 'assistantfinal'
|
|
61
|
+
if any(
|
|
62
|
+
self.buffer.endswith("assistantfinal"[:i])
|
|
63
|
+
for i in range(1, len("assistantfinal") + 1)
|
|
64
|
+
):
|
|
65
|
+
# Don't emit anything yet, wait for more text
|
|
66
|
+
return segments
|
|
67
|
+
else:
|
|
68
|
+
# Emit what we have so far and keep buffer for next time
|
|
69
|
+
if self.buffer:
|
|
70
|
+
segments.append({"channel": "analysis", "content": self.buffer})
|
|
71
|
+
self.buffer = ""
|
|
72
|
+
return segments
|
|
73
|
+
|
|
74
|
+
# If we are currently in 'final' mode
|
|
75
|
+
if self.current_channel == "final":
|
|
76
|
+
# Check if this is actually a new message starting with 'analysis'
|
|
77
|
+
if text.startswith("analysis"):
|
|
78
|
+
# Reset parser state for new message
|
|
79
|
+
self.current_channel = None
|
|
80
|
+
self.buffer = ""
|
|
81
|
+
# Re-process this text with the new state
|
|
82
|
+
return self.feed(text)
|
|
83
|
+
else:
|
|
84
|
+
segments.append({"channel": "final", "content": text})
|
|
85
|
+
return segments
|
|
86
|
+
|
|
87
|
+
# If no channel has been started yet
|
|
88
|
+
if text.startswith("analysis"):
|
|
89
|
+
self.current_channel = "analysis"
|
|
90
|
+
rest = text[len("analysis") :]
|
|
91
|
+
if "assistantfinal" in rest:
|
|
92
|
+
# Split immediately if marker is found in the first chunk
|
|
93
|
+
before, after = rest.split("assistantfinal", 1)
|
|
94
|
+
if before:
|
|
95
|
+
segments.append({"channel": "analysis", "content": before})
|
|
96
|
+
self.current_channel = "final"
|
|
97
|
+
if after:
|
|
98
|
+
segments.append({"channel": "final", "content": after})
|
|
99
|
+
else:
|
|
100
|
+
# Start buffering for potential 'assistantfinal' marker
|
|
101
|
+
self.buffer = rest
|
|
102
|
+
# Check if buffer ends with partial 'assistantfinal'
|
|
103
|
+
if any(
|
|
104
|
+
self.buffer.endswith("assistantfinal"[:i])
|
|
105
|
+
for i in range(1, len("assistantfinal") + 1)
|
|
106
|
+
):
|
|
107
|
+
# Don't emit anything yet, wait for more text
|
|
108
|
+
pass
|
|
109
|
+
else:
|
|
110
|
+
# Emit what we have so far
|
|
111
|
+
if self.buffer:
|
|
112
|
+
segments.append({"channel": "analysis", "content": self.buffer})
|
|
113
|
+
self.buffer = ""
|
|
114
|
+
elif text.startswith("assistantfinal"):
|
|
115
|
+
self.current_channel = "final"
|
|
116
|
+
rest = text[len("assistantfinal") :]
|
|
117
|
+
if rest:
|
|
118
|
+
segments.append({"channel": "final", "content": rest})
|
|
119
|
+
|
|
120
|
+
return segments
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
async def async_stream_harmony_chat_completion(
|
|
124
|
+
chunks: Union[
|
|
125
|
+
"ChatCompletion",
|
|
126
|
+
AsyncGenerator["ChatCompletionChunk", None],
|
|
127
|
+
],
|
|
128
|
+
) -> AsyncGenerator["ChatCompletion", None]:
|
|
129
|
+
"""
|
|
130
|
+
Parse Harmony-formatted content from either a full ChatCompletion (non-streaming)
|
|
131
|
+
or an async stream of ChatCompletionChunk (streaming), using the HarmonyStreamParser defined in this file.
|
|
132
|
+
|
|
133
|
+
Yields parsed objects incrementally.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
# --- Non-streaming: ChatCompletion ---
|
|
137
|
+
if isinstance(chunks, dict) and chunks.get("object") == "chat.completion":
|
|
138
|
+
out_data = deepcopy(chunks)
|
|
139
|
+
|
|
140
|
+
for choice in out_data["choices"]:
|
|
141
|
+
parser = HarmonyStreamParser()
|
|
142
|
+
msg = choice["message"]
|
|
143
|
+
|
|
144
|
+
# Backup original content & reasoning
|
|
145
|
+
original_content = msg.get("content") or ""
|
|
146
|
+
original_reasoning = msg.get("reasoning_content") or ""
|
|
147
|
+
|
|
148
|
+
# Reset fields before parsing
|
|
149
|
+
msg["content"] = ""
|
|
150
|
+
msg["reasoning_content"] = ""
|
|
151
|
+
msg.setdefault("tool_calls", [])
|
|
152
|
+
|
|
153
|
+
# Feed original content
|
|
154
|
+
for seg in parser.feed(original_content):
|
|
155
|
+
ch, c = seg["channel"], seg["content"]
|
|
156
|
+
if ch == "final":
|
|
157
|
+
msg["content"] += c
|
|
158
|
+
elif ch == "analysis":
|
|
159
|
+
msg["reasoning_content"] += c
|
|
160
|
+
elif ch == "tool":
|
|
161
|
+
msg["tool_calls"].append(c)
|
|
162
|
+
|
|
163
|
+
# Feed original reasoning_content
|
|
164
|
+
for seg in parser.feed(original_reasoning):
|
|
165
|
+
if seg["channel"] == "analysis":
|
|
166
|
+
msg["reasoning_content"] += seg["content"]
|
|
167
|
+
elif seg["channel"] == "tool":
|
|
168
|
+
msg["tool_calls"].append(seg["content"])
|
|
169
|
+
|
|
170
|
+
# Clean up reasoning_content: set to None if no reasoning content was parsed
|
|
171
|
+
if not msg["reasoning_content"] and not original_reasoning:
|
|
172
|
+
msg["reasoning_content"] = None # type: ignore
|
|
173
|
+
|
|
174
|
+
yield out_data
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
# Streaming: handle async generator
|
|
178
|
+
parsers_per_choice = {}
|
|
179
|
+
|
|
180
|
+
async for chunk in chunks: # type: ignore
|
|
181
|
+
out_chunk = { # type: ignore
|
|
182
|
+
"id": chunk["id"],
|
|
183
|
+
"model": chunk["model"],
|
|
184
|
+
"object": chunk["object"],
|
|
185
|
+
"created": chunk["created"],
|
|
186
|
+
"choices": [],
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
for i, choice in enumerate(chunk["choices"]):
|
|
190
|
+
delta = choice.get("delta", {})
|
|
191
|
+
text = delta.get("content") or "" # type: ignore
|
|
192
|
+
|
|
193
|
+
if i not in parsers_per_choice:
|
|
194
|
+
parsers_per_choice[i] = HarmonyStreamParser()
|
|
195
|
+
|
|
196
|
+
# Feed text to parser and collect current delta only
|
|
197
|
+
curr_delta: Dict[str, object] = {
|
|
198
|
+
"content": "",
|
|
199
|
+
"reasoning_content": "",
|
|
200
|
+
"tool_calls": [],
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
for seg in parsers_per_choice[i].feed(text):
|
|
204
|
+
ch = seg["channel"]
|
|
205
|
+
c = seg["content"]
|
|
206
|
+
if ch == "final":
|
|
207
|
+
curr_delta["content"] += c # type: ignore
|
|
208
|
+
elif ch == "analysis":
|
|
209
|
+
curr_delta["reasoning_content"] += c # type: ignore
|
|
210
|
+
elif ch == "tool":
|
|
211
|
+
curr_delta["tool_calls"].append(c) # type: ignore
|
|
212
|
+
|
|
213
|
+
if curr_delta["reasoning_content"]:
|
|
214
|
+
if not curr_delta["content"]:
|
|
215
|
+
curr_delta["content"] = None
|
|
216
|
+
|
|
217
|
+
elif curr_delta["content"]:
|
|
218
|
+
if not curr_delta["reasoning_content"]:
|
|
219
|
+
curr_delta["reasoning_content"] = None
|
|
220
|
+
|
|
221
|
+
elif (
|
|
222
|
+
choice.get("finish_reason") is not None
|
|
223
|
+
and not curr_delta["reasoning_content"]
|
|
224
|
+
):
|
|
225
|
+
# For the final chunk, if there's no new reasoning content,
|
|
226
|
+
# don't include empty reasoning_content to avoid clearing existing state
|
|
227
|
+
curr_delta["reasoning_content"] = None
|
|
228
|
+
|
|
229
|
+
out_chunk["choices"].append( # type: ignore
|
|
230
|
+
{
|
|
231
|
+
"index": i,
|
|
232
|
+
"delta": curr_delta,
|
|
233
|
+
"finish_reason": choice.get("finish_reason"),
|
|
234
|
+
}
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Only yield if we have either content or reasoning_content
|
|
238
|
+
has_content = any(
|
|
239
|
+
choice["delta"].get("content") # type: ignore
|
|
240
|
+
or choice["delta"].get("reasoning_content") # type: ignore
|
|
241
|
+
or choice.get("finish_reason") is not None # type: ignore
|
|
242
|
+
for choice in out_chunk["choices"] # type: ignore
|
|
243
|
+
)
|
|
244
|
+
if has_content:
|
|
245
|
+
yield out_chunk # type: ignore
|
|
@@ -19,11 +19,11 @@ import pprint
|
|
|
19
19
|
import queue
|
|
20
20
|
from typing import Iterator, List, Optional, Union
|
|
21
21
|
|
|
22
|
-
import
|
|
22
|
+
from packaging import version
|
|
23
23
|
|
|
24
24
|
from ....constants import XINFERENCE_MAX_TOKENS
|
|
25
25
|
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
|
|
26
|
-
from ..core import LLM
|
|
26
|
+
from ..core import LLM, chat_context_var
|
|
27
27
|
from ..llm_family import LLMFamilyV2, LLMSpecV1
|
|
28
28
|
from ..utils import ChatModelMixin
|
|
29
29
|
|
|
@@ -98,10 +98,19 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
98
98
|
from xllamacpp import (
|
|
99
99
|
CommonParams,
|
|
100
100
|
Server,
|
|
101
|
+
__version__,
|
|
101
102
|
estimate_gpu_layers,
|
|
102
103
|
get_device_info,
|
|
103
104
|
ggml_backend_dev_type,
|
|
104
105
|
)
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
if version.parse(__version__) < version.parse("0.2.0"):
|
|
109
|
+
raise RuntimeError(
|
|
110
|
+
"Please update xllamacpp to >= 0.2.0 by `pip install -U xllamacpp`"
|
|
111
|
+
)
|
|
112
|
+
except version.InvalidVersion:
|
|
113
|
+
pass # If the version parse failed, we just skip the version check.
|
|
105
114
|
except ImportError:
|
|
106
115
|
error_message = "Failed to import module 'xllamacpp'"
|
|
107
116
|
installation_guide = ["Please make sure 'xllamacpp' is installed. "]
|
|
@@ -160,6 +169,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
160
169
|
params.mmproj.path = mmproj
|
|
161
170
|
if self.model_family.chat_template:
|
|
162
171
|
params.chat_template = self.model_family.chat_template
|
|
172
|
+
params.use_jinja = True
|
|
163
173
|
# This is the default value, could be overwritten by _llamacpp_model_config
|
|
164
174
|
params.n_parallel = min(8, os.cpu_count() or 1)
|
|
165
175
|
for k, v in self._llamacpp_model_config.items():
|
|
@@ -208,7 +218,8 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
208
218
|
)
|
|
209
219
|
logger.info("Estimate num gpu layers: %s", estimate)
|
|
210
220
|
if estimate.tensor_split:
|
|
211
|
-
|
|
221
|
+
for i in range(len(estimate.tensor_split)):
|
|
222
|
+
params.tensor_split[i] = estimate.tensor_split[i]
|
|
212
223
|
else:
|
|
213
224
|
params.n_gpu_layers = estimate.layers
|
|
214
225
|
except Exception as e:
|
|
@@ -242,28 +253,18 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
242
253
|
{
|
|
243
254
|
"prompt": prompt,
|
|
244
255
|
"stream": stream,
|
|
256
|
+
"model": self.model_uid,
|
|
245
257
|
}
|
|
246
258
|
)
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
def _error_callback(err):
|
|
250
|
-
try:
|
|
251
|
-
msg = orjson.loads(err)
|
|
252
|
-
q.put(_Error(msg))
|
|
253
|
-
except Exception as e:
|
|
254
|
-
q.put(_Error(str(e)))
|
|
259
|
+
try:
|
|
255
260
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
except Exception as e:
|
|
262
|
-
logger.exception("handle_completions callback failed: %s", e)
|
|
263
|
-
q.put(_Error(str(e)))
|
|
261
|
+
def _callback(res):
|
|
262
|
+
if res.get("code"):
|
|
263
|
+
q.put(_Error(res))
|
|
264
|
+
else:
|
|
265
|
+
q.put(res)
|
|
264
266
|
|
|
265
|
-
|
|
266
|
-
self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
|
|
267
|
+
self._llm.handle_completions(data, _callback)
|
|
267
268
|
except Exception as ex:
|
|
268
269
|
logger.exception("handle_completions failed: %s", ex)
|
|
269
270
|
q.put(_Error(str(ex)))
|
|
@@ -296,6 +297,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
296
297
|
if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
|
|
297
298
|
generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
|
|
298
299
|
stream = generate_config.get("stream", False)
|
|
300
|
+
|
|
301
|
+
chat_template_kwargs = (
|
|
302
|
+
self._get_chat_template_kwargs_from_generate_config(
|
|
303
|
+
generate_config, self.reasoning_parser
|
|
304
|
+
)
|
|
305
|
+
or {}
|
|
306
|
+
)
|
|
307
|
+
chat_context_var.set(chat_template_kwargs)
|
|
308
|
+
|
|
299
309
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
300
310
|
q: queue.Queue = queue.Queue()
|
|
301
311
|
|
|
@@ -310,30 +320,21 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
310
320
|
"messages": messages,
|
|
311
321
|
"stream": stream,
|
|
312
322
|
"tools": tools,
|
|
323
|
+
"model": self.model_uid,
|
|
313
324
|
}
|
|
314
325
|
)
|
|
315
|
-
|
|
326
|
+
if chat_template_kwargs:
|
|
327
|
+
data["chat_template_kwargs"] = chat_template_kwargs
|
|
316
328
|
|
|
317
|
-
|
|
318
|
-
try:
|
|
319
|
-
msg = orjson.loads(err)
|
|
320
|
-
q.put(_Error(msg))
|
|
321
|
-
except Exception as e:
|
|
322
|
-
q.put(_Error(str(e)))
|
|
329
|
+
try:
|
|
323
330
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
except Exception as e:
|
|
330
|
-
logger.exception("handle_chat_completions callback failed: %s", e)
|
|
331
|
-
q.put(_Error(str(e)))
|
|
331
|
+
def _callback(res):
|
|
332
|
+
if res.get("code"):
|
|
333
|
+
q.put(_Error(res))
|
|
334
|
+
else:
|
|
335
|
+
q.put(res)
|
|
332
336
|
|
|
333
|
-
|
|
334
|
-
self._llm.handle_chat_completions(
|
|
335
|
-
prompt_json, _error_callback, _ok_callback
|
|
336
|
-
)
|
|
337
|
+
self._llm.handle_chat_completions(data, _callback)
|
|
337
338
|
except Exception as ex:
|
|
338
339
|
logger.exception("handle_chat_completions failed: %s", ex)
|
|
339
340
|
q.put(_Error(str(ex)))
|