xinference 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/chat_interface.py +10 -4
- xinference/core/model.py +2 -2
- xinference/fields.py +3 -1
- xinference/model/llm/ggml/chatglm.py +98 -13
- xinference/model/llm/ggml/llamacpp.py +49 -2
- xinference/model/llm/llm_family.json +132 -3
- xinference/model/llm/llm_family_modelscope.json +139 -3
- xinference/model/llm/pytorch/chatglm.py +48 -0
- xinference/model/llm/pytorch/core.py +23 -6
- xinference/model/llm/pytorch/deepseek_vl.py +35 -9
- xinference/model/llm/pytorch/internlm2.py +32 -1
- xinference/model/llm/pytorch/qwen_vl.py +38 -11
- xinference/model/llm/pytorch/utils.py +38 -1
- xinference/model/llm/pytorch/yi_vl.py +42 -14
- xinference/model/llm/sglang/core.py +31 -9
- xinference/model/llm/utils.py +25 -5
- xinference/model/llm/vllm/core.py +82 -3
- xinference/types.py +10 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.8e44da4b.js → main.551aa479.js} +3 -3
- xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/METADATA +3 -2
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/RECORD +33 -33
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +0 -1
- /xinference/web/ui/build/static/js/{main.8e44da4b.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
|
@@ -139,6 +139,12 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
139
139
|
generate_config = {}
|
|
140
140
|
|
|
141
141
|
stream = generate_config.get("stream", False)
|
|
142
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
143
|
+
include_usage = (
|
|
144
|
+
stream_options["include_usage"]
|
|
145
|
+
if isinstance(stream_options, dict)
|
|
146
|
+
else False
|
|
147
|
+
)
|
|
142
148
|
|
|
143
149
|
from ....thirdparty.llava.conversation import conv_templates
|
|
144
150
|
from ....thirdparty.llava.mm_utils import (
|
|
@@ -166,11 +172,11 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
166
172
|
)
|
|
167
173
|
|
|
168
174
|
images = state.get_images(return_pil=True)
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
175
|
+
if images:
|
|
176
|
+
image = images[0]
|
|
177
|
+
image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
|
|
178
|
+
"pixel_values"
|
|
179
|
+
][0]
|
|
174
180
|
|
|
175
181
|
stop_str = state.sep
|
|
176
182
|
keywords = [stop_str]
|
|
@@ -187,7 +193,9 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
187
193
|
"input_ids": input_ids,
|
|
188
194
|
"images": image_tensor.unsqueeze(0)
|
|
189
195
|
.to(dtype=torch.bfloat16)
|
|
190
|
-
.to(self._model.device)
|
|
196
|
+
.to(self._model.device)
|
|
197
|
+
if images
|
|
198
|
+
else None,
|
|
191
199
|
"streamer": streamer,
|
|
192
200
|
"do_sample": True,
|
|
193
201
|
"top_p": float(top_p),
|
|
@@ -200,7 +208,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
200
208
|
t.start()
|
|
201
209
|
|
|
202
210
|
if stream:
|
|
203
|
-
it = self._generate_stream(streamer, stop_str)
|
|
211
|
+
it = self._generate_stream(streamer, stop_str, input_ids, include_usage)
|
|
204
212
|
return self._to_chat_completion_chunks(it)
|
|
205
213
|
else:
|
|
206
214
|
c = self._generate(streamer, stop_str)
|
|
@@ -229,8 +237,12 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
229
237
|
)
|
|
230
238
|
return c
|
|
231
239
|
|
|
232
|
-
def _generate_stream(
|
|
240
|
+
def _generate_stream(
|
|
241
|
+
self, streamer, stop_str, input_ids, include_usage
|
|
242
|
+
) -> Iterator[CompletionChunk]:
|
|
233
243
|
completion_id = str(uuid.uuid1())
|
|
244
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
245
|
+
prompt_tokens = len(input_ids[0])
|
|
234
246
|
for i, new_text in enumerate(streamer):
|
|
235
247
|
if not new_text.endswith(stop_str):
|
|
236
248
|
completion_choice = CompletionChoice(
|
|
@@ -243,10 +255,12 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
243
255
|
model=self.model_uid,
|
|
244
256
|
choices=[completion_choice],
|
|
245
257
|
)
|
|
258
|
+
completion_tokens = i
|
|
259
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
246
260
|
completion_usage = CompletionUsage(
|
|
247
|
-
prompt_tokens
|
|
248
|
-
completion_tokens
|
|
249
|
-
total_tokens
|
|
261
|
+
prompt_tokens=prompt_tokens,
|
|
262
|
+
completion_tokens=completion_tokens,
|
|
263
|
+
total_tokens=total_tokens,
|
|
250
264
|
)
|
|
251
265
|
chunk["usage"] = completion_usage
|
|
252
266
|
yield chunk
|
|
@@ -262,9 +276,23 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
262
276
|
choices=[completion_choice],
|
|
263
277
|
)
|
|
264
278
|
completion_usage = CompletionUsage(
|
|
265
|
-
prompt_tokens
|
|
266
|
-
completion_tokens
|
|
267
|
-
total_tokens
|
|
279
|
+
prompt_tokens=prompt_tokens,
|
|
280
|
+
completion_tokens=completion_tokens,
|
|
281
|
+
total_tokens=total_tokens,
|
|
268
282
|
)
|
|
269
283
|
chunk["usage"] = completion_usage
|
|
270
284
|
yield chunk
|
|
285
|
+
if include_usage:
|
|
286
|
+
chunk = CompletionChunk(
|
|
287
|
+
id=completion_id,
|
|
288
|
+
object="text_completion",
|
|
289
|
+
created=int(time.time()),
|
|
290
|
+
model=self.model_uid,
|
|
291
|
+
choices=[],
|
|
292
|
+
)
|
|
293
|
+
chunk["usage"] = CompletionUsage(
|
|
294
|
+
prompt_tokens=prompt_tokens,
|
|
295
|
+
completion_tokens=completion_tokens,
|
|
296
|
+
total_tokens=total_tokens,
|
|
297
|
+
)
|
|
298
|
+
yield chunk
|
|
@@ -53,6 +53,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
|
|
|
53
53
|
stop: Optional[Union[str, List[str]]]
|
|
54
54
|
ignore_eos: bool
|
|
55
55
|
stream: bool
|
|
56
|
+
stream_options: Optional[Union[dict, None]]
|
|
56
57
|
|
|
57
58
|
|
|
58
59
|
try:
|
|
@@ -157,6 +158,8 @@ class SGLANGModel(LLM):
|
|
|
157
158
|
)
|
|
158
159
|
generate_config.setdefault("stop", [])
|
|
159
160
|
generate_config.setdefault("stream", False)
|
|
161
|
+
stream_options = generate_config.get("stream_options")
|
|
162
|
+
generate_config.setdefault("stream_options", stream_options)
|
|
160
163
|
generate_config.setdefault("ignore_eos", False)
|
|
161
164
|
|
|
162
165
|
return generate_config
|
|
@@ -192,7 +195,7 @@ class SGLANGModel(LLM):
|
|
|
192
195
|
|
|
193
196
|
@staticmethod
|
|
194
197
|
def _convert_state_to_completion_chunk(
|
|
195
|
-
request_id: str, model: str, output_text: str
|
|
198
|
+
request_id: str, model: str, output_text: str
|
|
196
199
|
) -> CompletionChunk:
|
|
197
200
|
choices: List[CompletionChoice] = [
|
|
198
201
|
CompletionChoice(
|
|
@@ -209,13 +212,6 @@ class SGLANGModel(LLM):
|
|
|
209
212
|
model=model,
|
|
210
213
|
choices=choices,
|
|
211
214
|
)
|
|
212
|
-
prompt_tokens = meta_info["prompt_tokens"]
|
|
213
|
-
completion_tokens = meta_info["completion_tokens"]
|
|
214
|
-
chunk["usage"] = CompletionUsage(
|
|
215
|
-
prompt_tokens=prompt_tokens,
|
|
216
|
-
completion_tokens=completion_tokens,
|
|
217
|
-
total_tokens=prompt_tokens + completion_tokens,
|
|
218
|
-
)
|
|
219
215
|
return chunk
|
|
220
216
|
|
|
221
217
|
@staticmethod
|
|
@@ -272,6 +268,9 @@ class SGLANGModel(LLM):
|
|
|
272
268
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
273
269
|
)
|
|
274
270
|
stream = sanitized_generate_config.pop("stream")
|
|
271
|
+
stream_options = sanitized_generate_config.pop("stream_options")
|
|
272
|
+
if isinstance(stream_options, dict):
|
|
273
|
+
include_usage = stream_options.pop("include_usage", False)
|
|
275
274
|
request_id = str(uuid.uuid1())
|
|
276
275
|
state = pipeline.run(
|
|
277
276
|
question=prompt,
|
|
@@ -289,11 +288,34 @@ class SGLANGModel(LLM):
|
|
|
289
288
|
else:
|
|
290
289
|
|
|
291
290
|
async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
|
|
291
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
292
292
|
async for out, meta_info in state.text_async_iter(
|
|
293
293
|
var_name="answer", return_meta_data=True
|
|
294
294
|
):
|
|
295
295
|
chunk = self._convert_state_to_completion_chunk(
|
|
296
|
-
request_id, self.model_uid, output_text=out
|
|
296
|
+
request_id, self.model_uid, output_text=out
|
|
297
|
+
)
|
|
298
|
+
prompt_tokens = meta_info["prompt_tokens"]
|
|
299
|
+
completion_tokens = meta_info["completion_tokens"]
|
|
300
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
301
|
+
chunk["usage"] = CompletionUsage(
|
|
302
|
+
prompt_tokens=prompt_tokens,
|
|
303
|
+
completion_tokens=completion_tokens,
|
|
304
|
+
total_tokens=total_tokens,
|
|
305
|
+
)
|
|
306
|
+
yield chunk
|
|
307
|
+
if include_usage:
|
|
308
|
+
chunk = CompletionChunk(
|
|
309
|
+
id=request_id,
|
|
310
|
+
object="text_completion",
|
|
311
|
+
created=int(time.time()),
|
|
312
|
+
model=self.model_uid,
|
|
313
|
+
choices=[],
|
|
314
|
+
)
|
|
315
|
+
chunk["usage"] = CompletionUsage(
|
|
316
|
+
prompt_tokens=prompt_tokens,
|
|
317
|
+
completion_tokens=completion_tokens,
|
|
318
|
+
total_tokens=total_tokens,
|
|
297
319
|
)
|
|
298
320
|
yield chunk
|
|
299
321
|
|
xinference/model/llm/utils.py
CHANGED
|
@@ -482,9 +482,6 @@ Begin!"""
|
|
|
482
482
|
for i, choice in enumerate(chunk["choices"])
|
|
483
483
|
],
|
|
484
484
|
}
|
|
485
|
-
usage = chunk.get("usage")
|
|
486
|
-
if usage is not None:
|
|
487
|
-
chat_chunk["usage"] = usage
|
|
488
485
|
return cast(ChatCompletionChunk, chat_chunk)
|
|
489
486
|
|
|
490
487
|
@classmethod
|
|
@@ -508,6 +505,19 @@ Begin!"""
|
|
|
508
505
|
for i, choice in enumerate(chunk["choices"])
|
|
509
506
|
],
|
|
510
507
|
}
|
|
508
|
+
return cast(ChatCompletionChunk, chat_chunk)
|
|
509
|
+
|
|
510
|
+
@classmethod
|
|
511
|
+
def _get_final_chat_completion_chunk(
|
|
512
|
+
cls, chunk: CompletionChunk
|
|
513
|
+
) -> ChatCompletionChunk:
|
|
514
|
+
chat_chunk = {
|
|
515
|
+
"id": "chat" + chunk["id"],
|
|
516
|
+
"model": chunk["model"],
|
|
517
|
+
"created": chunk["created"],
|
|
518
|
+
"object": "chat.completion.chunk",
|
|
519
|
+
"choices": [],
|
|
520
|
+
}
|
|
511
521
|
usage = chunk.get("usage")
|
|
512
522
|
if usage is not None:
|
|
513
523
|
chat_chunk["usage"] = usage
|
|
@@ -521,7 +531,12 @@ Begin!"""
|
|
|
521
531
|
for i, chunk in enumerate(chunks):
|
|
522
532
|
if i == 0:
|
|
523
533
|
yield cls._get_first_chat_completion_chunk(chunk)
|
|
524
|
-
|
|
534
|
+
# usage
|
|
535
|
+
choices = chunk.get("choices")
|
|
536
|
+
if not choices:
|
|
537
|
+
yield cls._get_final_chat_completion_chunk(chunk)
|
|
538
|
+
else:
|
|
539
|
+
yield cls._to_chat_completion_chunk(chunk)
|
|
525
540
|
|
|
526
541
|
@classmethod
|
|
527
542
|
async def _async_to_chat_completion_chunks(
|
|
@@ -532,7 +547,12 @@ Begin!"""
|
|
|
532
547
|
async for chunk in chunks:
|
|
533
548
|
if i == 0:
|
|
534
549
|
yield cls._get_first_chat_completion_chunk(chunk)
|
|
535
|
-
|
|
550
|
+
# usage
|
|
551
|
+
choices = chunk.get("choices")
|
|
552
|
+
if not choices:
|
|
553
|
+
yield cls._get_final_chat_completion_chunk(chunk)
|
|
554
|
+
else:
|
|
555
|
+
yield cls._to_chat_completion_chunk(chunk)
|
|
536
556
|
i += 1
|
|
537
557
|
|
|
538
558
|
@staticmethod
|
|
@@ -37,6 +37,7 @@ from ....types import (
|
|
|
37
37
|
CompletionChoice,
|
|
38
38
|
CompletionChunk,
|
|
39
39
|
CompletionUsage,
|
|
40
|
+
LoRA,
|
|
40
41
|
ToolCallFunction,
|
|
41
42
|
ToolCalls,
|
|
42
43
|
)
|
|
@@ -64,16 +65,19 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
64
65
|
|
|
65
66
|
|
|
66
67
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
68
|
+
lora_name: Optional[str]
|
|
67
69
|
n: int
|
|
68
70
|
best_of: Optional[int]
|
|
69
71
|
presence_penalty: float
|
|
70
72
|
frequency_penalty: float
|
|
71
73
|
temperature: float
|
|
72
74
|
top_p: float
|
|
75
|
+
top_k: int
|
|
73
76
|
max_tokens: int
|
|
74
77
|
stop_token_ids: Optional[List[int]]
|
|
75
78
|
stop: Optional[Union[str, List[str]]]
|
|
76
79
|
stream: bool # non-sampling param, should not be passed to the engine.
|
|
80
|
+
stream_options: Optional[Union[dict, None]]
|
|
77
81
|
|
|
78
82
|
|
|
79
83
|
try:
|
|
@@ -90,6 +94,7 @@ VLLM_SUPPORTED_MODELS = [
|
|
|
90
94
|
"internlm-16k",
|
|
91
95
|
"mistral-v0.1",
|
|
92
96
|
"Yi",
|
|
97
|
+
"Yi-1.5",
|
|
93
98
|
"code-llama",
|
|
94
99
|
"code-llama-python",
|
|
95
100
|
]
|
|
@@ -106,6 +111,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
106
111
|
"internlm2-chat",
|
|
107
112
|
"qwen-chat",
|
|
108
113
|
"Yi-chat",
|
|
114
|
+
"Yi-1.5-chat",
|
|
109
115
|
"code-llama-instruct",
|
|
110
116
|
"mistral-instruct-v0.1",
|
|
111
117
|
"mistral-instruct-v0.2",
|
|
@@ -143,16 +149,30 @@ class VLLMModel(LLM):
|
|
|
143
149
|
quantization: str,
|
|
144
150
|
model_path: str,
|
|
145
151
|
model_config: Optional[VLLMModelConfig],
|
|
152
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
146
153
|
):
|
|
154
|
+
try:
|
|
155
|
+
from vllm.lora.request import LoRARequest
|
|
156
|
+
except ImportError:
|
|
157
|
+
error_message = "Failed to import module 'vllm'"
|
|
158
|
+
installation_guide = [
|
|
159
|
+
"Please make sure 'vllm' is installed. ",
|
|
160
|
+
"You can install it by `pip install vllm`\n",
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
147
164
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
148
165
|
self._model_config = model_config
|
|
149
166
|
self._engine = None
|
|
167
|
+
self.lora_modules = peft_model
|
|
168
|
+
self.lora_requests: List[LoRARequest] = []
|
|
150
169
|
|
|
151
170
|
def load(self):
|
|
152
171
|
try:
|
|
153
172
|
import vllm
|
|
154
173
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
155
174
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
175
|
+
from vllm.lora.request import LoRARequest
|
|
156
176
|
except ImportError:
|
|
157
177
|
error_message = "Failed to import module 'vllm'"
|
|
158
178
|
installation_guide = [
|
|
@@ -171,11 +191,33 @@ class VLLMModel(LLM):
|
|
|
171
191
|
multiprocessing.set_start_method("fork", force=True)
|
|
172
192
|
|
|
173
193
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
194
|
+
|
|
195
|
+
if self.lora_modules is None:
|
|
196
|
+
self.lora_requests = []
|
|
197
|
+
else:
|
|
198
|
+
self.lora_requests = [
|
|
199
|
+
LoRARequest(
|
|
200
|
+
lora_name=lora.lora_name,
|
|
201
|
+
lora_int_id=i,
|
|
202
|
+
lora_local_path=lora.local_path,
|
|
203
|
+
)
|
|
204
|
+
for i, lora in enumerate(self.lora_modules, start=1)
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
enable_lora = len(self.lora_requests) > 0
|
|
208
|
+
max_loras = len(self.lora_requests)
|
|
209
|
+
|
|
174
210
|
logger.info(
|
|
175
211
|
f"Loading {self.model_uid} with following model config: {self._model_config}"
|
|
212
|
+
f"Enable lora: {enable_lora}. Lora count: {max_loras}."
|
|
176
213
|
)
|
|
177
214
|
|
|
178
|
-
engine_args = AsyncEngineArgs(
|
|
215
|
+
engine_args = AsyncEngineArgs(
|
|
216
|
+
model=self.model_path,
|
|
217
|
+
enable_lora=enable_lora,
|
|
218
|
+
max_loras=max_loras,
|
|
219
|
+
**self._model_config,
|
|
220
|
+
)
|
|
179
221
|
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
|
|
180
222
|
|
|
181
223
|
def _sanitize_model_config(
|
|
@@ -206,6 +248,7 @@ class VLLMModel(LLM):
|
|
|
206
248
|
generate_config = {}
|
|
207
249
|
|
|
208
250
|
sanitized = VLLMGenerateConfig()
|
|
251
|
+
sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
|
|
209
252
|
sanitized.setdefault("n", generate_config.get("n", 1))
|
|
210
253
|
sanitized.setdefault("best_of", generate_config.get("best_of", None))
|
|
211
254
|
sanitized.setdefault(
|
|
@@ -216,12 +259,16 @@ class VLLMModel(LLM):
|
|
|
216
259
|
)
|
|
217
260
|
sanitized.setdefault("temperature", generate_config.get("temperature", 1.0))
|
|
218
261
|
sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
|
|
262
|
+
sanitized.setdefault("top_k", generate_config.get("top_k", -1))
|
|
219
263
|
sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 1024))
|
|
220
264
|
sanitized.setdefault("stop", generate_config.get("stop", None))
|
|
221
265
|
sanitized.setdefault(
|
|
222
266
|
"stop_token_ids", generate_config.get("stop_token_ids", None)
|
|
223
267
|
)
|
|
224
|
-
sanitized.setdefault("stream", generate_config.get("stream",
|
|
268
|
+
sanitized.setdefault("stream", generate_config.get("stream", False))
|
|
269
|
+
sanitized.setdefault(
|
|
270
|
+
"stream_options", generate_config.get("stream_options", None)
|
|
271
|
+
)
|
|
225
272
|
|
|
226
273
|
return sanitized
|
|
227
274
|
|
|
@@ -338,16 +385,34 @@ class VLLMModel(LLM):
|
|
|
338
385
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
339
386
|
)
|
|
340
387
|
|
|
388
|
+
lora_model = sanitized_generate_config.pop("lora_name")
|
|
389
|
+
|
|
390
|
+
lora_request = None
|
|
391
|
+
if lora_model is not None:
|
|
392
|
+
for lora in self.lora_requests:
|
|
393
|
+
if lora_model == lora.lora_name:
|
|
394
|
+
lora_request = lora
|
|
395
|
+
break
|
|
396
|
+
|
|
341
397
|
stream = sanitized_generate_config.pop("stream")
|
|
398
|
+
stream_options = sanitized_generate_config.pop("stream_options", None)
|
|
399
|
+
include_usage = (
|
|
400
|
+
stream_options["include_usage"]
|
|
401
|
+
if isinstance(stream_options, dict)
|
|
402
|
+
else False
|
|
403
|
+
)
|
|
342
404
|
sampling_params = SamplingParams(**sanitized_generate_config)
|
|
343
405
|
request_id = str(uuid.uuid1())
|
|
344
406
|
|
|
345
407
|
assert self._engine is not None
|
|
346
|
-
results_generator = self._engine.generate(
|
|
408
|
+
results_generator = self._engine.generate(
|
|
409
|
+
prompt, sampling_params, request_id, lora_request=lora_request
|
|
410
|
+
)
|
|
347
411
|
|
|
348
412
|
async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
|
|
349
413
|
previous_texts = [""] * sanitized_generate_config["n"]
|
|
350
414
|
tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
|
|
415
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
351
416
|
async for _request_output in results_generator:
|
|
352
417
|
chunk = self._convert_request_output_to_completion_chunk(
|
|
353
418
|
request_id=request_id,
|
|
@@ -398,6 +463,20 @@ class VLLMModel(LLM):
|
|
|
398
463
|
total_tokens=total_tokens,
|
|
399
464
|
)
|
|
400
465
|
yield chunk
|
|
466
|
+
if include_usage:
|
|
467
|
+
chunk = CompletionChunk(
|
|
468
|
+
id=request_id,
|
|
469
|
+
object="text_completion",
|
|
470
|
+
created=int(time.time()),
|
|
471
|
+
model=self.model_uid,
|
|
472
|
+
choices=[],
|
|
473
|
+
)
|
|
474
|
+
chunk["usage"] = CompletionUsage(
|
|
475
|
+
prompt_tokens=prompt_tokens,
|
|
476
|
+
completion_tokens=completion_tokens,
|
|
477
|
+
total_tokens=total_tokens,
|
|
478
|
+
)
|
|
479
|
+
yield chunk
|
|
401
480
|
|
|
402
481
|
if stream:
|
|
403
482
|
return stream_results()
|
xinference/types.py
CHANGED
|
@@ -187,6 +187,8 @@ class ChatglmCppGenerateConfig(TypedDict, total=False):
|
|
|
187
187
|
top_p: float
|
|
188
188
|
temperature: float
|
|
189
189
|
stream: bool
|
|
190
|
+
lora_name: Optional[str]
|
|
191
|
+
stream_options: Optional[Union[dict, None]]
|
|
190
192
|
|
|
191
193
|
|
|
192
194
|
class QWenCppModelConfig(TypedDict, total=False):
|
|
@@ -231,6 +233,7 @@ class LlamaCppGenerateConfig(TypedDict, total=False):
|
|
|
231
233
|
repetition_penalty: float
|
|
232
234
|
top_k: int
|
|
233
235
|
stream: bool
|
|
236
|
+
stream_options: Optional[Union[dict, None]]
|
|
234
237
|
tfs_z: float
|
|
235
238
|
mirostat_mode: int
|
|
236
239
|
mirostat_tau: float
|
|
@@ -279,6 +282,8 @@ class PytorchGenerateConfig(TypedDict, total=False):
|
|
|
279
282
|
stream_interval: int
|
|
280
283
|
model: Optional[str]
|
|
281
284
|
tools: Optional[List[Dict]]
|
|
285
|
+
lora_name: Optional[str]
|
|
286
|
+
stream_options: Optional[Union[dict, None]]
|
|
282
287
|
|
|
283
288
|
|
|
284
289
|
class PytorchModelConfig(TypedDict, total=False):
|
|
@@ -350,10 +355,12 @@ class CreateCompletionTorch(BaseModel):
|
|
|
350
355
|
stop: Optional[Union[str, List[str]]] = stop_field
|
|
351
356
|
stop_token_ids: Optional[Union[int, List[int]]] = none_field
|
|
352
357
|
stream: bool = stream_field
|
|
358
|
+
stream_options: Optional[Union[dict, None]] = stream_option_field
|
|
353
359
|
stream_interval: int = stream_interval_field
|
|
354
360
|
temperature: float = temperature_field
|
|
355
361
|
top_p: float = top_p_field
|
|
356
362
|
top_k: int = top_k_field
|
|
363
|
+
lora_name: Optional[str]
|
|
357
364
|
|
|
358
365
|
|
|
359
366
|
CreateCompletionLlamaCpp: BaseModel
|
|
@@ -366,6 +373,8 @@ try:
|
|
|
366
373
|
include_fields={
|
|
367
374
|
"grammar": (Optional[Any], None),
|
|
368
375
|
"max_tokens": (Optional[int], max_tokens_field),
|
|
376
|
+
"lora_name": (Optional[str], None),
|
|
377
|
+
"stream_options": (Optional[Union[dict, None]], None),
|
|
369
378
|
},
|
|
370
379
|
)
|
|
371
380
|
except ImportError:
|
|
@@ -393,7 +402,7 @@ class _CreateCompletionOpenAIFallback(BaseModel):
|
|
|
393
402
|
seed: Optional[int] = none_field
|
|
394
403
|
stop: Optional[Union[str, List[str]]] = stop_field
|
|
395
404
|
stream: bool = stream_field
|
|
396
|
-
stream_options: Optional[dict] = stream_option_field
|
|
405
|
+
stream_options: Optional[Union[dict, None]] = stream_option_field
|
|
397
406
|
suffix: Optional[str] = none_field
|
|
398
407
|
temperature: float = temperature_field
|
|
399
408
|
top_p: float = top_p_field
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.54bca460.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.551aa479.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.54bca460.css.map": "./static/css/main.54bca460.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.551aa479.js.map": "./static/js/main.551aa479.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.54bca460.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.551aa479.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.551aa479.js"></script><link href="./static/css/main.54bca460.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|