xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +10 -4
- xinference/core/event.py +1 -1
- xinference/core/model.py +17 -6
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +68 -101
- xinference/deploy/cmdline.py +166 -1
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +7 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/chatglm.py +98 -13
- xinference/model/llm/ggml/llamacpp.py +49 -2
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +438 -7
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +258 -5
- xinference/model/llm/pytorch/chatglm.py +48 -0
- xinference/model/llm/pytorch/core.py +23 -6
- xinference/model/llm/pytorch/deepseek_vl.py +115 -33
- xinference/model/llm/pytorch/internlm2.py +32 -1
- xinference/model/llm/pytorch/qwen_vl.py +94 -12
- xinference/model/llm/pytorch/utils.py +38 -1
- xinference/model/llm/pytorch/yi_vl.py +96 -51
- xinference/model/llm/sglang/core.py +31 -9
- xinference/model/llm/utils.py +54 -20
- xinference/model/llm/vllm/core.py +101 -7
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +11 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.551aa479.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
|
@@ -108,6 +108,12 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
108
108
|
kwargs["max_length"] = int(max_new_tokens)
|
|
109
109
|
|
|
110
110
|
stream = generate_config.get("stream", False)
|
|
111
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
112
|
+
include_usage = (
|
|
113
|
+
stream_options["include_usage"]
|
|
114
|
+
if isinstance(stream_options, dict)
|
|
115
|
+
else False
|
|
116
|
+
)
|
|
111
117
|
if chat_history:
|
|
112
118
|
input_history = [
|
|
113
119
|
(chat_history[i]["content"], (chat_history[i + 1]["content"]))
|
|
@@ -122,9 +128,15 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
122
128
|
def _stream_generator():
|
|
123
129
|
last_chunk_text_length = 0
|
|
124
130
|
chunk_id = "chat-" + str(uuid.uuid1())
|
|
131
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
132
|
+
inputs = self._tokenizer([prompt], return_tensors="pt")
|
|
133
|
+
inputs = inputs.to(self._model.device)
|
|
134
|
+
prompt_tokens = len(inputs["input_ids"][0])
|
|
125
135
|
for chunk_text, _ in self._model.stream_chat(
|
|
126
|
-
self._tokenizer, prompt,
|
|
136
|
+
self._tokenizer, prompt, chat_history, **kwargs
|
|
127
137
|
):
|
|
138
|
+
completion_tokens = completion_tokens + 1
|
|
139
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
128
140
|
chunk_text = chunk_text[last_chunk_text_length:]
|
|
129
141
|
last_chunk_text_length += len(chunk_text)
|
|
130
142
|
completion_choice = CompletionChoice(
|
|
@@ -136,7 +148,26 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
136
148
|
created=int(time.time()),
|
|
137
149
|
model=self.model_uid,
|
|
138
150
|
choices=[completion_choice],
|
|
151
|
+
usage=CompletionUsage(
|
|
152
|
+
prompt_tokens=prompt_tokens,
|
|
153
|
+
completion_tokens=completion_tokens,
|
|
154
|
+
total_tokens=total_tokens,
|
|
155
|
+
),
|
|
156
|
+
)
|
|
157
|
+
if include_usage:
|
|
158
|
+
chunk = CompletionChunk(
|
|
159
|
+
id=chunk_id,
|
|
160
|
+
object="text_completion",
|
|
161
|
+
created=int(time.time()),
|
|
162
|
+
model=self.model_uid,
|
|
163
|
+
choices=[],
|
|
164
|
+
)
|
|
165
|
+
chunk["usage"] = CompletionUsage(
|
|
166
|
+
prompt_tokens=prompt_tokens,
|
|
167
|
+
completion_tokens=completion_tokens,
|
|
168
|
+
total_tokens=total_tokens,
|
|
139
169
|
)
|
|
170
|
+
yield chunk
|
|
140
171
|
|
|
141
172
|
return self._to_chat_completion_chunks(_stream_generator())
|
|
142
173
|
else:
|
|
@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
|
|
|
22
22
|
from ....model.utils import select_device
|
|
23
23
|
from ....types import (
|
|
24
24
|
ChatCompletion,
|
|
25
|
-
ChatCompletionChoice,
|
|
26
25
|
ChatCompletionChunk,
|
|
27
26
|
ChatCompletionMessage,
|
|
27
|
+
Completion,
|
|
28
|
+
CompletionChoice,
|
|
29
|
+
CompletionChunk,
|
|
28
30
|
CompletionUsage,
|
|
29
31
|
)
|
|
30
32
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
116
118
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
117
119
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
118
120
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
119
|
-
if generate_config and generate_config.get("stream"):
|
|
120
|
-
raise Exception(
|
|
121
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
122
|
-
)
|
|
123
121
|
prompt = self._message_content_to_qwen(prompt)
|
|
124
122
|
# Convert openai history to qwen vl history
|
|
125
123
|
qwen_history = []
|
|
@@ -134,22 +132,106 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
134
132
|
if len(query_to_response) == 2:
|
|
135
133
|
qwen_history.append(query_to_response)
|
|
136
134
|
query_to_response = []
|
|
135
|
+
|
|
136
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
137
|
+
stream_options = (
|
|
138
|
+
generate_config.pop("stream_options", None) if generate_config else None
|
|
139
|
+
)
|
|
140
|
+
include_usage = (
|
|
141
|
+
stream_options["include_usage"]
|
|
142
|
+
if isinstance(stream_options, dict)
|
|
143
|
+
else False
|
|
144
|
+
)
|
|
145
|
+
if stream:
|
|
146
|
+
it = self._generate_stream(prompt, qwen_history, include_usage)
|
|
147
|
+
return self._to_chat_completion_chunks(it)
|
|
148
|
+
else:
|
|
149
|
+
c = self._generate(prompt, qwen_history)
|
|
150
|
+
return self._to_chat_completion(c)
|
|
151
|
+
|
|
152
|
+
def _generate(self, prompt: str, qwen_history: List) -> Completion:
|
|
137
153
|
response, history = self._model.chat(
|
|
138
154
|
self._tokenizer, query=prompt, history=qwen_history
|
|
139
155
|
)
|
|
140
|
-
|
|
141
|
-
id=
|
|
142
|
-
object="
|
|
156
|
+
c = Completion(
|
|
157
|
+
id=str(uuid.uuid1()),
|
|
158
|
+
object="text_completion",
|
|
143
159
|
created=int(time.time()),
|
|
144
160
|
model=self.model_uid,
|
|
145
161
|
choices=[
|
|
146
|
-
|
|
147
|
-
index=0,
|
|
148
|
-
message={"role": "assistant", "content": response},
|
|
149
|
-
finish_reason="stop",
|
|
162
|
+
CompletionChoice(
|
|
163
|
+
index=0, text=response, finish_reason="stop", logprobs=None
|
|
150
164
|
)
|
|
151
165
|
],
|
|
152
166
|
usage=CompletionUsage(
|
|
153
167
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
154
168
|
),
|
|
155
169
|
)
|
|
170
|
+
return c
|
|
171
|
+
|
|
172
|
+
def _generate_stream(
|
|
173
|
+
self, prompt: str, qwen_history: List, include_usage
|
|
174
|
+
) -> Iterator[CompletionChunk]:
|
|
175
|
+
# response, history = model.chat(tokenizer, message, history=history)
|
|
176
|
+
response_generator = self._model.chat_stream(
|
|
177
|
+
self._tokenizer, query=prompt, history=qwen_history
|
|
178
|
+
)
|
|
179
|
+
completion_id = str(uuid.uuid1())
|
|
180
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
181
|
+
input_ids = self._tokenizer(prompt, allowed_special="all").input_ids
|
|
182
|
+
prompt_tokens = len(input_ids)
|
|
183
|
+
full_response = ""
|
|
184
|
+
for response in response_generator:
|
|
185
|
+
inc_content = response[len(full_response) :]
|
|
186
|
+
full_response = response
|
|
187
|
+
completion_choice = CompletionChoice(
|
|
188
|
+
text=inc_content, index=0, logprobs=None, finish_reason=None
|
|
189
|
+
)
|
|
190
|
+
completion_chunk = CompletionChunk(
|
|
191
|
+
id=completion_id,
|
|
192
|
+
object="text_completion",
|
|
193
|
+
created=int(time.time()),
|
|
194
|
+
model=self.model_uid,
|
|
195
|
+
choices=[completion_choice],
|
|
196
|
+
)
|
|
197
|
+
completion_tokens = completion_tokens + 1
|
|
198
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
199
|
+
completion_usage = CompletionUsage(
|
|
200
|
+
prompt_tokens=prompt_tokens,
|
|
201
|
+
completion_tokens=completion_tokens,
|
|
202
|
+
total_tokens=total_tokens,
|
|
203
|
+
)
|
|
204
|
+
completion_chunk["usage"] = completion_usage
|
|
205
|
+
yield completion_chunk
|
|
206
|
+
|
|
207
|
+
completion_choice = CompletionChoice(
|
|
208
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
209
|
+
)
|
|
210
|
+
completion_chunk = CompletionChunk(
|
|
211
|
+
id=completion_id,
|
|
212
|
+
object="text_completion",
|
|
213
|
+
created=int(time.time()),
|
|
214
|
+
model=self.model_uid,
|
|
215
|
+
choices=[completion_choice],
|
|
216
|
+
)
|
|
217
|
+
completion_usage = CompletionUsage(
|
|
218
|
+
prompt_tokens=prompt_tokens,
|
|
219
|
+
completion_tokens=completion_tokens,
|
|
220
|
+
total_tokens=total_tokens,
|
|
221
|
+
)
|
|
222
|
+
completion_chunk["usage"] = completion_usage
|
|
223
|
+
yield completion_chunk
|
|
224
|
+
if include_usage:
|
|
225
|
+
chunk = CompletionChunk(
|
|
226
|
+
id=completion_id,
|
|
227
|
+
object="text_completion",
|
|
228
|
+
created=int(time.time()),
|
|
229
|
+
model=self.model_uid,
|
|
230
|
+
choices=[],
|
|
231
|
+
)
|
|
232
|
+
chunk["usage"] = CompletionUsage(
|
|
233
|
+
prompt_tokens=prompt_tokens,
|
|
234
|
+
completion_tokens=completion_tokens,
|
|
235
|
+
total_tokens=total_tokens,
|
|
236
|
+
)
|
|
237
|
+
yield chunk
|
|
@@ -106,6 +106,10 @@ def generate_stream(
|
|
|
106
106
|
context_len = get_context_length(model.config)
|
|
107
107
|
stream_interval = generate_config.get("stream_interval", 2)
|
|
108
108
|
stream = generate_config.get("stream", False)
|
|
109
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
110
|
+
include_usage = (
|
|
111
|
+
stream_options["include_usage"] if isinstance(stream_options, dict) else False
|
|
112
|
+
)
|
|
109
113
|
|
|
110
114
|
len_prompt = len(prompt)
|
|
111
115
|
|
|
@@ -333,6 +337,21 @@ def generate_stream(
|
|
|
333
337
|
|
|
334
338
|
yield completion_chunk, completion_usage
|
|
335
339
|
|
|
340
|
+
if include_usage:
|
|
341
|
+
completion_chunk = CompletionChunk(
|
|
342
|
+
id=str(uuid.uuid1()),
|
|
343
|
+
object="text_completion",
|
|
344
|
+
created=int(time.time()),
|
|
345
|
+
model=model_uid,
|
|
346
|
+
choices=[],
|
|
347
|
+
)
|
|
348
|
+
completion_usage = CompletionUsage(
|
|
349
|
+
prompt_tokens=input_echo_len,
|
|
350
|
+
completion_tokens=i,
|
|
351
|
+
total_tokens=(input_echo_len + i),
|
|
352
|
+
)
|
|
353
|
+
yield completion_chunk, completion_usage
|
|
354
|
+
|
|
336
355
|
# clean
|
|
337
356
|
del past_key_values, out
|
|
338
357
|
gc.collect()
|
|
@@ -352,7 +371,10 @@ def generate_stream_falcon(
|
|
|
352
371
|
context_len = get_context_length(model.config)
|
|
353
372
|
stream_interval = generate_config.get("stream_interval", 2)
|
|
354
373
|
stream = generate_config.get("stream", False)
|
|
355
|
-
|
|
374
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
375
|
+
include_usage = (
|
|
376
|
+
stream_options["include_usage"] if isinstance(stream_options, dict) else False
|
|
377
|
+
)
|
|
356
378
|
len_prompt = len(prompt)
|
|
357
379
|
|
|
358
380
|
temperature = float(generate_config.get("temperature", 1.0))
|
|
@@ -488,6 +510,21 @@ def generate_stream_falcon(
|
|
|
488
510
|
|
|
489
511
|
yield completion_chunk, completion_usage
|
|
490
512
|
|
|
513
|
+
if include_usage:
|
|
514
|
+
completion_chunk = CompletionChunk(
|
|
515
|
+
id=str(uuid.uuid1()),
|
|
516
|
+
object="text_completion",
|
|
517
|
+
created=int(time.time()),
|
|
518
|
+
model=model_uid,
|
|
519
|
+
choices=[],
|
|
520
|
+
)
|
|
521
|
+
completion_usage = CompletionUsage(
|
|
522
|
+
prompt_tokens=input_echo_len,
|
|
523
|
+
completion_tokens=i,
|
|
524
|
+
total_tokens=(input_echo_len + i),
|
|
525
|
+
)
|
|
526
|
+
yield completion_chunk, completion_usage
|
|
527
|
+
|
|
491
528
|
# clean
|
|
492
529
|
gc.collect()
|
|
493
530
|
empty_cache()
|
|
@@ -27,9 +27,11 @@ from PIL import Image
|
|
|
27
27
|
from ....model.utils import select_device
|
|
28
28
|
from ....types import (
|
|
29
29
|
ChatCompletion,
|
|
30
|
-
ChatCompletionChoice,
|
|
31
30
|
ChatCompletionChunk,
|
|
32
31
|
ChatCompletionMessage,
|
|
32
|
+
Completion,
|
|
33
|
+
CompletionChoice,
|
|
34
|
+
CompletionChunk,
|
|
33
35
|
CompletionUsage,
|
|
34
36
|
)
|
|
35
37
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
122
124
|
raise RuntimeError("Only one image per message is supported by Yi VL.")
|
|
123
125
|
return content
|
|
124
126
|
|
|
125
|
-
@staticmethod
|
|
126
|
-
def _parse_text(text):
|
|
127
|
-
lines = text.split("\n")
|
|
128
|
-
lines = [line for line in lines if line != ""]
|
|
129
|
-
count = 0
|
|
130
|
-
for i, line in enumerate(lines):
|
|
131
|
-
if "```" in line:
|
|
132
|
-
count += 1
|
|
133
|
-
items = line.split("`")
|
|
134
|
-
if count % 2 == 1:
|
|
135
|
-
lines[i] = f'<pre><code class="language-{items[-1]}">'
|
|
136
|
-
else:
|
|
137
|
-
lines[i] = f"<br></code></pre>"
|
|
138
|
-
else:
|
|
139
|
-
if i > 0:
|
|
140
|
-
if count % 2 == 1:
|
|
141
|
-
line = line.replace("`", r"\`")
|
|
142
|
-
line = line.replace("<", "<")
|
|
143
|
-
line = line.replace(">", ">")
|
|
144
|
-
line = line.replace(" ", " ")
|
|
145
|
-
line = line.replace("*", "*")
|
|
146
|
-
line = line.replace("_", "_")
|
|
147
|
-
line = line.replace("-", "-")
|
|
148
|
-
line = line.replace(".", ".")
|
|
149
|
-
line = line.replace("!", "!")
|
|
150
|
-
line = line.replace("(", "(")
|
|
151
|
-
line = line.replace(")", ")")
|
|
152
|
-
line = line.replace("$", "$")
|
|
153
|
-
lines[i] = "<br>" + line
|
|
154
|
-
text = "".join(lines)
|
|
155
|
-
return text
|
|
156
|
-
|
|
157
127
|
def chat(
|
|
158
128
|
self,
|
|
159
129
|
prompt: Union[str, List[Dict]],
|
|
@@ -164,12 +134,18 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
164
134
|
from transformers import TextIteratorStreamer
|
|
165
135
|
|
|
166
136
|
# TODO(codingl2k1): implement stream mode.
|
|
167
|
-
|
|
168
|
-
raise Exception(
|
|
169
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
170
|
-
)
|
|
137
|
+
|
|
171
138
|
if not generate_config:
|
|
172
139
|
generate_config = {}
|
|
140
|
+
|
|
141
|
+
stream = generate_config.get("stream", False)
|
|
142
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
143
|
+
include_usage = (
|
|
144
|
+
stream_options["include_usage"]
|
|
145
|
+
if isinstance(stream_options, dict)
|
|
146
|
+
else False
|
|
147
|
+
)
|
|
148
|
+
|
|
173
149
|
from ....thirdparty.llava.conversation import conv_templates
|
|
174
150
|
from ....thirdparty.llava.mm_utils import (
|
|
175
151
|
KeywordsStoppingCriteria,
|
|
@@ -196,11 +172,11 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
196
172
|
)
|
|
197
173
|
|
|
198
174
|
images = state.get_images(return_pil=True)
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
175
|
+
if images:
|
|
176
|
+
image = images[0]
|
|
177
|
+
image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
|
|
178
|
+
"pixel_values"
|
|
179
|
+
][0]
|
|
204
180
|
|
|
205
181
|
stop_str = state.sep
|
|
206
182
|
keywords = [stop_str]
|
|
@@ -217,7 +193,9 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
217
193
|
"input_ids": input_ids,
|
|
218
194
|
"images": image_tensor.unsqueeze(0)
|
|
219
195
|
.to(dtype=torch.bfloat16)
|
|
220
|
-
.to(self._model.device)
|
|
196
|
+
.to(self._model.device)
|
|
197
|
+
if images
|
|
198
|
+
else None,
|
|
221
199
|
"streamer": streamer,
|
|
222
200
|
"do_sample": True,
|
|
223
201
|
"top_p": float(top_p),
|
|
@@ -229,25 +207,92 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
229
207
|
t = Thread(target=self._model.generate, kwargs=generate_kwargs)
|
|
230
208
|
t.start()
|
|
231
209
|
|
|
210
|
+
if stream:
|
|
211
|
+
it = self._generate_stream(streamer, stop_str, input_ids, include_usage)
|
|
212
|
+
return self._to_chat_completion_chunks(it)
|
|
213
|
+
else:
|
|
214
|
+
c = self._generate(streamer, stop_str)
|
|
215
|
+
return self._to_chat_completion(c)
|
|
216
|
+
|
|
217
|
+
def _generate(self, streamer, stop_str) -> Completion:
|
|
232
218
|
generated_text = ""
|
|
233
219
|
for new_text in streamer:
|
|
234
220
|
generated_text += new_text
|
|
235
221
|
if generated_text.endswith(stop_str):
|
|
236
222
|
generated_text = generated_text[: -len(stop_str)]
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
id=
|
|
240
|
-
object="
|
|
223
|
+
|
|
224
|
+
c = Completion(
|
|
225
|
+
id=str(uuid.uuid1()),
|
|
226
|
+
object="text_completion",
|
|
241
227
|
created=int(time.time()),
|
|
242
228
|
model=self.model_uid,
|
|
243
229
|
choices=[
|
|
244
|
-
|
|
245
|
-
index=0,
|
|
246
|
-
message={"role": "assistant", "content": r},
|
|
247
|
-
finish_reason="stop",
|
|
230
|
+
CompletionChoice(
|
|
231
|
+
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
248
232
|
)
|
|
249
233
|
],
|
|
250
234
|
usage=CompletionUsage(
|
|
251
235
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
252
236
|
),
|
|
253
237
|
)
|
|
238
|
+
return c
|
|
239
|
+
|
|
240
|
+
def _generate_stream(
|
|
241
|
+
self, streamer, stop_str, input_ids, include_usage
|
|
242
|
+
) -> Iterator[CompletionChunk]:
|
|
243
|
+
completion_id = str(uuid.uuid1())
|
|
244
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
245
|
+
prompt_tokens = len(input_ids[0])
|
|
246
|
+
for i, new_text in enumerate(streamer):
|
|
247
|
+
if not new_text.endswith(stop_str):
|
|
248
|
+
completion_choice = CompletionChoice(
|
|
249
|
+
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
250
|
+
)
|
|
251
|
+
chunk = CompletionChunk(
|
|
252
|
+
id=completion_id,
|
|
253
|
+
object="text_completion",
|
|
254
|
+
created=int(time.time()),
|
|
255
|
+
model=self.model_uid,
|
|
256
|
+
choices=[completion_choice],
|
|
257
|
+
)
|
|
258
|
+
completion_tokens = i
|
|
259
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
260
|
+
completion_usage = CompletionUsage(
|
|
261
|
+
prompt_tokens=prompt_tokens,
|
|
262
|
+
completion_tokens=completion_tokens,
|
|
263
|
+
total_tokens=total_tokens,
|
|
264
|
+
)
|
|
265
|
+
chunk["usage"] = completion_usage
|
|
266
|
+
yield chunk
|
|
267
|
+
|
|
268
|
+
completion_choice = CompletionChoice(
|
|
269
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
270
|
+
)
|
|
271
|
+
chunk = CompletionChunk(
|
|
272
|
+
id=completion_id,
|
|
273
|
+
object="text_completion",
|
|
274
|
+
created=int(time.time()),
|
|
275
|
+
model=self.model_uid,
|
|
276
|
+
choices=[completion_choice],
|
|
277
|
+
)
|
|
278
|
+
completion_usage = CompletionUsage(
|
|
279
|
+
prompt_tokens=prompt_tokens,
|
|
280
|
+
completion_tokens=completion_tokens,
|
|
281
|
+
total_tokens=total_tokens,
|
|
282
|
+
)
|
|
283
|
+
chunk["usage"] = completion_usage
|
|
284
|
+
yield chunk
|
|
285
|
+
if include_usage:
|
|
286
|
+
chunk = CompletionChunk(
|
|
287
|
+
id=completion_id,
|
|
288
|
+
object="text_completion",
|
|
289
|
+
created=int(time.time()),
|
|
290
|
+
model=self.model_uid,
|
|
291
|
+
choices=[],
|
|
292
|
+
)
|
|
293
|
+
chunk["usage"] = CompletionUsage(
|
|
294
|
+
prompt_tokens=prompt_tokens,
|
|
295
|
+
completion_tokens=completion_tokens,
|
|
296
|
+
total_tokens=total_tokens,
|
|
297
|
+
)
|
|
298
|
+
yield chunk
|
|
@@ -53,6 +53,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
|
|
|
53
53
|
stop: Optional[Union[str, List[str]]]
|
|
54
54
|
ignore_eos: bool
|
|
55
55
|
stream: bool
|
|
56
|
+
stream_options: Optional[Union[dict, None]]
|
|
56
57
|
|
|
57
58
|
|
|
58
59
|
try:
|
|
@@ -157,6 +158,8 @@ class SGLANGModel(LLM):
|
|
|
157
158
|
)
|
|
158
159
|
generate_config.setdefault("stop", [])
|
|
159
160
|
generate_config.setdefault("stream", False)
|
|
161
|
+
stream_options = generate_config.get("stream_options")
|
|
162
|
+
generate_config.setdefault("stream_options", stream_options)
|
|
160
163
|
generate_config.setdefault("ignore_eos", False)
|
|
161
164
|
|
|
162
165
|
return generate_config
|
|
@@ -192,7 +195,7 @@ class SGLANGModel(LLM):
|
|
|
192
195
|
|
|
193
196
|
@staticmethod
|
|
194
197
|
def _convert_state_to_completion_chunk(
|
|
195
|
-
request_id: str, model: str, output_text: str
|
|
198
|
+
request_id: str, model: str, output_text: str
|
|
196
199
|
) -> CompletionChunk:
|
|
197
200
|
choices: List[CompletionChoice] = [
|
|
198
201
|
CompletionChoice(
|
|
@@ -209,13 +212,6 @@ class SGLANGModel(LLM):
|
|
|
209
212
|
model=model,
|
|
210
213
|
choices=choices,
|
|
211
214
|
)
|
|
212
|
-
prompt_tokens = meta_info["prompt_tokens"]
|
|
213
|
-
completion_tokens = meta_info["completion_tokens"]
|
|
214
|
-
chunk["usage"] = CompletionUsage(
|
|
215
|
-
prompt_tokens=prompt_tokens,
|
|
216
|
-
completion_tokens=completion_tokens,
|
|
217
|
-
total_tokens=prompt_tokens + completion_tokens,
|
|
218
|
-
)
|
|
219
215
|
return chunk
|
|
220
216
|
|
|
221
217
|
@staticmethod
|
|
@@ -272,6 +268,9 @@ class SGLANGModel(LLM):
|
|
|
272
268
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
273
269
|
)
|
|
274
270
|
stream = sanitized_generate_config.pop("stream")
|
|
271
|
+
stream_options = sanitized_generate_config.pop("stream_options")
|
|
272
|
+
if isinstance(stream_options, dict):
|
|
273
|
+
include_usage = stream_options.pop("include_usage", False)
|
|
275
274
|
request_id = str(uuid.uuid1())
|
|
276
275
|
state = pipeline.run(
|
|
277
276
|
question=prompt,
|
|
@@ -289,11 +288,34 @@ class SGLANGModel(LLM):
|
|
|
289
288
|
else:
|
|
290
289
|
|
|
291
290
|
async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
|
|
291
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
292
292
|
async for out, meta_info in state.text_async_iter(
|
|
293
293
|
var_name="answer", return_meta_data=True
|
|
294
294
|
):
|
|
295
295
|
chunk = self._convert_state_to_completion_chunk(
|
|
296
|
-
request_id, self.model_uid, output_text=out
|
|
296
|
+
request_id, self.model_uid, output_text=out
|
|
297
|
+
)
|
|
298
|
+
prompt_tokens = meta_info["prompt_tokens"]
|
|
299
|
+
completion_tokens = meta_info["completion_tokens"]
|
|
300
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
301
|
+
chunk["usage"] = CompletionUsage(
|
|
302
|
+
prompt_tokens=prompt_tokens,
|
|
303
|
+
completion_tokens=completion_tokens,
|
|
304
|
+
total_tokens=total_tokens,
|
|
305
|
+
)
|
|
306
|
+
yield chunk
|
|
307
|
+
if include_usage:
|
|
308
|
+
chunk = CompletionChunk(
|
|
309
|
+
id=request_id,
|
|
310
|
+
object="text_completion",
|
|
311
|
+
created=int(time.time()),
|
|
312
|
+
model=self.model_uid,
|
|
313
|
+
choices=[],
|
|
314
|
+
)
|
|
315
|
+
chunk["usage"] = CompletionUsage(
|
|
316
|
+
prompt_tokens=prompt_tokens,
|
|
317
|
+
completion_tokens=completion_tokens,
|
|
318
|
+
total_tokens=total_tokens,
|
|
297
319
|
)
|
|
298
320
|
yield chunk
|
|
299
321
|
|
xinference/model/llm/utils.py
CHANGED
|
@@ -228,16 +228,14 @@ Begin!"""
|
|
|
228
228
|
tools_name_text = []
|
|
229
229
|
for func_info in tools:
|
|
230
230
|
parameters = []
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
param["required"] = True
|
|
240
|
-
parameters.append(param)
|
|
231
|
+
fp = func_info["function"].get("parameters", {})
|
|
232
|
+
if fp:
|
|
233
|
+
required_parameters = fp.get("required", [])
|
|
234
|
+
for name, p in fp["properties"].items():
|
|
235
|
+
param = dict({"name": name}, **p)
|
|
236
|
+
if name in required_parameters:
|
|
237
|
+
param["required"] = True
|
|
238
|
+
parameters.append(param)
|
|
241
239
|
|
|
242
240
|
name = func_info["function"]["name"]
|
|
243
241
|
desc = func_info["function"]["description"]
|
|
@@ -447,6 +445,17 @@ Begin!"""
|
|
|
447
445
|
else:
|
|
448
446
|
ret += "<AI>" + content.strip()
|
|
449
447
|
return ret
|
|
448
|
+
elif prompt_style.style_name == "PHI3":
|
|
449
|
+
ret = f"<|system|>{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
|
|
450
|
+
for message in chat_history:
|
|
451
|
+
content = message["content"] or ""
|
|
452
|
+
role = get_role(message["role"])
|
|
453
|
+
if content:
|
|
454
|
+
ret += f"<|{role}|>{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
|
|
455
|
+
else:
|
|
456
|
+
ret += f"<|{role}|>{prompt_style.intra_message_sep}"
|
|
457
|
+
ret += "<|assistant|>\n"
|
|
458
|
+
return ret
|
|
450
459
|
else:
|
|
451
460
|
raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
|
|
452
461
|
|
|
@@ -473,9 +482,6 @@ Begin!"""
|
|
|
473
482
|
for i, choice in enumerate(chunk["choices"])
|
|
474
483
|
],
|
|
475
484
|
}
|
|
476
|
-
usage = chunk.get("usage")
|
|
477
|
-
if usage is not None:
|
|
478
|
-
chat_chunk["usage"] = usage
|
|
479
485
|
return cast(ChatCompletionChunk, chat_chunk)
|
|
480
486
|
|
|
481
487
|
@classmethod
|
|
@@ -499,6 +505,19 @@ Begin!"""
|
|
|
499
505
|
for i, choice in enumerate(chunk["choices"])
|
|
500
506
|
],
|
|
501
507
|
}
|
|
508
|
+
return cast(ChatCompletionChunk, chat_chunk)
|
|
509
|
+
|
|
510
|
+
@classmethod
|
|
511
|
+
def _get_final_chat_completion_chunk(
|
|
512
|
+
cls, chunk: CompletionChunk
|
|
513
|
+
) -> ChatCompletionChunk:
|
|
514
|
+
chat_chunk = {
|
|
515
|
+
"id": "chat" + chunk["id"],
|
|
516
|
+
"model": chunk["model"],
|
|
517
|
+
"created": chunk["created"],
|
|
518
|
+
"object": "chat.completion.chunk",
|
|
519
|
+
"choices": [],
|
|
520
|
+
}
|
|
502
521
|
usage = chunk.get("usage")
|
|
503
522
|
if usage is not None:
|
|
504
523
|
chat_chunk["usage"] = usage
|
|
@@ -512,7 +531,12 @@ Begin!"""
|
|
|
512
531
|
for i, chunk in enumerate(chunks):
|
|
513
532
|
if i == 0:
|
|
514
533
|
yield cls._get_first_chat_completion_chunk(chunk)
|
|
515
|
-
|
|
534
|
+
# usage
|
|
535
|
+
choices = chunk.get("choices")
|
|
536
|
+
if not choices:
|
|
537
|
+
yield cls._get_final_chat_completion_chunk(chunk)
|
|
538
|
+
else:
|
|
539
|
+
yield cls._to_chat_completion_chunk(chunk)
|
|
516
540
|
|
|
517
541
|
@classmethod
|
|
518
542
|
async def _async_to_chat_completion_chunks(
|
|
@@ -523,7 +547,12 @@ Begin!"""
|
|
|
523
547
|
async for chunk in chunks:
|
|
524
548
|
if i == 0:
|
|
525
549
|
yield cls._get_first_chat_completion_chunk(chunk)
|
|
526
|
-
|
|
550
|
+
# usage
|
|
551
|
+
choices = chunk.get("choices")
|
|
552
|
+
if not choices:
|
|
553
|
+
yield cls._get_final_chat_completion_chunk(chunk)
|
|
554
|
+
else:
|
|
555
|
+
yield cls._to_chat_completion_chunk(chunk)
|
|
527
556
|
i += 1
|
|
528
557
|
|
|
529
558
|
@staticmethod
|
|
@@ -680,6 +709,15 @@ Begin!"""
|
|
|
680
709
|
else:
|
|
681
710
|
m = {"role": "assistant", "content": content, "tool_calls": []}
|
|
682
711
|
finish_reason = "stop"
|
|
712
|
+
try:
|
|
713
|
+
usage = c.get("usage")
|
|
714
|
+
assert "prompt_tokens" in usage
|
|
715
|
+
except Exception:
|
|
716
|
+
usage = {
|
|
717
|
+
"prompt_tokens": -1,
|
|
718
|
+
"completion_tokens": -1,
|
|
719
|
+
"total_tokens": -1,
|
|
720
|
+
}
|
|
683
721
|
return {
|
|
684
722
|
"id": "chat" + f"cmpl-{_id}",
|
|
685
723
|
"model": model_uid,
|
|
@@ -692,11 +730,7 @@ Begin!"""
|
|
|
692
730
|
"finish_reason": finish_reason,
|
|
693
731
|
}
|
|
694
732
|
],
|
|
695
|
-
"usage":
|
|
696
|
-
"prompt_tokens": -1,
|
|
697
|
-
"completion_tokens": -1,
|
|
698
|
-
"total_tokens": -1,
|
|
699
|
-
},
|
|
733
|
+
"usage": usage,
|
|
700
734
|
}
|
|
701
735
|
|
|
702
736
|
|