xinference 0.10.3__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/event.py +1 -1
- xinference/core/model.py +15 -4
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +68 -101
- xinference/deploy/cmdline.py +166 -1
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +5 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +306 -4
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +119 -2
- xinference/model/llm/pytorch/deepseek_vl.py +89 -33
- xinference/model/llm/pytorch/qwen_vl.py +67 -12
- xinference/model/llm/pytorch/yi_vl.py +62 -45
- xinference/model/llm/utils.py +29 -15
- xinference/model/llm/vllm/core.py +19 -4
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/METADATA +11 -11
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/RECORD +78 -57
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -27,9 +27,11 @@ import torch
|
|
|
27
27
|
from ....model.utils import select_device
|
|
28
28
|
from ....types import (
|
|
29
29
|
ChatCompletion,
|
|
30
|
-
ChatCompletionChoice,
|
|
31
30
|
ChatCompletionChunk,
|
|
32
31
|
ChatCompletionMessage,
|
|
32
|
+
Completion,
|
|
33
|
+
CompletionChoice,
|
|
34
|
+
CompletionChunk,
|
|
33
35
|
CompletionUsage,
|
|
34
36
|
)
|
|
35
37
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
67
69
|
self._type = torch.float16 if self._device == "mps" else torch.bfloat16
|
|
68
70
|
|
|
69
71
|
# specify the path to the model
|
|
70
|
-
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
|
|
72
|
+
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
|
|
71
73
|
self.model_path
|
|
72
74
|
)
|
|
73
75
|
self._tokenizer = self._vl_chat_processor.tokenizer
|
|
74
76
|
|
|
75
|
-
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
|
77
|
+
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
|
|
76
78
|
self.model_path, trust_remote_code=True, device_map=self._device
|
|
77
79
|
)
|
|
78
80
|
self._model = vl_gpt.to(self._type).eval()
|
|
@@ -149,10 +151,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
149
151
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
150
152
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
151
153
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
152
|
-
if
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
154
|
+
if not generate_config:
|
|
155
|
+
generate_config = {}
|
|
156
|
+
|
|
157
|
+
stream = generate_config.get("stream", False)
|
|
158
|
+
|
|
156
159
|
prompt, images = self._message_content_to_deepseek(prompt)
|
|
157
160
|
prompt_messages: List[Dict[str, Any]] = [
|
|
158
161
|
{
|
|
@@ -184,6 +187,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
184
187
|
|
|
185
188
|
deepseek_history.extend(prompt_messages)
|
|
186
189
|
|
|
190
|
+
from ....thirdparty.deepseek_vl.serve.inference import generate
|
|
187
191
|
from ....thirdparty.deepseek_vl.utils.io import load_pil_images
|
|
188
192
|
|
|
189
193
|
# load images and prepare for inputs
|
|
@@ -192,41 +196,93 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
192
196
|
conversations=deepseek_history, images=pil_images, force_batchify=True
|
|
193
197
|
).to(self._model.device, self._model.dtype)
|
|
194
198
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
bos_token_id=self._tokenizer.bos_token_id,
|
|
204
|
-
eos_token_id=self._tokenizer.eos_token_id,
|
|
205
|
-
max_new_tokens=512,
|
|
206
|
-
do_sample=True,
|
|
207
|
-
top_p=0.95,
|
|
208
|
-
temperature=0.2,
|
|
209
|
-
repetition_penalty=1.1,
|
|
210
|
-
use_cache=True,
|
|
211
|
-
)
|
|
199
|
+
temperature = generate_config.get("temperature", 0.2)
|
|
200
|
+
top_p = generate_config.get("top_p", 0.95)
|
|
201
|
+
max_new_tokens = generate_config.get("max_tokens", 512)
|
|
202
|
+
repetition_penalty = generate_config.get("repetition_penalty", 1.1)
|
|
203
|
+
|
|
204
|
+
conversation = self._vl_chat_processor.new_chat_template()
|
|
205
|
+
stop_str = conversation.sep2
|
|
206
|
+
stop_words = [stop_str]
|
|
212
207
|
|
|
213
|
-
|
|
214
|
-
|
|
208
|
+
streamer = generate(
|
|
209
|
+
vl_gpt=self._model,
|
|
210
|
+
tokenizer=self._tokenizer,
|
|
211
|
+
prepare_inputs=prepare_inputs,
|
|
212
|
+
max_gen_len=max_new_tokens,
|
|
213
|
+
temperature=temperature,
|
|
214
|
+
repetition_penalty=repetition_penalty,
|
|
215
|
+
top_p=top_p,
|
|
216
|
+
stop_words=stop_words,
|
|
215
217
|
)
|
|
216
218
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
219
|
+
if stream:
|
|
220
|
+
it = self._generate_stream(streamer, stop_str)
|
|
221
|
+
return self._to_chat_completion_chunks(it)
|
|
222
|
+
else:
|
|
223
|
+
c = self._generate(streamer, stop_str)
|
|
224
|
+
return self._to_chat_completion(c)
|
|
225
|
+
|
|
226
|
+
def _generate(self, streamer, stop_str) -> Completion:
|
|
227
|
+
generated_text = ""
|
|
228
|
+
for new_text in streamer:
|
|
229
|
+
if new_text.endswith(stop_str):
|
|
230
|
+
new_text = new_text[: -len(stop_str)]
|
|
231
|
+
generated_text += new_text
|
|
232
|
+
|
|
233
|
+
c = Completion(
|
|
234
|
+
id=str(uuid.uuid1()),
|
|
235
|
+
object="text_completion",
|
|
220
236
|
created=int(time.time()),
|
|
221
237
|
model=self.model_uid,
|
|
222
238
|
choices=[
|
|
223
|
-
|
|
224
|
-
index=0,
|
|
225
|
-
message={"role": "assistant", "content": answer},
|
|
226
|
-
finish_reason="stop",
|
|
239
|
+
CompletionChoice(
|
|
240
|
+
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
227
241
|
)
|
|
228
242
|
],
|
|
229
243
|
usage=CompletionUsage(
|
|
230
244
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
231
245
|
),
|
|
232
246
|
)
|
|
247
|
+
return c
|
|
248
|
+
|
|
249
|
+
def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
|
|
250
|
+
completion_id = str(uuid.uuid1())
|
|
251
|
+
for i, new_text in enumerate(streamer):
|
|
252
|
+
if new_text.endswith(stop_str):
|
|
253
|
+
new_text = new_text[: -len(stop_str)]
|
|
254
|
+
completion_choice = CompletionChoice(
|
|
255
|
+
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
256
|
+
)
|
|
257
|
+
chunk = CompletionChunk(
|
|
258
|
+
id=completion_id,
|
|
259
|
+
object="text_completion",
|
|
260
|
+
created=int(time.time()),
|
|
261
|
+
model=self.model_uid,
|
|
262
|
+
choices=[completion_choice],
|
|
263
|
+
)
|
|
264
|
+
completion_usage = CompletionUsage(
|
|
265
|
+
prompt_tokens=-1,
|
|
266
|
+
completion_tokens=-1,
|
|
267
|
+
total_tokens=-1,
|
|
268
|
+
)
|
|
269
|
+
chunk["usage"] = completion_usage
|
|
270
|
+
yield chunk
|
|
271
|
+
|
|
272
|
+
completion_choice = CompletionChoice(
|
|
273
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
274
|
+
)
|
|
275
|
+
chunk = CompletionChunk(
|
|
276
|
+
id=completion_id,
|
|
277
|
+
object="text_completion",
|
|
278
|
+
created=int(time.time()),
|
|
279
|
+
model=self.model_uid,
|
|
280
|
+
choices=[completion_choice],
|
|
281
|
+
)
|
|
282
|
+
completion_usage = CompletionUsage(
|
|
283
|
+
prompt_tokens=-1,
|
|
284
|
+
completion_tokens=-1,
|
|
285
|
+
total_tokens=-1,
|
|
286
|
+
)
|
|
287
|
+
chunk["usage"] = completion_usage
|
|
288
|
+
yield chunk
|
|
@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
|
|
|
22
22
|
from ....model.utils import select_device
|
|
23
23
|
from ....types import (
|
|
24
24
|
ChatCompletion,
|
|
25
|
-
ChatCompletionChoice,
|
|
26
25
|
ChatCompletionChunk,
|
|
27
26
|
ChatCompletionMessage,
|
|
27
|
+
Completion,
|
|
28
|
+
CompletionChoice,
|
|
29
|
+
CompletionChunk,
|
|
28
30
|
CompletionUsage,
|
|
29
31
|
)
|
|
30
32
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
116
118
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
117
119
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
118
120
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
119
|
-
if generate_config and generate_config.get("stream"):
|
|
120
|
-
raise Exception(
|
|
121
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
122
|
-
)
|
|
123
121
|
prompt = self._message_content_to_qwen(prompt)
|
|
124
122
|
# Convert openai history to qwen vl history
|
|
125
123
|
qwen_history = []
|
|
@@ -134,22 +132,79 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
134
132
|
if len(query_to_response) == 2:
|
|
135
133
|
qwen_history.append(query_to_response)
|
|
136
134
|
query_to_response = []
|
|
135
|
+
|
|
136
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
137
|
+
|
|
138
|
+
if stream:
|
|
139
|
+
it = self._generate_stream(prompt, qwen_history)
|
|
140
|
+
return self._to_chat_completion_chunks(it)
|
|
141
|
+
else:
|
|
142
|
+
c = self._generate(prompt, qwen_history)
|
|
143
|
+
return self._to_chat_completion(c)
|
|
144
|
+
|
|
145
|
+
def _generate(self, prompt: str, qwen_history: List) -> Completion:
|
|
137
146
|
response, history = self._model.chat(
|
|
138
147
|
self._tokenizer, query=prompt, history=qwen_history
|
|
139
148
|
)
|
|
140
|
-
|
|
141
|
-
id=
|
|
142
|
-
object="
|
|
149
|
+
c = Completion(
|
|
150
|
+
id=str(uuid.uuid1()),
|
|
151
|
+
object="text_completion",
|
|
143
152
|
created=int(time.time()),
|
|
144
153
|
model=self.model_uid,
|
|
145
154
|
choices=[
|
|
146
|
-
|
|
147
|
-
index=0,
|
|
148
|
-
message={"role": "assistant", "content": response},
|
|
149
|
-
finish_reason="stop",
|
|
155
|
+
CompletionChoice(
|
|
156
|
+
index=0, text=response, finish_reason="stop", logprobs=None
|
|
150
157
|
)
|
|
151
158
|
],
|
|
152
159
|
usage=CompletionUsage(
|
|
153
160
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
154
161
|
),
|
|
155
162
|
)
|
|
163
|
+
return c
|
|
164
|
+
|
|
165
|
+
def _generate_stream(
|
|
166
|
+
self, prompt: str, qwen_history: List
|
|
167
|
+
) -> Iterator[CompletionChunk]:
|
|
168
|
+
# response, history = model.chat(tokenizer, message, history=history)
|
|
169
|
+
response_generator = self._model.chat_stream(
|
|
170
|
+
self._tokenizer, query=prompt, history=qwen_history
|
|
171
|
+
)
|
|
172
|
+
full_response = ""
|
|
173
|
+
for response in response_generator:
|
|
174
|
+
inc_content = response[len(full_response) :]
|
|
175
|
+
full_response = response
|
|
176
|
+
completion_choice = CompletionChoice(
|
|
177
|
+
text=inc_content, index=0, logprobs=None, finish_reason=None
|
|
178
|
+
)
|
|
179
|
+
completion_chunk = CompletionChunk(
|
|
180
|
+
id=str(uuid.uuid1()),
|
|
181
|
+
object="text_completion",
|
|
182
|
+
created=int(time.time()),
|
|
183
|
+
model=self.model_uid,
|
|
184
|
+
choices=[completion_choice],
|
|
185
|
+
)
|
|
186
|
+
completion_usage = CompletionUsage(
|
|
187
|
+
prompt_tokens=-1,
|
|
188
|
+
completion_tokens=-1,
|
|
189
|
+
total_tokens=-1,
|
|
190
|
+
)
|
|
191
|
+
completion_chunk["usage"] = completion_usage
|
|
192
|
+
yield completion_chunk
|
|
193
|
+
|
|
194
|
+
completion_choice = CompletionChoice(
|
|
195
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
196
|
+
)
|
|
197
|
+
completion_chunk = CompletionChunk(
|
|
198
|
+
id=str(uuid.uuid1()),
|
|
199
|
+
object="text_completion",
|
|
200
|
+
created=int(time.time()),
|
|
201
|
+
model=self.model_uid,
|
|
202
|
+
choices=[completion_choice],
|
|
203
|
+
)
|
|
204
|
+
completion_usage = CompletionUsage(
|
|
205
|
+
prompt_tokens=-1,
|
|
206
|
+
completion_tokens=-1,
|
|
207
|
+
total_tokens=-1,
|
|
208
|
+
)
|
|
209
|
+
completion_chunk["usage"] = completion_usage
|
|
210
|
+
yield completion_chunk
|
|
@@ -27,9 +27,11 @@ from PIL import Image
|
|
|
27
27
|
from ....model.utils import select_device
|
|
28
28
|
from ....types import (
|
|
29
29
|
ChatCompletion,
|
|
30
|
-
ChatCompletionChoice,
|
|
31
30
|
ChatCompletionChunk,
|
|
32
31
|
ChatCompletionMessage,
|
|
32
|
+
Completion,
|
|
33
|
+
CompletionChoice,
|
|
34
|
+
CompletionChunk,
|
|
33
35
|
CompletionUsage,
|
|
34
36
|
)
|
|
35
37
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
122
124
|
raise RuntimeError("Only one image per message is supported by Yi VL.")
|
|
123
125
|
return content
|
|
124
126
|
|
|
125
|
-
@staticmethod
|
|
126
|
-
def _parse_text(text):
|
|
127
|
-
lines = text.split("\n")
|
|
128
|
-
lines = [line for line in lines if line != ""]
|
|
129
|
-
count = 0
|
|
130
|
-
for i, line in enumerate(lines):
|
|
131
|
-
if "```" in line:
|
|
132
|
-
count += 1
|
|
133
|
-
items = line.split("`")
|
|
134
|
-
if count % 2 == 1:
|
|
135
|
-
lines[i] = f'<pre><code class="language-{items[-1]}">'
|
|
136
|
-
else:
|
|
137
|
-
lines[i] = f"<br></code></pre>"
|
|
138
|
-
else:
|
|
139
|
-
if i > 0:
|
|
140
|
-
if count % 2 == 1:
|
|
141
|
-
line = line.replace("`", r"\`")
|
|
142
|
-
line = line.replace("<", "<")
|
|
143
|
-
line = line.replace(">", ">")
|
|
144
|
-
line = line.replace(" ", " ")
|
|
145
|
-
line = line.replace("*", "*")
|
|
146
|
-
line = line.replace("_", "_")
|
|
147
|
-
line = line.replace("-", "-")
|
|
148
|
-
line = line.replace(".", ".")
|
|
149
|
-
line = line.replace("!", "!")
|
|
150
|
-
line = line.replace("(", "(")
|
|
151
|
-
line = line.replace(")", ")")
|
|
152
|
-
line = line.replace("$", "$")
|
|
153
|
-
lines[i] = "<br>" + line
|
|
154
|
-
text = "".join(lines)
|
|
155
|
-
return text
|
|
156
|
-
|
|
157
127
|
def chat(
|
|
158
128
|
self,
|
|
159
129
|
prompt: Union[str, List[Dict]],
|
|
@@ -164,12 +134,12 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
164
134
|
from transformers import TextIteratorStreamer
|
|
165
135
|
|
|
166
136
|
# TODO(codingl2k1): implement stream mode.
|
|
167
|
-
|
|
168
|
-
raise Exception(
|
|
169
|
-
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
170
|
-
)
|
|
137
|
+
|
|
171
138
|
if not generate_config:
|
|
172
139
|
generate_config = {}
|
|
140
|
+
|
|
141
|
+
stream = generate_config.get("stream", False)
|
|
142
|
+
|
|
173
143
|
from ....thirdparty.llava.conversation import conv_templates
|
|
174
144
|
from ....thirdparty.llava.mm_utils import (
|
|
175
145
|
KeywordsStoppingCriteria,
|
|
@@ -229,25 +199,72 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
229
199
|
t = Thread(target=self._model.generate, kwargs=generate_kwargs)
|
|
230
200
|
t.start()
|
|
231
201
|
|
|
202
|
+
if stream:
|
|
203
|
+
it = self._generate_stream(streamer, stop_str)
|
|
204
|
+
return self._to_chat_completion_chunks(it)
|
|
205
|
+
else:
|
|
206
|
+
c = self._generate(streamer, stop_str)
|
|
207
|
+
return self._to_chat_completion(c)
|
|
208
|
+
|
|
209
|
+
def _generate(self, streamer, stop_str) -> Completion:
|
|
232
210
|
generated_text = ""
|
|
233
211
|
for new_text in streamer:
|
|
234
212
|
generated_text += new_text
|
|
235
213
|
if generated_text.endswith(stop_str):
|
|
236
214
|
generated_text = generated_text[: -len(stop_str)]
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
id=
|
|
240
|
-
object="
|
|
215
|
+
|
|
216
|
+
c = Completion(
|
|
217
|
+
id=str(uuid.uuid1()),
|
|
218
|
+
object="text_completion",
|
|
241
219
|
created=int(time.time()),
|
|
242
220
|
model=self.model_uid,
|
|
243
221
|
choices=[
|
|
244
|
-
|
|
245
|
-
index=0,
|
|
246
|
-
message={"role": "assistant", "content": r},
|
|
247
|
-
finish_reason="stop",
|
|
222
|
+
CompletionChoice(
|
|
223
|
+
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
248
224
|
)
|
|
249
225
|
],
|
|
250
226
|
usage=CompletionUsage(
|
|
251
227
|
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
252
228
|
),
|
|
253
229
|
)
|
|
230
|
+
return c
|
|
231
|
+
|
|
232
|
+
def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
|
|
233
|
+
completion_id = str(uuid.uuid1())
|
|
234
|
+
for i, new_text in enumerate(streamer):
|
|
235
|
+
if not new_text.endswith(stop_str):
|
|
236
|
+
completion_choice = CompletionChoice(
|
|
237
|
+
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
238
|
+
)
|
|
239
|
+
chunk = CompletionChunk(
|
|
240
|
+
id=completion_id,
|
|
241
|
+
object="text_completion",
|
|
242
|
+
created=int(time.time()),
|
|
243
|
+
model=self.model_uid,
|
|
244
|
+
choices=[completion_choice],
|
|
245
|
+
)
|
|
246
|
+
completion_usage = CompletionUsage(
|
|
247
|
+
prompt_tokens=-1,
|
|
248
|
+
completion_tokens=-1,
|
|
249
|
+
total_tokens=-1,
|
|
250
|
+
)
|
|
251
|
+
chunk["usage"] = completion_usage
|
|
252
|
+
yield chunk
|
|
253
|
+
|
|
254
|
+
completion_choice = CompletionChoice(
|
|
255
|
+
text="", index=0, logprobs=None, finish_reason="stop"
|
|
256
|
+
)
|
|
257
|
+
chunk = CompletionChunk(
|
|
258
|
+
id=completion_id,
|
|
259
|
+
object="text_completion",
|
|
260
|
+
created=int(time.time()),
|
|
261
|
+
model=self.model_uid,
|
|
262
|
+
choices=[completion_choice],
|
|
263
|
+
)
|
|
264
|
+
completion_usage = CompletionUsage(
|
|
265
|
+
prompt_tokens=-1,
|
|
266
|
+
completion_tokens=-1,
|
|
267
|
+
total_tokens=-1,
|
|
268
|
+
)
|
|
269
|
+
chunk["usage"] = completion_usage
|
|
270
|
+
yield chunk
|
xinference/model/llm/utils.py
CHANGED
|
@@ -228,16 +228,14 @@ Begin!"""
|
|
|
228
228
|
tools_name_text = []
|
|
229
229
|
for func_info in tools:
|
|
230
230
|
parameters = []
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
param["required"] = True
|
|
240
|
-
parameters.append(param)
|
|
231
|
+
fp = func_info["function"].get("parameters", {})
|
|
232
|
+
if fp:
|
|
233
|
+
required_parameters = fp.get("required", [])
|
|
234
|
+
for name, p in fp["properties"].items():
|
|
235
|
+
param = dict({"name": name}, **p)
|
|
236
|
+
if name in required_parameters:
|
|
237
|
+
param["required"] = True
|
|
238
|
+
parameters.append(param)
|
|
241
239
|
|
|
242
240
|
name = func_info["function"]["name"]
|
|
243
241
|
desc = func_info["function"]["description"]
|
|
@@ -447,6 +445,17 @@ Begin!"""
|
|
|
447
445
|
else:
|
|
448
446
|
ret += "<AI>" + content.strip()
|
|
449
447
|
return ret
|
|
448
|
+
elif prompt_style.style_name == "PHI3":
|
|
449
|
+
ret = f"<|system|>{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
|
|
450
|
+
for message in chat_history:
|
|
451
|
+
content = message["content"] or ""
|
|
452
|
+
role = get_role(message["role"])
|
|
453
|
+
if content:
|
|
454
|
+
ret += f"<|{role}|>{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
|
|
455
|
+
else:
|
|
456
|
+
ret += f"<|{role}|>{prompt_style.intra_message_sep}"
|
|
457
|
+
ret += "<|assistant|>\n"
|
|
458
|
+
return ret
|
|
450
459
|
else:
|
|
451
460
|
raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
|
|
452
461
|
|
|
@@ -680,6 +689,15 @@ Begin!"""
|
|
|
680
689
|
else:
|
|
681
690
|
m = {"role": "assistant", "content": content, "tool_calls": []}
|
|
682
691
|
finish_reason = "stop"
|
|
692
|
+
try:
|
|
693
|
+
usage = c.get("usage")
|
|
694
|
+
assert "prompt_tokens" in usage
|
|
695
|
+
except Exception:
|
|
696
|
+
usage = {
|
|
697
|
+
"prompt_tokens": -1,
|
|
698
|
+
"completion_tokens": -1,
|
|
699
|
+
"total_tokens": -1,
|
|
700
|
+
}
|
|
683
701
|
return {
|
|
684
702
|
"id": "chat" + f"cmpl-{_id}",
|
|
685
703
|
"model": model_uid,
|
|
@@ -692,11 +710,7 @@ Begin!"""
|
|
|
692
710
|
"finish_reason": finish_reason,
|
|
693
711
|
}
|
|
694
712
|
],
|
|
695
|
-
"usage":
|
|
696
|
-
"prompt_tokens": -1,
|
|
697
|
-
"completion_tokens": -1,
|
|
698
|
-
"total_tokens": -1,
|
|
699
|
-
},
|
|
713
|
+
"usage": usage,
|
|
700
714
|
}
|
|
701
715
|
|
|
702
716
|
|
|
@@ -110,6 +110,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
110
110
|
"mistral-instruct-v0.1",
|
|
111
111
|
"mistral-instruct-v0.2",
|
|
112
112
|
"mixtral-instruct-v0.1",
|
|
113
|
+
"mixtral-8x22B-instruct-v0.1",
|
|
113
114
|
"chatglm3",
|
|
114
115
|
"chatglm3-32k",
|
|
115
116
|
"chatglm3-128k",
|
|
@@ -239,10 +240,17 @@ class VLLMModel(LLM):
|
|
|
239
240
|
if llm_spec.model_format == "pytorch":
|
|
240
241
|
if quantization != "none" and not (quantization is None):
|
|
241
242
|
return False
|
|
242
|
-
if llm_spec.model_format
|
|
243
|
-
# Currently, only 4-bit weight quantization is supported for
|
|
243
|
+
if llm_spec.model_format == "awq":
|
|
244
|
+
# Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
|
|
244
245
|
if "4" not in quantization:
|
|
245
246
|
return False
|
|
247
|
+
if llm_spec.model_format == "gptq":
|
|
248
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
|
|
249
|
+
if not any(q in quantization for q in ("3", "4", "8")):
|
|
250
|
+
return False
|
|
251
|
+
else:
|
|
252
|
+
if "4" not in quantization:
|
|
253
|
+
return False
|
|
246
254
|
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
247
255
|
if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
|
|
248
256
|
return False
|
|
@@ -416,10 +424,17 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
416
424
|
if llm_spec.model_format == "pytorch":
|
|
417
425
|
if quantization != "none" and not (quantization is None):
|
|
418
426
|
return False
|
|
419
|
-
if llm_spec.model_format
|
|
420
|
-
# Currently, only 4-bit weight quantization is supported for
|
|
427
|
+
if llm_spec.model_format == "awq":
|
|
428
|
+
# Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
|
|
421
429
|
if "4" not in quantization:
|
|
422
430
|
return False
|
|
431
|
+
if llm_spec.model_format == "gptq":
|
|
432
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
|
|
433
|
+
if not any(q in quantization for q in ("3", "4", "8")):
|
|
434
|
+
return False
|
|
435
|
+
else:
|
|
436
|
+
if "4" not in quantization:
|
|
437
|
+
return False
|
|
423
438
|
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
424
439
|
if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
|
|
425
440
|
return False
|
|
@@ -4,7 +4,6 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
-
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
|
|
8
7
|
from PIL import Image
|
|
9
8
|
from transformers import AutoModel, AutoTokenizer
|
|
10
9
|
|
|
@@ -20,6 +19,8 @@ DEFAULT_IM_END_TOKEN = "<im_end>"
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def init_omni_lmm(model_path, device_map):
|
|
22
|
+
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
|
|
23
|
+
|
|
23
24
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
24
25
|
disable_torch_init()
|
|
25
26
|
model_name = os.path.expanduser(model_path)
|
|
@@ -2,7 +2,6 @@ import gc
|
|
|
2
2
|
import math
|
|
3
3
|
from typing import List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
|
-
import timm
|
|
6
5
|
import torch
|
|
7
6
|
import torch.nn as nn
|
|
8
7
|
from torch import Tensor
|
|
@@ -37,6 +36,8 @@ class Identity(torch.nn.Identity):
|
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
def create_vision_module(config):
|
|
39
|
+
import timm
|
|
40
|
+
|
|
40
41
|
vision_tower = timm.create_model(
|
|
41
42
|
"eva02_enormous_patch14_clip_224.laion2b_plus",
|
|
42
43
|
pretrained=False,
|
xinference/types.py
CHANGED
|
@@ -33,6 +33,7 @@ from .fields import (
|
|
|
33
33
|
stop_field,
|
|
34
34
|
stream_field,
|
|
35
35
|
stream_interval_field,
|
|
36
|
+
stream_option_field,
|
|
36
37
|
temperature_field,
|
|
37
38
|
top_k_field,
|
|
38
39
|
top_p_field,
|
|
@@ -392,6 +393,7 @@ class _CreateCompletionOpenAIFallback(BaseModel):
|
|
|
392
393
|
seed: Optional[int] = none_field
|
|
393
394
|
stop: Optional[Union[str, List[str]]] = stop_field
|
|
394
395
|
stream: bool = stream_field
|
|
396
|
+
stream_options: Optional[dict] = stream_option_field
|
|
395
397
|
suffix: Optional[str] = none_field
|
|
396
398
|
temperature: float = temperature_field
|
|
397
399
|
top_p: float = top_p_field
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
|
-
"main.
|
|
3
|
+
"main.css": "./static/css/main.54bca460.css",
|
|
4
|
+
"main.js": "./static/js/main.8e44da4b.js",
|
|
4
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
5
6
|
"index.html": "./index.html",
|
|
6
|
-
"main.
|
|
7
|
+
"main.54bca460.css.map": "./static/css/main.54bca460.css.map",
|
|
8
|
+
"main.8e44da4b.js.map": "./static/js/main.8e44da4b.js.map"
|
|
7
9
|
},
|
|
8
10
|
"entrypoints": [
|
|
9
|
-
"static/
|
|
11
|
+
"static/css/main.54bca460.css",
|
|
12
|
+
"static/js/main.8e44da4b.js"
|
|
10
13
|
]
|
|
11
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.8e44da4b.js"></script><link href="./static/css/main.54bca460.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
.formBox{max-height:80vh;max-width:50vw;min-width:50vw;overflow:auto;padding:40px 20px 0 0;position:relative;transition:all .4s ease-in-out}.broaden{max-width:100%;min-width:100%;padding-right:0}.show-json{align-items:center;color:#444;display:flex;position:fixed;right:60px;top:90px}.icon{cursor:pointer;margin-left:20px;position:absolute;right:-40px}.icon:hover{color:#1976d2}.arrow{font-size:24px!important}.jsonBox{min-height:80vh;position:relative;transition:all .4s ease-in-out;width:100%}.hide{overflow:hidden;-webkit-transform:translate(30vw);transform:translate(30vw);width:0}.jsonBox-header{font-weight:700;line-height:40px}.textarea{border:1px solid #ddd;border-radius:5px;color:#444;height:calc(100% - 40px);padding:5px 10px;resize:none;width:100%}.copyIcon{color:#555;cursor:pointer;font-size:16px!important;position:absolute;right:5px;top:13px}.copyIcon:hover{color:#1976d2}.addBtn{margin-left:20px!important}.item{background-color:#eee;border-radius:10px;margin:10px 50px 0;overflow:hidden;padding:20px;position:relative}.item:hover .deleteBtn{-webkit-transform:translateX(-50px);transform:translateX(-50px)}.deleteBtn{background-color:#1976d2;border-radius:25px;height:50px;line-height:70px;position:absolute;right:20px;text-align:center;top:calc(50% - 25px);-webkit-transform:translateX(80px);transform:translateX(80px);transition:all .3s ease-in-out;width:50px}.deleteBtn:hover{box-shadow:0 0 10px #aaa;cursor:pointer}.deleteIcon{color:#fff;font-size:28px!important}
|
|
2
|
+
/*# sourceMappingURL=main.54bca460.css.map*/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"static/css/main.54bca460.css","mappings":"AAAA,SAIE,eAAgB,CAFhB,cAAe,CACf,cAAe,CAEf,aAAc,CACd,qBAAsB,CALtB,iBAAkB,CAMlB,8BACF,CAEA,SACE,cAAe,CACf,cAAe,CACf,eACF,CAEA,WAEE,kBAAmB,CAInB,UAAW,CALX,YAAa,CAEb,cAAe,CAEf,UAAW,CADX,QAGF,CAEA,MAGE,cAAe,CACf,gBAAiB,CAHjB,iBAAkB,CAClB,WAGF,CAEA,YACE,aACF,CAEA,OACE,wBACF,CAEA,SAEE,eAAgB,CADhB,iBAAkB,CAGlB,8BAAgC,CADhC,UAEF,CAEA,MAGE,eAAgB,CADhB,iCAA6B,CAA7B,yBAA6B,CAD7B,OAGF,CAEA,gBAEE,eAAgB,CADhB,gBAEF,CAEA,UAIE,qBAAsB,CACtB,iBAAkB,CAElB,UAAW,CALX,wBAAyB,CACzB,gBAAiB,CAGjB,WAAY,CALZ,UAOF,CAEA,UAME,UAAW,CALX,cAAe,CAIf,wBAA0B,CAH1B,iBAAkB,CAElB,SAAU,CADV,QAIF,CAEA,gBACE,aACF,CAEA,QACE,0BACF,CAEA,MAEE,qBAAsB,CAGtB,kBAAmB,CAFnB,kBAAmB,CAGnB,eAAgB,CAFhB,YAAa,CAHb,iBAMF,CAEA,uBACE,mCAA4B,CAA5B,2BACF,CAEA,WAUE,wBAAyB,CADzB,kBAAmB,CAJnB,WAAY,CAGZ,gBAAiB,CAPjB,iBAAkB,CAClB,UAAW,CAKX,iBAAkB,CAJlB,oBAAqB,CAGrB,kCAA2B,CAA3B,0BAA2B,CAK3B,8BAAgC,CAPhC,UAQF,CAEA,iBAEE,wBAAyB,CADzB,cAEF,CAEA,YAEE,UAAW,CADX,wBAEF","sources":["scenes/register_model/styles/registerModelStyle.css"],"sourcesContent":[".formBox {\n position: relative;\n max-width: 50vw;\n min-width: 50vw;\n max-height: 80vh;\n overflow: auto;\n padding: 40px 20px 0 0;\n transition: all 0.4s ease-in-out;\n}\n\n.broaden {\n max-width: 100%;\n min-width: 100%;\n padding-right: 0;\n}\n\n.show-json {\n display: flex;\n align-items: center;\n position: fixed;\n top: 90px;\n right: 60px;\n color: #444;\n}\n\n.icon {\n position: absolute;\n right: -40px;\n cursor: pointer;\n margin-left: 20px;\n}\n\n.icon:hover {\n color: #1976d2;\n}\n\n.arrow {\n font-size: 24px !important;\n}\n\n.jsonBox {\n position: relative;\n min-height: 80vh;\n width: 100%;\n transition: all 0.4s ease-in-out;\n}\n\n.hide {\n width: 0;\n transform: translate(30vw, 0);\n overflow: hidden;\n}\n\n.jsonBox-header {\n line-height: 40px;\n font-weight: 700;\n}\n\n.textarea {\n width: 100%;\n height: calc(100% - 40px);\n padding: 5px 10px;\n border: 1px solid #ddd;\n border-radius: 5px;\n resize: none;\n color: #444;\n}\n\n.copyIcon {\n cursor: pointer;\n position: absolute;\n top: 13px;\n right: 5px;\n font-size: 16px !important;\n color: #555;\n}\n\n.copyIcon:hover {\n color: #1976d2;\n}\n\n.addBtn {\n margin-left: 20px !important;\n}\n\n.item {\n position: relative;\n background-color: #eee;\n margin: 10px 50px 0;\n padding: 20px;\n border-radius: 10px;\n overflow: hidden;\n}\n\n.item:hover .deleteBtn {\n transform: translateX(-50px);\n}\n\n.deleteBtn {\n position: absolute;\n right: 20px;\n top: calc(50% - 25px);\n width: 50px;\n height: 50px;\n transform: translateX(80px);\n text-align: center;\n line-height: 70px;\n border-radius: 25px;\n background-color: #1976d2;\n transition: all 0.3s ease-in-out;\n}\n\n.deleteBtn:hover {\n cursor: pointer;\n box-shadow: 0 0 10px #aaa;\n}\n\n.deleteIcon {\n font-size: 28px !important;\n color: #fff;\n}\n"],"names":[],"sourceRoot":""}
|