xinference 0.10.3__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (89) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/event.py +1 -1
  8. xinference/core/model.py +15 -4
  9. xinference/core/status_guard.py +1 -1
  10. xinference/core/supervisor.py +58 -72
  11. xinference/core/worker.py +68 -101
  12. xinference/deploy/cmdline.py +166 -1
  13. xinference/deploy/test/test_cmdline.py +2 -0
  14. xinference/deploy/utils.py +1 -1
  15. xinference/device_utils.py +29 -3
  16. xinference/fields.py +5 -1
  17. xinference/model/audio/whisper.py +88 -12
  18. xinference/model/core.py +2 -2
  19. xinference/model/image/__init__.py +29 -0
  20. xinference/model/image/core.py +6 -0
  21. xinference/model/image/custom.py +109 -0
  22. xinference/model/llm/__init__.py +92 -32
  23. xinference/model/llm/core.py +57 -102
  24. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  25. xinference/model/llm/llm_family.json +306 -4
  26. xinference/model/llm/llm_family.py +45 -41
  27. xinference/model/llm/llm_family_modelscope.json +119 -2
  28. xinference/model/llm/pytorch/deepseek_vl.py +89 -33
  29. xinference/model/llm/pytorch/qwen_vl.py +67 -12
  30. xinference/model/llm/pytorch/yi_vl.py +62 -45
  31. xinference/model/llm/utils.py +29 -15
  32. xinference/model/llm/vllm/core.py +19 -4
  33. xinference/thirdparty/omnilmm/chat.py +2 -1
  34. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  35. xinference/types.py +2 -0
  36. xinference/web/ui/build/asset-manifest.json +6 -3
  37. xinference/web/ui/build/index.html +1 -1
  38. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  39. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  40. xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
  41. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
  42. xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
  58. xinference/web/ui/node_modules/.package-lock.json +33 -0
  59. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  60. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  61. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  62. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  63. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  64. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  65. xinference/web/ui/node_modules/delegate/package.json +31 -0
  66. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  67. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  68. xinference/web/ui/node_modules/select/bower.json +13 -0
  69. xinference/web/ui/node_modules/select/package.json +29 -0
  70. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  71. xinference/web/ui/package-lock.json +34 -0
  72. xinference/web/ui/package.json +1 -0
  73. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/METADATA +11 -11
  74. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/RECORD +78 -57
  75. xinference/client/oscar/__init__.py +0 -13
  76. xinference/client/oscar/actor_client.py +0 -611
  77. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  78. xinference/model/llm/pytorch/spec_model.py +0 -186
  79. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  80. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  86. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
  87. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
  88. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
  89. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
@@ -27,9 +27,11 @@ import torch
27
27
  from ....model.utils import select_device
28
28
  from ....types import (
29
29
  ChatCompletion,
30
- ChatCompletionChoice,
31
30
  ChatCompletionChunk,
32
31
  ChatCompletionMessage,
32
+ Completion,
33
+ CompletionChoice,
34
+ CompletionChunk,
33
35
  CompletionUsage,
34
36
  )
35
37
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
67
69
  self._type = torch.float16 if self._device == "mps" else torch.bfloat16
68
70
 
69
71
  # specify the path to the model
70
- self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
72
+ self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
71
73
  self.model_path
72
74
  )
73
75
  self._tokenizer = self._vl_chat_processor.tokenizer
74
76
 
75
- vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
77
+ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
76
78
  self.model_path, trust_remote_code=True, device_map=self._device
77
79
  )
78
80
  self._model = vl_gpt.to(self._type).eval()
@@ -149,10 +151,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
149
151
  chat_history: Optional[List[ChatCompletionMessage]] = None,
150
152
  generate_config: Optional[PytorchGenerateConfig] = None,
151
153
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
152
- if generate_config and generate_config.get("stream"):
153
- raise Exception(
154
- f"Chat with model {self.model_family.model_name} does not support stream."
155
- )
154
+ if not generate_config:
155
+ generate_config = {}
156
+
157
+ stream = generate_config.get("stream", False)
158
+
156
159
  prompt, images = self._message_content_to_deepseek(prompt)
157
160
  prompt_messages: List[Dict[str, Any]] = [
158
161
  {
@@ -184,6 +187,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
184
187
 
185
188
  deepseek_history.extend(prompt_messages)
186
189
 
190
+ from ....thirdparty.deepseek_vl.serve.inference import generate
187
191
  from ....thirdparty.deepseek_vl.utils.io import load_pil_images
188
192
 
189
193
  # load images and prepare for inputs
@@ -192,41 +196,93 @@ class DeepSeekVLChatModel(PytorchChatModel):
192
196
  conversations=deepseek_history, images=pil_images, force_batchify=True
193
197
  ).to(self._model.device, self._model.dtype)
194
198
 
195
- # run image encoder to get the image embeddings
196
- inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
197
-
198
- # run the model to get the response
199
- outputs = self._model.language_model.generate(
200
- inputs_embeds=inputs_embeds,
201
- attention_mask=prepare_inputs.attention_mask,
202
- pad_token_id=self._tokenizer.eos_token_id,
203
- bos_token_id=self._tokenizer.bos_token_id,
204
- eos_token_id=self._tokenizer.eos_token_id,
205
- max_new_tokens=512,
206
- do_sample=True,
207
- top_p=0.95,
208
- temperature=0.2,
209
- repetition_penalty=1.1,
210
- use_cache=True,
211
- )
199
+ temperature = generate_config.get("temperature", 0.2)
200
+ top_p = generate_config.get("top_p", 0.95)
201
+ max_new_tokens = generate_config.get("max_tokens", 512)
202
+ repetition_penalty = generate_config.get("repetition_penalty", 1.1)
203
+
204
+ conversation = self._vl_chat_processor.new_chat_template()
205
+ stop_str = conversation.sep2
206
+ stop_words = [stop_str]
212
207
 
213
- answer = self._tokenizer.decode(
214
- outputs[0].cpu().tolist(), skip_special_tokens=True
208
+ streamer = generate(
209
+ vl_gpt=self._model,
210
+ tokenizer=self._tokenizer,
211
+ prepare_inputs=prepare_inputs,
212
+ max_gen_len=max_new_tokens,
213
+ temperature=temperature,
214
+ repetition_penalty=repetition_penalty,
215
+ top_p=top_p,
216
+ stop_words=stop_words,
215
217
  )
216
218
 
217
- return ChatCompletion(
218
- id="chat" + str(uuid.uuid1()),
219
- object="chat.completion",
219
+ if stream:
220
+ it = self._generate_stream(streamer, stop_str)
221
+ return self._to_chat_completion_chunks(it)
222
+ else:
223
+ c = self._generate(streamer, stop_str)
224
+ return self._to_chat_completion(c)
225
+
226
+ def _generate(self, streamer, stop_str) -> Completion:
227
+ generated_text = ""
228
+ for new_text in streamer:
229
+ if new_text.endswith(stop_str):
230
+ new_text = new_text[: -len(stop_str)]
231
+ generated_text += new_text
232
+
233
+ c = Completion(
234
+ id=str(uuid.uuid1()),
235
+ object="text_completion",
220
236
  created=int(time.time()),
221
237
  model=self.model_uid,
222
238
  choices=[
223
- ChatCompletionChoice(
224
- index=0,
225
- message={"role": "assistant", "content": answer},
226
- finish_reason="stop",
239
+ CompletionChoice(
240
+ index=0, text=generated_text, finish_reason="stop", logprobs=None
227
241
  )
228
242
  ],
229
243
  usage=CompletionUsage(
230
244
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
231
245
  ),
232
246
  )
247
+ return c
248
+
249
+ def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
250
+ completion_id = str(uuid.uuid1())
251
+ for i, new_text in enumerate(streamer):
252
+ if new_text.endswith(stop_str):
253
+ new_text = new_text[: -len(stop_str)]
254
+ completion_choice = CompletionChoice(
255
+ text=new_text, index=0, logprobs=None, finish_reason=None
256
+ )
257
+ chunk = CompletionChunk(
258
+ id=completion_id,
259
+ object="text_completion",
260
+ created=int(time.time()),
261
+ model=self.model_uid,
262
+ choices=[completion_choice],
263
+ )
264
+ completion_usage = CompletionUsage(
265
+ prompt_tokens=-1,
266
+ completion_tokens=-1,
267
+ total_tokens=-1,
268
+ )
269
+ chunk["usage"] = completion_usage
270
+ yield chunk
271
+
272
+ completion_choice = CompletionChoice(
273
+ text="", index=0, logprobs=None, finish_reason="stop"
274
+ )
275
+ chunk = CompletionChunk(
276
+ id=completion_id,
277
+ object="text_completion",
278
+ created=int(time.time()),
279
+ model=self.model_uid,
280
+ choices=[completion_choice],
281
+ )
282
+ completion_usage = CompletionUsage(
283
+ prompt_tokens=-1,
284
+ completion_tokens=-1,
285
+ total_tokens=-1,
286
+ )
287
+ chunk["usage"] = completion_usage
288
+ yield chunk
@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
22
22
  from ....model.utils import select_device
23
23
  from ....types import (
24
24
  ChatCompletion,
25
- ChatCompletionChoice,
26
25
  ChatCompletionChunk,
27
26
  ChatCompletionMessage,
27
+ Completion,
28
+ CompletionChoice,
29
+ CompletionChunk,
28
30
  CompletionUsage,
29
31
  )
30
32
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
116
118
  chat_history: Optional[List[ChatCompletionMessage]] = None,
117
119
  generate_config: Optional[PytorchGenerateConfig] = None,
118
120
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
119
- if generate_config and generate_config.get("stream"):
120
- raise Exception(
121
- f"Chat with model {self.model_family.model_name} does not support stream."
122
- )
123
121
  prompt = self._message_content_to_qwen(prompt)
124
122
  # Convert openai history to qwen vl history
125
123
  qwen_history = []
@@ -134,22 +132,79 @@ class QwenVLChatModel(PytorchChatModel):
134
132
  if len(query_to_response) == 2:
135
133
  qwen_history.append(query_to_response)
136
134
  query_to_response = []
135
+
136
+ stream = generate_config.get("stream", False) if generate_config else False
137
+
138
+ if stream:
139
+ it = self._generate_stream(prompt, qwen_history)
140
+ return self._to_chat_completion_chunks(it)
141
+ else:
142
+ c = self._generate(prompt, qwen_history)
143
+ return self._to_chat_completion(c)
144
+
145
+ def _generate(self, prompt: str, qwen_history: List) -> Completion:
137
146
  response, history = self._model.chat(
138
147
  self._tokenizer, query=prompt, history=qwen_history
139
148
  )
140
- return ChatCompletion(
141
- id="chat" + str(uuid.uuid1()),
142
- object="chat.completion",
149
+ c = Completion(
150
+ id=str(uuid.uuid1()),
151
+ object="text_completion",
143
152
  created=int(time.time()),
144
153
  model=self.model_uid,
145
154
  choices=[
146
- ChatCompletionChoice(
147
- index=0,
148
- message={"role": "assistant", "content": response},
149
- finish_reason="stop",
155
+ CompletionChoice(
156
+ index=0, text=response, finish_reason="stop", logprobs=None
150
157
  )
151
158
  ],
152
159
  usage=CompletionUsage(
153
160
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
154
161
  ),
155
162
  )
163
+ return c
164
+
165
+ def _generate_stream(
166
+ self, prompt: str, qwen_history: List
167
+ ) -> Iterator[CompletionChunk]:
168
+ # response, history = model.chat(tokenizer, message, history=history)
169
+ response_generator = self._model.chat_stream(
170
+ self._tokenizer, query=prompt, history=qwen_history
171
+ )
172
+ full_response = ""
173
+ for response in response_generator:
174
+ inc_content = response[len(full_response) :]
175
+ full_response = response
176
+ completion_choice = CompletionChoice(
177
+ text=inc_content, index=0, logprobs=None, finish_reason=None
178
+ )
179
+ completion_chunk = CompletionChunk(
180
+ id=str(uuid.uuid1()),
181
+ object="text_completion",
182
+ created=int(time.time()),
183
+ model=self.model_uid,
184
+ choices=[completion_choice],
185
+ )
186
+ completion_usage = CompletionUsage(
187
+ prompt_tokens=-1,
188
+ completion_tokens=-1,
189
+ total_tokens=-1,
190
+ )
191
+ completion_chunk["usage"] = completion_usage
192
+ yield completion_chunk
193
+
194
+ completion_choice = CompletionChoice(
195
+ text="", index=0, logprobs=None, finish_reason="stop"
196
+ )
197
+ completion_chunk = CompletionChunk(
198
+ id=str(uuid.uuid1()),
199
+ object="text_completion",
200
+ created=int(time.time()),
201
+ model=self.model_uid,
202
+ choices=[completion_choice],
203
+ )
204
+ completion_usage = CompletionUsage(
205
+ prompt_tokens=-1,
206
+ completion_tokens=-1,
207
+ total_tokens=-1,
208
+ )
209
+ completion_chunk["usage"] = completion_usage
210
+ yield completion_chunk
@@ -27,9 +27,11 @@ from PIL import Image
27
27
  from ....model.utils import select_device
28
28
  from ....types import (
29
29
  ChatCompletion,
30
- ChatCompletionChoice,
31
30
  ChatCompletionChunk,
32
31
  ChatCompletionMessage,
32
+ Completion,
33
+ CompletionChoice,
34
+ CompletionChunk,
33
35
  CompletionUsage,
34
36
  )
35
37
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
122
124
  raise RuntimeError("Only one image per message is supported by Yi VL.")
123
125
  return content
124
126
 
125
- @staticmethod
126
- def _parse_text(text):
127
- lines = text.split("\n")
128
- lines = [line for line in lines if line != ""]
129
- count = 0
130
- for i, line in enumerate(lines):
131
- if "```" in line:
132
- count += 1
133
- items = line.split("`")
134
- if count % 2 == 1:
135
- lines[i] = f'<pre><code class="language-{items[-1]}">'
136
- else:
137
- lines[i] = f"<br></code></pre>"
138
- else:
139
- if i > 0:
140
- if count % 2 == 1:
141
- line = line.replace("`", r"\`")
142
- line = line.replace("<", "&lt;")
143
- line = line.replace(">", "&gt;")
144
- line = line.replace(" ", "&nbsp;")
145
- line = line.replace("*", "&ast;")
146
- line = line.replace("_", "&lowbar;")
147
- line = line.replace("-", "&#45;")
148
- line = line.replace(".", "&#46;")
149
- line = line.replace("!", "&#33;")
150
- line = line.replace("(", "&#40;")
151
- line = line.replace(")", "&#41;")
152
- line = line.replace("$", "&#36;")
153
- lines[i] = "<br>" + line
154
- text = "".join(lines)
155
- return text
156
-
157
127
  def chat(
158
128
  self,
159
129
  prompt: Union[str, List[Dict]],
@@ -164,12 +134,12 @@ class YiVLChatModel(PytorchChatModel):
164
134
  from transformers import TextIteratorStreamer
165
135
 
166
136
  # TODO(codingl2k1): implement stream mode.
167
- if generate_config and generate_config.get("stream"):
168
- raise Exception(
169
- f"Chat with model {self.model_family.model_name} does not support stream."
170
- )
137
+
171
138
  if not generate_config:
172
139
  generate_config = {}
140
+
141
+ stream = generate_config.get("stream", False)
142
+
173
143
  from ....thirdparty.llava.conversation import conv_templates
174
144
  from ....thirdparty.llava.mm_utils import (
175
145
  KeywordsStoppingCriteria,
@@ -229,25 +199,72 @@ class YiVLChatModel(PytorchChatModel):
229
199
  t = Thread(target=self._model.generate, kwargs=generate_kwargs)
230
200
  t.start()
231
201
 
202
+ if stream:
203
+ it = self._generate_stream(streamer, stop_str)
204
+ return self._to_chat_completion_chunks(it)
205
+ else:
206
+ c = self._generate(streamer, stop_str)
207
+ return self._to_chat_completion(c)
208
+
209
+ def _generate(self, streamer, stop_str) -> Completion:
232
210
  generated_text = ""
233
211
  for new_text in streamer:
234
212
  generated_text += new_text
235
213
  if generated_text.endswith(stop_str):
236
214
  generated_text = generated_text[: -len(stop_str)]
237
- r = self._parse_text(generated_text)
238
- return ChatCompletion(
239
- id="chat" + str(uuid.uuid1()),
240
- object="chat.completion",
215
+
216
+ c = Completion(
217
+ id=str(uuid.uuid1()),
218
+ object="text_completion",
241
219
  created=int(time.time()),
242
220
  model=self.model_uid,
243
221
  choices=[
244
- ChatCompletionChoice(
245
- index=0,
246
- message={"role": "assistant", "content": r},
247
- finish_reason="stop",
222
+ CompletionChoice(
223
+ index=0, text=generated_text, finish_reason="stop", logprobs=None
248
224
  )
249
225
  ],
250
226
  usage=CompletionUsage(
251
227
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
252
228
  ),
253
229
  )
230
+ return c
231
+
232
+ def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
233
+ completion_id = str(uuid.uuid1())
234
+ for i, new_text in enumerate(streamer):
235
+ if not new_text.endswith(stop_str):
236
+ completion_choice = CompletionChoice(
237
+ text=new_text, index=0, logprobs=None, finish_reason=None
238
+ )
239
+ chunk = CompletionChunk(
240
+ id=completion_id,
241
+ object="text_completion",
242
+ created=int(time.time()),
243
+ model=self.model_uid,
244
+ choices=[completion_choice],
245
+ )
246
+ completion_usage = CompletionUsage(
247
+ prompt_tokens=-1,
248
+ completion_tokens=-1,
249
+ total_tokens=-1,
250
+ )
251
+ chunk["usage"] = completion_usage
252
+ yield chunk
253
+
254
+ completion_choice = CompletionChoice(
255
+ text="", index=0, logprobs=None, finish_reason="stop"
256
+ )
257
+ chunk = CompletionChunk(
258
+ id=completion_id,
259
+ object="text_completion",
260
+ created=int(time.time()),
261
+ model=self.model_uid,
262
+ choices=[completion_choice],
263
+ )
264
+ completion_usage = CompletionUsage(
265
+ prompt_tokens=-1,
266
+ completion_tokens=-1,
267
+ total_tokens=-1,
268
+ )
269
+ chunk["usage"] = completion_usage
270
+ yield chunk
@@ -228,16 +228,14 @@ Begin!"""
228
228
  tools_name_text = []
229
229
  for func_info in tools:
230
230
  parameters = []
231
- required_parameters = func_info["function"]["parameters"].get(
232
- "required", []
233
- )
234
- for name, p in func_info["function"]["parameters"][
235
- "properties"
236
- ].items():
237
- param = dict({"name": name}, **p)
238
- if name in required_parameters:
239
- param["required"] = True
240
- parameters.append(param)
231
+ fp = func_info["function"].get("parameters", {})
232
+ if fp:
233
+ required_parameters = fp.get("required", [])
234
+ for name, p in fp["properties"].items():
235
+ param = dict({"name": name}, **p)
236
+ if name in required_parameters:
237
+ param["required"] = True
238
+ parameters.append(param)
241
239
 
242
240
  name = func_info["function"]["name"]
243
241
  desc = func_info["function"]["description"]
@@ -447,6 +445,17 @@ Begin!"""
447
445
  else:
448
446
  ret += "<AI>" + content.strip()
449
447
  return ret
448
+ elif prompt_style.style_name == "PHI3":
449
+ ret = f"<|system|>{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
450
+ for message in chat_history:
451
+ content = message["content"] or ""
452
+ role = get_role(message["role"])
453
+ if content:
454
+ ret += f"<|{role}|>{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
455
+ else:
456
+ ret += f"<|{role}|>{prompt_style.intra_message_sep}"
457
+ ret += "<|assistant|>\n"
458
+ return ret
450
459
  else:
451
460
  raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
452
461
 
@@ -680,6 +689,15 @@ Begin!"""
680
689
  else:
681
690
  m = {"role": "assistant", "content": content, "tool_calls": []}
682
691
  finish_reason = "stop"
692
+ try:
693
+ usage = c.get("usage")
694
+ assert "prompt_tokens" in usage
695
+ except Exception:
696
+ usage = {
697
+ "prompt_tokens": -1,
698
+ "completion_tokens": -1,
699
+ "total_tokens": -1,
700
+ }
683
701
  return {
684
702
  "id": "chat" + f"cmpl-{_id}",
685
703
  "model": model_uid,
@@ -692,11 +710,7 @@ Begin!"""
692
710
  "finish_reason": finish_reason,
693
711
  }
694
712
  ],
695
- "usage": {
696
- "prompt_tokens": -1,
697
- "completion_tokens": -1,
698
- "total_tokens": -1,
699
- },
713
+ "usage": usage,
700
714
  }
701
715
 
702
716
 
@@ -110,6 +110,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
110
110
  "mistral-instruct-v0.1",
111
111
  "mistral-instruct-v0.2",
112
112
  "mixtral-instruct-v0.1",
113
+ "mixtral-8x22B-instruct-v0.1",
113
114
  "chatglm3",
114
115
  "chatglm3-32k",
115
116
  "chatglm3-128k",
@@ -239,10 +240,17 @@ class VLLMModel(LLM):
239
240
  if llm_spec.model_format == "pytorch":
240
241
  if quantization != "none" and not (quantization is None):
241
242
  return False
242
- if llm_spec.model_format in ["gptq", "awq"]:
243
- # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
243
+ if llm_spec.model_format == "awq":
244
+ # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
244
245
  if "4" not in quantization:
245
246
  return False
247
+ if llm_spec.model_format == "gptq":
248
+ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
249
+ if not any(q in quantization for q in ("3", "4", "8")):
250
+ return False
251
+ else:
252
+ if "4" not in quantization:
253
+ return False
246
254
  if isinstance(llm_family, CustomLLMFamilyV1):
247
255
  if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
248
256
  return False
@@ -416,10 +424,17 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
416
424
  if llm_spec.model_format == "pytorch":
417
425
  if quantization != "none" and not (quantization is None):
418
426
  return False
419
- if llm_spec.model_format in ["gptq", "awq"]:
420
- # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
427
+ if llm_spec.model_format == "awq":
428
+ # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
421
429
  if "4" not in quantization:
422
430
  return False
431
+ if llm_spec.model_format == "gptq":
432
+ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
433
+ if not any(q in quantization for q in ("3", "4", "8")):
434
+ return False
435
+ else:
436
+ if "4" not in quantization:
437
+ return False
423
438
  if isinstance(llm_family, CustomLLMFamilyV1):
424
439
  if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
425
440
  return False
@@ -4,7 +4,6 @@ import json
4
4
  import os
5
5
 
6
6
  import torch
7
- from accelerate import init_empty_weights, load_checkpoint_and_dispatch
8
7
  from PIL import Image
9
8
  from transformers import AutoModel, AutoTokenizer
10
9
 
@@ -20,6 +19,8 @@ DEFAULT_IM_END_TOKEN = "<im_end>"
20
19
 
21
20
 
22
21
  def init_omni_lmm(model_path, device_map):
22
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch
23
+
23
24
  torch.backends.cuda.matmul.allow_tf32 = True
24
25
  disable_torch_init()
25
26
  model_name = os.path.expanduser(model_path)
@@ -2,7 +2,6 @@ import gc
2
2
  import math
3
3
  from typing import List, Optional, Tuple, Union
4
4
 
5
- import timm
6
5
  import torch
7
6
  import torch.nn as nn
8
7
  from torch import Tensor
@@ -37,6 +36,8 @@ class Identity(torch.nn.Identity):
37
36
 
38
37
 
39
38
  def create_vision_module(config):
39
+ import timm
40
+
40
41
  vision_tower = timm.create_model(
41
42
  "eva02_enormous_patch14_clip_224.laion2b_plus",
42
43
  pretrained=False,
xinference/types.py CHANGED
@@ -33,6 +33,7 @@ from .fields import (
33
33
  stop_field,
34
34
  stream_field,
35
35
  stream_interval_field,
36
+ stream_option_field,
36
37
  temperature_field,
37
38
  top_k_field,
38
39
  top_p_field,
@@ -392,6 +393,7 @@ class _CreateCompletionOpenAIFallback(BaseModel):
392
393
  seed: Optional[int] = none_field
393
394
  stop: Optional[Union[str, List[str]]] = stop_field
394
395
  stream: bool = stream_field
396
+ stream_options: Optional[dict] = stream_option_field
395
397
  suffix: Optional[str] = none_field
396
398
  temperature: float = temperature_field
397
399
  top_p: float = top_p_field
@@ -1,11 +1,14 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.26fdbfbe.js",
3
+ "main.css": "./static/css/main.54bca460.css",
4
+ "main.js": "./static/js/main.8e44da4b.js",
4
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
6
  "index.html": "./index.html",
6
- "main.26fdbfbe.js.map": "./static/js/main.26fdbfbe.js.map"
7
+ "main.54bca460.css.map": "./static/css/main.54bca460.css.map",
8
+ "main.8e44da4b.js.map": "./static/js/main.8e44da4b.js.map"
7
9
  },
8
10
  "entrypoints": [
9
- "static/js/main.26fdbfbe.js"
11
+ "static/css/main.54bca460.css",
12
+ "static/js/main.8e44da4b.js"
10
13
  ]
11
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.26fdbfbe.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.8e44da4b.js"></script><link href="./static/css/main.54bca460.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
@@ -0,0 +1,2 @@
1
+ .formBox{max-height:80vh;max-width:50vw;min-width:50vw;overflow:auto;padding:40px 20px 0 0;position:relative;transition:all .4s ease-in-out}.broaden{max-width:100%;min-width:100%;padding-right:0}.show-json{align-items:center;color:#444;display:flex;position:fixed;right:60px;top:90px}.icon{cursor:pointer;margin-left:20px;position:absolute;right:-40px}.icon:hover{color:#1976d2}.arrow{font-size:24px!important}.jsonBox{min-height:80vh;position:relative;transition:all .4s ease-in-out;width:100%}.hide{overflow:hidden;-webkit-transform:translate(30vw);transform:translate(30vw);width:0}.jsonBox-header{font-weight:700;line-height:40px}.textarea{border:1px solid #ddd;border-radius:5px;color:#444;height:calc(100% - 40px);padding:5px 10px;resize:none;width:100%}.copyIcon{color:#555;cursor:pointer;font-size:16px!important;position:absolute;right:5px;top:13px}.copyIcon:hover{color:#1976d2}.addBtn{margin-left:20px!important}.item{background-color:#eee;border-radius:10px;margin:10px 50px 0;overflow:hidden;padding:20px;position:relative}.item:hover .deleteBtn{-webkit-transform:translateX(-50px);transform:translateX(-50px)}.deleteBtn{background-color:#1976d2;border-radius:25px;height:50px;line-height:70px;position:absolute;right:20px;text-align:center;top:calc(50% - 25px);-webkit-transform:translateX(80px);transform:translateX(80px);transition:all .3s ease-in-out;width:50px}.deleteBtn:hover{box-shadow:0 0 10px #aaa;cursor:pointer}.deleteIcon{color:#fff;font-size:28px!important}
2
+ /*# sourceMappingURL=main.54bca460.css.map*/
@@ -0,0 +1 @@
1
+ {"version":3,"file":"static/css/main.54bca460.css","mappings":"AAAA,SAIE,eAAgB,CAFhB,cAAe,CACf,cAAe,CAEf,aAAc,CACd,qBAAsB,CALtB,iBAAkB,CAMlB,8BACF,CAEA,SACE,cAAe,CACf,cAAe,CACf,eACF,CAEA,WAEE,kBAAmB,CAInB,UAAW,CALX,YAAa,CAEb,cAAe,CAEf,UAAW,CADX,QAGF,CAEA,MAGE,cAAe,CACf,gBAAiB,CAHjB,iBAAkB,CAClB,WAGF,CAEA,YACE,aACF,CAEA,OACE,wBACF,CAEA,SAEE,eAAgB,CADhB,iBAAkB,CAGlB,8BAAgC,CADhC,UAEF,CAEA,MAGE,eAAgB,CADhB,iCAA6B,CAA7B,yBAA6B,CAD7B,OAGF,CAEA,gBAEE,eAAgB,CADhB,gBAEF,CAEA,UAIE,qBAAsB,CACtB,iBAAkB,CAElB,UAAW,CALX,wBAAyB,CACzB,gBAAiB,CAGjB,WAAY,CALZ,UAOF,CAEA,UAME,UAAW,CALX,cAAe,CAIf,wBAA0B,CAH1B,iBAAkB,CAElB,SAAU,CADV,QAIF,CAEA,gBACE,aACF,CAEA,QACE,0BACF,CAEA,MAEE,qBAAsB,CAGtB,kBAAmB,CAFnB,kBAAmB,CAGnB,eAAgB,CAFhB,YAAa,CAHb,iBAMF,CAEA,uBACE,mCAA4B,CAA5B,2BACF,CAEA,WAUE,wBAAyB,CADzB,kBAAmB,CAJnB,WAAY,CAGZ,gBAAiB,CAPjB,iBAAkB,CAClB,UAAW,CAKX,iBAAkB,CAJlB,oBAAqB,CAGrB,kCAA2B,CAA3B,0BAA2B,CAK3B,8BAAgC,CAPhC,UAQF,CAEA,iBAEE,wBAAyB,CADzB,cAEF,CAEA,YAEE,UAAW,CADX,wBAEF","sources":["scenes/register_model/styles/registerModelStyle.css"],"sourcesContent":[".formBox {\n position: relative;\n max-width: 50vw;\n min-width: 50vw;\n max-height: 80vh;\n overflow: auto;\n padding: 40px 20px 0 0;\n transition: all 0.4s ease-in-out;\n}\n\n.broaden {\n max-width: 100%;\n min-width: 100%;\n padding-right: 0;\n}\n\n.show-json {\n display: flex;\n align-items: center;\n position: fixed;\n top: 90px;\n right: 60px;\n color: #444;\n}\n\n.icon {\n position: absolute;\n right: -40px;\n cursor: pointer;\n margin-left: 20px;\n}\n\n.icon:hover {\n color: #1976d2;\n}\n\n.arrow {\n font-size: 24px !important;\n}\n\n.jsonBox {\n position: relative;\n min-height: 80vh;\n width: 100%;\n transition: all 0.4s ease-in-out;\n}\n\n.hide {\n width: 0;\n transform: translate(30vw, 0);\n overflow: hidden;\n}\n\n.jsonBox-header {\n line-height: 40px;\n font-weight: 700;\n}\n\n.textarea {\n width: 100%;\n height: calc(100% - 40px);\n padding: 5px 10px;\n border: 1px solid #ddd;\n border-radius: 5px;\n resize: none;\n color: #444;\n}\n\n.copyIcon {\n cursor: pointer;\n position: absolute;\n top: 13px;\n right: 5px;\n font-size: 16px !important;\n color: #555;\n}\n\n.copyIcon:hover {\n color: #1976d2;\n}\n\n.addBtn {\n margin-left: 20px !important;\n}\n\n.item {\n position: relative;\n background-color: #eee;\n margin: 10px 50px 0;\n padding: 20px;\n border-radius: 10px;\n overflow: hidden;\n}\n\n.item:hover .deleteBtn {\n transform: translateX(-50px);\n}\n\n.deleteBtn {\n position: absolute;\n right: 20px;\n top: calc(50% - 25px);\n width: 50px;\n height: 50px;\n transform: translateX(80px);\n text-align: center;\n line-height: 70px;\n border-radius: 25px;\n background-color: #1976d2;\n transition: all 0.3s ease-in-out;\n}\n\n.deleteBtn:hover {\n cursor: pointer;\n box-shadow: 0 0 10px #aaa;\n}\n\n.deleteIcon {\n font-size: 28px !important;\n color: #fff;\n}\n"],"names":[],"sourceRoot":""}