xinference 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/event.py +1 -1
  8. xinference/core/model.py +15 -4
  9. xinference/core/status_guard.py +1 -1
  10. xinference/core/supervisor.py +58 -72
  11. xinference/core/worker.py +73 -102
  12. xinference/deploy/cmdline.py +175 -6
  13. xinference/deploy/test/test_cmdline.py +2 -0
  14. xinference/deploy/utils.py +1 -1
  15. xinference/device_utils.py +29 -3
  16. xinference/fields.py +5 -1
  17. xinference/model/audio/model_spec.json +8 -1
  18. xinference/model/audio/whisper.py +88 -12
  19. xinference/model/core.py +2 -2
  20. xinference/model/embedding/core.py +13 -0
  21. xinference/model/image/__init__.py +29 -0
  22. xinference/model/image/core.py +6 -0
  23. xinference/model/image/custom.py +109 -0
  24. xinference/model/llm/__init__.py +92 -32
  25. xinference/model/llm/core.py +57 -102
  26. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  27. xinference/model/llm/llm_family.json +446 -2
  28. xinference/model/llm/llm_family.py +45 -41
  29. xinference/model/llm/llm_family_modelscope.json +208 -1
  30. xinference/model/llm/pytorch/deepseek_vl.py +89 -33
  31. xinference/model/llm/pytorch/qwen_vl.py +67 -12
  32. xinference/model/llm/pytorch/yi_vl.py +62 -45
  33. xinference/model/llm/utils.py +45 -15
  34. xinference/model/llm/vllm/core.py +21 -4
  35. xinference/model/rerank/core.py +48 -20
  36. xinference/thirdparty/omnilmm/chat.py +2 -1
  37. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  38. xinference/types.py +2 -0
  39. xinference/web/ui/build/asset-manifest.json +6 -3
  40. xinference/web/ui/build/index.html +1 -1
  41. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  42. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  43. xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
  44. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
  45. xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
  61. xinference/web/ui/node_modules/.package-lock.json +33 -0
  62. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  63. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  64. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  65. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  66. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  67. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  68. xinference/web/ui/node_modules/delegate/package.json +31 -0
  69. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  70. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  71. xinference/web/ui/node_modules/select/bower.json +13 -0
  72. xinference/web/ui/node_modules/select/package.json +29 -0
  73. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  74. xinference/web/ui/package-lock.json +34 -0
  75. xinference/web/ui/package.json +1 -0
  76. {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/METADATA +14 -13
  77. {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/RECORD +81 -60
  78. xinference/client/oscar/__init__.py +0 -13
  79. xinference/client/oscar/actor_client.py +0 -611
  80. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  81. xinference/model/llm/pytorch/spec_model.py +0 -186
  82. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  83. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  89. {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
  90. {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
  91. {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
  92. {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
@@ -84,6 +84,96 @@
84
84
  ]
85
85
  }
86
86
  },
87
+ {
88
+ "version": 1,
89
+ "context_length": 8192,
90
+ "model_name": "llama-3",
91
+ "model_lang": [
92
+ "en"
93
+ ],
94
+ "model_ability": [
95
+ "generate"
96
+ ],
97
+ "model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
98
+ "model_specs": [
99
+ {
100
+ "model_format": "pytorch",
101
+ "model_size_in_billions": 8,
102
+ "quantizations": [
103
+ "4-bit",
104
+ "8-bit",
105
+ "none"
106
+ ],
107
+ "model_id": "LLM-Research/Meta-Llama-3-8B",
108
+ "model_hub": "modelscope"
109
+ },
110
+ {
111
+ "model_format": "pytorch",
112
+ "model_size_in_billions": 70,
113
+ "quantizations": [
114
+ "4-bit",
115
+ "8-bit",
116
+ "none"
117
+ ],
118
+ "model_id": "LLM-Research/Meta-Llama-3-70B",
119
+ "model_hub": "modelscope"
120
+ }
121
+ ]
122
+ },
123
+ {
124
+ "version": 1,
125
+ "context_length": 8192,
126
+ "model_name": "llama-3-instruct",
127
+ "model_lang": [
128
+ "en"
129
+ ],
130
+ "model_ability": [
131
+ "chat"
132
+ ],
133
+ "model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
134
+ "model_specs": [
135
+ {
136
+ "model_format": "pytorch",
137
+ "model_size_in_billions": 8,
138
+ "quantizations": [
139
+ "4-bit",
140
+ "8-bit",
141
+ "none"
142
+ ],
143
+ "model_id": "LLM-Research/Meta-Llama-3-8B-Instruct",
144
+ "model_hub": "modelscope"
145
+ },
146
+ {
147
+ "model_format": "pytorch",
148
+ "model_size_in_billions": 70,
149
+ "quantizations": [
150
+ "4-bit",
151
+ "8-bit",
152
+ "none"
153
+ ],
154
+ "model_id": "LLM-Research/Meta-Llama-3-70B-Instruct",
155
+ "model_hub": "modelscope"
156
+ }
157
+ ],
158
+ "prompt_style": {
159
+ "style_name": "LLAMA3",
160
+ "system_prompt": "You are a helpful assistant.",
161
+ "roles": [
162
+ "user",
163
+ "assistant"
164
+ ],
165
+ "intra_message_sep": "\n\n",
166
+ "inter_message_sep": "<|eot_id|>",
167
+ "stop_token_ids": [
168
+ 128001,
169
+ 128009
170
+ ],
171
+ "stop": [
172
+ "<|end_of_text|>",
173
+ "<|eot_id|>"
174
+ ]
175
+ }
176
+ },
87
177
  {
88
178
  "version": 1,
89
179
  "context_length": 2048,
@@ -323,7 +413,7 @@
323
413
  ],
324
414
  "model_hub": "modelscope",
325
415
  "model_id": "ZhipuAI/chatglm3-6b",
326
- "model_revision": "v1.0.0"
416
+ "model_revision": "v1.0.2"
327
417
  }
328
418
  ],
329
419
  "prompt_style": {
@@ -1847,6 +1937,17 @@
1847
1937
  "model_id": "qwen/Qwen1.5-72B-Chat",
1848
1938
  "model_hub": "modelscope"
1849
1939
  },
1940
+ {
1941
+ "model_format": "pytorch",
1942
+ "model_size_in_billions": 110,
1943
+ "quantizations": [
1944
+ "4-bit",
1945
+ "8-bit",
1946
+ "none"
1947
+ ],
1948
+ "model_id": "qwen/Qwen1.5-110B-Chat",
1949
+ "model_hub": "modelscope"
1950
+ },
1850
1951
  {
1851
1952
  "model_format": "gptq",
1852
1953
  "model_size_in_billions": "0_5",
@@ -1916,6 +2017,15 @@
1916
2017
  "model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
1917
2018
  "model_hub": "modelscope"
1918
2019
  },
2020
+ {
2021
+ "model_format": "gptq",
2022
+ "model_size_in_billions": 110,
2023
+ "quantizations": [
2024
+ "Int4"
2025
+ ],
2026
+ "model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
2027
+ "model_hub": "modelscope"
2028
+ },
1919
2029
  {
1920
2030
  "model_format": "awq",
1921
2031
  "model_size_in_billions": "0_5",
@@ -1979,6 +2089,15 @@
1979
2089
  "model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
1980
2090
  "model_hub": "modelscope"
1981
2091
  },
2092
+ {
2093
+ "model_format": "awq",
2094
+ "model_size_in_billions": 110,
2095
+ "quantizations": [
2096
+ "Int4"
2097
+ ],
2098
+ "model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
2099
+ "model_hub": "modelscope"
2100
+ },
1982
2101
  {
1983
2102
  "model_format": "ggufv2",
1984
2103
  "model_size_in_billions": "0_5",
@@ -3205,5 +3324,93 @@
3205
3324
  "model_revision": "master"
3206
3325
  }
3207
3326
  ]
3327
+ },
3328
+ {
3329
+ "version": 1,
3330
+ "context_length": 128000,
3331
+ "model_name": "phi-3-mini-128k-instruct",
3332
+ "model_lang": [
3333
+ "en"
3334
+ ],
3335
+ "model_ability": [
3336
+ "chat"
3337
+ ],
3338
+ "model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
3339
+ "model_specs": [
3340
+ {
3341
+ "model_format": "pytorch",
3342
+ "model_size_in_billions": 4,
3343
+ "quantizations": [
3344
+ "4-bit",
3345
+ "8-bit",
3346
+ "none"
3347
+ ],
3348
+ "model_hub": "modelscope",
3349
+ "model_id": "LLM-Research/Phi-3-mini-128k-instruct",
3350
+ "model_revision": "master"
3351
+ }
3352
+ ],
3353
+ "prompt_style": {
3354
+ "style_name": "PHI3",
3355
+ "system_prompt": "You are a helpful AI assistant.",
3356
+ "roles": [
3357
+ "user",
3358
+ "assistant"
3359
+ ],
3360
+ "intra_message_sep": "\n",
3361
+ "inter_message_sep": "<|end|>\n",
3362
+ "stop_token_ids":[
3363
+ 32000,
3364
+ 32007
3365
+ ],
3366
+ "stop": [
3367
+ "<|endoftext|>",
3368
+ "<|end|>"
3369
+ ]
3370
+ }
3371
+ },
3372
+ {
3373
+ "version": 1,
3374
+ "context_length": 4096,
3375
+ "model_name": "phi-3-mini-4k-instruct",
3376
+ "model_lang": [
3377
+ "en"
3378
+ ],
3379
+ "model_ability": [
3380
+ "chat"
3381
+ ],
3382
+ "model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
3383
+ "model_specs": [
3384
+ {
3385
+ "model_format": "pytorch",
3386
+ "model_size_in_billions": 4,
3387
+ "quantizations": [
3388
+ "4-bit",
3389
+ "8-bit",
3390
+ "none"
3391
+ ],
3392
+ "model_hub": "modelscope",
3393
+ "model_id": "LLM-Research/Phi-3-mini-4k-instruct",
3394
+ "model_revision": "master"
3395
+ }
3396
+ ],
3397
+ "prompt_style": {
3398
+ "style_name": "PHI3",
3399
+ "system_prompt": "You are a helpful AI assistant.",
3400
+ "roles": [
3401
+ "user",
3402
+ "assistant"
3403
+ ],
3404
+ "intra_message_sep": "\n",
3405
+ "inter_message_sep": "<|end|>\n",
3406
+ "stop_token_ids":[
3407
+ 32000,
3408
+ 32007
3409
+ ],
3410
+ "stop": [
3411
+ "<|endoftext|>",
3412
+ "<|end|>"
3413
+ ]
3414
+ }
3208
3415
  }
3209
3416
  ]
@@ -27,9 +27,11 @@ import torch
27
27
  from ....model.utils import select_device
28
28
  from ....types import (
29
29
  ChatCompletion,
30
- ChatCompletionChoice,
31
30
  ChatCompletionChunk,
32
31
  ChatCompletionMessage,
32
+ Completion,
33
+ CompletionChoice,
34
+ CompletionChunk,
33
35
  CompletionUsage,
34
36
  )
35
37
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
67
69
  self._type = torch.float16 if self._device == "mps" else torch.bfloat16
68
70
 
69
71
  # specify the path to the model
70
- self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
72
+ self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
71
73
  self.model_path
72
74
  )
73
75
  self._tokenizer = self._vl_chat_processor.tokenizer
74
76
 
75
- vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
77
+ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
76
78
  self.model_path, trust_remote_code=True, device_map=self._device
77
79
  )
78
80
  self._model = vl_gpt.to(self._type).eval()
@@ -149,10 +151,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
149
151
  chat_history: Optional[List[ChatCompletionMessage]] = None,
150
152
  generate_config: Optional[PytorchGenerateConfig] = None,
151
153
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
152
- if generate_config and generate_config.get("stream"):
153
- raise Exception(
154
- f"Chat with model {self.model_family.model_name} does not support stream."
155
- )
154
+ if not generate_config:
155
+ generate_config = {}
156
+
157
+ stream = generate_config.get("stream", False)
158
+
156
159
  prompt, images = self._message_content_to_deepseek(prompt)
157
160
  prompt_messages: List[Dict[str, Any]] = [
158
161
  {
@@ -184,6 +187,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
184
187
 
185
188
  deepseek_history.extend(prompt_messages)
186
189
 
190
+ from ....thirdparty.deepseek_vl.serve.inference import generate
187
191
  from ....thirdparty.deepseek_vl.utils.io import load_pil_images
188
192
 
189
193
  # load images and prepare for inputs
@@ -192,41 +196,93 @@ class DeepSeekVLChatModel(PytorchChatModel):
192
196
  conversations=deepseek_history, images=pil_images, force_batchify=True
193
197
  ).to(self._model.device, self._model.dtype)
194
198
 
195
- # run image encoder to get the image embeddings
196
- inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
197
-
198
- # run the model to get the response
199
- outputs = self._model.language_model.generate(
200
- inputs_embeds=inputs_embeds,
201
- attention_mask=prepare_inputs.attention_mask,
202
- pad_token_id=self._tokenizer.eos_token_id,
203
- bos_token_id=self._tokenizer.bos_token_id,
204
- eos_token_id=self._tokenizer.eos_token_id,
205
- max_new_tokens=512,
206
- do_sample=True,
207
- top_p=0.95,
208
- temperature=0.2,
209
- repetition_penalty=1.1,
210
- use_cache=True,
211
- )
199
+ temperature = generate_config.get("temperature", 0.2)
200
+ top_p = generate_config.get("top_p", 0.95)
201
+ max_new_tokens = generate_config.get("max_tokens", 512)
202
+ repetition_penalty = generate_config.get("repetition_penalty", 1.1)
203
+
204
+ conversation = self._vl_chat_processor.new_chat_template()
205
+ stop_str = conversation.sep2
206
+ stop_words = [stop_str]
212
207
 
213
- answer = self._tokenizer.decode(
214
- outputs[0].cpu().tolist(), skip_special_tokens=True
208
+ streamer = generate(
209
+ vl_gpt=self._model,
210
+ tokenizer=self._tokenizer,
211
+ prepare_inputs=prepare_inputs,
212
+ max_gen_len=max_new_tokens,
213
+ temperature=temperature,
214
+ repetition_penalty=repetition_penalty,
215
+ top_p=top_p,
216
+ stop_words=stop_words,
215
217
  )
216
218
 
217
- return ChatCompletion(
218
- id="chat" + str(uuid.uuid1()),
219
- object="chat.completion",
219
+ if stream:
220
+ it = self._generate_stream(streamer, stop_str)
221
+ return self._to_chat_completion_chunks(it)
222
+ else:
223
+ c = self._generate(streamer, stop_str)
224
+ return self._to_chat_completion(c)
225
+
226
+ def _generate(self, streamer, stop_str) -> Completion:
227
+ generated_text = ""
228
+ for new_text in streamer:
229
+ if new_text.endswith(stop_str):
230
+ new_text = new_text[: -len(stop_str)]
231
+ generated_text += new_text
232
+
233
+ c = Completion(
234
+ id=str(uuid.uuid1()),
235
+ object="text_completion",
220
236
  created=int(time.time()),
221
237
  model=self.model_uid,
222
238
  choices=[
223
- ChatCompletionChoice(
224
- index=0,
225
- message={"role": "assistant", "content": answer},
226
- finish_reason="stop",
239
+ CompletionChoice(
240
+ index=0, text=generated_text, finish_reason="stop", logprobs=None
227
241
  )
228
242
  ],
229
243
  usage=CompletionUsage(
230
244
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
231
245
  ),
232
246
  )
247
+ return c
248
+
249
+ def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
250
+ completion_id = str(uuid.uuid1())
251
+ for i, new_text in enumerate(streamer):
252
+ if new_text.endswith(stop_str):
253
+ new_text = new_text[: -len(stop_str)]
254
+ completion_choice = CompletionChoice(
255
+ text=new_text, index=0, logprobs=None, finish_reason=None
256
+ )
257
+ chunk = CompletionChunk(
258
+ id=completion_id,
259
+ object="text_completion",
260
+ created=int(time.time()),
261
+ model=self.model_uid,
262
+ choices=[completion_choice],
263
+ )
264
+ completion_usage = CompletionUsage(
265
+ prompt_tokens=-1,
266
+ completion_tokens=-1,
267
+ total_tokens=-1,
268
+ )
269
+ chunk["usage"] = completion_usage
270
+ yield chunk
271
+
272
+ completion_choice = CompletionChoice(
273
+ text="", index=0, logprobs=None, finish_reason="stop"
274
+ )
275
+ chunk = CompletionChunk(
276
+ id=completion_id,
277
+ object="text_completion",
278
+ created=int(time.time()),
279
+ model=self.model_uid,
280
+ choices=[completion_choice],
281
+ )
282
+ completion_usage = CompletionUsage(
283
+ prompt_tokens=-1,
284
+ completion_tokens=-1,
285
+ total_tokens=-1,
286
+ )
287
+ chunk["usage"] = completion_usage
288
+ yield chunk
@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
22
22
  from ....model.utils import select_device
23
23
  from ....types import (
24
24
  ChatCompletion,
25
- ChatCompletionChoice,
26
25
  ChatCompletionChunk,
27
26
  ChatCompletionMessage,
27
+ Completion,
28
+ CompletionChoice,
29
+ CompletionChunk,
28
30
  CompletionUsage,
29
31
  )
30
32
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
116
118
  chat_history: Optional[List[ChatCompletionMessage]] = None,
117
119
  generate_config: Optional[PytorchGenerateConfig] = None,
118
120
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
119
- if generate_config and generate_config.get("stream"):
120
- raise Exception(
121
- f"Chat with model {self.model_family.model_name} does not support stream."
122
- )
123
121
  prompt = self._message_content_to_qwen(prompt)
124
122
  # Convert openai history to qwen vl history
125
123
  qwen_history = []
@@ -134,22 +132,79 @@ class QwenVLChatModel(PytorchChatModel):
134
132
  if len(query_to_response) == 2:
135
133
  qwen_history.append(query_to_response)
136
134
  query_to_response = []
135
+
136
+ stream = generate_config.get("stream", False) if generate_config else False
137
+
138
+ if stream:
139
+ it = self._generate_stream(prompt, qwen_history)
140
+ return self._to_chat_completion_chunks(it)
141
+ else:
142
+ c = self._generate(prompt, qwen_history)
143
+ return self._to_chat_completion(c)
144
+
145
+ def _generate(self, prompt: str, qwen_history: List) -> Completion:
137
146
  response, history = self._model.chat(
138
147
  self._tokenizer, query=prompt, history=qwen_history
139
148
  )
140
- return ChatCompletion(
141
- id="chat" + str(uuid.uuid1()),
142
- object="chat.completion",
149
+ c = Completion(
150
+ id=str(uuid.uuid1()),
151
+ object="text_completion",
143
152
  created=int(time.time()),
144
153
  model=self.model_uid,
145
154
  choices=[
146
- ChatCompletionChoice(
147
- index=0,
148
- message={"role": "assistant", "content": response},
149
- finish_reason="stop",
155
+ CompletionChoice(
156
+ index=0, text=response, finish_reason="stop", logprobs=None
150
157
  )
151
158
  ],
152
159
  usage=CompletionUsage(
153
160
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
154
161
  ),
155
162
  )
163
+ return c
164
+
165
+ def _generate_stream(
166
+ self, prompt: str, qwen_history: List
167
+ ) -> Iterator[CompletionChunk]:
168
+ # response, history = model.chat(tokenizer, message, history=history)
169
+ response_generator = self._model.chat_stream(
170
+ self._tokenizer, query=prompt, history=qwen_history
171
+ )
172
+ full_response = ""
173
+ for response in response_generator:
174
+ inc_content = response[len(full_response) :]
175
+ full_response = response
176
+ completion_choice = CompletionChoice(
177
+ text=inc_content, index=0, logprobs=None, finish_reason=None
178
+ )
179
+ completion_chunk = CompletionChunk(
180
+ id=str(uuid.uuid1()),
181
+ object="text_completion",
182
+ created=int(time.time()),
183
+ model=self.model_uid,
184
+ choices=[completion_choice],
185
+ )
186
+ completion_usage = CompletionUsage(
187
+ prompt_tokens=-1,
188
+ completion_tokens=-1,
189
+ total_tokens=-1,
190
+ )
191
+ completion_chunk["usage"] = completion_usage
192
+ yield completion_chunk
193
+
194
+ completion_choice = CompletionChoice(
195
+ text="", index=0, logprobs=None, finish_reason="stop"
196
+ )
197
+ completion_chunk = CompletionChunk(
198
+ id=str(uuid.uuid1()),
199
+ object="text_completion",
200
+ created=int(time.time()),
201
+ model=self.model_uid,
202
+ choices=[completion_choice],
203
+ )
204
+ completion_usage = CompletionUsage(
205
+ prompt_tokens=-1,
206
+ completion_tokens=-1,
207
+ total_tokens=-1,
208
+ )
209
+ completion_chunk["usage"] = completion_usage
210
+ yield completion_chunk
@@ -27,9 +27,11 @@ from PIL import Image
27
27
  from ....model.utils import select_device
28
28
  from ....types import (
29
29
  ChatCompletion,
30
- ChatCompletionChoice,
31
30
  ChatCompletionChunk,
32
31
  ChatCompletionMessage,
32
+ Completion,
33
+ CompletionChoice,
34
+ CompletionChunk,
33
35
  CompletionUsage,
34
36
  )
35
37
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
122
124
  raise RuntimeError("Only one image per message is supported by Yi VL.")
123
125
  return content
124
126
 
125
- @staticmethod
126
- def _parse_text(text):
127
- lines = text.split("\n")
128
- lines = [line for line in lines if line != ""]
129
- count = 0
130
- for i, line in enumerate(lines):
131
- if "```" in line:
132
- count += 1
133
- items = line.split("`")
134
- if count % 2 == 1:
135
- lines[i] = f'<pre><code class="language-{items[-1]}">'
136
- else:
137
- lines[i] = f"<br></code></pre>"
138
- else:
139
- if i > 0:
140
- if count % 2 == 1:
141
- line = line.replace("`", r"\`")
142
- line = line.replace("<", "&lt;")
143
- line = line.replace(">", "&gt;")
144
- line = line.replace(" ", "&nbsp;")
145
- line = line.replace("*", "&ast;")
146
- line = line.replace("_", "&lowbar;")
147
- line = line.replace("-", "&#45;")
148
- line = line.replace(".", "&#46;")
149
- line = line.replace("!", "&#33;")
150
- line = line.replace("(", "&#40;")
151
- line = line.replace(")", "&#41;")
152
- line = line.replace("$", "&#36;")
153
- lines[i] = "<br>" + line
154
- text = "".join(lines)
155
- return text
156
-
157
127
  def chat(
158
128
  self,
159
129
  prompt: Union[str, List[Dict]],
@@ -164,12 +134,12 @@ class YiVLChatModel(PytorchChatModel):
164
134
  from transformers import TextIteratorStreamer
165
135
 
166
136
  # TODO(codingl2k1): implement stream mode.
167
- if generate_config and generate_config.get("stream"):
168
- raise Exception(
169
- f"Chat with model {self.model_family.model_name} does not support stream."
170
- )
137
+
171
138
  if not generate_config:
172
139
  generate_config = {}
140
+
141
+ stream = generate_config.get("stream", False)
142
+
173
143
  from ....thirdparty.llava.conversation import conv_templates
174
144
  from ....thirdparty.llava.mm_utils import (
175
145
  KeywordsStoppingCriteria,
@@ -229,25 +199,72 @@ class YiVLChatModel(PytorchChatModel):
229
199
  t = Thread(target=self._model.generate, kwargs=generate_kwargs)
230
200
  t.start()
231
201
 
202
+ if stream:
203
+ it = self._generate_stream(streamer, stop_str)
204
+ return self._to_chat_completion_chunks(it)
205
+ else:
206
+ c = self._generate(streamer, stop_str)
207
+ return self._to_chat_completion(c)
208
+
209
+ def _generate(self, streamer, stop_str) -> Completion:
232
210
  generated_text = ""
233
211
  for new_text in streamer:
234
212
  generated_text += new_text
235
213
  if generated_text.endswith(stop_str):
236
214
  generated_text = generated_text[: -len(stop_str)]
237
- r = self._parse_text(generated_text)
238
- return ChatCompletion(
239
- id="chat" + str(uuid.uuid1()),
240
- object="chat.completion",
215
+
216
+ c = Completion(
217
+ id=str(uuid.uuid1()),
218
+ object="text_completion",
241
219
  created=int(time.time()),
242
220
  model=self.model_uid,
243
221
  choices=[
244
- ChatCompletionChoice(
245
- index=0,
246
- message={"role": "assistant", "content": r},
247
- finish_reason="stop",
222
+ CompletionChoice(
223
+ index=0, text=generated_text, finish_reason="stop", logprobs=None
248
224
  )
249
225
  ],
250
226
  usage=CompletionUsage(
251
227
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
252
228
  ),
253
229
  )
230
+ return c
231
+
232
+ def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
233
+ completion_id = str(uuid.uuid1())
234
+ for i, new_text in enumerate(streamer):
235
+ if not new_text.endswith(stop_str):
236
+ completion_choice = CompletionChoice(
237
+ text=new_text, index=0, logprobs=None, finish_reason=None
238
+ )
239
+ chunk = CompletionChunk(
240
+ id=completion_id,
241
+ object="text_completion",
242
+ created=int(time.time()),
243
+ model=self.model_uid,
244
+ choices=[completion_choice],
245
+ )
246
+ completion_usage = CompletionUsage(
247
+ prompt_tokens=-1,
248
+ completion_tokens=-1,
249
+ total_tokens=-1,
250
+ )
251
+ chunk["usage"] = completion_usage
252
+ yield chunk
253
+
254
+ completion_choice = CompletionChoice(
255
+ text="", index=0, logprobs=None, finish_reason="stop"
256
+ )
257
+ chunk = CompletionChunk(
258
+ id=completion_id,
259
+ object="text_completion",
260
+ created=int(time.time()),
261
+ model=self.model_uid,
262
+ choices=[completion_choice],
263
+ )
264
+ completion_usage = CompletionUsage(
265
+ prompt_tokens=-1,
266
+ completion_tokens=-1,
267
+ total_tokens=-1,
268
+ )
269
+ chunk["usage"] = completion_usage
270
+ yield chunk