xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +5 -39
  4. xinference/client/restful/restful_client.py +3 -24
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/model.py +82 -31
  11. xinference/core/scheduler.py +37 -37
  12. xinference/core/status_guard.py +1 -1
  13. xinference/core/supervisor.py +11 -10
  14. xinference/core/utils.py +80 -22
  15. xinference/core/worker.py +17 -16
  16. xinference/deploy/cmdline.py +8 -16
  17. xinference/deploy/local.py +1 -1
  18. xinference/deploy/supervisor.py +1 -1
  19. xinference/deploy/utils.py +1 -1
  20. xinference/deploy/worker.py +1 -1
  21. xinference/model/audio/cosyvoice.py +86 -41
  22. xinference/model/embedding/core.py +52 -31
  23. xinference/model/image/stable_diffusion/core.py +18 -1
  24. xinference/model/llm/__init__.py +21 -11
  25. xinference/model/llm/llama_cpp/core.py +16 -33
  26. xinference/model/llm/llm_family.json +619 -1297
  27. xinference/model/llm/llm_family.py +31 -52
  28. xinference/model/llm/llm_family_csghub.json +18 -35
  29. xinference/model/llm/llm_family_modelscope.json +573 -1119
  30. xinference/model/llm/lmdeploy/core.py +56 -88
  31. xinference/model/llm/mlx/core.py +46 -69
  32. xinference/model/llm/sglang/core.py +33 -18
  33. xinference/model/llm/transformers/chatglm.py +167 -305
  34. xinference/model/llm/transformers/cogvlm2.py +36 -63
  35. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  36. xinference/model/llm/transformers/core.py +49 -50
  37. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  38. xinference/model/llm/transformers/glm4v.py +55 -111
  39. xinference/model/llm/transformers/intern_vl.py +39 -70
  40. xinference/model/llm/transformers/internlm2.py +32 -54
  41. xinference/model/llm/transformers/minicpmv25.py +22 -55
  42. xinference/model/llm/transformers/minicpmv26.py +158 -68
  43. xinference/model/llm/transformers/omnilmm.py +5 -28
  44. xinference/model/llm/transformers/qwen2_vl.py +208 -0
  45. xinference/model/llm/transformers/qwen_vl.py +34 -86
  46. xinference/model/llm/transformers/utils.py +32 -38
  47. xinference/model/llm/transformers/yi_vl.py +32 -72
  48. xinference/model/llm/utils.py +195 -489
  49. xinference/model/llm/vllm/core.py +153 -100
  50. xinference/model/rerank/core.py +41 -8
  51. xinference/model/rerank/model_spec.json +7 -0
  52. xinference/model/rerank/model_spec_modelscope.json +7 -1
  53. xinference/model/utils.py +1 -31
  54. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  55. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  56. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  57. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  58. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  59. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  60. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  61. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  62. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  63. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  64. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  65. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  66. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  67. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  68. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  69. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  70. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  71. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  72. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  73. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  74. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  75. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  76. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  77. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  78. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  79. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  80. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  81. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
  82. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  83. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  84. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  85. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  88. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  89. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  90. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  91. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  92. xinference/thirdparty/matcha/VERSION +1 -0
  93. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  94. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  95. xinference/thirdparty/omnilmm/LICENSE +201 -0
  96. xinference/thirdparty/whisper/__init__.py +156 -0
  97. xinference/thirdparty/whisper/__main__.py +3 -0
  98. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  99. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  100. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  101. xinference/thirdparty/whisper/audio.py +157 -0
  102. xinference/thirdparty/whisper/decoding.py +826 -0
  103. xinference/thirdparty/whisper/model.py +314 -0
  104. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  105. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  106. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  107. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  108. xinference/thirdparty/whisper/timing.py +386 -0
  109. xinference/thirdparty/whisper/tokenizer.py +395 -0
  110. xinference/thirdparty/whisper/transcribe.py +605 -0
  111. xinference/thirdparty/whisper/triton_ops.py +109 -0
  112. xinference/thirdparty/whisper/utils.py +316 -0
  113. xinference/thirdparty/whisper/version.py +1 -0
  114. xinference/types.py +7 -49
  115. xinference/web/ui/build/asset-manifest.json +6 -6
  116. xinference/web/ui/build/index.html +1 -1
  117. xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
  118. xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
  119. xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
  120. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
  121. xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
  122. xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
  123. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  124. xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
  125. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  126. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  127. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  128. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  129. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  130. xinference/web/ui/node_modules/.package-lock.json +37 -0
  131. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  132. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  133. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  134. xinference/web/ui/package-lock.json +38 -0
  135. xinference/web/ui/package.json +1 -0
  136. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
  137. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
  138. xinference/model/llm/transformers/llama_2.py +0 -108
  139. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  140. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  141. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  142. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  144. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  145. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  146. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
  147. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
  148. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
  149. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@ import base64
15
15
  import logging
16
16
  import os.path
17
17
  import tempfile
18
- import time
19
18
  import uuid
20
19
  from concurrent.futures import ThreadPoolExecutor
21
20
  from io import BytesIO
@@ -25,16 +24,9 @@ import requests
25
24
  import torch
26
25
 
27
26
  from ....model.utils import select_device
28
- from ....types import (
29
- ChatCompletion,
30
- ChatCompletionChunk,
31
- ChatCompletionMessage,
32
- Completion,
33
- CompletionChoice,
34
- CompletionChunk,
35
- CompletionUsage,
36
- )
27
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
37
28
  from ..llm_family import LLMFamilyV1, LLMSpecV1
29
+ from ..utils import generate_chat_completion, generate_completion_chunk
38
30
  from .core import PytorchChatModel, PytorchGenerateConfig
39
31
 
40
32
  logger = logging.getLogger(__name__)
@@ -147,9 +139,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
147
139
 
148
140
  def chat(
149
141
  self,
150
- prompt: Union[str, List[Dict]],
151
- system_prompt: Optional[str] = None,
152
- chat_history: Optional[List[ChatCompletionMessage]] = None,
142
+ messages: List[Dict],
153
143
  generate_config: Optional[PytorchGenerateConfig] = None,
154
144
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
155
145
  if not generate_config:
@@ -162,44 +152,40 @@ class DeepSeekVLChatModel(PytorchChatModel):
162
152
  if isinstance(stream_options, dict)
163
153
  else False
164
154
  )
165
- prompt, images = self._message_content_to_deepseek(prompt)
166
- prompt_messages: List[Dict[str, Any]] = [
167
- {
168
- "role": "User",
169
- "content": prompt,
170
- },
171
- {"role": "Assistant", "content": ""},
172
- ]
173
- if images:
174
- prompt_messages[0]["images"] = images
175
-
176
- # Convert openai history to qwen vl history
177
- deepseek_history = []
178
- for h in chat_history or []:
179
- role = h["role"]
155
+
156
+ prompt = ""
157
+ deepseek_messages = []
158
+ for i, message in enumerate(messages):
159
+ role = message["role"]
160
+ content = message["content"]
180
161
  if role == "user":
181
- content, images = self._message_content_to_deepseek(h["content"])
182
- msg: Dict[str, Any] = {
183
- "role": "User",
184
- "content": content,
185
- }
186
- if images:
187
- msg["images"] = images
188
- deepseek_history.append(msg)
162
+ if isinstance(content, str):
163
+ deepseek_messages.append({"role": "User", "content": content})
164
+ else:
165
+ content, images = self._message_content_to_deepseek(content)
166
+ msg: Dict[str, Any] = {
167
+ "role": "User",
168
+ "content": content,
169
+ }
170
+ if images:
171
+ msg["images"] = images
172
+ deepseek_messages.append(msg)
173
+ if i == len(messages) - 1:
174
+ prompt = content
189
175
  elif role == "assistant":
190
- deepseek_history.append({"role": "Assistant", "content": h["content"]})
176
+ deepseek_messages.append({"role": "Assistant", "content": content})
191
177
  else:
192
- logger.error("Unexpected msg in chat history: %s", h)
193
-
194
- deepseek_history.extend(prompt_messages)
178
+ logger.error(
179
+ f"Unexpected message in messages: role: {role}, message: {message}"
180
+ )
195
181
 
196
182
  from ....thirdparty.deepseek_vl.serve.inference import generate
197
183
  from ....thirdparty.deepseek_vl.utils.io import load_pil_images
198
184
 
199
185
  # load images and prepare for inputs
200
- pil_images = load_pil_images(deepseek_history)
186
+ pil_images = load_pil_images(deepseek_messages)
201
187
  prepare_inputs = self._vl_chat_processor(
202
- conversations=deepseek_history, images=pil_images, force_batchify=True
188
+ conversations=deepseek_messages, images=pil_images, force_batchify=True
203
189
  ).to(self._model.device, self._model.dtype)
204
190
 
205
191
  temperature = generate_config.get("temperature", 0.2)
@@ -226,31 +212,16 @@ class DeepSeekVLChatModel(PytorchChatModel):
226
212
  it = self._generate_stream(streamer, stop_str, include_usage, prompt)
227
213
  return self._to_chat_completion_chunks(it)
228
214
  else:
229
- c = self._generate(streamer, stop_str)
230
- return self._to_chat_completion(c)
215
+ return self._generate(streamer, stop_str)
231
216
 
232
- def _generate(self, streamer, stop_str) -> Completion:
217
+ def _generate(self, streamer, stop_str) -> ChatCompletion:
233
218
  generated_text = ""
234
219
  for new_text in streamer:
235
220
  if new_text.endswith(stop_str):
236
221
  new_text = new_text[: -len(stop_str)]
237
222
  generated_text += new_text
238
223
 
239
- c = Completion(
240
- id=str(uuid.uuid1()),
241
- object="text_completion",
242
- created=int(time.time()),
243
- model=self.model_uid,
244
- choices=[
245
- CompletionChoice(
246
- index=0, text=generated_text, finish_reason="stop", logprobs=None
247
- )
248
- ],
249
- usage=CompletionUsage(
250
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
251
- ),
252
- )
253
- return c
224
+ return generate_chat_completion(self.model_uid, generated_text)
254
225
 
255
226
  def _generate_stream(
256
227
  self, streamer, stop_str, include_usage, prompt
@@ -262,54 +233,40 @@ class DeepSeekVLChatModel(PytorchChatModel):
262
233
  for i, new_text in enumerate(streamer):
263
234
  if new_text.endswith(stop_str):
264
235
  new_text = new_text[: -len(stop_str)]
265
- completion_choice = CompletionChoice(
266
- text=new_text, index=0, logprobs=None, finish_reason=None
267
- )
268
- chunk = CompletionChunk(
269
- id=completion_id,
270
- object="text_completion",
271
- created=int(time.time()),
272
- model=self.model_uid,
273
- choices=[completion_choice],
274
- )
275
236
  completion_tokens = i
276
237
  total_tokens = prompt_tokens + completion_tokens
277
- completion_usage = CompletionUsage(
238
+ yield generate_completion_chunk(
239
+ chunk_text=new_text,
240
+ finish_reason=None,
241
+ chunk_id=completion_id,
242
+ model_uid=self.model_uid,
278
243
  prompt_tokens=prompt_tokens,
279
244
  completion_tokens=completion_tokens,
280
245
  total_tokens=total_tokens,
246
+ has_choice=True,
247
+ has_content=True,
281
248
  )
282
- chunk["usage"] = completion_usage
283
- yield chunk
284
-
285
- completion_choice = CompletionChoice(
286
- text="", index=0, logprobs=None, finish_reason="stop"
287
- )
288
- chunk = CompletionChunk(
289
- id=completion_id,
290
- object="text_completion",
291
- created=int(time.time()),
292
- model=self.model_uid,
293
- choices=[completion_choice],
294
- )
295
- completion_usage = CompletionUsage(
249
+ yield generate_completion_chunk(
250
+ chunk_text=None,
251
+ finish_reason="stop",
252
+ chunk_id=completion_id,
253
+ model_uid=self.model_uid,
296
254
  prompt_tokens=prompt_tokens,
297
255
  completion_tokens=completion_tokens,
298
256
  total_tokens=total_tokens,
257
+ has_choice=True,
258
+ has_content=False,
299
259
  )
300
- chunk["usage"] = completion_usage
301
- yield chunk
260
+
302
261
  if include_usage:
303
- chunk = CompletionChunk(
304
- id=completion_id,
305
- object="text_completion",
306
- created=int(time.time()),
307
- model=self.model_uid,
308
- choices=[],
309
- )
310
- chunk["usage"] = CompletionUsage(
262
+ yield generate_completion_chunk(
263
+ chunk_text=None,
264
+ finish_reason=None,
265
+ chunk_id=completion_id,
266
+ model_uid=self.model_uid,
311
267
  prompt_tokens=prompt_tokens,
312
268
  completion_tokens=completion_tokens,
313
269
  total_tokens=total_tokens,
270
+ has_choice=False,
271
+ has_content=False,
314
272
  )
315
- yield chunk
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import time
16
15
  import typing
17
16
  import uuid
18
17
  from concurrent.futures import ThreadPoolExecutor
@@ -22,18 +21,10 @@ from typing import Dict, Iterator, List, Optional, Union
22
21
  import torch
23
22
 
24
23
  from ....core.scheduler import InferenceRequest
25
- from ....types import (
26
- ChatCompletion,
27
- ChatCompletionChunk,
28
- ChatCompletionMessage,
29
- Completion,
30
- CompletionChoice,
31
- CompletionChunk,
32
- CompletionUsage,
33
- )
24
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
34
25
  from ...utils import select_device
35
26
  from ..llm_family import LLMFamilyV1, LLMSpecV1
36
- from ..utils import _decode_image
27
+ from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
37
28
  from .core import PytorchChatModel, PytorchGenerateConfig
38
29
  from .utils import get_max_src_len
39
30
 
@@ -102,66 +93,45 @@ class Glm4VModel(PytorchChatModel):
102
93
  self._tokenizer = tokenizer
103
94
  self._save_tensorizer()
104
95
 
105
- def _message_content_to_chat(self, content):
106
- if not isinstance(content, str):
107
- texts = []
108
- image_urls = []
109
- for c in content:
110
- c_type = c.get("type")
111
- if c_type == "text":
112
- texts.append(c["text"])
113
- elif c_type == "image_url":
114
- image_urls.append(c["image_url"]["url"])
115
- image_futures = []
116
- with ThreadPoolExecutor() as executor:
117
- for image_url in image_urls:
118
- fut = executor.submit(_decode_image, image_url)
119
- image_futures.append(fut)
120
- images = [fut.result() for fut in image_futures]
121
- text = " ".join(texts)
122
- if len(images) == 0:
123
- return text, []
124
- elif len(images) == 1:
125
- return text, images
96
+ @staticmethod
97
+ def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
98
+ res = []
99
+ for message in messages:
100
+ role = message["role"]
101
+ content = message["content"]
102
+ if isinstance(content, str):
103
+ res.append({"role": role, "content": content})
126
104
  else:
127
- raise RuntimeError("Only one image per message is supported")
128
- return content, []
129
-
130
- def _get_chat_msgs(
131
- self,
132
- prompt: Union[str, List[Dict]],
133
- chat_history: Optional[List[ChatCompletionMessage]] = None,
134
- ):
135
- content, images_chat = self._message_content_to_chat(prompt)
136
-
137
- msgs = []
138
- query_to_response: List[Dict] = []
139
- images_history = []
140
- for h in chat_history or []:
141
- role = h["role"]
142
- content_h, images_tmp = self._message_content_to_chat(h["content"])
143
- if images_tmp:
144
- images_history = images_tmp
145
- if len(query_to_response) == 0 and role == "user":
146
- query_to_response.append({"role": "user", "content": content_h})
147
- if len(query_to_response) == 1 and role == "assistant":
148
- query_to_response.append({"role": "assistant", "content": content_h})
149
- if len(query_to_response) == 2:
150
- msgs.extend(query_to_response)
151
- query_to_response = []
152
- image = None
153
- if len(images_chat) > 0:
154
- image = images_chat[0]
155
- elif len(images_history) > 0:
156
- image = images_history[0]
157
- msgs.append({"role": "user", "content": content, "image": image})
158
- return msgs
105
+ texts = []
106
+ image_urls = []
107
+ for c in content:
108
+ c_type = c.get("type")
109
+ if c_type == "text":
110
+ texts.append(c["text"])
111
+ else:
112
+ assert (
113
+ c_type == "image_url"
114
+ ), "Please follow the image input of the OpenAI API."
115
+ image_urls.append(c["image_url"]["url"])
116
+ if len(image_urls) > 1:
117
+ raise RuntimeError("Only one image per message is supported")
118
+ image_futures = []
119
+ with ThreadPoolExecutor() as executor:
120
+ for image_url in image_urls:
121
+ fut = executor.submit(_decode_image, image_url)
122
+ image_futures.append(fut)
123
+ images = [fut.result() for fut in image_futures]
124
+ assert len(images) <= 1
125
+ text = " ".join(texts)
126
+ if images:
127
+ res.append({"role": role, "content": text, "image": images[0]})
128
+ else:
129
+ res.append({"role": role, "content": text})
130
+ return res
159
131
 
160
132
  def chat(
161
133
  self,
162
- prompt: Union[str, List[Dict]],
163
- system_prompt: Optional[str] = None,
164
- chat_history: Optional[List[ChatCompletionMessage]] = None,
134
+ messages: List[Dict],
165
135
  generate_config: Optional[PytorchGenerateConfig] = None,
166
136
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
167
137
  from transformers import TextIteratorStreamer
@@ -170,7 +140,7 @@ class Glm4VModel(PytorchChatModel):
170
140
  generate_config = {}
171
141
 
172
142
  stream = generate_config.get("stream", False)
173
- msgs = self._get_chat_msgs(prompt, chat_history)
143
+ msgs = self._get_processed_msgs(messages)
174
144
 
175
145
  inputs = self._tokenizer.apply_chat_template(
176
146
  msgs,
@@ -213,64 +183,38 @@ class Glm4VModel(PytorchChatModel):
213
183
  response = self._tokenizer.decode(outputs[0])
214
184
  if response.endswith(stop_str):
215
185
  response = response[: -len(stop_str)]
216
- c = Completion(
217
- id=str(uuid.uuid1()),
218
- object="text_completion",
219
- created=int(time.time()),
220
- model=self.model_uid,
221
- choices=[
222
- CompletionChoice(
223
- index=0, text=response, finish_reason="stop", logprobs=None
224
- )
225
- ],
226
- usage=CompletionUsage(
227
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
228
- ),
229
- )
230
- return self._to_chat_completion(c)
186
+ return generate_chat_completion(self.model_uid, response)
231
187
 
232
188
  def chat_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
233
189
  completion_id = str(uuid.uuid1())
234
190
  for new_text in streamer:
235
191
  if not new_text.endswith(stop_str):
236
- completion_choice = CompletionChoice(
237
- text=new_text, index=0, logprobs=None, finish_reason=None
238
- )
239
- chunk = CompletionChunk(
240
- id=completion_id,
241
- object="text_completion",
242
- created=int(time.time()),
243
- model=self.model_uid,
244
- choices=[completion_choice],
245
- )
246
- completion_usage = CompletionUsage(
192
+ yield generate_completion_chunk(
193
+ chunk_text=new_text,
194
+ finish_reason=None,
195
+ chunk_id=completion_id,
196
+ model_uid=self.model_uid,
247
197
  prompt_tokens=-1,
248
198
  completion_tokens=-1,
249
199
  total_tokens=-1,
200
+ has_choice=True,
201
+ has_content=True,
250
202
  )
251
- chunk["usage"] = completion_usage
252
- yield chunk
253
203
 
254
- completion_choice = CompletionChoice(
255
- text="", index=0, logprobs=None, finish_reason="stop"
256
- )
257
- chunk = CompletionChunk(
258
- id=completion_id,
259
- object="text_completion",
260
- created=int(time.time()),
261
- model=self.model_uid,
262
- choices=[completion_choice],
263
- )
264
- completion_usage = CompletionUsage(
204
+ yield generate_completion_chunk(
205
+ chunk_text=None,
206
+ finish_reason="stop",
207
+ chunk_id=completion_id,
208
+ model_uid=self.model_uid,
265
209
  prompt_tokens=-1,
266
210
  completion_tokens=-1,
267
211
  total_tokens=-1,
212
+ has_choice=True,
213
+ has_content=False,
268
214
  )
269
- chunk["usage"] = completion_usage
270
- yield chunk
271
215
 
272
- def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
273
- msgs = self._get_chat_msgs(prompt, chat_history)
216
+ def _get_full_prompt(self, messages, tools):
217
+ msgs = self._get_processed_msgs(messages)
274
218
  inputs = self._tokenizer.apply_chat_template(
275
219
  msgs,
276
220
  add_generation_prompt=True,
@@ -12,24 +12,20 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import time
16
15
  import uuid
17
16
  from concurrent.futures import ThreadPoolExecutor
18
17
  from typing import Dict, Iterator, List, Optional, Union
19
18
 
20
19
  import torch
21
20
 
22
- from ....types import (
23
- ChatCompletion,
24
- ChatCompletionChunk,
25
- ChatCompletionMessage,
26
- Completion,
27
- CompletionChoice,
28
- CompletionChunk,
29
- CompletionUsage,
30
- )
21
+ from ....types import ChatCompletion, ChatCompletionChunk
31
22
  from ..llm_family import LLMFamilyV1, LLMSpecV1
32
- from ..utils import _decode_image
23
+ from ..utils import (
24
+ _decode_image,
25
+ generate_chat_completion,
26
+ generate_completion_chunk,
27
+ parse_messages,
28
+ )
33
29
  from .core import PytorchChatModel, PytorchGenerateConfig
34
30
 
35
31
  logger = logging.getLogger(__name__)
@@ -78,7 +74,7 @@ def _message_content_to_intern(content, image_cnt):
78
74
 
79
75
  def _get_prompt_and_chat_history(
80
76
  prompt: Union[str, List[Dict]],
81
- chat_history: Optional[List[ChatCompletionMessage]] = None,
77
+ chat_history: Optional[List[Dict]] = None,
82
78
  ):
83
79
  # Convert openai history to intern vl history
84
80
  images = []
@@ -332,9 +328,7 @@ class InternVLChatModel(PytorchChatModel):
332
328
 
333
329
  def chat(
334
330
  self,
335
- prompt: Union[str, List[Dict]],
336
- system_prompt: Optional[str] = None,
337
- chat_history: Optional[List[ChatCompletionMessage]] = None,
331
+ messages: List[Dict],
338
332
  generate_config: Optional[PytorchGenerateConfig] = None,
339
333
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
340
334
  from ....thirdparty.internvl.conversation import get_conv_template
@@ -366,6 +360,7 @@ class InternVLChatModel(PytorchChatModel):
366
360
  else False
367
361
  )
368
362
 
363
+ prompt, _, chat_history = parse_messages(messages)
369
364
  content, history, images, videos = _get_prompt_and_chat_history(
370
365
  prompt, chat_history
371
366
  )
@@ -434,10 +429,9 @@ class InternVLChatModel(PytorchChatModel):
434
429
  chunk = self._generate_stream(generate_kwargs, input_ids, include_usage)
435
430
  return self._to_chat_completion_chunks(chunk)
436
431
  else:
437
- chunk = self._generate(generate_kwargs, input_ids, template)
438
- return self._to_chat_completion(chunk)
432
+ return self._generate(generate_kwargs, input_ids, template)
439
433
 
440
- def _generate(self, generate_kwargs, input_ids, template):
434
+ def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion:
441
435
  prompt_tokens = len(input_ids[0])
442
436
  generation_output = self._model.generate(**generate_kwargs)
443
437
  completion_tokens = len(generation_output[0])
@@ -445,23 +439,13 @@ class InternVLChatModel(PytorchChatModel):
445
439
  generation_output, skip_special_tokens=True
446
440
  )[0]
447
441
  response = response.split(template.sep)[0].strip()
448
- chunk = Completion(
449
- id=str(uuid.uuid1()),
450
- object="text_completion",
451
- created=int(time.time()),
452
- model=self.model_uid,
453
- choices=[
454
- CompletionChoice(
455
- index=0, text=response, finish_reason="stop", logprobs=None
456
- )
457
- ],
458
- usage=CompletionUsage(
459
- prompt_tokens=prompt_tokens,
460
- completion_tokens=completion_tokens,
461
- total_tokens=prompt_tokens + completion_tokens,
462
- ),
442
+ return generate_chat_completion(
443
+ self.model_uid,
444
+ response,
445
+ prompt_tokens=prompt_tokens,
446
+ completion_tokens=completion_tokens,
447
+ total_tokens=prompt_tokens + completion_tokens,
463
448
  )
464
- return chunk
465
449
 
466
450
  def _generate_stream(self, generate_kwargs, input_ids, include_usage):
467
451
  from threading import Thread
@@ -483,58 +467,43 @@ class InternVLChatModel(PytorchChatModel):
483
467
 
484
468
  completion_id = str(uuid.uuid1())
485
469
  prompt_tokens = len(input_ids[0])
486
- completion_tokens = 0
470
+ total_tokens, completion_tokens = 0, 0
487
471
  # Loop through the streamer to get the new text as it is generated
488
472
  for i, new_text in enumerate(streamer):
489
473
  if new_text == self._model.conv_template.sep:
490
474
  break
491
- completion_choice = CompletionChoice(
492
- text=new_text, index=0, logprobs=None, finish_reason=None
493
- )
494
- chunk = CompletionChunk(
495
- id=completion_id,
496
- object="text_completion",
497
- created=int(time.time()),
498
- model=self.model_uid,
499
- choices=[completion_choice],
500
- )
501
475
  completion_tokens = max(completion_tokens, len(streamer.token_cache))
502
476
  total_tokens = prompt_tokens + completion_tokens
503
- completion_usage = CompletionUsage(
477
+ yield generate_completion_chunk(
478
+ chunk_text=new_text,
479
+ finish_reason=None,
480
+ chunk_id=completion_id,
481
+ model_uid=self.model_uid,
504
482
  prompt_tokens=prompt_tokens,
505
483
  completion_tokens=completion_tokens,
506
484
  total_tokens=total_tokens,
507
485
  )
508
- chunk["usage"] = completion_usage
509
- yield chunk
510
- completion_choice = CompletionChoice(
511
- text="", index=0, logprobs=None, finish_reason="stop"
512
- )
513
- chunk = CompletionChunk(
514
- id=completion_id,
515
- object="text_completion",
516
- created=int(time.time()),
517
- model=self.model_uid,
518
- choices=[completion_choice],
519
- )
520
- completion_usage = CompletionUsage(
486
+ yield generate_completion_chunk(
487
+ chunk_text=None,
488
+ finish_reason="stop",
489
+ chunk_id=completion_id,
490
+ model_uid=self.model_uid,
521
491
  prompt_tokens=prompt_tokens,
522
492
  completion_tokens=completion_tokens,
523
493
  total_tokens=total_tokens,
494
+ has_choice=True,
495
+ has_content=False,
524
496
  )
525
- chunk["usage"] = completion_usage
526
- yield chunk
497
+
527
498
  if include_usage:
528
- chunk = CompletionChunk(
529
- id=completion_id,
530
- object="text_completion",
531
- created=int(time.time()),
532
- model=self.model_uid,
533
- choices=[],
534
- )
535
- chunk["usage"] = CompletionUsage(
499
+ yield generate_completion_chunk(
500
+ chunk_text=None,
501
+ finish_reason=None,
502
+ chunk_id=completion_id,
503
+ model_uid=self.model_uid,
536
504
  prompt_tokens=prompt_tokens,
537
505
  completion_tokens=completion_tokens,
538
506
  total_tokens=total_tokens,
507
+ has_choice=False,
508
+ has_content=False,
539
509
  )
540
- yield chunk