xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +51 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +5 -39
- xinference/client/restful/restful_client.py +3 -24
- xinference/conftest.py +1 -1
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +8 -14
- xinference/core/event.py +1 -1
- xinference/core/model.py +82 -31
- xinference/core/scheduler.py +37 -37
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +11 -10
- xinference/core/utils.py +80 -22
- xinference/core/worker.py +17 -16
- xinference/deploy/cmdline.py +8 -16
- xinference/deploy/local.py +1 -1
- xinference/deploy/supervisor.py +1 -1
- xinference/deploy/utils.py +1 -1
- xinference/deploy/worker.py +1 -1
- xinference/model/audio/cosyvoice.py +86 -41
- xinference/model/embedding/core.py +52 -31
- xinference/model/image/stable_diffusion/core.py +18 -1
- xinference/model/llm/__init__.py +21 -11
- xinference/model/llm/llama_cpp/core.py +16 -33
- xinference/model/llm/llm_family.json +619 -1297
- xinference/model/llm/llm_family.py +31 -52
- xinference/model/llm/llm_family_csghub.json +18 -35
- xinference/model/llm/llm_family_modelscope.json +573 -1119
- xinference/model/llm/lmdeploy/core.py +56 -88
- xinference/model/llm/mlx/core.py +46 -69
- xinference/model/llm/sglang/core.py +33 -18
- xinference/model/llm/transformers/chatglm.py +167 -305
- xinference/model/llm/transformers/cogvlm2.py +36 -63
- xinference/model/llm/transformers/cogvlm2_video.py +33 -223
- xinference/model/llm/transformers/core.py +49 -50
- xinference/model/llm/transformers/deepseek_vl.py +53 -96
- xinference/model/llm/transformers/glm4v.py +55 -111
- xinference/model/llm/transformers/intern_vl.py +39 -70
- xinference/model/llm/transformers/internlm2.py +32 -54
- xinference/model/llm/transformers/minicpmv25.py +22 -55
- xinference/model/llm/transformers/minicpmv26.py +158 -68
- xinference/model/llm/transformers/omnilmm.py +5 -28
- xinference/model/llm/transformers/qwen2_vl.py +208 -0
- xinference/model/llm/transformers/qwen_vl.py +34 -86
- xinference/model/llm/transformers/utils.py +32 -38
- xinference/model/llm/transformers/yi_vl.py +32 -72
- xinference/model/llm/utils.py +195 -489
- xinference/model/llm/vllm/core.py +153 -100
- xinference/model/rerank/core.py +41 -8
- xinference/model/rerank/model_spec.json +7 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -1
- xinference/model/utils.py +1 -31
- xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
- xinference/thirdparty/cosyvoice/cli/model.py +139 -26
- xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
- xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
- xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
- xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
- xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
- xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
- xinference/thirdparty/cosyvoice/utils/common.py +36 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
- xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
- xinference/thirdparty/matcha/VERSION +1 -0
- xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
- xinference/thirdparty/matcha/hifigan/README.md +101 -0
- xinference/thirdparty/omnilmm/LICENSE +201 -0
- xinference/thirdparty/whisper/__init__.py +156 -0
- xinference/thirdparty/whisper/__main__.py +3 -0
- xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
- xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
- xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
- xinference/thirdparty/whisper/audio.py +157 -0
- xinference/thirdparty/whisper/decoding.py +826 -0
- xinference/thirdparty/whisper/model.py +314 -0
- xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
- xinference/thirdparty/whisper/normalizers/basic.py +76 -0
- xinference/thirdparty/whisper/normalizers/english.json +1741 -0
- xinference/thirdparty/whisper/normalizers/english.py +550 -0
- xinference/thirdparty/whisper/timing.py +386 -0
- xinference/thirdparty/whisper/tokenizer.py +395 -0
- xinference/thirdparty/whisper/transcribe.py +605 -0
- xinference/thirdparty/whisper/triton_ops.py +109 -0
- xinference/thirdparty/whisper/utils.py +316 -0
- xinference/thirdparty/whisper/version.py +1 -0
- xinference/types.py +7 -49
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
- xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
- xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
- xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
- xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +37 -0
- xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
- xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
- xinference/web/ui/node_modules/nunjucks/package.json +112 -0
- xinference/web/ui/package-lock.json +38 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
- xinference/model/llm/transformers/llama_2.py +0 -108
- xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
- xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
- xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,6 @@ import base64
|
|
|
15
15
|
import logging
|
|
16
16
|
import os.path
|
|
17
17
|
import tempfile
|
|
18
|
-
import time
|
|
19
18
|
import uuid
|
|
20
19
|
from concurrent.futures import ThreadPoolExecutor
|
|
21
20
|
from io import BytesIO
|
|
@@ -25,16 +24,9 @@ import requests
|
|
|
25
24
|
import torch
|
|
26
25
|
|
|
27
26
|
from ....model.utils import select_device
|
|
28
|
-
from ....types import
|
|
29
|
-
ChatCompletion,
|
|
30
|
-
ChatCompletionChunk,
|
|
31
|
-
ChatCompletionMessage,
|
|
32
|
-
Completion,
|
|
33
|
-
CompletionChoice,
|
|
34
|
-
CompletionChunk,
|
|
35
|
-
CompletionUsage,
|
|
36
|
-
)
|
|
27
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
37
28
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
29
|
+
from ..utils import generate_chat_completion, generate_completion_chunk
|
|
38
30
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
39
31
|
|
|
40
32
|
logger = logging.getLogger(__name__)
|
|
@@ -147,9 +139,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
147
139
|
|
|
148
140
|
def chat(
|
|
149
141
|
self,
|
|
150
|
-
|
|
151
|
-
system_prompt: Optional[str] = None,
|
|
152
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
142
|
+
messages: List[Dict],
|
|
153
143
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
154
144
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
155
145
|
if not generate_config:
|
|
@@ -162,44 +152,40 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
162
152
|
if isinstance(stream_options, dict)
|
|
163
153
|
else False
|
|
164
154
|
)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
{"role": "Assistant", "content": ""},
|
|
172
|
-
]
|
|
173
|
-
if images:
|
|
174
|
-
prompt_messages[0]["images"] = images
|
|
175
|
-
|
|
176
|
-
# Convert openai history to qwen vl history
|
|
177
|
-
deepseek_history = []
|
|
178
|
-
for h in chat_history or []:
|
|
179
|
-
role = h["role"]
|
|
155
|
+
|
|
156
|
+
prompt = ""
|
|
157
|
+
deepseek_messages = []
|
|
158
|
+
for i, message in enumerate(messages):
|
|
159
|
+
role = message["role"]
|
|
160
|
+
content = message["content"]
|
|
180
161
|
if role == "user":
|
|
181
|
-
content,
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
162
|
+
if isinstance(content, str):
|
|
163
|
+
deepseek_messages.append({"role": "User", "content": content})
|
|
164
|
+
else:
|
|
165
|
+
content, images = self._message_content_to_deepseek(content)
|
|
166
|
+
msg: Dict[str, Any] = {
|
|
167
|
+
"role": "User",
|
|
168
|
+
"content": content,
|
|
169
|
+
}
|
|
170
|
+
if images:
|
|
171
|
+
msg["images"] = images
|
|
172
|
+
deepseek_messages.append(msg)
|
|
173
|
+
if i == len(messages) - 1:
|
|
174
|
+
prompt = content
|
|
189
175
|
elif role == "assistant":
|
|
190
|
-
|
|
176
|
+
deepseek_messages.append({"role": "Assistant", "content": content})
|
|
191
177
|
else:
|
|
192
|
-
logger.error(
|
|
193
|
-
|
|
194
|
-
|
|
178
|
+
logger.error(
|
|
179
|
+
f"Unexpected message in messages: role: {role}, message: {message}"
|
|
180
|
+
)
|
|
195
181
|
|
|
196
182
|
from ....thirdparty.deepseek_vl.serve.inference import generate
|
|
197
183
|
from ....thirdparty.deepseek_vl.utils.io import load_pil_images
|
|
198
184
|
|
|
199
185
|
# load images and prepare for inputs
|
|
200
|
-
pil_images = load_pil_images(
|
|
186
|
+
pil_images = load_pil_images(deepseek_messages)
|
|
201
187
|
prepare_inputs = self._vl_chat_processor(
|
|
202
|
-
conversations=
|
|
188
|
+
conversations=deepseek_messages, images=pil_images, force_batchify=True
|
|
203
189
|
).to(self._model.device, self._model.dtype)
|
|
204
190
|
|
|
205
191
|
temperature = generate_config.get("temperature", 0.2)
|
|
@@ -226,31 +212,16 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
226
212
|
it = self._generate_stream(streamer, stop_str, include_usage, prompt)
|
|
227
213
|
return self._to_chat_completion_chunks(it)
|
|
228
214
|
else:
|
|
229
|
-
|
|
230
|
-
return self._to_chat_completion(c)
|
|
215
|
+
return self._generate(streamer, stop_str)
|
|
231
216
|
|
|
232
|
-
def _generate(self, streamer, stop_str) ->
|
|
217
|
+
def _generate(self, streamer, stop_str) -> ChatCompletion:
|
|
233
218
|
generated_text = ""
|
|
234
219
|
for new_text in streamer:
|
|
235
220
|
if new_text.endswith(stop_str):
|
|
236
221
|
new_text = new_text[: -len(stop_str)]
|
|
237
222
|
generated_text += new_text
|
|
238
223
|
|
|
239
|
-
|
|
240
|
-
id=str(uuid.uuid1()),
|
|
241
|
-
object="text_completion",
|
|
242
|
-
created=int(time.time()),
|
|
243
|
-
model=self.model_uid,
|
|
244
|
-
choices=[
|
|
245
|
-
CompletionChoice(
|
|
246
|
-
index=0, text=generated_text, finish_reason="stop", logprobs=None
|
|
247
|
-
)
|
|
248
|
-
],
|
|
249
|
-
usage=CompletionUsage(
|
|
250
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
251
|
-
),
|
|
252
|
-
)
|
|
253
|
-
return c
|
|
224
|
+
return generate_chat_completion(self.model_uid, generated_text)
|
|
254
225
|
|
|
255
226
|
def _generate_stream(
|
|
256
227
|
self, streamer, stop_str, include_usage, prompt
|
|
@@ -262,54 +233,40 @@ class DeepSeekVLChatModel(PytorchChatModel):
|
|
|
262
233
|
for i, new_text in enumerate(streamer):
|
|
263
234
|
if new_text.endswith(stop_str):
|
|
264
235
|
new_text = new_text[: -len(stop_str)]
|
|
265
|
-
completion_choice = CompletionChoice(
|
|
266
|
-
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
267
|
-
)
|
|
268
|
-
chunk = CompletionChunk(
|
|
269
|
-
id=completion_id,
|
|
270
|
-
object="text_completion",
|
|
271
|
-
created=int(time.time()),
|
|
272
|
-
model=self.model_uid,
|
|
273
|
-
choices=[completion_choice],
|
|
274
|
-
)
|
|
275
236
|
completion_tokens = i
|
|
276
237
|
total_tokens = prompt_tokens + completion_tokens
|
|
277
|
-
|
|
238
|
+
yield generate_completion_chunk(
|
|
239
|
+
chunk_text=new_text,
|
|
240
|
+
finish_reason=None,
|
|
241
|
+
chunk_id=completion_id,
|
|
242
|
+
model_uid=self.model_uid,
|
|
278
243
|
prompt_tokens=prompt_tokens,
|
|
279
244
|
completion_tokens=completion_tokens,
|
|
280
245
|
total_tokens=total_tokens,
|
|
246
|
+
has_choice=True,
|
|
247
|
+
has_content=True,
|
|
281
248
|
)
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
)
|
|
288
|
-
chunk = CompletionChunk(
|
|
289
|
-
id=completion_id,
|
|
290
|
-
object="text_completion",
|
|
291
|
-
created=int(time.time()),
|
|
292
|
-
model=self.model_uid,
|
|
293
|
-
choices=[completion_choice],
|
|
294
|
-
)
|
|
295
|
-
completion_usage = CompletionUsage(
|
|
249
|
+
yield generate_completion_chunk(
|
|
250
|
+
chunk_text=None,
|
|
251
|
+
finish_reason="stop",
|
|
252
|
+
chunk_id=completion_id,
|
|
253
|
+
model_uid=self.model_uid,
|
|
296
254
|
prompt_tokens=prompt_tokens,
|
|
297
255
|
completion_tokens=completion_tokens,
|
|
298
256
|
total_tokens=total_tokens,
|
|
257
|
+
has_choice=True,
|
|
258
|
+
has_content=False,
|
|
299
259
|
)
|
|
300
|
-
|
|
301
|
-
yield chunk
|
|
260
|
+
|
|
302
261
|
if include_usage:
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
choices=[],
|
|
309
|
-
)
|
|
310
|
-
chunk["usage"] = CompletionUsage(
|
|
262
|
+
yield generate_completion_chunk(
|
|
263
|
+
chunk_text=None,
|
|
264
|
+
finish_reason=None,
|
|
265
|
+
chunk_id=completion_id,
|
|
266
|
+
model_uid=self.model_uid,
|
|
311
267
|
prompt_tokens=prompt_tokens,
|
|
312
268
|
completion_tokens=completion_tokens,
|
|
313
269
|
total_tokens=total_tokens,
|
|
270
|
+
has_choice=False,
|
|
271
|
+
has_content=False,
|
|
314
272
|
)
|
|
315
|
-
yield chunk
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import time
|
|
16
15
|
import typing
|
|
17
16
|
import uuid
|
|
18
17
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -22,18 +21,10 @@ from typing import Dict, Iterator, List, Optional, Union
|
|
|
22
21
|
import torch
|
|
23
22
|
|
|
24
23
|
from ....core.scheduler import InferenceRequest
|
|
25
|
-
from ....types import
|
|
26
|
-
ChatCompletion,
|
|
27
|
-
ChatCompletionChunk,
|
|
28
|
-
ChatCompletionMessage,
|
|
29
|
-
Completion,
|
|
30
|
-
CompletionChoice,
|
|
31
|
-
CompletionChunk,
|
|
32
|
-
CompletionUsage,
|
|
33
|
-
)
|
|
24
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
34
25
|
from ...utils import select_device
|
|
35
26
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
36
|
-
from ..utils import _decode_image
|
|
27
|
+
from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
|
|
37
28
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
38
29
|
from .utils import get_max_src_len
|
|
39
30
|
|
|
@@ -102,66 +93,45 @@ class Glm4VModel(PytorchChatModel):
|
|
|
102
93
|
self._tokenizer = tokenizer
|
|
103
94
|
self._save_tensorizer()
|
|
104
95
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
elif c_type == "image_url":
|
|
114
|
-
image_urls.append(c["image_url"]["url"])
|
|
115
|
-
image_futures = []
|
|
116
|
-
with ThreadPoolExecutor() as executor:
|
|
117
|
-
for image_url in image_urls:
|
|
118
|
-
fut = executor.submit(_decode_image, image_url)
|
|
119
|
-
image_futures.append(fut)
|
|
120
|
-
images = [fut.result() for fut in image_futures]
|
|
121
|
-
text = " ".join(texts)
|
|
122
|
-
if len(images) == 0:
|
|
123
|
-
return text, []
|
|
124
|
-
elif len(images) == 1:
|
|
125
|
-
return text, images
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
|
|
98
|
+
res = []
|
|
99
|
+
for message in messages:
|
|
100
|
+
role = message["role"]
|
|
101
|
+
content = message["content"]
|
|
102
|
+
if isinstance(content, str):
|
|
103
|
+
res.append({"role": role, "content": content})
|
|
126
104
|
else:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if len(images_chat) > 0:
|
|
154
|
-
image = images_chat[0]
|
|
155
|
-
elif len(images_history) > 0:
|
|
156
|
-
image = images_history[0]
|
|
157
|
-
msgs.append({"role": "user", "content": content, "image": image})
|
|
158
|
-
return msgs
|
|
105
|
+
texts = []
|
|
106
|
+
image_urls = []
|
|
107
|
+
for c in content:
|
|
108
|
+
c_type = c.get("type")
|
|
109
|
+
if c_type == "text":
|
|
110
|
+
texts.append(c["text"])
|
|
111
|
+
else:
|
|
112
|
+
assert (
|
|
113
|
+
c_type == "image_url"
|
|
114
|
+
), "Please follow the image input of the OpenAI API."
|
|
115
|
+
image_urls.append(c["image_url"]["url"])
|
|
116
|
+
if len(image_urls) > 1:
|
|
117
|
+
raise RuntimeError("Only one image per message is supported")
|
|
118
|
+
image_futures = []
|
|
119
|
+
with ThreadPoolExecutor() as executor:
|
|
120
|
+
for image_url in image_urls:
|
|
121
|
+
fut = executor.submit(_decode_image, image_url)
|
|
122
|
+
image_futures.append(fut)
|
|
123
|
+
images = [fut.result() for fut in image_futures]
|
|
124
|
+
assert len(images) <= 1
|
|
125
|
+
text = " ".join(texts)
|
|
126
|
+
if images:
|
|
127
|
+
res.append({"role": role, "content": text, "image": images[0]})
|
|
128
|
+
else:
|
|
129
|
+
res.append({"role": role, "content": text})
|
|
130
|
+
return res
|
|
159
131
|
|
|
160
132
|
def chat(
|
|
161
133
|
self,
|
|
162
|
-
|
|
163
|
-
system_prompt: Optional[str] = None,
|
|
164
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
134
|
+
messages: List[Dict],
|
|
165
135
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
166
136
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
167
137
|
from transformers import TextIteratorStreamer
|
|
@@ -170,7 +140,7 @@ class Glm4VModel(PytorchChatModel):
|
|
|
170
140
|
generate_config = {}
|
|
171
141
|
|
|
172
142
|
stream = generate_config.get("stream", False)
|
|
173
|
-
msgs = self.
|
|
143
|
+
msgs = self._get_processed_msgs(messages)
|
|
174
144
|
|
|
175
145
|
inputs = self._tokenizer.apply_chat_template(
|
|
176
146
|
msgs,
|
|
@@ -213,64 +183,38 @@ class Glm4VModel(PytorchChatModel):
|
|
|
213
183
|
response = self._tokenizer.decode(outputs[0])
|
|
214
184
|
if response.endswith(stop_str):
|
|
215
185
|
response = response[: -len(stop_str)]
|
|
216
|
-
|
|
217
|
-
id=str(uuid.uuid1()),
|
|
218
|
-
object="text_completion",
|
|
219
|
-
created=int(time.time()),
|
|
220
|
-
model=self.model_uid,
|
|
221
|
-
choices=[
|
|
222
|
-
CompletionChoice(
|
|
223
|
-
index=0, text=response, finish_reason="stop", logprobs=None
|
|
224
|
-
)
|
|
225
|
-
],
|
|
226
|
-
usage=CompletionUsage(
|
|
227
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
228
|
-
),
|
|
229
|
-
)
|
|
230
|
-
return self._to_chat_completion(c)
|
|
186
|
+
return generate_chat_completion(self.model_uid, response)
|
|
231
187
|
|
|
232
188
|
def chat_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
|
|
233
189
|
completion_id = str(uuid.uuid1())
|
|
234
190
|
for new_text in streamer:
|
|
235
191
|
if not new_text.endswith(stop_str):
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
object="text_completion",
|
|
242
|
-
created=int(time.time()),
|
|
243
|
-
model=self.model_uid,
|
|
244
|
-
choices=[completion_choice],
|
|
245
|
-
)
|
|
246
|
-
completion_usage = CompletionUsage(
|
|
192
|
+
yield generate_completion_chunk(
|
|
193
|
+
chunk_text=new_text,
|
|
194
|
+
finish_reason=None,
|
|
195
|
+
chunk_id=completion_id,
|
|
196
|
+
model_uid=self.model_uid,
|
|
247
197
|
prompt_tokens=-1,
|
|
248
198
|
completion_tokens=-1,
|
|
249
199
|
total_tokens=-1,
|
|
200
|
+
has_choice=True,
|
|
201
|
+
has_content=True,
|
|
250
202
|
)
|
|
251
|
-
chunk["usage"] = completion_usage
|
|
252
|
-
yield chunk
|
|
253
203
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
object="text_completion",
|
|
260
|
-
created=int(time.time()),
|
|
261
|
-
model=self.model_uid,
|
|
262
|
-
choices=[completion_choice],
|
|
263
|
-
)
|
|
264
|
-
completion_usage = CompletionUsage(
|
|
204
|
+
yield generate_completion_chunk(
|
|
205
|
+
chunk_text=None,
|
|
206
|
+
finish_reason="stop",
|
|
207
|
+
chunk_id=completion_id,
|
|
208
|
+
model_uid=self.model_uid,
|
|
265
209
|
prompt_tokens=-1,
|
|
266
210
|
completion_tokens=-1,
|
|
267
211
|
total_tokens=-1,
|
|
212
|
+
has_choice=True,
|
|
213
|
+
has_content=False,
|
|
268
214
|
)
|
|
269
|
-
chunk["usage"] = completion_usage
|
|
270
|
-
yield chunk
|
|
271
215
|
|
|
272
|
-
def _get_full_prompt(self,
|
|
273
|
-
msgs = self.
|
|
216
|
+
def _get_full_prompt(self, messages, tools):
|
|
217
|
+
msgs = self._get_processed_msgs(messages)
|
|
274
218
|
inputs = self._tokenizer.apply_chat_template(
|
|
275
219
|
msgs,
|
|
276
220
|
add_generation_prompt=True,
|
|
@@ -12,24 +12,20 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import time
|
|
16
15
|
import uuid
|
|
17
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
17
|
from typing import Dict, Iterator, List, Optional, Union
|
|
19
18
|
|
|
20
19
|
import torch
|
|
21
20
|
|
|
22
|
-
from ....types import
|
|
23
|
-
ChatCompletion,
|
|
24
|
-
ChatCompletionChunk,
|
|
25
|
-
ChatCompletionMessage,
|
|
26
|
-
Completion,
|
|
27
|
-
CompletionChoice,
|
|
28
|
-
CompletionChunk,
|
|
29
|
-
CompletionUsage,
|
|
30
|
-
)
|
|
21
|
+
from ....types import ChatCompletion, ChatCompletionChunk
|
|
31
22
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
32
|
-
from ..utils import
|
|
23
|
+
from ..utils import (
|
|
24
|
+
_decode_image,
|
|
25
|
+
generate_chat_completion,
|
|
26
|
+
generate_completion_chunk,
|
|
27
|
+
parse_messages,
|
|
28
|
+
)
|
|
33
29
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
34
30
|
|
|
35
31
|
logger = logging.getLogger(__name__)
|
|
@@ -78,7 +74,7 @@ def _message_content_to_intern(content, image_cnt):
|
|
|
78
74
|
|
|
79
75
|
def _get_prompt_and_chat_history(
|
|
80
76
|
prompt: Union[str, List[Dict]],
|
|
81
|
-
chat_history: Optional[List[
|
|
77
|
+
chat_history: Optional[List[Dict]] = None,
|
|
82
78
|
):
|
|
83
79
|
# Convert openai history to intern vl history
|
|
84
80
|
images = []
|
|
@@ -332,9 +328,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
332
328
|
|
|
333
329
|
def chat(
|
|
334
330
|
self,
|
|
335
|
-
|
|
336
|
-
system_prompt: Optional[str] = None,
|
|
337
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
331
|
+
messages: List[Dict],
|
|
338
332
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
339
333
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
340
334
|
from ....thirdparty.internvl.conversation import get_conv_template
|
|
@@ -366,6 +360,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
366
360
|
else False
|
|
367
361
|
)
|
|
368
362
|
|
|
363
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
369
364
|
content, history, images, videos = _get_prompt_and_chat_history(
|
|
370
365
|
prompt, chat_history
|
|
371
366
|
)
|
|
@@ -434,10 +429,9 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
434
429
|
chunk = self._generate_stream(generate_kwargs, input_ids, include_usage)
|
|
435
430
|
return self._to_chat_completion_chunks(chunk)
|
|
436
431
|
else:
|
|
437
|
-
|
|
438
|
-
return self._to_chat_completion(chunk)
|
|
432
|
+
return self._generate(generate_kwargs, input_ids, template)
|
|
439
433
|
|
|
440
|
-
def _generate(self, generate_kwargs, input_ids, template):
|
|
434
|
+
def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion:
|
|
441
435
|
prompt_tokens = len(input_ids[0])
|
|
442
436
|
generation_output = self._model.generate(**generate_kwargs)
|
|
443
437
|
completion_tokens = len(generation_output[0])
|
|
@@ -445,23 +439,13 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
445
439
|
generation_output, skip_special_tokens=True
|
|
446
440
|
)[0]
|
|
447
441
|
response = response.split(template.sep)[0].strip()
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
CompletionChoice(
|
|
455
|
-
index=0, text=response, finish_reason="stop", logprobs=None
|
|
456
|
-
)
|
|
457
|
-
],
|
|
458
|
-
usage=CompletionUsage(
|
|
459
|
-
prompt_tokens=prompt_tokens,
|
|
460
|
-
completion_tokens=completion_tokens,
|
|
461
|
-
total_tokens=prompt_tokens + completion_tokens,
|
|
462
|
-
),
|
|
442
|
+
return generate_chat_completion(
|
|
443
|
+
self.model_uid,
|
|
444
|
+
response,
|
|
445
|
+
prompt_tokens=prompt_tokens,
|
|
446
|
+
completion_tokens=completion_tokens,
|
|
447
|
+
total_tokens=prompt_tokens + completion_tokens,
|
|
463
448
|
)
|
|
464
|
-
return chunk
|
|
465
449
|
|
|
466
450
|
def _generate_stream(self, generate_kwargs, input_ids, include_usage):
|
|
467
451
|
from threading import Thread
|
|
@@ -483,58 +467,43 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
483
467
|
|
|
484
468
|
completion_id = str(uuid.uuid1())
|
|
485
469
|
prompt_tokens = len(input_ids[0])
|
|
486
|
-
completion_tokens = 0
|
|
470
|
+
total_tokens, completion_tokens = 0, 0
|
|
487
471
|
# Loop through the streamer to get the new text as it is generated
|
|
488
472
|
for i, new_text in enumerate(streamer):
|
|
489
473
|
if new_text == self._model.conv_template.sep:
|
|
490
474
|
break
|
|
491
|
-
completion_choice = CompletionChoice(
|
|
492
|
-
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
493
|
-
)
|
|
494
|
-
chunk = CompletionChunk(
|
|
495
|
-
id=completion_id,
|
|
496
|
-
object="text_completion",
|
|
497
|
-
created=int(time.time()),
|
|
498
|
-
model=self.model_uid,
|
|
499
|
-
choices=[completion_choice],
|
|
500
|
-
)
|
|
501
475
|
completion_tokens = max(completion_tokens, len(streamer.token_cache))
|
|
502
476
|
total_tokens = prompt_tokens + completion_tokens
|
|
503
|
-
|
|
477
|
+
yield generate_completion_chunk(
|
|
478
|
+
chunk_text=new_text,
|
|
479
|
+
finish_reason=None,
|
|
480
|
+
chunk_id=completion_id,
|
|
481
|
+
model_uid=self.model_uid,
|
|
504
482
|
prompt_tokens=prompt_tokens,
|
|
505
483
|
completion_tokens=completion_tokens,
|
|
506
484
|
total_tokens=total_tokens,
|
|
507
485
|
)
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
chunk = CompletionChunk(
|
|
514
|
-
id=completion_id,
|
|
515
|
-
object="text_completion",
|
|
516
|
-
created=int(time.time()),
|
|
517
|
-
model=self.model_uid,
|
|
518
|
-
choices=[completion_choice],
|
|
519
|
-
)
|
|
520
|
-
completion_usage = CompletionUsage(
|
|
486
|
+
yield generate_completion_chunk(
|
|
487
|
+
chunk_text=None,
|
|
488
|
+
finish_reason="stop",
|
|
489
|
+
chunk_id=completion_id,
|
|
490
|
+
model_uid=self.model_uid,
|
|
521
491
|
prompt_tokens=prompt_tokens,
|
|
522
492
|
completion_tokens=completion_tokens,
|
|
523
493
|
total_tokens=total_tokens,
|
|
494
|
+
has_choice=True,
|
|
495
|
+
has_content=False,
|
|
524
496
|
)
|
|
525
|
-
|
|
526
|
-
yield chunk
|
|
497
|
+
|
|
527
498
|
if include_usage:
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
choices=[],
|
|
534
|
-
)
|
|
535
|
-
chunk["usage"] = CompletionUsage(
|
|
499
|
+
yield generate_completion_chunk(
|
|
500
|
+
chunk_text=None,
|
|
501
|
+
finish_reason=None,
|
|
502
|
+
chunk_id=completion_id,
|
|
503
|
+
model_uid=self.model_uid,
|
|
536
504
|
prompt_tokens=prompt_tokens,
|
|
537
505
|
completion_tokens=completion_tokens,
|
|
538
506
|
total_tokens=total_tokens,
|
|
507
|
+
has_choice=False,
|
|
508
|
+
has_content=False,
|
|
539
509
|
)
|
|
540
|
-
yield chunk
|