xinference 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/restful_client.py +1 -1
  3. xinference/conftest.py +0 -7
  4. xinference/core/media_interface.py +9 -8
  5. xinference/core/model.py +13 -6
  6. xinference/core/scheduler.py +1 -10
  7. xinference/core/worker.py +0 -10
  8. xinference/model/audio/model_spec.json +53 -1
  9. xinference/model/audio/model_spec_modelscope.json +57 -1
  10. xinference/model/embedding/core.py +19 -11
  11. xinference/model/image/model_spec.json +10 -1
  12. xinference/model/image/model_spec_modelscope.json +20 -0
  13. xinference/model/llm/__init__.py +6 -54
  14. xinference/model/llm/core.py +19 -5
  15. xinference/model/llm/llama_cpp/core.py +59 -3
  16. xinference/model/llm/llama_cpp/memory.py +455 -0
  17. xinference/model/llm/llm_family.json +185 -397
  18. xinference/model/llm/llm_family.py +88 -16
  19. xinference/model/llm/llm_family_modelscope.json +199 -421
  20. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  21. xinference/model/llm/sglang/core.py +4 -0
  22. xinference/model/llm/transformers/__init__.py +27 -6
  23. xinference/model/llm/transformers/chatglm.py +4 -2
  24. xinference/model/llm/transformers/core.py +49 -28
  25. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  26. xinference/model/llm/transformers/gemma3.py +119 -164
  27. xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
  28. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  29. xinference/model/llm/transformers/multimodal/core.py +205 -0
  30. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  31. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  32. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  33. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  34. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  35. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  36. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  37. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  38. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  39. xinference/model/llm/transformers/opt.py +4 -2
  40. xinference/model/llm/transformers/utils.py +6 -37
  41. xinference/model/llm/vllm/core.py +4 -0
  42. xinference/model/rerank/core.py +7 -1
  43. xinference/model/rerank/utils.py +17 -0
  44. xinference/web/ui/build/asset-manifest.json +3 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
  47. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
  52. xinference/web/ui/src/locales/en.json +3 -1
  53. xinference/web/ui/src/locales/zh.json +3 -1
  54. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/METADATA +16 -14
  55. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
  56. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
  57. xinference/model/llm/transformers/cogvlm2.py +0 -442
  58. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  59. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  60. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  61. xinference/model/llm/transformers/intern_vl.py +0 -526
  62. xinference/model/llm/transformers/internlm2.py +0 -94
  63. xinference/model/llm/transformers/minicpmv25.py +0 -193
  64. xinference/model/llm/transformers/omnilmm.py +0 -132
  65. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  66. xinference/model/llm/transformers/qwen_vl.py +0 -360
  67. xinference/thirdparty/omnilmm/LICENSE +0 -201
  68. xinference/thirdparty/omnilmm/__init__.py +0 -0
  69. xinference/thirdparty/omnilmm/chat.py +0 -218
  70. xinference/thirdparty/omnilmm/constants.py +0 -4
  71. xinference/thirdparty/omnilmm/conversation.py +0 -332
  72. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  73. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  74. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  75. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  76. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  77. xinference/thirdparty/omnilmm/utils.py +0 -134
  78. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  79. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  84. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
  85. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
  86. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
  87. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
@@ -1,333 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import logging
15
- import uuid
16
- from concurrent.futures import ThreadPoolExecutor
17
- from typing import Dict, Iterator, List, Optional, Tuple, Union
18
-
19
- import torch
20
-
21
- from ....model.utils import select_device
22
- from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
23
- from ..llm_family import LLMFamilyV1, LLMSpecV1
24
- from ..utils import (
25
- _decode_image,
26
- generate_chat_completion,
27
- generate_completion_chunk,
28
- parse_messages,
29
- )
30
- from .core import PytorchChatModel, PytorchGenerateConfig
31
- from .utils import cache_clean
32
-
33
- logger = logging.getLogger(__name__)
34
-
35
-
36
- LANGUAGE_TOKEN_TYPE = 0
37
- VISION_TOKEN_TYPE = 1
38
-
39
-
40
- def recur_move_to(item, tgt, criterion_func):
41
- """
42
- This function is copied from https://github.com/THUDM/CogVLM2/blob/main/basic_demo/cli_demo_batch_inference.py
43
- """
44
- if criterion_func(item):
45
- device_copy = item.to(tgt)
46
- return device_copy
47
- elif isinstance(item, list):
48
- return [recur_move_to(v, tgt, criterion_func) for v in item]
49
- elif isinstance(item, tuple):
50
- return tuple([recur_move_to(v, tgt, criterion_func) for v in item])
51
- elif isinstance(item, dict):
52
- return {k: recur_move_to(v, tgt, criterion_func) for k, v in item.items()}
53
- else:
54
- return item
55
-
56
-
57
- class CogVLM2VideoModel(PytorchChatModel):
58
- def __init__(self, *args, **kwargs):
59
- super().__init__(*args, **kwargs)
60
- self._torch_type = None
61
- self._device = None
62
- self._tokenizer = None
63
- self._model = None
64
-
65
- @classmethod
66
- def match_json(
67
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
68
- ) -> bool:
69
- family = model_family.model_family or model_family.model_name
70
- if "cogvlm2" in family.lower() and "video" in family.lower():
71
- return True
72
- return False
73
-
74
- def load(self):
75
- from transformers import AutoModelForCausalLM, AutoTokenizer
76
- from transformers.generation import GenerationConfig
77
-
78
- device = self._pytorch_model_config.get("device", "auto")
79
- self._device = select_device(device)
80
- self._torch_type = (
81
- torch.bfloat16
82
- if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
83
- else torch.float16
84
- )
85
-
86
- if self._check_tensorizer_integrity():
87
- self._model, self._tokenizer = self._load_tensorizer()
88
- return
89
-
90
- kwargs = self.apply_bnb_quantization()
91
-
92
- self._tokenizer = AutoTokenizer.from_pretrained(
93
- self.model_path,
94
- trust_remote_code=True,
95
- )
96
-
97
- self._model = AutoModelForCausalLM.from_pretrained(
98
- self.model_path,
99
- torch_dtype=self._torch_type,
100
- trust_remote_code=True,
101
- low_cpu_mem_usage=True,
102
- device_map="auto",
103
- **kwargs
104
- ).eval()
105
-
106
- # Specify hyperparameters for generation
107
- self._model.generation_config = GenerationConfig.from_pretrained(
108
- self.model_path,
109
- trust_remote_code=True,
110
- )
111
- self._save_tensorizer()
112
-
113
- def _load_video(self, video_path):
114
- import numpy as np
115
- from decord import VideoReader, bridge, cpu
116
-
117
- bridge.set_bridge("torch")
118
- num_frames = 24
119
-
120
- decord_vr = VideoReader(video_path, ctx=cpu(0))
121
- frame_id_list = None
122
- total_frames = len(decord_vr)
123
- timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
124
- timestamps = [i[0] for i in timestamps]
125
- max_second = round(max(timestamps)) + 1
126
- frame_id_list = []
127
- for second in range(max_second):
128
- closest_num = min(timestamps, key=lambda x: abs(x - second))
129
- index = timestamps.index(closest_num)
130
- frame_id_list.append(index)
131
- if len(frame_id_list) >= num_frames:
132
- break
133
- video_data = decord_vr.get_batch(frame_id_list)
134
- video_data = video_data.permute(3, 0, 1, 2)
135
- return video_data
136
-
137
- def _message_content_to_cogvlm2(self, content):
138
- if not isinstance(content, str):
139
- texts = []
140
- image_urls = []
141
- video_urls = []
142
- for c in content:
143
- c_type = c.get("type")
144
- if c_type == "text":
145
- texts.append(c["text"])
146
- elif c_type == "image_url":
147
- image_urls.append(c["image_url"]["url"])
148
- elif c_type == "video_url":
149
- video_urls.append(c["video_url"]["url"])
150
- if len(video_urls) > 1:
151
- raise RuntimeError("Only one video per message is supported")
152
- image_futures = []
153
- video = None
154
- with ThreadPoolExecutor() as executor:
155
- for image_url in image_urls:
156
- fut = executor.submit(_decode_image, image_url)
157
- image_futures.append(fut)
158
- images = [fut.result() for fut in image_futures]
159
- for v in video_urls:
160
- video = self._load_video(v)
161
- text = " ".join(texts)
162
- return text, images, video
163
- return content, [], None
164
-
165
- def _history_content_to_cogvlm2(self, system_prompt: str, chat_history: List[Dict]):
166
- query = system_prompt
167
- history: List[Tuple] = []
168
- pixel_values = None
169
- video_urls: List[str] = []
170
- for i in range(0, len(chat_history), 2):
171
- user = chat_history[i]["content"]
172
- if isinstance(user, List):
173
- for content in user:
174
- c_type = content.get("type")
175
- if c_type == "text":
176
- user = content["text"]
177
- elif c_type == "image_url" and not pixel_values:
178
- pixel_values = _decode_image(content["image_url"]["url"])
179
- elif c_type == "video_url":
180
- video_urls.append(content["video_url"]["url"])
181
- assistant = chat_history[i + 1]["content"]
182
- history.append((user, assistant))
183
- query = assistant # type: ignore
184
- if len(video_urls) > 1:
185
- raise RuntimeError("Only one video per message is supported")
186
- video = None
187
- for v in video_urls:
188
- video = self._load_video(v)
189
- return query, history, [pixel_values], video
190
-
191
- def get_query_and_history(
192
- self,
193
- prompt: Union[str, List[Dict]],
194
- system_prompt: Optional[str] = None,
195
- chat_history: Optional[List[Dict]] = None,
196
- ):
197
- content, image, video = self._message_content_to_cogvlm2(prompt)
198
-
199
- history = []
200
- history_image = None
201
- history_video = None
202
- if chat_history:
203
- (
204
- query,
205
- history,
206
- history_image,
207
- history_video,
208
- ) = self._history_content_to_cogvlm2(
209
- system_prompt, chat_history # type: ignore
210
- )
211
-
212
- if image and history_image:
213
- history = []
214
- query = content
215
- else:
216
- image = image if image else history_image
217
- query = content
218
-
219
- if video is not None and history_video is not None:
220
- history = []
221
- query = content
222
- else:
223
- video = video if video is not None else history_video
224
- query = content
225
-
226
- return query, image, video, history
227
-
228
- @cache_clean
229
- def chat(
230
- self,
231
- messages: List[Dict],
232
- generate_config: Optional[PytorchGenerateConfig] = None,
233
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
234
- system_prompt = ""
235
- if messages[0]["role"] == "system":
236
- system_prompt = messages[0]["content"]
237
- stream = generate_config.get("stream", False) if generate_config else False
238
-
239
- sanitized_config = {
240
- "pad_token_id": 128002,
241
- "max_new_tokens": generate_config.get("max_tokens", 512)
242
- if generate_config
243
- else 512,
244
- }
245
-
246
- prompt, _, chat_history = parse_messages(messages)
247
- query, image, video, history = self.get_query_and_history(
248
- prompt, system_prompt=system_prompt, chat_history=chat_history
249
- )
250
-
251
- if video is not None:
252
- image = [video]
253
-
254
- input_by_model = self._model.build_conversation_input_ids(
255
- self._tokenizer,
256
- query=query,
257
- history=history,
258
- images=image,
259
- template_version="chat",
260
- )
261
-
262
- inputs = {
263
- "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self._device),
264
- "token_type_ids": input_by_model["token_type_ids"]
265
- .unsqueeze(0)
266
- .to(self._device),
267
- "attention_mask": input_by_model["attention_mask"]
268
- .unsqueeze(0)
269
- .to(self._device),
270
- "images": [
271
- [input_by_model["images"][0].to(self._device).to(self._torch_type)]
272
- ]
273
- if image is not None
274
- else None,
275
- }
276
-
277
- if stream:
278
- it = self._streaming_chat_response(inputs, sanitized_config)
279
- return self._to_chat_completion_chunks(it)
280
- else:
281
- with torch.no_grad():
282
- outputs = self._model.generate(**inputs, **sanitized_config)
283
- outputs = outputs[:, inputs["input_ids"].shape[1] :]
284
- response = self._tokenizer.decode(outputs[0])
285
- response = response.split("<|end_of_text|>")[0]
286
-
287
- return generate_chat_completion(self.model_uid, response)
288
-
289
- def _streaming_chat_response(
290
- self, inputs: Dict, config: Dict
291
- ) -> Iterator[CompletionChunk]:
292
- from threading import Thread
293
-
294
- from transformers import TextIteratorStreamer
295
-
296
- streamer = TextIteratorStreamer(
297
- self._tokenizer, skip_prompt=True, skip_special_tokens=True
298
- )
299
- generation_kwargs = {
300
- "input_ids": inputs["input_ids"],
301
- "attention_mask": inputs["attention_mask"],
302
- "token_type_ids": inputs["token_type_ids"],
303
- "images": inputs["images"],
304
- "max_new_tokens": config["max_new_tokens"],
305
- "pad_token_id": config["pad_token_id"],
306
- "streamer": streamer,
307
- }
308
-
309
- thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
310
- thread.start()
311
-
312
- completion_id = str(uuid.uuid1())
313
- for new_text in streamer:
314
- yield generate_completion_chunk(
315
- chunk_text=new_text,
316
- finish_reason=None,
317
- chunk_id=completion_id,
318
- model_uid=self.model_uid,
319
- prompt_tokens=-1,
320
- completion_tokens=-1,
321
- total_tokens=-1,
322
- )
323
- yield generate_completion_chunk(
324
- chunk_text=None,
325
- finish_reason="stop",
326
- chunk_id=completion_id,
327
- model_uid=self.model_uid,
328
- prompt_tokens=-1,
329
- completion_tokens=-1,
330
- total_tokens=-1,
331
- has_choice=True,
332
- has_content=False,
333
- )
@@ -1,280 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import base64
15
- import logging
16
- import os.path
17
- import tempfile
18
- import uuid
19
- from concurrent.futures import ThreadPoolExecutor
20
- from io import BytesIO
21
- from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
22
-
23
- import requests
24
- import torch
25
-
26
- from ....model.utils import select_device
27
- from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
28
- from ..llm_family import LLMFamilyV1, LLMSpecV1
29
- from ..utils import generate_chat_completion, generate_completion_chunk
30
- from .core import PytorchChatModel, PytorchGenerateConfig
31
- from .utils import cache_clean
32
-
33
- logger = logging.getLogger(__name__)
34
-
35
-
36
- class DeepSeekVLChatModel(PytorchChatModel):
37
- def __init__(self, *args, **kwargs):
38
- super().__init__(*args, **kwargs)
39
- self._tokenizer = None
40
- self._model = None
41
- self._vl_chat_processor = None
42
- self._type = None
43
-
44
- @classmethod
45
- def match_json(
46
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
47
- ) -> bool:
48
- llm_family = model_family.model_family or model_family.model_name
49
- if "deepseek-vl-chat" == llm_family.lower():
50
- return True
51
- return False
52
-
53
- def load(self):
54
- from transformers import AutoModelForCausalLM
55
-
56
- from ....thirdparty.deepseek_vl.models import (
57
- MultiModalityCausalLM,
58
- VLChatProcessor,
59
- )
60
-
61
- self._device = self._pytorch_model_config.get("device", "auto")
62
- self._device = select_device(self._device)
63
- self._type = torch.float16 if self._device == "mps" else torch.bfloat16
64
-
65
- kwargs = self.apply_bnb_quantization()
66
-
67
- # specify the path to the model
68
- self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained( # type: ignore
69
- self.model_path
70
- )
71
- self._tokenizer = self._vl_chat_processor.tokenizer
72
-
73
- vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( # type: ignore
74
- self.model_path,
75
- trust_remote_code=True,
76
- device_map=self._device,
77
- torch_dtype=self._type,
78
- **kwargs,
79
- )
80
- self._model = vl_gpt.eval()
81
-
82
- @staticmethod
83
- def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:
84
- def _ensure_url(_url):
85
- if _url.startswith("data:"):
86
- logging.info("Parse url by base64 decoder.")
87
- # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
88
- # e.g. f"data:image/jpeg;base64,{base64_image}"
89
- _type, data = _url.split(";")
90
- _, ext = _type.split("/")
91
- data = data[len("base64,") :]
92
- data = base64.b64decode(data.encode("utf-8"))
93
-
94
- with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
95
- f.write(data)
96
- logging.info("Dump base64 data to %s", f.name)
97
- return f.name
98
- else:
99
- if len(_url) > 2048:
100
- raise Exception(f"Image url is too long, {len(_url)} > 2048.")
101
-
102
- return _url
103
-
104
- def _download(_images):
105
- local_images = []
106
-
107
- # To make requests.get works
108
- headers = {
109
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
110
- }
111
- with ThreadPoolExecutor() as executor:
112
- for url in images:
113
- try:
114
- if os.path.exists(url):
115
- local_images.append(url)
116
- continue
117
- except Exception as e:
118
- logger.debug("Image is remote: %s, e: %s", url, e)
119
- pass
120
- # Append a placeholder
121
- local_images.append(None)
122
-
123
- def _fill_placeholder(_url, _index):
124
- response = requests.get(url, headers=headers)
125
- local_images[_index] = BytesIO(response.content)
126
-
127
- executor.submit(_fill_placeholder, url, len(local_images) - 1)
128
- return local_images
129
-
130
- if not isinstance(content, str):
131
- # TODO(codingl2k1): Optimize _ensure_url
132
-
133
- images = []
134
- new_content = []
135
- for c in content:
136
- c_type = c.get("type")
137
- if c_type == "image_url":
138
- images.append(_ensure_url(c["image_url"]["url"]))
139
- elif c_type == "text":
140
- new_content.append(c["text"])
141
- if images:
142
- new_content.insert(0, "<image_placeholder>")
143
- images = _download(images)
144
- return "".join(new_content), images
145
- return content, []
146
-
147
- @cache_clean
148
- def chat(
149
- self,
150
- messages: List[Dict],
151
- generate_config: Optional[PytorchGenerateConfig] = None,
152
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
153
- if not generate_config:
154
- generate_config = {}
155
-
156
- stream = generate_config.get("stream", False)
157
- stream_options = generate_config.pop("stream_options", None)
158
- include_usage = (
159
- stream_options["include_usage"]
160
- if isinstance(stream_options, dict)
161
- else False
162
- )
163
-
164
- prompt = ""
165
- deepseek_messages = []
166
- for i, message in enumerate(messages):
167
- role = message["role"]
168
- content = message["content"]
169
- if role == "user":
170
- if isinstance(content, str):
171
- deepseek_messages.append({"role": "User", "content": content})
172
- else:
173
- content, images = self._message_content_to_deepseek(content)
174
- msg: Dict[str, Any] = {
175
- "role": "User",
176
- "content": content,
177
- }
178
- if images:
179
- msg["images"] = images
180
- deepseek_messages.append(msg)
181
- if i == len(messages) - 1:
182
- prompt = content
183
- elif role == "assistant":
184
- deepseek_messages.append({"role": "Assistant", "content": content})
185
- else:
186
- logger.error(
187
- f"Unexpected message in messages: role: {role}, message: {message}"
188
- )
189
-
190
- from ....thirdparty.deepseek_vl.serve.inference import generate
191
- from ....thirdparty.deepseek_vl.utils.io import load_pil_images
192
-
193
- # load images and prepare for inputs
194
- pil_images = load_pil_images(deepseek_messages)
195
- prepare_inputs = self._vl_chat_processor(
196
- conversations=deepseek_messages, images=pil_images, force_batchify=True
197
- ).to(self._model.device, self._model.dtype)
198
-
199
- temperature = generate_config.get("temperature", 0.2)
200
- top_p = generate_config.get("top_p", 0.95)
201
- max_new_tokens = generate_config.get("max_tokens", 512)
202
- repetition_penalty = generate_config.get("repetition_penalty", 1.1)
203
-
204
- conversation = self._vl_chat_processor.new_chat_template()
205
- stop_str = conversation.sep2
206
- stop_words = [stop_str]
207
-
208
- streamer = generate(
209
- vl_gpt=self._model,
210
- tokenizer=self._tokenizer,
211
- prepare_inputs=prepare_inputs,
212
- max_gen_len=max_new_tokens,
213
- temperature=temperature,
214
- repetition_penalty=repetition_penalty,
215
- top_p=top_p,
216
- stop_words=stop_words,
217
- )
218
-
219
- if stream:
220
- it = self._generate_stream(streamer, stop_str, include_usage, prompt)
221
- return self._to_chat_completion_chunks(it)
222
- else:
223
- return self._generate(streamer, stop_str)
224
-
225
- def _generate(self, streamer, stop_str) -> ChatCompletion:
226
- generated_text = ""
227
- for new_text in streamer:
228
- if new_text.endswith(stop_str):
229
- new_text = new_text[: -len(stop_str)]
230
- generated_text += new_text
231
-
232
- return generate_chat_completion(self.model_uid, generated_text)
233
-
234
- def _generate_stream(
235
- self, streamer, stop_str, include_usage, prompt
236
- ) -> Iterator[CompletionChunk]:
237
- completion_id = str(uuid.uuid1())
238
- prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
239
- input_ids = self._tokenizer(prompt).input_ids
240
- prompt_tokens = len(input_ids)
241
- for i, new_text in enumerate(streamer):
242
- if new_text.endswith(stop_str):
243
- new_text = new_text[: -len(stop_str)]
244
- completion_tokens = i
245
- total_tokens = prompt_tokens + completion_tokens
246
- yield generate_completion_chunk(
247
- chunk_text=new_text,
248
- finish_reason=None,
249
- chunk_id=completion_id,
250
- model_uid=self.model_uid,
251
- prompt_tokens=prompt_tokens,
252
- completion_tokens=completion_tokens,
253
- total_tokens=total_tokens,
254
- has_choice=True,
255
- has_content=True,
256
- )
257
- yield generate_completion_chunk(
258
- chunk_text=None,
259
- finish_reason="stop",
260
- chunk_id=completion_id,
261
- model_uid=self.model_uid,
262
- prompt_tokens=prompt_tokens,
263
- completion_tokens=completion_tokens,
264
- total_tokens=total_tokens,
265
- has_choice=True,
266
- has_content=False,
267
- )
268
-
269
- if include_usage:
270
- yield generate_completion_chunk(
271
- chunk_text=None,
272
- finish_reason=None,
273
- chunk_id=completion_id,
274
- model_uid=self.model_uid,
275
- prompt_tokens=prompt_tokens,
276
- completion_tokens=completion_tokens,
277
- total_tokens=total_tokens,
278
- has_choice=False,
279
- has_content=False,
280
- )