xinference 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +15 -34
- xinference/client/restful/restful_client.py +2 -2
- xinference/core/chat_interface.py +44 -9
- xinference/core/model.py +4 -4
- xinference/core/scheduler.py +1 -2
- xinference/core/worker.py +1 -1
- xinference/deploy/cmdline.py +2 -2
- xinference/deploy/test/test_cmdline.py +7 -7
- xinference/model/llm/__init__.py +20 -27
- xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
- xinference/model/llm/llm_family.json +448 -1153
- xinference/model/llm/llm_family.py +14 -139
- xinference/model/llm/llm_family_modelscope.json +230 -313
- xinference/model/llm/memory.py +9 -9
- xinference/model/llm/sglang/core.py +2 -2
- xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
- xinference/model/llm/{pytorch → transformers}/core.py +2 -10
- xinference/model/llm/transformers/intern_vl.py +457 -0
- xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
- xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
- xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
- xinference/model/llm/utils.py +76 -70
- xinference/model/llm/vllm/core.py +110 -11
- xinference/model/utils.py +1 -95
- xinference/thirdparty/internvl/__init__.py +0 -0
- xinference/thirdparty/internvl/conversation.py +393 -0
- xinference/thirdparty/omnilmm/model/utils.py +16 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
- xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/METADATA +12 -15
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
- xinference/locale/utils.py +0 -39
- xinference/locale/zh_CN.json +0 -26
- xinference/model/llm/ggml/tools/__init__.py +0 -15
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
- xinference/model/llm/ggml/tools/gguf.py +0 -884
- xinference/model/llm/pytorch/__init__.py +0 -13
- xinference/model/llm/pytorch/baichuan.py +0 -81
- xinference/model/llm/pytorch/falcon.py +0 -138
- xinference/model/llm/pytorch/intern_vl.py +0 -352
- xinference/model/llm/pytorch/vicuna.py +0 -69
- xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
- xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
- /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
- /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
- /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import base64
|
|
15
|
-
import json
|
|
16
15
|
import logging
|
|
17
16
|
import time
|
|
18
17
|
import uuid
|
|
@@ -124,29 +123,60 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
124
123
|
else:
|
|
125
124
|
return Image.open(BytesIO(response.content)).convert("RGB")
|
|
126
125
|
|
|
126
|
+
MAX_NUM_FRAMES = 64
|
|
127
|
+
|
|
128
|
+
def encode_video(video_path):
|
|
129
|
+
from decord import VideoReader, cpu
|
|
130
|
+
|
|
131
|
+
def uniform_sample(l, n):
|
|
132
|
+
gap = len(l) / n
|
|
133
|
+
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
|
134
|
+
return [l[i] for i in idxs]
|
|
135
|
+
|
|
136
|
+
vr = VideoReader(video_path, ctx=cpu(0))
|
|
137
|
+
sample_fps = round(vr.get_avg_fps() / 1) # FPS
|
|
138
|
+
frame_idx = [i for i in range(0, len(vr), sample_fps)]
|
|
139
|
+
if len(frame_idx) > MAX_NUM_FRAMES:
|
|
140
|
+
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
|
141
|
+
frames = vr.get_batch(frame_idx).asnumpy()
|
|
142
|
+
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
|
143
|
+
print("num frames:", len(frames))
|
|
144
|
+
return frames
|
|
145
|
+
|
|
146
|
+
def _load_video(_url):
|
|
147
|
+
frames = None
|
|
148
|
+
if _url.startswith("data:"):
|
|
149
|
+
raise RuntimeError("Only video url format is supported")
|
|
150
|
+
else:
|
|
151
|
+
frames = encode_video(_url)
|
|
152
|
+
return frames
|
|
153
|
+
|
|
127
154
|
if not isinstance(content, str):
|
|
128
155
|
texts = []
|
|
129
156
|
image_urls = []
|
|
157
|
+
video_urls = []
|
|
130
158
|
for c in content:
|
|
131
159
|
c_type = c.get("type")
|
|
132
160
|
if c_type == "text":
|
|
133
161
|
texts.append(c["text"])
|
|
134
162
|
elif c_type == "image_url":
|
|
135
163
|
image_urls.append(c["image_url"]["url"])
|
|
164
|
+
elif c_type == "video_url":
|
|
165
|
+
video_urls.append(c["video_url"]["url"])
|
|
136
166
|
image_futures = []
|
|
137
167
|
with ThreadPoolExecutor() as executor:
|
|
138
168
|
for image_url in image_urls:
|
|
139
169
|
fut = executor.submit(_load_image, image_url)
|
|
140
170
|
image_futures.append(fut)
|
|
141
171
|
images = [fut.result() for fut in image_futures]
|
|
172
|
+
frames = []
|
|
173
|
+
if len(video_urls) > 1:
|
|
174
|
+
raise RuntimeError("Only one video per message is supported")
|
|
175
|
+
for v in video_urls:
|
|
176
|
+
frames = _load_video(v)
|
|
142
177
|
text = " ".join(texts)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
elif len(images) == 1:
|
|
146
|
-
return text, images
|
|
147
|
-
else:
|
|
148
|
-
raise RuntimeError("Only one image per message is supported")
|
|
149
|
-
return content, []
|
|
178
|
+
return text, images, frames
|
|
179
|
+
return content, [], []
|
|
150
180
|
|
|
151
181
|
def chat(
|
|
152
182
|
self,
|
|
@@ -156,36 +186,51 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
156
186
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
157
187
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
158
188
|
stream = generate_config.get("stream", False) if generate_config else False
|
|
159
|
-
|
|
189
|
+
videoExisted = False
|
|
190
|
+
|
|
191
|
+
content, images_chat, video_frames = self._message_content_to_chat(prompt)
|
|
192
|
+
if len(video_frames) > 0:
|
|
193
|
+
videoExisted = True
|
|
194
|
+
images_chat = video_frames
|
|
160
195
|
|
|
161
196
|
msgs = []
|
|
162
197
|
query_to_response: List[Dict] = []
|
|
163
|
-
images_history = []
|
|
164
198
|
for h in chat_history or []:
|
|
199
|
+
images_history = []
|
|
165
200
|
role = h["role"]
|
|
166
|
-
content_h, images_tmp = self._message_content_to_chat(
|
|
201
|
+
content_h, images_tmp, video_frames_h = self._message_content_to_chat(
|
|
202
|
+
h["content"]
|
|
203
|
+
)
|
|
167
204
|
if images_tmp != []:
|
|
168
205
|
images_history = images_tmp
|
|
206
|
+
if len(video_frames_h) > 0:
|
|
207
|
+
videoExisted = True
|
|
208
|
+
images_history = video_frames_h
|
|
169
209
|
if len(query_to_response) == 0 and role == "user":
|
|
170
|
-
query_to_response.append(
|
|
210
|
+
query_to_response.append(
|
|
211
|
+
{"role": "user", "content": images_history + [content_h]}
|
|
212
|
+
)
|
|
171
213
|
if len(query_to_response) == 1 and role == "assistant":
|
|
172
|
-
query_to_response.append(
|
|
214
|
+
query_to_response.append(
|
|
215
|
+
{"role": "assistant", "content": images_history + [content_h]}
|
|
216
|
+
)
|
|
173
217
|
if len(query_to_response) == 2:
|
|
174
218
|
msgs.extend(query_to_response)
|
|
175
219
|
query_to_response = []
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
220
|
+
msgs.append({"role": "user", "content": images_chat + [content]})
|
|
221
|
+
|
|
222
|
+
# Set decode params for video
|
|
223
|
+
params = {}
|
|
224
|
+
if videoExisted:
|
|
225
|
+
params = {"use_image_id": False, "max_slice_nums": 1}
|
|
182
226
|
|
|
183
227
|
chat = self._model.chat(
|
|
184
|
-
image=
|
|
185
|
-
msgs=
|
|
228
|
+
image=None,
|
|
229
|
+
msgs=msgs,
|
|
186
230
|
tokenizer=self._tokenizer,
|
|
187
231
|
sampling=True,
|
|
188
|
-
**generate_config
|
|
232
|
+
**generate_config,
|
|
233
|
+
**params,
|
|
189
234
|
)
|
|
190
235
|
if stream:
|
|
191
236
|
it = self.chat_stream(chat)
|
xinference/model/llm/utils.py
CHANGED
|
@@ -11,14 +11,19 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import base64
|
|
14
15
|
import functools
|
|
15
16
|
import json
|
|
16
17
|
import logging
|
|
17
18
|
import os
|
|
18
19
|
import time
|
|
19
20
|
import uuid
|
|
21
|
+
from io import BytesIO
|
|
20
22
|
from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
|
|
21
23
|
|
|
24
|
+
import requests
|
|
25
|
+
from PIL import Image
|
|
26
|
+
|
|
22
27
|
from ...types import (
|
|
23
28
|
SPECIAL_TOOL_PROMPT,
|
|
24
29
|
ChatCompletion,
|
|
@@ -28,7 +33,7 @@ from ...types import (
|
|
|
28
33
|
CompletionChunk,
|
|
29
34
|
)
|
|
30
35
|
from .llm_family import (
|
|
31
|
-
|
|
36
|
+
LlamaCppLLMSpecV1,
|
|
32
37
|
LLMFamilyV1,
|
|
33
38
|
LLMSpecV1,
|
|
34
39
|
PromptStyleV1,
|
|
@@ -60,7 +65,7 @@ class ChatModelMixin:
|
|
|
60
65
|
chat_history: List[ChatCompletionMessage],
|
|
61
66
|
prompt_style: PromptStyleV1,
|
|
62
67
|
tools: Optional[List[Dict]] = None,
|
|
63
|
-
)
|
|
68
|
+
):
|
|
64
69
|
"""
|
|
65
70
|
Inspired by FastChat. Format chat history into a prompt according to the prompty style of
|
|
66
71
|
different models.
|
|
@@ -92,17 +97,6 @@ class ChatModelMixin:
|
|
|
92
97
|
else:
|
|
93
98
|
ret += role + ":"
|
|
94
99
|
return ret
|
|
95
|
-
elif prompt_style.style_name == "ADD_COLON_TWO":
|
|
96
|
-
seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
|
|
97
|
-
ret = prompt_style.system_prompt + seps[0]
|
|
98
|
-
for i, message in enumerate(chat_history):
|
|
99
|
-
role = get_role(message["role"])
|
|
100
|
-
content = message["content"]
|
|
101
|
-
if content:
|
|
102
|
-
ret += role + ": " + content + seps[i % 2]
|
|
103
|
-
else:
|
|
104
|
-
ret += role + ":"
|
|
105
|
-
return ret
|
|
106
100
|
elif prompt_style.style_name == "NO_COLON_TWO":
|
|
107
101
|
seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
|
|
108
102
|
ret = prompt_style.system_prompt
|
|
@@ -144,21 +138,6 @@ class ChatModelMixin:
|
|
|
144
138
|
else:
|
|
145
139
|
ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}"
|
|
146
140
|
return ret
|
|
147
|
-
elif prompt_style.style_name == "FALCON":
|
|
148
|
-
ret = prompt_style.system_prompt
|
|
149
|
-
for message in chat_history:
|
|
150
|
-
role = get_role(message["role"])
|
|
151
|
-
content = message["content"]
|
|
152
|
-
if content:
|
|
153
|
-
ret += (
|
|
154
|
-
role
|
|
155
|
-
+ ": "
|
|
156
|
-
+ content.replace("\r\n", "\n").replace("\n\n", "\n")
|
|
157
|
-
)
|
|
158
|
-
ret += "\n\n"
|
|
159
|
-
else:
|
|
160
|
-
ret += role + ":"
|
|
161
|
-
return ret
|
|
162
141
|
elif prompt_style.style_name == "MIXTRAL_V01":
|
|
163
142
|
ret = ""
|
|
164
143
|
for i, message in enumerate(chat_history):
|
|
@@ -168,22 +147,6 @@ class ChatModelMixin:
|
|
|
168
147
|
else: # assistant
|
|
169
148
|
ret += f"{content} </s>"
|
|
170
149
|
return ret
|
|
171
|
-
elif prompt_style.style_name == "CHATGLM":
|
|
172
|
-
round_add_n = 1 if prompt_style.intra_message_sep == "\n\n" else 0
|
|
173
|
-
if prompt_style.system_prompt:
|
|
174
|
-
ret = prompt_style.system_prompt + prompt_style.intra_message_sep
|
|
175
|
-
else:
|
|
176
|
-
ret = ""
|
|
177
|
-
for i, message in enumerate(chat_history):
|
|
178
|
-
role = get_role(message["role"])
|
|
179
|
-
content = message["content"]
|
|
180
|
-
if i % 2 == 0:
|
|
181
|
-
ret += f"[Round {i // 2 + round_add_n}]{prompt_style.intra_message_sep}"
|
|
182
|
-
if content:
|
|
183
|
-
ret += role + ":" + content + prompt_style.intra_message_sep
|
|
184
|
-
else:
|
|
185
|
-
ret += role + ":"
|
|
186
|
-
return ret
|
|
187
150
|
elif prompt_style.style_name == "CHATGLM3":
|
|
188
151
|
prompts = (
|
|
189
152
|
[f"<|system|>\n {prompt_style.system_prompt}"]
|
|
@@ -323,25 +286,6 @@ Begin!"""
|
|
|
323
286
|
else:
|
|
324
287
|
ret += role + "\n"
|
|
325
288
|
return ret
|
|
326
|
-
elif prompt_style.style_name == "INTERNLM":
|
|
327
|
-
seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
|
|
328
|
-
ret = ""
|
|
329
|
-
for i, message in enumerate(chat_history[:-2]):
|
|
330
|
-
if i % 2 == 0:
|
|
331
|
-
ret += "<s>"
|
|
332
|
-
role = get_role(message["role"])
|
|
333
|
-
content = message["content"]
|
|
334
|
-
ret += role + ":" + str(content) + seps[i % 2]
|
|
335
|
-
if len(ret) == 0:
|
|
336
|
-
ret += "<s>"
|
|
337
|
-
ret += (
|
|
338
|
-
chat_history[-2]["role"]
|
|
339
|
-
+ ":"
|
|
340
|
-
+ str(chat_history[-2]["content"])
|
|
341
|
-
+ seps[0]
|
|
342
|
-
)
|
|
343
|
-
ret += chat_history[-1]["role"] + ":"
|
|
344
|
-
return ret
|
|
345
289
|
elif prompt_style.style_name == "INTERNLM2":
|
|
346
290
|
ret = (
|
|
347
291
|
"<s>"
|
|
@@ -370,9 +314,6 @@ Begin!"""
|
|
|
370
314
|
else:
|
|
371
315
|
ret += role + ": Let's think step by step."
|
|
372
316
|
return ret
|
|
373
|
-
elif prompt_style.style_name == "INSTRUCTION":
|
|
374
|
-
message = chat_history[-2]
|
|
375
|
-
return prompt_style.system_prompt.format(message["content"])
|
|
376
317
|
elif prompt_style.style_name == "DEEPSEEK_CHAT":
|
|
377
318
|
seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
|
|
378
319
|
ret = prompt_style.system_prompt
|
|
@@ -504,6 +445,52 @@ Begin!"""
|
|
|
504
445
|
else:
|
|
505
446
|
ret += role
|
|
506
447
|
return ret
|
|
448
|
+
elif prompt_style.style_name == "INTERNVL":
|
|
449
|
+
ret = (
|
|
450
|
+
"<s>"
|
|
451
|
+
if prompt_style.system_prompt == ""
|
|
452
|
+
else "<s><|im_start|>system\n"
|
|
453
|
+
+ prompt_style.system_prompt
|
|
454
|
+
+ prompt_style.intra_message_sep
|
|
455
|
+
+ "\n"
|
|
456
|
+
)
|
|
457
|
+
images = [] # type: ignore
|
|
458
|
+
for message in chat_history:
|
|
459
|
+
role = get_role(message["role"])
|
|
460
|
+
content = message["content"]
|
|
461
|
+
if isinstance(content, str):
|
|
462
|
+
ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
|
|
463
|
+
elif isinstance(content, list):
|
|
464
|
+
text = ""
|
|
465
|
+
image_urls = []
|
|
466
|
+
for c in content:
|
|
467
|
+
c_type = c.get("type")
|
|
468
|
+
if c_type == "text":
|
|
469
|
+
text = c["text"]
|
|
470
|
+
elif c_type == "image_url":
|
|
471
|
+
image_urls.append(c["image_url"]["url"])
|
|
472
|
+
image_futures = []
|
|
473
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
474
|
+
|
|
475
|
+
with ThreadPoolExecutor() as executor:
|
|
476
|
+
for image_url in image_urls:
|
|
477
|
+
fut = executor.submit(_decode_image, image_url)
|
|
478
|
+
image_futures.append(fut)
|
|
479
|
+
images = [fut.result() for fut in image_futures]
|
|
480
|
+
if len(image_futures) == 0:
|
|
481
|
+
ret += (
|
|
482
|
+
role + "\n" + text + prompt_style.intra_message_sep + "\n"
|
|
483
|
+
)
|
|
484
|
+
else:
|
|
485
|
+
ret += (
|
|
486
|
+
role
|
|
487
|
+
+ "\n"
|
|
488
|
+
+ f"<image>\n{text}"
|
|
489
|
+
+ prompt_style.intra_message_sep
|
|
490
|
+
+ "\n"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
return (ret, images)
|
|
507
494
|
else:
|
|
508
495
|
raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
|
|
509
496
|
|
|
@@ -706,7 +693,7 @@ Begin!"""
|
|
|
706
693
|
family = model_family.model_family or model_family.model_name
|
|
707
694
|
if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
|
|
708
695
|
content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
|
|
709
|
-
elif family in
|
|
696
|
+
elif family in GLM4_TOOL_CALL_FAMILY:
|
|
710
697
|
content, func, args = cls._eval_glm_chat_arguments(c, tools)
|
|
711
698
|
elif family in QWEN_TOOL_CALL_FAMILY:
|
|
712
699
|
content, func, args = cls._eval_qwen_chat_arguments(c, tools)
|
|
@@ -870,10 +857,10 @@ def get_file_location(
|
|
|
870
857
|
is_cached = cache_status
|
|
871
858
|
assert isinstance(is_cached, bool)
|
|
872
859
|
|
|
873
|
-
if spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
|
|
860
|
+
if spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
874
861
|
return cache_dir, is_cached
|
|
875
|
-
elif spec.model_format in ["
|
|
876
|
-
assert isinstance(spec,
|
|
862
|
+
elif spec.model_format in ["ggufv2"]:
|
|
863
|
+
assert isinstance(spec, LlamaCppLLMSpecV1)
|
|
877
864
|
filename = spec.model_file_name_template.format(quantization=quantization)
|
|
878
865
|
model_path = os.path.join(cache_dir, filename)
|
|
879
866
|
return model_path, is_cached
|
|
@@ -885,3 +872,22 @@ def get_model_version(
|
|
|
885
872
|
llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
886
873
|
) -> str:
|
|
887
874
|
return f"{llm_family.model_name}--{llm_spec.model_size_in_billions}B--{llm_spec.model_format}--{quantization}"
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def _decode_image(_url):
|
|
878
|
+
if _url.startswith("data:"):
|
|
879
|
+
logging.info("Parse url by base64 decoder.")
|
|
880
|
+
# https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
|
|
881
|
+
# e.g. f"data:image/jpeg;base64,{base64_image}"
|
|
882
|
+
_type, data = _url.split(";")
|
|
883
|
+
_, ext = _type.split("/")
|
|
884
|
+
data = data[len("base64,") :]
|
|
885
|
+
data = base64.b64decode(data.encode("utf-8"))
|
|
886
|
+
return Image.open(BytesIO(data)).convert("RGB")
|
|
887
|
+
else:
|
|
888
|
+
try:
|
|
889
|
+
response = requests.get(_url)
|
|
890
|
+
except requests.exceptions.MissingSchema:
|
|
891
|
+
return Image.open(_url).convert("RGB")
|
|
892
|
+
else:
|
|
893
|
+
return Image.open(BytesIO(response.content)).convert("RGB")
|
|
@@ -21,6 +21,7 @@ import time
|
|
|
21
21
|
import uuid
|
|
22
22
|
from typing import (
|
|
23
23
|
TYPE_CHECKING,
|
|
24
|
+
Any,
|
|
24
25
|
AsyncGenerator,
|
|
25
26
|
Dict,
|
|
26
27
|
Iterable,
|
|
@@ -88,11 +89,12 @@ try:
|
|
|
88
89
|
except ImportError:
|
|
89
90
|
VLLM_INSTALLED = False
|
|
90
91
|
|
|
92
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
|
|
93
|
+
"internvl2",
|
|
94
|
+
]
|
|
91
95
|
VLLM_SUPPORTED_MODELS = [
|
|
92
96
|
"llama-2",
|
|
93
97
|
"llama-3",
|
|
94
|
-
"baichuan",
|
|
95
|
-
"internlm-16k",
|
|
96
98
|
"mistral-v0.1",
|
|
97
99
|
"codestral-v0.1",
|
|
98
100
|
"Yi",
|
|
@@ -105,13 +107,7 @@ VLLM_SUPPORTED_MODELS = [
|
|
|
105
107
|
VLLM_SUPPORTED_CHAT_MODELS = [
|
|
106
108
|
"llama-2-chat",
|
|
107
109
|
"llama-3-instruct",
|
|
108
|
-
"vicuna-v1.3",
|
|
109
|
-
"vicuna-v1.5",
|
|
110
|
-
"baichuan-chat",
|
|
111
110
|
"baichuan-2-chat",
|
|
112
|
-
"internlm-chat-7b",
|
|
113
|
-
"internlm-chat-8k",
|
|
114
|
-
"internlm-chat-20b",
|
|
115
111
|
"internlm2-chat",
|
|
116
112
|
"internlm2.5-chat",
|
|
117
113
|
"internlm2.5-chat-1m",
|
|
@@ -338,7 +334,7 @@ class VLLMModel(LLM):
|
|
|
338
334
|
return False
|
|
339
335
|
if not cls._is_linux():
|
|
340
336
|
return False
|
|
341
|
-
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
337
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
|
|
342
338
|
return False
|
|
343
339
|
if llm_spec.model_format == "pytorch":
|
|
344
340
|
if quantization != "none" and not (quantization is None):
|
|
@@ -421,7 +417,7 @@ class VLLMModel(LLM):
|
|
|
421
417
|
|
|
422
418
|
async def async_generate(
|
|
423
419
|
self,
|
|
424
|
-
prompt: str,
|
|
420
|
+
prompt: Union[str, Dict[str, Any]],
|
|
425
421
|
generate_config: Optional[Dict] = None,
|
|
426
422
|
tools: object = False,
|
|
427
423
|
) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
|
|
@@ -558,7 +554,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
558
554
|
def match(
|
|
559
555
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
560
556
|
) -> bool:
|
|
561
|
-
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
557
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
|
|
562
558
|
return False
|
|
563
559
|
if llm_spec.model_format == "pytorch":
|
|
564
560
|
if quantization != "none" and not (quantization is None):
|
|
@@ -644,3 +640,106 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
644
640
|
self.model_family, self.model_uid, c, tools
|
|
645
641
|
)
|
|
646
642
|
return self._to_chat_completion(c)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
646
|
+
def load(self):
|
|
647
|
+
try:
|
|
648
|
+
import vllm
|
|
649
|
+
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
650
|
+
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
651
|
+
except ImportError:
|
|
652
|
+
error_message = "Failed to import module 'vllm'"
|
|
653
|
+
installation_guide = [
|
|
654
|
+
"Please make sure 'vllm' is installed. ",
|
|
655
|
+
"You can install it by `pip install vllm`\n",
|
|
656
|
+
]
|
|
657
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
658
|
+
|
|
659
|
+
if vllm.__version__ >= "0.3.1":
|
|
660
|
+
# from vllm v0.3.1, it uses cupy as NCCL backend
|
|
661
|
+
# in which cupy will fork a process
|
|
662
|
+
# only for xoscar >= 0.3.0, new process is allowed in subpool
|
|
663
|
+
# besides, xinference set start method as forkserver for unix
|
|
664
|
+
# we need to set it to fork to make cupy NCCL work
|
|
665
|
+
multiprocessing.set_start_method("fork", force=True)
|
|
666
|
+
|
|
667
|
+
self._model_config = self._sanitize_model_config(self._model_config)
|
|
668
|
+
|
|
669
|
+
logger.info(
|
|
670
|
+
f"Loading {self.model_uid} with following model config: {self._model_config}"
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
engine_args = AsyncEngineArgs(
|
|
674
|
+
model=self.model_path,
|
|
675
|
+
**self._model_config,
|
|
676
|
+
)
|
|
677
|
+
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
|
|
678
|
+
|
|
679
|
+
@classmethod
|
|
680
|
+
def match(
|
|
681
|
+
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
682
|
+
) -> bool:
|
|
683
|
+
if llm_spec.model_format != "pytorch":
|
|
684
|
+
return False
|
|
685
|
+
if llm_spec.model_format == "pytorch":
|
|
686
|
+
if quantization != "none" and not (quantization is None):
|
|
687
|
+
return False
|
|
688
|
+
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
689
|
+
if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
|
|
690
|
+
return False
|
|
691
|
+
else:
|
|
692
|
+
if llm_family.model_name not in VLLM_SUPPORTED_VISION_MODEL_LIST:
|
|
693
|
+
return False
|
|
694
|
+
if "vision" not in llm_family.model_ability:
|
|
695
|
+
return False
|
|
696
|
+
return VLLM_INSTALLED
|
|
697
|
+
|
|
698
|
+
def _sanitize_chat_config(
|
|
699
|
+
self,
|
|
700
|
+
generate_config: Optional[Dict] = None,
|
|
701
|
+
) -> Dict:
|
|
702
|
+
if not generate_config:
|
|
703
|
+
generate_config = {}
|
|
704
|
+
if self.model_family.prompt_style:
|
|
705
|
+
if self.model_family.prompt_style.stop_token_ids:
|
|
706
|
+
generate_config.setdefault(
|
|
707
|
+
"stop_token_ids",
|
|
708
|
+
self.model_family.prompt_style.stop_token_ids.copy(),
|
|
709
|
+
)
|
|
710
|
+
return generate_config
|
|
711
|
+
|
|
712
|
+
async def async_chat(
|
|
713
|
+
self,
|
|
714
|
+
prompt: str,
|
|
715
|
+
system_prompt: Optional[str] = None,
|
|
716
|
+
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
717
|
+
generate_config: Optional[Dict] = None,
|
|
718
|
+
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
719
|
+
# only support single image, waiting vllm support multi images
|
|
720
|
+
assert self.model_family.prompt_style is not None
|
|
721
|
+
prompt_style = self.model_family.prompt_style.copy()
|
|
722
|
+
chat_history = chat_history or []
|
|
723
|
+
prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
|
|
724
|
+
logger.info(f"messages:{prompt}")
|
|
725
|
+
if len(images) == 0:
|
|
726
|
+
inputs = {
|
|
727
|
+
"prompt": prompt,
|
|
728
|
+
}
|
|
729
|
+
else:
|
|
730
|
+
inputs = {
|
|
731
|
+
"prompt": prompt,
|
|
732
|
+
"multi_modal_data": {"image": images[-1]}, # type: ignore
|
|
733
|
+
}
|
|
734
|
+
generate_config = self._sanitize_chat_config(generate_config)
|
|
735
|
+
|
|
736
|
+
stream = generate_config.get("stream", None)
|
|
737
|
+
|
|
738
|
+
if stream:
|
|
739
|
+
agen = await self.async_generate(inputs, generate_config)
|
|
740
|
+
assert isinstance(agen, AsyncGenerator)
|
|
741
|
+
return self._async_to_chat_completion_chunks(agen)
|
|
742
|
+
else:
|
|
743
|
+
c = await self.async_generate(inputs, generate_config)
|
|
744
|
+
assert not isinstance(c, AsyncGenerator)
|
|
745
|
+
return self._to_chat_completion(c)
|
xinference/model/utils.py
CHANGED
|
@@ -14,13 +14,11 @@
|
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
-
import shutil
|
|
18
17
|
from json import JSONDecodeError
|
|
19
18
|
from pathlib import Path
|
|
20
19
|
from typing import Any, Callable, Dict, Optional, Tuple, Union
|
|
21
20
|
|
|
22
21
|
import huggingface_hub
|
|
23
|
-
from fsspec import AbstractFileSystem
|
|
24
22
|
|
|
25
23
|
from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
|
|
26
24
|
from ..device_utils import get_available_device, is_device_available
|
|
@@ -220,12 +218,7 @@ def is_valid_model_uri(model_uri: Optional[str]) -> bool:
|
|
|
220
218
|
return True
|
|
221
219
|
|
|
222
220
|
|
|
223
|
-
def cache_from_uri(
|
|
224
|
-
model_spec: CacheableModelSpec,
|
|
225
|
-
self_hosted_storage: bool = False,
|
|
226
|
-
) -> str:
|
|
227
|
-
from fsspec import AbstractFileSystem, filesystem
|
|
228
|
-
|
|
221
|
+
def cache_from_uri(model_spec: CacheableModelSpec) -> str:
|
|
229
222
|
cache_dir = os.path.realpath(
|
|
230
223
|
os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
|
|
231
224
|
)
|
|
@@ -247,48 +240,6 @@ def cache_from_uri(
|
|
|
247
240
|
os.makedirs(XINFERENCE_CACHE_DIR, exist_ok=True)
|
|
248
241
|
os.symlink(src_root, cache_dir, target_is_directory=True)
|
|
249
242
|
return cache_dir
|
|
250
|
-
elif src_scheme in ["s3"]:
|
|
251
|
-
# use anonymous connection for self-hosted storage.
|
|
252
|
-
src_fs: AbstractFileSystem = filesystem(src_scheme, anon=self_hosted_storage)
|
|
253
|
-
local_fs: AbstractFileSystem = filesystem("file")
|
|
254
|
-
|
|
255
|
-
files_to_download = []
|
|
256
|
-
os.makedirs(cache_dir, exist_ok=True)
|
|
257
|
-
|
|
258
|
-
for path, _, files in src_fs.walk(model_spec.model_uri):
|
|
259
|
-
for file in files:
|
|
260
|
-
src_path = f"{path}/{file}"
|
|
261
|
-
local_path = src_path.replace(src_root, cache_dir)
|
|
262
|
-
files_to_download.append((src_path, local_path))
|
|
263
|
-
|
|
264
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
265
|
-
|
|
266
|
-
failed = False
|
|
267
|
-
with ThreadPoolExecutor(max_workers=min(len(files_to_download), 4)) as executor:
|
|
268
|
-
futures = [
|
|
269
|
-
(
|
|
270
|
-
src_path,
|
|
271
|
-
executor.submit(
|
|
272
|
-
copy_from_src_to_dst, src_fs, src_path, local_fs, local_path
|
|
273
|
-
),
|
|
274
|
-
)
|
|
275
|
-
for src_path, local_path in files_to_download
|
|
276
|
-
]
|
|
277
|
-
for src_path, future in futures:
|
|
278
|
-
if failed:
|
|
279
|
-
future.cancel()
|
|
280
|
-
else:
|
|
281
|
-
try:
|
|
282
|
-
future.result()
|
|
283
|
-
except:
|
|
284
|
-
logger.error(f"Download {src_path} failed", exc_info=True)
|
|
285
|
-
failed = True
|
|
286
|
-
|
|
287
|
-
if failed:
|
|
288
|
-
logger.warning(f"Removing cache directory: {cache_dir}")
|
|
289
|
-
shutil.rmtree(cache_dir, ignore_errors=True)
|
|
290
|
-
raise RuntimeError(f"Failed to download model '{model_spec.model_name}' ")
|
|
291
|
-
return cache_dir
|
|
292
243
|
else:
|
|
293
244
|
raise ValueError(f"Unsupported URL scheme: {src_scheme}")
|
|
294
245
|
|
|
@@ -346,51 +297,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
|
|
|
346
297
|
return cache_dir
|
|
347
298
|
|
|
348
299
|
|
|
349
|
-
def copy_from_src_to_dst(
|
|
350
|
-
_src_fs: "AbstractFileSystem",
|
|
351
|
-
_src_path: str,
|
|
352
|
-
dst_fs: "AbstractFileSystem",
|
|
353
|
-
dst_path: str,
|
|
354
|
-
max_attempt: int = 3,
|
|
355
|
-
):
|
|
356
|
-
from tqdm import tqdm
|
|
357
|
-
|
|
358
|
-
for attempt in range(max_attempt):
|
|
359
|
-
logger.info(f"Copy from {_src_path} to {dst_path}, attempt: {attempt}")
|
|
360
|
-
try:
|
|
361
|
-
with _src_fs.open(_src_path, "rb") as src_file:
|
|
362
|
-
file_size = _src_fs.info(_src_path)["size"]
|
|
363
|
-
|
|
364
|
-
dst_fs.makedirs(os.path.dirname(dst_path), exist_ok=True)
|
|
365
|
-
with dst_fs.open(dst_path, "wb") as dst_file:
|
|
366
|
-
chunk_size = 1024 * 1024 # 1 MB
|
|
367
|
-
|
|
368
|
-
with tqdm(
|
|
369
|
-
total=file_size,
|
|
370
|
-
unit="B",
|
|
371
|
-
unit_scale=True,
|
|
372
|
-
unit_divisor=1024,
|
|
373
|
-
desc=_src_path,
|
|
374
|
-
) as pbar:
|
|
375
|
-
while True:
|
|
376
|
-
chunk = src_file.read(chunk_size)
|
|
377
|
-
if not chunk:
|
|
378
|
-
break
|
|
379
|
-
dst_file.write(chunk)
|
|
380
|
-
pbar.update(len(chunk))
|
|
381
|
-
logger.info(
|
|
382
|
-
f"Copy from {_src_path} to {dst_path} finished, attempt: {attempt}"
|
|
383
|
-
)
|
|
384
|
-
break
|
|
385
|
-
except:
|
|
386
|
-
logger.error(
|
|
387
|
-
f"Failed to copy from {_src_path} to {dst_path} on attempt {attempt + 1}",
|
|
388
|
-
exc_info=True,
|
|
389
|
-
)
|
|
390
|
-
if attempt + 1 == max_attempt:
|
|
391
|
-
raise
|
|
392
|
-
|
|
393
|
-
|
|
394
300
|
def patch_trust_remote_code():
|
|
395
301
|
"""sentence-transformers calls transformers without the trust_remote_code=True, some embedding
|
|
396
302
|
models will fail to load, e.g. jina-embeddings-v2-base-en
|
|
File without changes
|