xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +132 -0
- xinference/api/restful_api.py +282 -78
- xinference/client/handlers.py +3 -0
- xinference/client/restful/restful_client.py +108 -75
- xinference/constants.py +14 -4
- xinference/core/cache_tracker.py +102 -0
- xinference/core/chat_interface.py +10 -4
- xinference/core/event.py +56 -0
- xinference/core/model.py +44 -0
- xinference/core/resource.py +19 -12
- xinference/core/status_guard.py +4 -0
- xinference/core/supervisor.py +278 -87
- xinference/core/utils.py +68 -3
- xinference/core/worker.py +98 -8
- xinference/deploy/cmdline.py +6 -3
- xinference/deploy/local.py +2 -2
- xinference/deploy/supervisor.py +2 -2
- xinference/model/audio/__init__.py +27 -0
- xinference/model/audio/core.py +161 -0
- xinference/model/audio/model_spec.json +79 -0
- xinference/model/audio/utils.py +18 -0
- xinference/model/audio/whisper.py +132 -0
- xinference/model/core.py +18 -13
- xinference/model/embedding/__init__.py +27 -2
- xinference/model/embedding/core.py +43 -3
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/utils.py +18 -0
- xinference/model/image/__init__.py +12 -1
- xinference/model/image/core.py +63 -9
- xinference/model/image/utils.py +26 -0
- xinference/model/llm/__init__.py +20 -1
- xinference/model/llm/core.py +43 -2
- xinference/model/llm/ggml/chatglm.py +15 -6
- xinference/model/llm/llm_family.json +197 -6
- xinference/model/llm/llm_family.py +9 -7
- xinference/model/llm/llm_family_modelscope.json +189 -4
- xinference/model/llm/pytorch/chatglm.py +3 -3
- xinference/model/llm/pytorch/core.py +4 -2
- xinference/model/{multimodal → llm/pytorch}/qwen_vl.py +10 -8
- xinference/model/llm/pytorch/utils.py +21 -9
- xinference/model/llm/pytorch/yi_vl.py +246 -0
- xinference/model/llm/utils.py +57 -4
- xinference/model/llm/vllm/core.py +5 -4
- xinference/model/rerank/__init__.py +25 -2
- xinference/model/rerank/core.py +51 -9
- xinference/model/rerank/model_spec.json +6 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -0
- xinference/{api/oauth2/common.py → model/rerank/utils.py} +6 -2
- xinference/model/utils.py +5 -3
- xinference/thirdparty/__init__.py +0 -0
- xinference/thirdparty/llava/__init__.py +1 -0
- xinference/thirdparty/llava/conversation.py +205 -0
- xinference/thirdparty/llava/mm_utils.py +122 -0
- xinference/thirdparty/llava/model/__init__.py +1 -0
- xinference/thirdparty/llava/model/clip_encoder/__init__.py +0 -0
- xinference/thirdparty/llava/model/clip_encoder/builder.py +11 -0
- xinference/thirdparty/llava/model/clip_encoder/clip_encoder.py +86 -0
- xinference/thirdparty/llava/model/constants.py +6 -0
- xinference/thirdparty/llava/model/llava_arch.py +385 -0
- xinference/thirdparty/llava/model/llava_llama.py +163 -0
- xinference/thirdparty/llava/model/multimodal_projector/__init__.py +0 -0
- xinference/thirdparty/llava/model/multimodal_projector/builder.py +64 -0
- xinference/types.py +1 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.15822aeb.js +3 -0
- xinference/web/ui/build/static/js/main.15822aeb.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/139e5e4adf436923107d2b02994c7ff6dba2aac1989e9b6638984f0dfe782c4a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/64accc515dc6cd584a2873796cd7da6f93de57f7e465eb5423cca9a2f3fe3eff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/65ca3ba225b8c8dac907210545b51f2fcdb2591f0feeb7195f1c037f2bc956a0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b80db1012318b97c329c4e3e72454f7512fb107e57c444b437dbe4ba1a3faa5a.json +1 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/METADATA +33 -23
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/RECORD +81 -64
- xinference/api/oauth2/core.py +0 -93
- xinference/model/multimodal/__init__.py +0 -52
- xinference/model/multimodal/core.py +0 -467
- xinference/model/multimodal/model_spec.json +0 -43
- xinference/model/multimodal/model_spec_modelscope.json +0 -45
- xinference/web/ui/build/static/js/main.b83095c2.js +0 -3
- xinference/web/ui/build/static/js/main.b83095c2.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +0 -1
- /xinference/web/ui/build/static/js/{main.b83095c2.js.LICENSE.txt → main.15822aeb.js.LICENSE.txt} +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/LICENSE +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/WHEEL +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/top_level.txt +0 -0
xinference/model/llm/core.py
CHANGED
|
@@ -17,7 +17,8 @@ import logging
|
|
|
17
17
|
import os
|
|
18
18
|
import platform
|
|
19
19
|
from abc import abstractmethod
|
|
20
|
-
from
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
|
21
22
|
|
|
22
23
|
from ...core.utils import parse_replica_model_uid
|
|
23
24
|
from ..core import ModelDescription
|
|
@@ -28,6 +29,15 @@ if TYPE_CHECKING:
|
|
|
28
29
|
logger = logging.getLogger(__name__)
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
LLM_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_llm_model_descriptions():
|
|
36
|
+
import copy
|
|
37
|
+
|
|
38
|
+
return copy.deepcopy(LLM_MODEL_DESCRIPTIONS)
|
|
39
|
+
|
|
40
|
+
|
|
31
41
|
class LLM(abc.ABC):
|
|
32
42
|
def __init__(
|
|
33
43
|
self,
|
|
@@ -107,8 +117,9 @@ class LLMDescription(ModelDescription):
|
|
|
107
117
|
llm_family: "LLMFamilyV1",
|
|
108
118
|
llm_spec: "LLMSpecV1",
|
|
109
119
|
quantization: Optional[str],
|
|
120
|
+
model_path: Optional[str] = None,
|
|
110
121
|
):
|
|
111
|
-
super().__init__(address, devices)
|
|
122
|
+
super().__init__(address, devices, model_path=model_path)
|
|
112
123
|
self._llm_family = llm_family
|
|
113
124
|
self._llm_spec = llm_spec
|
|
114
125
|
self._quantization = quantization
|
|
@@ -124,12 +135,42 @@ class LLMDescription(ModelDescription):
|
|
|
124
135
|
"model_description": self._llm_family.model_description,
|
|
125
136
|
"model_format": self._llm_spec.model_format,
|
|
126
137
|
"model_size_in_billions": self._llm_spec.model_size_in_billions,
|
|
138
|
+
"model_family": self._llm_family.model_family
|
|
139
|
+
or self._llm_family.model_name,
|
|
127
140
|
"quantization": self._quantization,
|
|
128
141
|
"model_hub": self._llm_spec.model_hub,
|
|
129
142
|
"revision": self._llm_spec.model_revision,
|
|
130
143
|
"context_length": self._llm_family.context_length,
|
|
131
144
|
}
|
|
132
145
|
|
|
146
|
+
def to_version_info(self):
|
|
147
|
+
from .utils import get_file_location, get_model_version
|
|
148
|
+
|
|
149
|
+
model_file_location, cache_status = get_file_location(
|
|
150
|
+
self._llm_family, self._llm_spec, self._quantization
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
"model_version": get_model_version(
|
|
155
|
+
self._llm_family, self._llm_spec, self._quantization
|
|
156
|
+
),
|
|
157
|
+
"model_file_location": model_file_location,
|
|
158
|
+
"cache_status": cache_status,
|
|
159
|
+
"quantization": self._quantization,
|
|
160
|
+
"model_format": self._llm_spec.model_format,
|
|
161
|
+
"model_size_in_billions": self._llm_spec.model_size_in_billions,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
|
|
166
|
+
res = defaultdict(list)
|
|
167
|
+
for spec in llm_family.model_specs:
|
|
168
|
+
for q in spec.quantizations:
|
|
169
|
+
res[llm_family.model_name].append(
|
|
170
|
+
LLMDescription(None, None, llm_family, spec, q).to_version_info()
|
|
171
|
+
)
|
|
172
|
+
return res
|
|
173
|
+
|
|
133
174
|
|
|
134
175
|
def create_llm_model_instance(
|
|
135
176
|
subpool_addr: str,
|
|
@@ -230,20 +230,28 @@ class ChatglmCppChatModel(LLM):
|
|
|
230
230
|
),
|
|
231
231
|
}
|
|
232
232
|
|
|
233
|
+
@staticmethod
|
|
234
|
+
def _to_chatglm_chat_messages(history_list: List[Any]):
|
|
235
|
+
from chatglm_cpp import ChatMessage
|
|
236
|
+
|
|
237
|
+
return [ChatMessage(role=v["role"], content=v["content"]) for v in history_list]
|
|
238
|
+
|
|
233
239
|
def chat(
|
|
234
240
|
self,
|
|
235
241
|
prompt: str,
|
|
242
|
+
system_prompt: Optional[str] = None,
|
|
236
243
|
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
237
244
|
generate_config: Optional[ChatglmCppGenerateConfig] = None,
|
|
238
245
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
246
|
+
chat_history_list = []
|
|
247
|
+
if system_prompt is not None:
|
|
248
|
+
chat_history_list.append({"role": "system", "content": system_prompt})
|
|
239
249
|
if chat_history is not None:
|
|
240
|
-
chat_history_list
|
|
241
|
-
else:
|
|
242
|
-
chat_history_list = []
|
|
250
|
+
chat_history_list.extend(chat_history) # type: ignore
|
|
243
251
|
|
|
244
252
|
tool_message = self._handle_tools(generate_config)
|
|
245
253
|
if tool_message is not None:
|
|
246
|
-
chat_history_list.insert(0, tool_message)
|
|
254
|
+
chat_history_list.insert(0, tool_message) # type: ignore
|
|
247
255
|
|
|
248
256
|
# We drop the message which contains tool calls to walkaround the issue:
|
|
249
257
|
# https://github.com/li-plus/chatglm.cpp/issues/231
|
|
@@ -276,17 +284,18 @@ class ChatglmCppChatModel(LLM):
|
|
|
276
284
|
params = {k: v for k, v in params.items() if v is not None}
|
|
277
285
|
|
|
278
286
|
assert self._llm is not None
|
|
287
|
+
chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
|
|
279
288
|
|
|
280
289
|
if generate_config["stream"]:
|
|
281
290
|
it = self._llm.chat(
|
|
282
|
-
|
|
291
|
+
chat_history_messages,
|
|
283
292
|
**params,
|
|
284
293
|
)
|
|
285
294
|
assert not isinstance(it, str)
|
|
286
295
|
return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
|
|
287
296
|
else:
|
|
288
297
|
c = self._llm.chat(
|
|
289
|
-
|
|
298
|
+
chat_history_messages,
|
|
290
299
|
**params,
|
|
291
300
|
)
|
|
292
301
|
assert not isinstance(c, Iterator)
|
|
@@ -2361,6 +2361,15 @@
|
|
|
2361
2361
|
"model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
2362
2362
|
"model_revision": "125c431e2ff41a156b9f9076f744d2f35dd6e67a"
|
|
2363
2363
|
},
|
|
2364
|
+
{
|
|
2365
|
+
"model_format": "awq",
|
|
2366
|
+
"model_size_in_billions": "46_7",
|
|
2367
|
+
"quantizations": [
|
|
2368
|
+
"4-bit"
|
|
2369
|
+
],
|
|
2370
|
+
"model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
|
|
2371
|
+
"model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
|
|
2372
|
+
},
|
|
2364
2373
|
{
|
|
2365
2374
|
"model_format": "ggufv2",
|
|
2366
2375
|
"model_size_in_billions": "46_7",
|
|
@@ -3184,7 +3193,7 @@
|
|
|
3184
3193
|
"none"
|
|
3185
3194
|
],
|
|
3186
3195
|
"model_id": "internlm/internlm2-chat-7b",
|
|
3187
|
-
"model_revision": "
|
|
3196
|
+
"model_revision": "2292b86b21cb856642782cebed0a453997453b1f"
|
|
3188
3197
|
},
|
|
3189
3198
|
{
|
|
3190
3199
|
"model_format": "pytorch",
|
|
@@ -3193,22 +3202,204 @@
|
|
|
3193
3202
|
"none"
|
|
3194
3203
|
],
|
|
3195
3204
|
"model_id": "internlm/internlm2-chat-20b",
|
|
3196
|
-
"model_revision": "
|
|
3205
|
+
"model_revision": "b666125047cd98c5a7c85ca28720b44a06aed124"
|
|
3197
3206
|
}
|
|
3198
3207
|
],
|
|
3199
3208
|
"prompt_style": {
|
|
3200
3209
|
"style_name": "INTERNLM2",
|
|
3201
3210
|
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
3202
3211
|
"roles": [
|
|
3203
|
-
"
|
|
3204
|
-
"
|
|
3212
|
+
"<|im_start|>user",
|
|
3213
|
+
"<|im_start|>assistant"
|
|
3205
3214
|
],
|
|
3206
|
-
"intra_message_sep": "
|
|
3215
|
+
"intra_message_sep": "<|im_end|>",
|
|
3207
3216
|
"stop_token_ids": [
|
|
3208
3217
|
92542
|
|
3209
3218
|
],
|
|
3210
3219
|
"stop": [
|
|
3211
|
-
"
|
|
3220
|
+
"<|im_end|>"
|
|
3221
|
+
]
|
|
3222
|
+
}
|
|
3223
|
+
},
|
|
3224
|
+
{
|
|
3225
|
+
"version": 1,
|
|
3226
|
+
"context_length": 4096,
|
|
3227
|
+
"model_name": "qwen-vl-chat",
|
|
3228
|
+
"model_lang": [
|
|
3229
|
+
"en",
|
|
3230
|
+
"zh"
|
|
3231
|
+
],
|
|
3232
|
+
"model_ability": [
|
|
3233
|
+
"chat",
|
|
3234
|
+
"vision"
|
|
3235
|
+
],
|
|
3236
|
+
"model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
|
|
3237
|
+
"model_specs": [
|
|
3238
|
+
{
|
|
3239
|
+
"model_format": "pytorch",
|
|
3240
|
+
"model_size_in_billions": 7,
|
|
3241
|
+
"quantizations": [
|
|
3242
|
+
"none"
|
|
3243
|
+
],
|
|
3244
|
+
"model_id": "Qwen/Qwen-VL-Chat",
|
|
3245
|
+
"model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42"
|
|
3246
|
+
},
|
|
3247
|
+
{
|
|
3248
|
+
"model_format": "gptq",
|
|
3249
|
+
"model_size_in_billions": 7,
|
|
3250
|
+
"quantizations": [
|
|
3251
|
+
"Int4"
|
|
3252
|
+
],
|
|
3253
|
+
"model_id": "Qwen/Qwen-VL-Chat-{quantization}",
|
|
3254
|
+
"model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409"
|
|
3255
|
+
}
|
|
3256
|
+
],
|
|
3257
|
+
"prompt_style": {
|
|
3258
|
+
"style_name": "QWEN",
|
|
3259
|
+
"system_prompt": "You are a helpful assistant.",
|
|
3260
|
+
"roles": [
|
|
3261
|
+
"user",
|
|
3262
|
+
"assistant"
|
|
3263
|
+
]
|
|
3264
|
+
}
|
|
3265
|
+
},
|
|
3266
|
+
{
|
|
3267
|
+
"version": 1,
|
|
3268
|
+
"context_length": 4096,
|
|
3269
|
+
"model_name": "orion-chat",
|
|
3270
|
+
"model_lang": [
|
|
3271
|
+
"en",
|
|
3272
|
+
"zh"
|
|
3273
|
+
],
|
|
3274
|
+
"model_ability": [
|
|
3275
|
+
"chat"
|
|
3276
|
+
],
|
|
3277
|
+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
|
|
3278
|
+
"model_specs": [
|
|
3279
|
+
{
|
|
3280
|
+
"model_format": "pytorch",
|
|
3281
|
+
"model_size_in_billions": 14,
|
|
3282
|
+
"quantizations": [
|
|
3283
|
+
"none",
|
|
3284
|
+
"4-bit",
|
|
3285
|
+
"8-bit"
|
|
3286
|
+
],
|
|
3287
|
+
"model_id": "OrionStarAI/Orion-14B-Chat",
|
|
3288
|
+
"model_revision": "ea6fb9b7e1917f3693935accbeb0bfecfd6552a7"
|
|
3289
|
+
},
|
|
3290
|
+
{
|
|
3291
|
+
"model_format": "awq",
|
|
3292
|
+
"model_size_in_billions": 14,
|
|
3293
|
+
"quantizations": [
|
|
3294
|
+
"Int4"
|
|
3295
|
+
],
|
|
3296
|
+
"model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
|
|
3297
|
+
}
|
|
3298
|
+
],
|
|
3299
|
+
"prompt_style": {
|
|
3300
|
+
"style_name": "orion",
|
|
3301
|
+
"roles": [
|
|
3302
|
+
"Human",
|
|
3303
|
+
"assistant"
|
|
3304
|
+
],
|
|
3305
|
+
"stop": [
|
|
3306
|
+
"<s>",
|
|
3307
|
+
"</s>",
|
|
3308
|
+
"<unk>"
|
|
3309
|
+
]
|
|
3310
|
+
}
|
|
3311
|
+
},
|
|
3312
|
+
{
|
|
3313
|
+
"version": 1,
|
|
3314
|
+
"context_length": 4096,
|
|
3315
|
+
"model_name": "orion-chat-rag",
|
|
3316
|
+
"model_lang": [
|
|
3317
|
+
"en",
|
|
3318
|
+
"zh"
|
|
3319
|
+
],
|
|
3320
|
+
"model_ability": [
|
|
3321
|
+
"chat"
|
|
3322
|
+
],
|
|
3323
|
+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
|
|
3324
|
+
"model_specs": [
|
|
3325
|
+
{
|
|
3326
|
+
"model_format": "pytorch",
|
|
3327
|
+
"model_size_in_billions": 14,
|
|
3328
|
+
"quantizations": [
|
|
3329
|
+
"none",
|
|
3330
|
+
"4-bit",
|
|
3331
|
+
"8-bit"
|
|
3332
|
+
],
|
|
3333
|
+
"model_id": "OrionStarAI/Orion-14B-Chat-RAG",
|
|
3334
|
+
"model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2"
|
|
3335
|
+
}
|
|
3336
|
+
],
|
|
3337
|
+
"prompt_style": {
|
|
3338
|
+
"style_name": "orion",
|
|
3339
|
+
"roles": [
|
|
3340
|
+
"Human",
|
|
3341
|
+
"assistant"
|
|
3342
|
+
],
|
|
3343
|
+
"stop": [
|
|
3344
|
+
"<s>",
|
|
3345
|
+
"</s>",
|
|
3346
|
+
"<unk>"
|
|
3347
|
+
]
|
|
3348
|
+
}
|
|
3349
|
+
},
|
|
3350
|
+
{
|
|
3351
|
+
"version": 1,
|
|
3352
|
+
"context_length": 204800,
|
|
3353
|
+
"model_name": "yi-vl-chat",
|
|
3354
|
+
"model_lang": [
|
|
3355
|
+
"en",
|
|
3356
|
+
"zh"
|
|
3357
|
+
],
|
|
3358
|
+
"model_ability": [
|
|
3359
|
+
"chat",
|
|
3360
|
+
"vision"
|
|
3361
|
+
],
|
|
3362
|
+
"model_description": "Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.",
|
|
3363
|
+
"model_specs": [
|
|
3364
|
+
{
|
|
3365
|
+
"model_format": "pytorch",
|
|
3366
|
+
"model_size_in_billions": 6,
|
|
3367
|
+
"quantizations": [
|
|
3368
|
+
"none"
|
|
3369
|
+
],
|
|
3370
|
+
"model_id": "01-ai/Yi-VL-6B",
|
|
3371
|
+
"model_revision": "897c938da1ec860330e2ba2d425ab3004495ba38"
|
|
3372
|
+
},
|
|
3373
|
+
{
|
|
3374
|
+
"model_format": "pytorch",
|
|
3375
|
+
"model_size_in_billions": 34,
|
|
3376
|
+
"quantizations": [
|
|
3377
|
+
"none"
|
|
3378
|
+
],
|
|
3379
|
+
"model_id": "01-ai/Yi-VL-34B",
|
|
3380
|
+
"model_revision": "ea29a9a430f27893e780366dae81d4ca5ebab561"
|
|
3381
|
+
}
|
|
3382
|
+
],
|
|
3383
|
+
"prompt_style": {
|
|
3384
|
+
"style_name": "CHATML",
|
|
3385
|
+
"system_prompt": "",
|
|
3386
|
+
"roles": [
|
|
3387
|
+
"<|im_start|>user",
|
|
3388
|
+
"<|im_start|>assistant"
|
|
3389
|
+
],
|
|
3390
|
+
"intra_message_sep": "<|im_end|>",
|
|
3391
|
+
"inter_message_sep": "",
|
|
3392
|
+
"stop_token_ids": [
|
|
3393
|
+
2,
|
|
3394
|
+
6,
|
|
3395
|
+
7,
|
|
3396
|
+
8
|
|
3397
|
+
],
|
|
3398
|
+
"stop": [
|
|
3399
|
+
"<|endoftext|>",
|
|
3400
|
+
"<|im_start|>",
|
|
3401
|
+
"<|im_end|>",
|
|
3402
|
+
"<|im_sep|>"
|
|
3212
3403
|
]
|
|
3213
3404
|
}
|
|
3214
3405
|
}
|
|
@@ -70,7 +70,7 @@ class GgmlLLMSpecV1(BaseModel):
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class PytorchLLMSpecV1(BaseModel):
|
|
73
|
-
model_format: Literal["pytorch", "gptq"]
|
|
73
|
+
model_format: Literal["pytorch", "gptq", "awq"]
|
|
74
74
|
# Must in order that `str` first, then `int`
|
|
75
75
|
model_size_in_billions: Union[str, int]
|
|
76
76
|
quantizations: List[str]
|
|
@@ -106,7 +106,7 @@ class LLMFamilyV1(BaseModel):
|
|
|
106
106
|
context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
|
|
107
107
|
model_name: str
|
|
108
108
|
model_lang: List[str]
|
|
109
|
-
model_ability: List[Literal["embed", "generate", "chat", "tools"]]
|
|
109
|
+
model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]]
|
|
110
110
|
model_description: Optional[str]
|
|
111
111
|
# reason for not required str here: legacy registration
|
|
112
112
|
model_family: Optional[str]
|
|
@@ -212,6 +212,8 @@ UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
|
212
212
|
|
|
213
213
|
UD_LLM_FAMILIES_LOCK = Lock()
|
|
214
214
|
|
|
215
|
+
LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
|
|
216
|
+
|
|
215
217
|
|
|
216
218
|
def download_from_self_hosted_storage() -> bool:
|
|
217
219
|
from ...constants import XINFERENCE_ENV_MODEL_SRC
|
|
@@ -449,7 +451,7 @@ def _get_meta_path(
|
|
|
449
451
|
return os.path.join(cache_dir, "__valid_download")
|
|
450
452
|
else:
|
|
451
453
|
return os.path.join(cache_dir, f"__valid_download_{model_hub}")
|
|
452
|
-
elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
|
|
454
|
+
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
|
|
453
455
|
assert quantization is not None
|
|
454
456
|
if model_hub == "huggingface":
|
|
455
457
|
return os.path.join(cache_dir, f"__valid_download_{quantization}")
|
|
@@ -487,7 +489,7 @@ def _skip_download(
|
|
|
487
489
|
logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
|
|
488
490
|
return True
|
|
489
491
|
return False
|
|
490
|
-
elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
|
|
492
|
+
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
|
|
491
493
|
assert quantization is not None
|
|
492
494
|
return os.path.exists(
|
|
493
495
|
_get_meta_path(cache_dir, model_format, model_hub, quantization)
|
|
@@ -535,7 +537,7 @@ def cache_from_modelscope(
|
|
|
535
537
|
):
|
|
536
538
|
return cache_dir
|
|
537
539
|
|
|
538
|
-
if llm_spec.model_format in ["pytorch", "gptq"]:
|
|
540
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
539
541
|
download_dir = retry_download(
|
|
540
542
|
snapshot_download,
|
|
541
543
|
llm_family.model_name,
|
|
@@ -596,7 +598,7 @@ def cache_from_huggingface(
|
|
|
596
598
|
):
|
|
597
599
|
return cache_dir
|
|
598
600
|
|
|
599
|
-
if llm_spec.model_format in ["pytorch", "gptq"]:
|
|
601
|
+
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
|
|
600
602
|
assert isinstance(llm_spec, PytorchLLMSpecV1)
|
|
601
603
|
retry_download(
|
|
602
604
|
huggingface_hub.snapshot_download,
|
|
@@ -677,7 +679,7 @@ def get_cache_status(
|
|
|
677
679
|
]
|
|
678
680
|
return any(revisions)
|
|
679
681
|
# just check meta file for ggml and gptq model
|
|
680
|
-
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq"]:
|
|
682
|
+
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
|
|
681
683
|
ret = []
|
|
682
684
|
for q in llm_spec.quantizations:
|
|
683
685
|
assert q is not None
|
|
@@ -1817,15 +1817,200 @@
|
|
|
1817
1817
|
"style_name": "INTERNLM2",
|
|
1818
1818
|
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
1819
1819
|
"roles": [
|
|
1820
|
-
"
|
|
1821
|
-
"
|
|
1820
|
+
"<|im_start|>user",
|
|
1821
|
+
"<|im_start|>assistant"
|
|
1822
1822
|
],
|
|
1823
|
-
"intra_message_sep": "
|
|
1823
|
+
"intra_message_sep": "<|im_end|>",
|
|
1824
1824
|
"stop_token_ids": [
|
|
1825
1825
|
92542
|
|
1826
1826
|
],
|
|
1827
1827
|
"stop": [
|
|
1828
|
-
"
|
|
1828
|
+
"<|im_end|>"
|
|
1829
|
+
]
|
|
1830
|
+
}
|
|
1831
|
+
},
|
|
1832
|
+
{
|
|
1833
|
+
"version": 1,
|
|
1834
|
+
"context_length": 4096,
|
|
1835
|
+
"model_name": "qwen-vl-chat",
|
|
1836
|
+
"model_lang": [
|
|
1837
|
+
"en",
|
|
1838
|
+
"zh"
|
|
1839
|
+
],
|
|
1840
|
+
"model_ability": [
|
|
1841
|
+
"chat",
|
|
1842
|
+
"vision"
|
|
1843
|
+
],
|
|
1844
|
+
"model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
|
|
1845
|
+
"model_specs": [
|
|
1846
|
+
{
|
|
1847
|
+
"model_format": "pytorch",
|
|
1848
|
+
"model_size_in_billions": 7,
|
|
1849
|
+
"quantizations": [
|
|
1850
|
+
"none"
|
|
1851
|
+
],
|
|
1852
|
+
"model_hub": "modelscope",
|
|
1853
|
+
"model_id": "Qwen/Qwen-VL-Chat",
|
|
1854
|
+
"model_revision": "master"
|
|
1855
|
+
},
|
|
1856
|
+
{
|
|
1857
|
+
"model_format": "gptq",
|
|
1858
|
+
"model_size_in_billions": 7,
|
|
1859
|
+
"quantizations": [
|
|
1860
|
+
"Int4"
|
|
1861
|
+
],
|
|
1862
|
+
"model_hub": "modelscope",
|
|
1863
|
+
"model_id": "Qwen/Qwen-VL-Chat-{quantization}",
|
|
1864
|
+
"model_revision": "master"
|
|
1865
|
+
}
|
|
1866
|
+
],
|
|
1867
|
+
"prompt_style": {
|
|
1868
|
+
"style_name": "QWEN",
|
|
1869
|
+
"system_prompt": "You are a helpful assistant.",
|
|
1870
|
+
"roles": [
|
|
1871
|
+
"user",
|
|
1872
|
+
"assistant"
|
|
1873
|
+
]
|
|
1874
|
+
}
|
|
1875
|
+
},
|
|
1876
|
+
{
|
|
1877
|
+
"version": 1,
|
|
1878
|
+
"context_length": 4096,
|
|
1879
|
+
"model_name": "orion-chat",
|
|
1880
|
+
"model_lang": [
|
|
1881
|
+
"en",
|
|
1882
|
+
"zh"
|
|
1883
|
+
],
|
|
1884
|
+
"model_ability": [
|
|
1885
|
+
"chat"
|
|
1886
|
+
],
|
|
1887
|
+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
|
|
1888
|
+
"model_specs": [
|
|
1889
|
+
{
|
|
1890
|
+
"model_format": "pytorch",
|
|
1891
|
+
"model_size_in_billions": 14,
|
|
1892
|
+
"quantizations": [
|
|
1893
|
+
"none",
|
|
1894
|
+
"4-bit",
|
|
1895
|
+
"8-bit"
|
|
1896
|
+
],
|
|
1897
|
+
"model_id": "OrionStarAI/Orion-14B-Chat",
|
|
1898
|
+
"model_hub": "modelscope"
|
|
1899
|
+
},
|
|
1900
|
+
{
|
|
1901
|
+
"model_format": "awq",
|
|
1902
|
+
"model_size_in_billions": 14,
|
|
1903
|
+
"quantizations": [
|
|
1904
|
+
"Int4"
|
|
1905
|
+
],
|
|
1906
|
+
"model_hub": "modelscope",
|
|
1907
|
+
"model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
|
|
1908
|
+
}
|
|
1909
|
+
],
|
|
1910
|
+
"prompt_style": {
|
|
1911
|
+
"style_name": "orion",
|
|
1912
|
+
"roles": [
|
|
1913
|
+
"Human",
|
|
1914
|
+
"assistant"
|
|
1915
|
+
],
|
|
1916
|
+
"stop": [
|
|
1917
|
+
"<s>",
|
|
1918
|
+
"</s>",
|
|
1919
|
+
"<unk>"
|
|
1920
|
+
]
|
|
1921
|
+
}
|
|
1922
|
+
},
|
|
1923
|
+
{
|
|
1924
|
+
"version": 1,
|
|
1925
|
+
"context_length": 4096,
|
|
1926
|
+
"model_name": "orion-chat-rag",
|
|
1927
|
+
"model_lang": [
|
|
1928
|
+
"en",
|
|
1929
|
+
"zh"
|
|
1930
|
+
],
|
|
1931
|
+
"model_ability": [
|
|
1932
|
+
"chat"
|
|
1933
|
+
],
|
|
1934
|
+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
|
|
1935
|
+
"model_specs": [
|
|
1936
|
+
{
|
|
1937
|
+
"model_format": "pytorch",
|
|
1938
|
+
"model_size_in_billions": 14,
|
|
1939
|
+
"quantizations": [
|
|
1940
|
+
"none",
|
|
1941
|
+
"4-bit",
|
|
1942
|
+
"8-bit"
|
|
1943
|
+
],
|
|
1944
|
+
"model_hub": "modelscope",
|
|
1945
|
+
"model_id": "OrionStarAI/Orion-14B-Chat-RAG"
|
|
1946
|
+
}
|
|
1947
|
+
],
|
|
1948
|
+
"prompt_style": {
|
|
1949
|
+
"style_name": "orion",
|
|
1950
|
+
"roles": [
|
|
1951
|
+
"Human",
|
|
1952
|
+
"assistant"
|
|
1953
|
+
],
|
|
1954
|
+
"stop": [
|
|
1955
|
+
"<s>",
|
|
1956
|
+
"</s>",
|
|
1957
|
+
"<unk>"
|
|
1958
|
+
]
|
|
1959
|
+
}
|
|
1960
|
+
},
|
|
1961
|
+
{
|
|
1962
|
+
"version": 1,
|
|
1963
|
+
"context_length": 204800,
|
|
1964
|
+
"model_name": "yi-vl-chat",
|
|
1965
|
+
"model_lang": [
|
|
1966
|
+
"en",
|
|
1967
|
+
"zh"
|
|
1968
|
+
],
|
|
1969
|
+
"model_ability": [
|
|
1970
|
+
"chat",
|
|
1971
|
+
"vision"
|
|
1972
|
+
],
|
|
1973
|
+
"model_description": "Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.",
|
|
1974
|
+
"model_specs": [
|
|
1975
|
+
{
|
|
1976
|
+
"model_format": "pytorch",
|
|
1977
|
+
"model_size_in_billions": 6,
|
|
1978
|
+
"quantizations": [
|
|
1979
|
+
"none"
|
|
1980
|
+
],
|
|
1981
|
+
"model_hub": "modelscope",
|
|
1982
|
+
"model_id": "01ai/Yi-VL-6B"
|
|
1983
|
+
},
|
|
1984
|
+
{
|
|
1985
|
+
"model_format": "pytorch",
|
|
1986
|
+
"model_size_in_billions": 34,
|
|
1987
|
+
"quantizations": [
|
|
1988
|
+
"none"
|
|
1989
|
+
],
|
|
1990
|
+
"model_hub": "modelscope",
|
|
1991
|
+
"model_id": "01ai/Yi-VL-34B"
|
|
1992
|
+
}
|
|
1993
|
+
],
|
|
1994
|
+
"prompt_style": {
|
|
1995
|
+
"style_name": "CHATML",
|
|
1996
|
+
"system_prompt": "",
|
|
1997
|
+
"roles": [
|
|
1998
|
+
"<|im_start|>user",
|
|
1999
|
+
"<|im_start|>assistant"
|
|
2000
|
+
],
|
|
2001
|
+
"intra_message_sep": "<|im_end|>",
|
|
2002
|
+
"inter_message_sep": "",
|
|
2003
|
+
"stop_token_ids": [
|
|
2004
|
+
2,
|
|
2005
|
+
6,
|
|
2006
|
+
7,
|
|
2007
|
+
8
|
|
2008
|
+
],
|
|
2009
|
+
"stop": [
|
|
2010
|
+
"<|endoftext|>",
|
|
2011
|
+
"<|im_start|>",
|
|
2012
|
+
"<|im_end|>",
|
|
2013
|
+
"<|im_sep|>"
|
|
1829
2014
|
]
|
|
1830
2015
|
}
|
|
1831
2016
|
}
|
|
@@ -120,9 +120,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
120
120
|
top_p = generate_config.get("top_p")
|
|
121
121
|
if top_p is not None:
|
|
122
122
|
kwargs["top_p"] = float(top_p)
|
|
123
|
-
|
|
124
|
-
if
|
|
125
|
-
kwargs["
|
|
123
|
+
max_new_tokens = generate_config.get("max_tokens")
|
|
124
|
+
if max_new_tokens is not None:
|
|
125
|
+
kwargs["max_new_tokens"] = int(max_new_tokens)
|
|
126
126
|
# Tool calls only works for non stream, so we call chat directly.
|
|
127
127
|
if prompt == SPECIAL_TOOL_PROMPT and chat_history:
|
|
128
128
|
tool_message = chat_history.pop()
|
|
@@ -190,7 +190,7 @@ class PytorchModel(LLM):
|
|
|
190
190
|
def match(
|
|
191
191
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
192
192
|
) -> bool:
|
|
193
|
-
if llm_spec.model_format not in ["pytorch", "gptq"]:
|
|
193
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
194
194
|
return False
|
|
195
195
|
model_family = llm_family.model_family or llm_family.model_name
|
|
196
196
|
if model_family in [
|
|
@@ -408,7 +408,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
408
408
|
def match(
|
|
409
409
|
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
410
410
|
) -> bool:
|
|
411
|
-
if llm_spec.model_format not in ["pytorch", "gptq"]:
|
|
411
|
+
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
412
412
|
return False
|
|
413
413
|
if llm_family.model_name in [
|
|
414
414
|
"baichuan-chat",
|
|
@@ -422,6 +422,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
422
422
|
"llama-2",
|
|
423
423
|
"llama-2-chat",
|
|
424
424
|
"internlm2-chat",
|
|
425
|
+
"qwen-vl-chat",
|
|
426
|
+
"yi-vl-chat",
|
|
425
427
|
]:
|
|
426
428
|
return False
|
|
427
429
|
if "chat" not in llm_family.model_ability:
|