xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (95) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +132 -0
  3. xinference/api/restful_api.py +282 -78
  4. xinference/client/handlers.py +3 -0
  5. xinference/client/restful/restful_client.py +108 -75
  6. xinference/constants.py +14 -4
  7. xinference/core/cache_tracker.py +102 -0
  8. xinference/core/chat_interface.py +10 -4
  9. xinference/core/event.py +56 -0
  10. xinference/core/model.py +44 -0
  11. xinference/core/resource.py +19 -12
  12. xinference/core/status_guard.py +4 -0
  13. xinference/core/supervisor.py +278 -87
  14. xinference/core/utils.py +68 -3
  15. xinference/core/worker.py +98 -8
  16. xinference/deploy/cmdline.py +6 -3
  17. xinference/deploy/local.py +2 -2
  18. xinference/deploy/supervisor.py +2 -2
  19. xinference/model/audio/__init__.py +27 -0
  20. xinference/model/audio/core.py +161 -0
  21. xinference/model/audio/model_spec.json +79 -0
  22. xinference/model/audio/utils.py +18 -0
  23. xinference/model/audio/whisper.py +132 -0
  24. xinference/model/core.py +18 -13
  25. xinference/model/embedding/__init__.py +27 -2
  26. xinference/model/embedding/core.py +43 -3
  27. xinference/model/embedding/model_spec.json +24 -0
  28. xinference/model/embedding/model_spec_modelscope.json +24 -0
  29. xinference/model/embedding/utils.py +18 -0
  30. xinference/model/image/__init__.py +12 -1
  31. xinference/model/image/core.py +63 -9
  32. xinference/model/image/utils.py +26 -0
  33. xinference/model/llm/__init__.py +20 -1
  34. xinference/model/llm/core.py +43 -2
  35. xinference/model/llm/ggml/chatglm.py +15 -6
  36. xinference/model/llm/llm_family.json +197 -6
  37. xinference/model/llm/llm_family.py +9 -7
  38. xinference/model/llm/llm_family_modelscope.json +189 -4
  39. xinference/model/llm/pytorch/chatglm.py +3 -3
  40. xinference/model/llm/pytorch/core.py +4 -2
  41. xinference/model/{multimodal → llm/pytorch}/qwen_vl.py +10 -8
  42. xinference/model/llm/pytorch/utils.py +21 -9
  43. xinference/model/llm/pytorch/yi_vl.py +246 -0
  44. xinference/model/llm/utils.py +57 -4
  45. xinference/model/llm/vllm/core.py +5 -4
  46. xinference/model/rerank/__init__.py +25 -2
  47. xinference/model/rerank/core.py +51 -9
  48. xinference/model/rerank/model_spec.json +6 -0
  49. xinference/model/rerank/model_spec_modelscope.json +7 -0
  50. xinference/{api/oauth2/common.py → model/rerank/utils.py} +6 -2
  51. xinference/model/utils.py +5 -3
  52. xinference/thirdparty/__init__.py +0 -0
  53. xinference/thirdparty/llava/__init__.py +1 -0
  54. xinference/thirdparty/llava/conversation.py +205 -0
  55. xinference/thirdparty/llava/mm_utils.py +122 -0
  56. xinference/thirdparty/llava/model/__init__.py +1 -0
  57. xinference/thirdparty/llava/model/clip_encoder/__init__.py +0 -0
  58. xinference/thirdparty/llava/model/clip_encoder/builder.py +11 -0
  59. xinference/thirdparty/llava/model/clip_encoder/clip_encoder.py +86 -0
  60. xinference/thirdparty/llava/model/constants.py +6 -0
  61. xinference/thirdparty/llava/model/llava_arch.py +385 -0
  62. xinference/thirdparty/llava/model/llava_llama.py +163 -0
  63. xinference/thirdparty/llava/model/multimodal_projector/__init__.py +0 -0
  64. xinference/thirdparty/llava/model/multimodal_projector/builder.py +64 -0
  65. xinference/types.py +1 -1
  66. xinference/web/ui/build/asset-manifest.json +3 -3
  67. xinference/web/ui/build/index.html +1 -1
  68. xinference/web/ui/build/static/js/main.15822aeb.js +3 -0
  69. xinference/web/ui/build/static/js/main.15822aeb.js.map +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/139e5e4adf436923107d2b02994c7ff6dba2aac1989e9b6638984f0dfe782c4a.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/64accc515dc6cd584a2873796cd7da6f93de57f7e465eb5423cca9a2f3fe3eff.json +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/65ca3ba225b8c8dac907210545b51f2fcdb2591f0feeb7195f1c037f2bc956a0.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/b80db1012318b97c329c4e3e72454f7512fb107e57c444b437dbe4ba1a3faa5a.json +1 -0
  75. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/METADATA +33 -23
  76. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/RECORD +81 -64
  77. xinference/api/oauth2/core.py +0 -93
  78. xinference/model/multimodal/__init__.py +0 -52
  79. xinference/model/multimodal/core.py +0 -467
  80. xinference/model/multimodal/model_spec.json +0 -43
  81. xinference/model/multimodal/model_spec_modelscope.json +0 -45
  82. xinference/web/ui/build/static/js/main.b83095c2.js +0 -3
  83. xinference/web/ui/build/static/js/main.b83095c2.js.map +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +0 -1
  91. /xinference/web/ui/build/static/js/{main.b83095c2.js.LICENSE.txt → main.15822aeb.js.LICENSE.txt} +0 -0
  92. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/LICENSE +0 -0
  93. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/WHEEL +0 -0
  94. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/entry_points.txt +0 -0
  95. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,8 @@ import logging
17
17
  import os
18
18
  import platform
19
19
  from abc import abstractmethod
20
- from typing import TYPE_CHECKING, List, Optional, Tuple, Union
20
+ from collections import defaultdict
21
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
21
22
 
22
23
  from ...core.utils import parse_replica_model_uid
23
24
  from ..core import ModelDescription
@@ -28,6 +29,15 @@ if TYPE_CHECKING:
28
29
  logger = logging.getLogger(__name__)
29
30
 
30
31
 
32
+ LLM_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
33
+
34
+
35
+ def get_llm_model_descriptions():
36
+ import copy
37
+
38
+ return copy.deepcopy(LLM_MODEL_DESCRIPTIONS)
39
+
40
+
31
41
  class LLM(abc.ABC):
32
42
  def __init__(
33
43
  self,
@@ -107,8 +117,9 @@ class LLMDescription(ModelDescription):
107
117
  llm_family: "LLMFamilyV1",
108
118
  llm_spec: "LLMSpecV1",
109
119
  quantization: Optional[str],
120
+ model_path: Optional[str] = None,
110
121
  ):
111
- super().__init__(address, devices)
122
+ super().__init__(address, devices, model_path=model_path)
112
123
  self._llm_family = llm_family
113
124
  self._llm_spec = llm_spec
114
125
  self._quantization = quantization
@@ -124,12 +135,42 @@ class LLMDescription(ModelDescription):
124
135
  "model_description": self._llm_family.model_description,
125
136
  "model_format": self._llm_spec.model_format,
126
137
  "model_size_in_billions": self._llm_spec.model_size_in_billions,
138
+ "model_family": self._llm_family.model_family
139
+ or self._llm_family.model_name,
127
140
  "quantization": self._quantization,
128
141
  "model_hub": self._llm_spec.model_hub,
129
142
  "revision": self._llm_spec.model_revision,
130
143
  "context_length": self._llm_family.context_length,
131
144
  }
132
145
 
146
+ def to_version_info(self):
147
+ from .utils import get_file_location, get_model_version
148
+
149
+ model_file_location, cache_status = get_file_location(
150
+ self._llm_family, self._llm_spec, self._quantization
151
+ )
152
+
153
+ return {
154
+ "model_version": get_model_version(
155
+ self._llm_family, self._llm_spec, self._quantization
156
+ ),
157
+ "model_file_location": model_file_location,
158
+ "cache_status": cache_status,
159
+ "quantization": self._quantization,
160
+ "model_format": self._llm_spec.model_format,
161
+ "model_size_in_billions": self._llm_spec.model_size_in_billions,
162
+ }
163
+
164
+
165
+ def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
166
+ res = defaultdict(list)
167
+ for spec in llm_family.model_specs:
168
+ for q in spec.quantizations:
169
+ res[llm_family.model_name].append(
170
+ LLMDescription(None, None, llm_family, spec, q).to_version_info()
171
+ )
172
+ return res
173
+
133
174
 
134
175
  def create_llm_model_instance(
135
176
  subpool_addr: str,
@@ -230,20 +230,28 @@ class ChatglmCppChatModel(LLM):
230
230
  ),
231
231
  }
232
232
 
233
+ @staticmethod
234
+ def _to_chatglm_chat_messages(history_list: List[Any]):
235
+ from chatglm_cpp import ChatMessage
236
+
237
+ return [ChatMessage(role=v["role"], content=v["content"]) for v in history_list]
238
+
233
239
  def chat(
234
240
  self,
235
241
  prompt: str,
242
+ system_prompt: Optional[str] = None,
236
243
  chat_history: Optional[List[ChatCompletionMessage]] = None,
237
244
  generate_config: Optional[ChatglmCppGenerateConfig] = None,
238
245
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
246
+ chat_history_list = []
247
+ if system_prompt is not None:
248
+ chat_history_list.append({"role": "system", "content": system_prompt})
239
249
  if chat_history is not None:
240
- chat_history_list = chat_history
241
- else:
242
- chat_history_list = []
250
+ chat_history_list.extend(chat_history) # type: ignore
243
251
 
244
252
  tool_message = self._handle_tools(generate_config)
245
253
  if tool_message is not None:
246
- chat_history_list.insert(0, tool_message)
254
+ chat_history_list.insert(0, tool_message) # type: ignore
247
255
 
248
256
  # We drop the message which contains tool calls to walkaround the issue:
249
257
  # https://github.com/li-plus/chatglm.cpp/issues/231
@@ -276,17 +284,18 @@ class ChatglmCppChatModel(LLM):
276
284
  params = {k: v for k, v in params.items() if v is not None}
277
285
 
278
286
  assert self._llm is not None
287
+ chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
279
288
 
280
289
  if generate_config["stream"]:
281
290
  it = self._llm.chat(
282
- chat_history_list,
291
+ chat_history_messages,
283
292
  **params,
284
293
  )
285
294
  assert not isinstance(it, str)
286
295
  return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
287
296
  else:
288
297
  c = self._llm.chat(
289
- chat_history_list,
298
+ chat_history_messages,
290
299
  **params,
291
300
  )
292
301
  assert not isinstance(c, Iterator)
@@ -2361,6 +2361,15 @@
2361
2361
  "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
2362
2362
  "model_revision": "125c431e2ff41a156b9f9076f744d2f35dd6e67a"
2363
2363
  },
2364
+ {
2365
+ "model_format": "awq",
2366
+ "model_size_in_billions": "46_7",
2367
+ "quantizations": [
2368
+ "4-bit"
2369
+ ],
2370
+ "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
2371
+ "model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
2372
+ },
2364
2373
  {
2365
2374
  "model_format": "ggufv2",
2366
2375
  "model_size_in_billions": "46_7",
@@ -3184,7 +3193,7 @@
3184
3193
  "none"
3185
3194
  ],
3186
3195
  "model_id": "internlm/internlm2-chat-7b",
3187
- "model_revision": "5797f79825bab7013932d57e2babaac1b8de6b4f"
3196
+ "model_revision": "2292b86b21cb856642782cebed0a453997453b1f"
3188
3197
  },
3189
3198
  {
3190
3199
  "model_format": "pytorch",
@@ -3193,22 +3202,204 @@
3193
3202
  "none"
3194
3203
  ],
3195
3204
  "model_id": "internlm/internlm2-chat-20b",
3196
- "model_revision": "3ccaf3ae82d5d01c0a95eecf40ee550f9c543635"
3205
+ "model_revision": "b666125047cd98c5a7c85ca28720b44a06aed124"
3197
3206
  }
3198
3207
  ],
3199
3208
  "prompt_style": {
3200
3209
  "style_name": "INTERNLM2",
3201
3210
  "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
3202
3211
  "roles": [
3203
- "[UNUSED_TOKEN_146]user",
3204
- "[UNUSED_TOKEN_146]assistant"
3212
+ "<|im_start|>user",
3213
+ "<|im_start|>assistant"
3205
3214
  ],
3206
- "intra_message_sep": "[UNUSED_TOKEN_145]",
3215
+ "intra_message_sep": "<|im_end|>",
3207
3216
  "stop_token_ids": [
3208
3217
  92542
3209
3218
  ],
3210
3219
  "stop": [
3211
- "[UNUSED_TOKEN_145]"
3220
+ "<|im_end|>"
3221
+ ]
3222
+ }
3223
+ },
3224
+ {
3225
+ "version": 1,
3226
+ "context_length": 4096,
3227
+ "model_name": "qwen-vl-chat",
3228
+ "model_lang": [
3229
+ "en",
3230
+ "zh"
3231
+ ],
3232
+ "model_ability": [
3233
+ "chat",
3234
+ "vision"
3235
+ ],
3236
+ "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
3237
+ "model_specs": [
3238
+ {
3239
+ "model_format": "pytorch",
3240
+ "model_size_in_billions": 7,
3241
+ "quantizations": [
3242
+ "none"
3243
+ ],
3244
+ "model_id": "Qwen/Qwen-VL-Chat",
3245
+ "model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42"
3246
+ },
3247
+ {
3248
+ "model_format": "gptq",
3249
+ "model_size_in_billions": 7,
3250
+ "quantizations": [
3251
+ "Int4"
3252
+ ],
3253
+ "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
3254
+ "model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409"
3255
+ }
3256
+ ],
3257
+ "prompt_style": {
3258
+ "style_name": "QWEN",
3259
+ "system_prompt": "You are a helpful assistant.",
3260
+ "roles": [
3261
+ "user",
3262
+ "assistant"
3263
+ ]
3264
+ }
3265
+ },
3266
+ {
3267
+ "version": 1,
3268
+ "context_length": 4096,
3269
+ "model_name": "orion-chat",
3270
+ "model_lang": [
3271
+ "en",
3272
+ "zh"
3273
+ ],
3274
+ "model_ability": [
3275
+ "chat"
3276
+ ],
3277
+ "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
3278
+ "model_specs": [
3279
+ {
3280
+ "model_format": "pytorch",
3281
+ "model_size_in_billions": 14,
3282
+ "quantizations": [
3283
+ "none",
3284
+ "4-bit",
3285
+ "8-bit"
3286
+ ],
3287
+ "model_id": "OrionStarAI/Orion-14B-Chat",
3288
+ "model_revision": "ea6fb9b7e1917f3693935accbeb0bfecfd6552a7"
3289
+ },
3290
+ {
3291
+ "model_format": "awq",
3292
+ "model_size_in_billions": 14,
3293
+ "quantizations": [
3294
+ "Int4"
3295
+ ],
3296
+ "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
3297
+ }
3298
+ ],
3299
+ "prompt_style": {
3300
+ "style_name": "orion",
3301
+ "roles": [
3302
+ "Human",
3303
+ "assistant"
3304
+ ],
3305
+ "stop": [
3306
+ "<s>",
3307
+ "</s>",
3308
+ "<unk>"
3309
+ ]
3310
+ }
3311
+ },
3312
+ {
3313
+ "version": 1,
3314
+ "context_length": 4096,
3315
+ "model_name": "orion-chat-rag",
3316
+ "model_lang": [
3317
+ "en",
3318
+ "zh"
3319
+ ],
3320
+ "model_ability": [
3321
+ "chat"
3322
+ ],
3323
+ "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
3324
+ "model_specs": [
3325
+ {
3326
+ "model_format": "pytorch",
3327
+ "model_size_in_billions": 14,
3328
+ "quantizations": [
3329
+ "none",
3330
+ "4-bit",
3331
+ "8-bit"
3332
+ ],
3333
+ "model_id": "OrionStarAI/Orion-14B-Chat-RAG",
3334
+ "model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2"
3335
+ }
3336
+ ],
3337
+ "prompt_style": {
3338
+ "style_name": "orion",
3339
+ "roles": [
3340
+ "Human",
3341
+ "assistant"
3342
+ ],
3343
+ "stop": [
3344
+ "<s>",
3345
+ "</s>",
3346
+ "<unk>"
3347
+ ]
3348
+ }
3349
+ },
3350
+ {
3351
+ "version": 1,
3352
+ "context_length": 204800,
3353
+ "model_name": "yi-vl-chat",
3354
+ "model_lang": [
3355
+ "en",
3356
+ "zh"
3357
+ ],
3358
+ "model_ability": [
3359
+ "chat",
3360
+ "vision"
3361
+ ],
3362
+ "model_description": "Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.",
3363
+ "model_specs": [
3364
+ {
3365
+ "model_format": "pytorch",
3366
+ "model_size_in_billions": 6,
3367
+ "quantizations": [
3368
+ "none"
3369
+ ],
3370
+ "model_id": "01-ai/Yi-VL-6B",
3371
+ "model_revision": "897c938da1ec860330e2ba2d425ab3004495ba38"
3372
+ },
3373
+ {
3374
+ "model_format": "pytorch",
3375
+ "model_size_in_billions": 34,
3376
+ "quantizations": [
3377
+ "none"
3378
+ ],
3379
+ "model_id": "01-ai/Yi-VL-34B",
3380
+ "model_revision": "ea29a9a430f27893e780366dae81d4ca5ebab561"
3381
+ }
3382
+ ],
3383
+ "prompt_style": {
3384
+ "style_name": "CHATML",
3385
+ "system_prompt": "",
3386
+ "roles": [
3387
+ "<|im_start|>user",
3388
+ "<|im_start|>assistant"
3389
+ ],
3390
+ "intra_message_sep": "<|im_end|>",
3391
+ "inter_message_sep": "",
3392
+ "stop_token_ids": [
3393
+ 2,
3394
+ 6,
3395
+ 7,
3396
+ 8
3397
+ ],
3398
+ "stop": [
3399
+ "<|endoftext|>",
3400
+ "<|im_start|>",
3401
+ "<|im_end|>",
3402
+ "<|im_sep|>"
3212
3403
  ]
3213
3404
  }
3214
3405
  }
@@ -70,7 +70,7 @@ class GgmlLLMSpecV1(BaseModel):
70
70
 
71
71
 
72
72
  class PytorchLLMSpecV1(BaseModel):
73
- model_format: Literal["pytorch", "gptq"]
73
+ model_format: Literal["pytorch", "gptq", "awq"]
74
74
  # Must in order that `str` first, then `int`
75
75
  model_size_in_billions: Union[str, int]
76
76
  quantizations: List[str]
@@ -106,7 +106,7 @@ class LLMFamilyV1(BaseModel):
106
106
  context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
107
107
  model_name: str
108
108
  model_lang: List[str]
109
- model_ability: List[Literal["embed", "generate", "chat", "tools"]]
109
+ model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]]
110
110
  model_description: Optional[str]
111
111
  # reason for not required str here: legacy registration
112
112
  model_family: Optional[str]
@@ -212,6 +212,8 @@ UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
212
212
 
213
213
  UD_LLM_FAMILIES_LOCK = Lock()
214
214
 
215
+ LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
216
+
215
217
 
216
218
  def download_from_self_hosted_storage() -> bool:
217
219
  from ...constants import XINFERENCE_ENV_MODEL_SRC
@@ -449,7 +451,7 @@ def _get_meta_path(
449
451
  return os.path.join(cache_dir, "__valid_download")
450
452
  else:
451
453
  return os.path.join(cache_dir, f"__valid_download_{model_hub}")
452
- elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
454
+ elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
453
455
  assert quantization is not None
454
456
  if model_hub == "huggingface":
455
457
  return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -487,7 +489,7 @@ def _skip_download(
487
489
  logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
488
490
  return True
489
491
  return False
490
- elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
492
+ elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
491
493
  assert quantization is not None
492
494
  return os.path.exists(
493
495
  _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -535,7 +537,7 @@ def cache_from_modelscope(
535
537
  ):
536
538
  return cache_dir
537
539
 
538
- if llm_spec.model_format in ["pytorch", "gptq"]:
540
+ if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
539
541
  download_dir = retry_download(
540
542
  snapshot_download,
541
543
  llm_family.model_name,
@@ -596,7 +598,7 @@ def cache_from_huggingface(
596
598
  ):
597
599
  return cache_dir
598
600
 
599
- if llm_spec.model_format in ["pytorch", "gptq"]:
601
+ if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
600
602
  assert isinstance(llm_spec, PytorchLLMSpecV1)
601
603
  retry_download(
602
604
  huggingface_hub.snapshot_download,
@@ -677,7 +679,7 @@ def get_cache_status(
677
679
  ]
678
680
  return any(revisions)
679
681
  # just check meta file for ggml and gptq model
680
- elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq"]:
682
+ elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
681
683
  ret = []
682
684
  for q in llm_spec.quantizations:
683
685
  assert q is not None
@@ -1817,15 +1817,200 @@
1817
1817
  "style_name": "INTERNLM2",
1818
1818
  "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
1819
1819
  "roles": [
1820
- "[UNUSED_TOKEN_146]user",
1821
- "[UNUSED_TOKEN_146]assistant"
1820
+ "<|im_start|>user",
1821
+ "<|im_start|>assistant"
1822
1822
  ],
1823
- "intra_message_sep": "[UNUSED_TOKEN_145]",
1823
+ "intra_message_sep": "<|im_end|>",
1824
1824
  "stop_token_ids": [
1825
1825
  92542
1826
1826
  ],
1827
1827
  "stop": [
1828
- "[UNUSED_TOKEN_145]"
1828
+ "<|im_end|>"
1829
+ ]
1830
+ }
1831
+ },
1832
+ {
1833
+ "version": 1,
1834
+ "context_length": 4096,
1835
+ "model_name": "qwen-vl-chat",
1836
+ "model_lang": [
1837
+ "en",
1838
+ "zh"
1839
+ ],
1840
+ "model_ability": [
1841
+ "chat",
1842
+ "vision"
1843
+ ],
1844
+ "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
1845
+ "model_specs": [
1846
+ {
1847
+ "model_format": "pytorch",
1848
+ "model_size_in_billions": 7,
1849
+ "quantizations": [
1850
+ "none"
1851
+ ],
1852
+ "model_hub": "modelscope",
1853
+ "model_id": "Qwen/Qwen-VL-Chat",
1854
+ "model_revision": "master"
1855
+ },
1856
+ {
1857
+ "model_format": "gptq",
1858
+ "model_size_in_billions": 7,
1859
+ "quantizations": [
1860
+ "Int4"
1861
+ ],
1862
+ "model_hub": "modelscope",
1863
+ "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
1864
+ "model_revision": "master"
1865
+ }
1866
+ ],
1867
+ "prompt_style": {
1868
+ "style_name": "QWEN",
1869
+ "system_prompt": "You are a helpful assistant.",
1870
+ "roles": [
1871
+ "user",
1872
+ "assistant"
1873
+ ]
1874
+ }
1875
+ },
1876
+ {
1877
+ "version": 1,
1878
+ "context_length": 4096,
1879
+ "model_name": "orion-chat",
1880
+ "model_lang": [
1881
+ "en",
1882
+ "zh"
1883
+ ],
1884
+ "model_ability": [
1885
+ "chat"
1886
+ ],
1887
+ "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
1888
+ "model_specs": [
1889
+ {
1890
+ "model_format": "pytorch",
1891
+ "model_size_in_billions": 14,
1892
+ "quantizations": [
1893
+ "none",
1894
+ "4-bit",
1895
+ "8-bit"
1896
+ ],
1897
+ "model_id": "OrionStarAI/Orion-14B-Chat",
1898
+ "model_hub": "modelscope"
1899
+ },
1900
+ {
1901
+ "model_format": "awq",
1902
+ "model_size_in_billions": 14,
1903
+ "quantizations": [
1904
+ "Int4"
1905
+ ],
1906
+ "model_hub": "modelscope",
1907
+ "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
1908
+ }
1909
+ ],
1910
+ "prompt_style": {
1911
+ "style_name": "orion",
1912
+ "roles": [
1913
+ "Human",
1914
+ "assistant"
1915
+ ],
1916
+ "stop": [
1917
+ "<s>",
1918
+ "</s>",
1919
+ "<unk>"
1920
+ ]
1921
+ }
1922
+ },
1923
+ {
1924
+ "version": 1,
1925
+ "context_length": 4096,
1926
+ "model_name": "orion-chat-rag",
1927
+ "model_lang": [
1928
+ "en",
1929
+ "zh"
1930
+ ],
1931
+ "model_ability": [
1932
+ "chat"
1933
+ ],
1934
+ "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
1935
+ "model_specs": [
1936
+ {
1937
+ "model_format": "pytorch",
1938
+ "model_size_in_billions": 14,
1939
+ "quantizations": [
1940
+ "none",
1941
+ "4-bit",
1942
+ "8-bit"
1943
+ ],
1944
+ "model_hub": "modelscope",
1945
+ "model_id": "OrionStarAI/Orion-14B-Chat-RAG"
1946
+ }
1947
+ ],
1948
+ "prompt_style": {
1949
+ "style_name": "orion",
1950
+ "roles": [
1951
+ "Human",
1952
+ "assistant"
1953
+ ],
1954
+ "stop": [
1955
+ "<s>",
1956
+ "</s>",
1957
+ "<unk>"
1958
+ ]
1959
+ }
1960
+ },
1961
+ {
1962
+ "version": 1,
1963
+ "context_length": 204800,
1964
+ "model_name": "yi-vl-chat",
1965
+ "model_lang": [
1966
+ "en",
1967
+ "zh"
1968
+ ],
1969
+ "model_ability": [
1970
+ "chat",
1971
+ "vision"
1972
+ ],
1973
+ "model_description": "Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.",
1974
+ "model_specs": [
1975
+ {
1976
+ "model_format": "pytorch",
1977
+ "model_size_in_billions": 6,
1978
+ "quantizations": [
1979
+ "none"
1980
+ ],
1981
+ "model_hub": "modelscope",
1982
+ "model_id": "01ai/Yi-VL-6B"
1983
+ },
1984
+ {
1985
+ "model_format": "pytorch",
1986
+ "model_size_in_billions": 34,
1987
+ "quantizations": [
1988
+ "none"
1989
+ ],
1990
+ "model_hub": "modelscope",
1991
+ "model_id": "01ai/Yi-VL-34B"
1992
+ }
1993
+ ],
1994
+ "prompt_style": {
1995
+ "style_name": "CHATML",
1996
+ "system_prompt": "",
1997
+ "roles": [
1998
+ "<|im_start|>user",
1999
+ "<|im_start|>assistant"
2000
+ ],
2001
+ "intra_message_sep": "<|im_end|>",
2002
+ "inter_message_sep": "",
2003
+ "stop_token_ids": [
2004
+ 2,
2005
+ 6,
2006
+ 7,
2007
+ 8
2008
+ ],
2009
+ "stop": [
2010
+ "<|endoftext|>",
2011
+ "<|im_start|>",
2012
+ "<|im_end|>",
2013
+ "<|im_sep|>"
1829
2014
  ]
1830
2015
  }
1831
2016
  }
@@ -120,9 +120,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
120
120
  top_p = generate_config.get("top_p")
121
121
  if top_p is not None:
122
122
  kwargs["top_p"] = float(top_p)
123
- max_length = generate_config.get("max_tokens")
124
- if max_length is not None:
125
- kwargs["max_length"] = int(max_length)
123
+ max_new_tokens = generate_config.get("max_tokens")
124
+ if max_new_tokens is not None:
125
+ kwargs["max_new_tokens"] = int(max_new_tokens)
126
126
  # Tool calls only works for non stream, so we call chat directly.
127
127
  if prompt == SPECIAL_TOOL_PROMPT and chat_history:
128
128
  tool_message = chat_history.pop()
@@ -190,7 +190,7 @@ class PytorchModel(LLM):
190
190
  def match(
191
191
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
192
192
  ) -> bool:
193
- if llm_spec.model_format not in ["pytorch", "gptq"]:
193
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
194
194
  return False
195
195
  model_family = llm_family.model_family or llm_family.model_name
196
196
  if model_family in [
@@ -408,7 +408,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
408
408
  def match(
409
409
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
410
410
  ) -> bool:
411
- if llm_spec.model_format not in ["pytorch", "gptq"]:
411
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
412
412
  return False
413
413
  if llm_family.model_name in [
414
414
  "baichuan-chat",
@@ -422,6 +422,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
422
422
  "llama-2",
423
423
  "llama-2-chat",
424
424
  "internlm2-chat",
425
+ "qwen-vl-chat",
426
+ "yi-vl-chat",
425
427
  ]:
426
428
  return False
427
429
  if "chat" not in llm_family.model_ability: