xinference 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/chat_interface.py +10 -4
- xinference/core/model.py +2 -2
- xinference/fields.py +3 -1
- xinference/model/llm/ggml/chatglm.py +98 -13
- xinference/model/llm/ggml/llamacpp.py +49 -2
- xinference/model/llm/llm_family.json +132 -3
- xinference/model/llm/llm_family_modelscope.json +139 -3
- xinference/model/llm/pytorch/chatglm.py +48 -0
- xinference/model/llm/pytorch/core.py +23 -6
- xinference/model/llm/pytorch/deepseek_vl.py +35 -9
- xinference/model/llm/pytorch/internlm2.py +32 -1
- xinference/model/llm/pytorch/qwen_vl.py +38 -11
- xinference/model/llm/pytorch/utils.py +38 -1
- xinference/model/llm/pytorch/yi_vl.py +42 -14
- xinference/model/llm/sglang/core.py +31 -9
- xinference/model/llm/utils.py +25 -5
- xinference/model/llm/vllm/core.py +82 -3
- xinference/types.py +10 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.8e44da4b.js → main.551aa479.js} +3 -3
- xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/METADATA +3 -2
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/RECORD +33 -33
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +0 -1
- /xinference/web/ui/build/static/js/{main.8e44da4b.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-05-
|
|
11
|
+
"date": "2024-05-17T14:10:09+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.11.
|
|
14
|
+
"full-revisionid": "55a0200079eacf4fd6ee10c5868f0eaba244db29",
|
|
15
|
+
"version": "0.11.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -109,6 +109,7 @@ class GradioInterface:
|
|
|
109
109
|
history: List[List[str]],
|
|
110
110
|
max_tokens: int,
|
|
111
111
|
temperature: float,
|
|
112
|
+
lora_name: str,
|
|
112
113
|
) -> Generator:
|
|
113
114
|
from ..client import RESTfulClient
|
|
114
115
|
|
|
@@ -127,6 +128,7 @@ class GradioInterface:
|
|
|
127
128
|
"max_tokens": int(max_tokens),
|
|
128
129
|
"temperature": temperature,
|
|
129
130
|
"stream": True,
|
|
131
|
+
"lora_name": lora_name,
|
|
130
132
|
},
|
|
131
133
|
):
|
|
132
134
|
assert isinstance(chunk, dict)
|
|
@@ -152,6 +154,7 @@ class GradioInterface:
|
|
|
152
154
|
gr.Slider(
|
|
153
155
|
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
154
156
|
),
|
|
157
|
+
gr.Text(label="LoRA Name"),
|
|
155
158
|
],
|
|
156
159
|
title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
|
|
157
160
|
css="""
|
|
@@ -331,7 +334,7 @@ class GradioInterface:
|
|
|
331
334
|
history: hist,
|
|
332
335
|
}
|
|
333
336
|
|
|
334
|
-
def complete(text, hist, max_tokens, temperature) -> Generator:
|
|
337
|
+
def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
|
|
335
338
|
from ..client import RESTfulClient
|
|
336
339
|
|
|
337
340
|
client = RESTfulClient(self.endpoint)
|
|
@@ -349,6 +352,7 @@ class GradioInterface:
|
|
|
349
352
|
"max_tokens": max_tokens,
|
|
350
353
|
"temperature": temperature,
|
|
351
354
|
"stream": True,
|
|
355
|
+
"lora_name": lora_name,
|
|
352
356
|
},
|
|
353
357
|
):
|
|
354
358
|
assert isinstance(chunk, dict)
|
|
@@ -368,7 +372,7 @@ class GradioInterface:
|
|
|
368
372
|
history: hist,
|
|
369
373
|
}
|
|
370
374
|
|
|
371
|
-
def retry(text, hist, max_tokens, temperature) -> Generator:
|
|
375
|
+
def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
|
|
372
376
|
from ..client import RESTfulClient
|
|
373
377
|
|
|
374
378
|
client = RESTfulClient(self.endpoint)
|
|
@@ -387,6 +391,7 @@ class GradioInterface:
|
|
|
387
391
|
"max_tokens": max_tokens,
|
|
388
392
|
"temperature": temperature,
|
|
389
393
|
"stream": True,
|
|
394
|
+
"lora_name": lora_name,
|
|
390
395
|
},
|
|
391
396
|
):
|
|
392
397
|
assert isinstance(chunk, dict)
|
|
@@ -470,10 +475,11 @@ class GradioInterface:
|
|
|
470
475
|
temperature = gr.Slider(
|
|
471
476
|
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
472
477
|
)
|
|
478
|
+
lora_name = gr.Text(label="LoRA Name")
|
|
473
479
|
|
|
474
480
|
btn_generate.click(
|
|
475
481
|
fn=complete,
|
|
476
|
-
inputs=[textbox, history, length, temperature],
|
|
482
|
+
inputs=[textbox, history, length, temperature, lora_name],
|
|
477
483
|
outputs=[textbox, history],
|
|
478
484
|
)
|
|
479
485
|
|
|
@@ -485,7 +491,7 @@ class GradioInterface:
|
|
|
485
491
|
|
|
486
492
|
btn_retry.click(
|
|
487
493
|
fn=retry,
|
|
488
|
-
inputs=[textbox, history, length, temperature],
|
|
494
|
+
inputs=[textbox, history, length, temperature, lora_name],
|
|
489
495
|
outputs=[textbox, history],
|
|
490
496
|
)
|
|
491
497
|
|
xinference/core/model.py
CHANGED
|
@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
257
257
|
for v in gen:
|
|
258
258
|
if time_to_first_token is None:
|
|
259
259
|
time_to_first_token = (time.time() - start_time) * 1000
|
|
260
|
-
final_usage = v.
|
|
260
|
+
final_usage = v.get("usage", None)
|
|
261
261
|
v = dict(data=json.dumps(v))
|
|
262
262
|
yield sse_starlette.sse.ensure_bytes(v, None)
|
|
263
263
|
except OutOfMemoryError:
|
|
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
289
289
|
async for v in gen:
|
|
290
290
|
if time_to_first_token is None:
|
|
291
291
|
time_to_first_token = (time.time() - start_time) * 1000
|
|
292
|
-
final_usage = v.
|
|
292
|
+
final_usage = v.get("usage", None)
|
|
293
293
|
v = await asyncio.to_thread(json.dumps, v)
|
|
294
294
|
v = dict(data=v) # noqa: F821
|
|
295
295
|
yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
|
xinference/fields.py
CHANGED
|
@@ -108,10 +108,11 @@ class ChatglmCppChatModel(LLM):
|
|
|
108
108
|
|
|
109
109
|
@staticmethod
|
|
110
110
|
def _convert_raw_text_chunks_to_chat(
|
|
111
|
-
tokens: Iterator[Any], model_name: str
|
|
111
|
+
tokens: Iterator[Any], model_name: str, include_usage: bool, input_ids
|
|
112
112
|
) -> Iterator[ChatCompletionChunk]:
|
|
113
|
+
request_id = str(uuid.uuid4())
|
|
113
114
|
yield {
|
|
114
|
-
"id": "chat" + f"cmpl-{
|
|
115
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
115
116
|
"model": model_name,
|
|
116
117
|
"object": "chat.completion.chunk",
|
|
117
118
|
"created": int(time.time()),
|
|
@@ -125,9 +126,13 @@ class ChatglmCppChatModel(LLM):
|
|
|
125
126
|
}
|
|
126
127
|
],
|
|
127
128
|
}
|
|
129
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
128
130
|
for token in tokens:
|
|
131
|
+
prompt_tokens = len(input_ids)
|
|
132
|
+
completion_tokens = completion_tokens + 1
|
|
133
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
129
134
|
yield {
|
|
130
|
-
"id": "chat" + f"cmpl-{
|
|
135
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
131
136
|
"model": model_name,
|
|
132
137
|
"object": "chat.completion.chunk",
|
|
133
138
|
"created": int(time.time()),
|
|
@@ -143,6 +148,35 @@ class ChatglmCppChatModel(LLM):
|
|
|
143
148
|
}
|
|
144
149
|
],
|
|
145
150
|
}
|
|
151
|
+
# stop
|
|
152
|
+
yield {
|
|
153
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
154
|
+
"model": model_name,
|
|
155
|
+
"object": "chat.completion.chunk",
|
|
156
|
+
"created": int(time.time()),
|
|
157
|
+
"choices": [
|
|
158
|
+
{
|
|
159
|
+
"index": 0,
|
|
160
|
+
"delta": {
|
|
161
|
+
"content": "",
|
|
162
|
+
},
|
|
163
|
+
"finish_reason": "stop",
|
|
164
|
+
}
|
|
165
|
+
],
|
|
166
|
+
}
|
|
167
|
+
if include_usage:
|
|
168
|
+
yield {
|
|
169
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
170
|
+
"model": model_name,
|
|
171
|
+
"object": "chat.completion.chunk",
|
|
172
|
+
"created": int(time.time()),
|
|
173
|
+
"choices": [],
|
|
174
|
+
"usage": {
|
|
175
|
+
"prompt_tokens": prompt_tokens,
|
|
176
|
+
"completion_tokens": completion_tokens,
|
|
177
|
+
"total_tokens": total_tokens,
|
|
178
|
+
},
|
|
179
|
+
}
|
|
146
180
|
|
|
147
181
|
@classmethod
|
|
148
182
|
def _convert_raw_text_completion_to_chat(
|
|
@@ -273,7 +307,7 @@ class ChatglmCppChatModel(LLM):
|
|
|
273
307
|
|
|
274
308
|
params = {
|
|
275
309
|
"max_length": generate_config.get("max_tokens"),
|
|
276
|
-
"max_context_length": generate_config.get("max_tokens"),
|
|
310
|
+
"max_context_length": generate_config.get("max_tokens", 1024),
|
|
277
311
|
"top_k": generate_config.get("top_k"),
|
|
278
312
|
"top_p": generate_config.get("top_p"),
|
|
279
313
|
"temperature": generate_config.get("temperature"),
|
|
@@ -286,13 +320,27 @@ class ChatglmCppChatModel(LLM):
|
|
|
286
320
|
assert self._llm is not None
|
|
287
321
|
chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
|
|
288
322
|
|
|
289
|
-
|
|
323
|
+
stream = generate_config.get("stream")
|
|
324
|
+
stream_options = generate_config.get("stream_options", None)
|
|
325
|
+
include_usage = (
|
|
326
|
+
stream_options["include_usage"]
|
|
327
|
+
if isinstance(stream_options, dict)
|
|
328
|
+
else False
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if stream:
|
|
290
332
|
it = self._llm.chat(
|
|
291
333
|
chat_history_messages,
|
|
292
334
|
**params,
|
|
293
335
|
)
|
|
294
336
|
assert not isinstance(it, str)
|
|
295
|
-
|
|
337
|
+
input_ids = self._llm.tokenizer.encode_messages(
|
|
338
|
+
chat_history_messages, params["max_context_length"]
|
|
339
|
+
)
|
|
340
|
+
return self._convert_raw_text_chunks_to_chat(
|
|
341
|
+
it, self.model_uid, include_usage, input_ids
|
|
342
|
+
)
|
|
343
|
+
|
|
296
344
|
else:
|
|
297
345
|
c = self._llm.chat(
|
|
298
346
|
chat_history_messages,
|
|
@@ -320,11 +368,13 @@ class ChatglmCppChatModel(LLM):
|
|
|
320
368
|
|
|
321
369
|
@staticmethod
|
|
322
370
|
def _convert_str_to_completion_chunk(
|
|
323
|
-
tokens: Iterator[str], model_name: str
|
|
371
|
+
tokens: Iterator[str], model_name: str, include_usage: bool, input_ids
|
|
324
372
|
) -> Iterator[CompletionChunk]:
|
|
325
|
-
|
|
373
|
+
request_id = str(uuid.uuid4())
|
|
374
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
375
|
+
for i, token in enumerate(tokens):
|
|
326
376
|
yield {
|
|
327
|
-
"id": "generate" + f"-{
|
|
377
|
+
"id": "generate" + f"-{request_id}",
|
|
328
378
|
"model": model_name,
|
|
329
379
|
"object": "text_completion",
|
|
330
380
|
"created": int(time.time()),
|
|
@@ -332,6 +382,32 @@ class ChatglmCppChatModel(LLM):
|
|
|
332
382
|
{"index": 0, "text": token, "finish_reason": None, "logprobs": None}
|
|
333
383
|
],
|
|
334
384
|
}
|
|
385
|
+
prompt_tokens = len(input_ids)
|
|
386
|
+
completion_tokens = i
|
|
387
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
388
|
+
# stop
|
|
389
|
+
yield {
|
|
390
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
391
|
+
"model": model_name,
|
|
392
|
+
"object": "text_completion",
|
|
393
|
+
"created": int(time.time()),
|
|
394
|
+
"choices": [
|
|
395
|
+
{"index": 0, "text": "", "finish_reason": "stop", "logprobs": None}
|
|
396
|
+
],
|
|
397
|
+
}
|
|
398
|
+
if include_usage:
|
|
399
|
+
yield {
|
|
400
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
401
|
+
"model": model_name,
|
|
402
|
+
"object": "text_completion",
|
|
403
|
+
"created": int(time.time()),
|
|
404
|
+
"choices": [],
|
|
405
|
+
"usage": {
|
|
406
|
+
"prompt_tokens": prompt_tokens,
|
|
407
|
+
"completion_tokens": completion_tokens,
|
|
408
|
+
"total_tokens": total_tokens,
|
|
409
|
+
},
|
|
410
|
+
}
|
|
335
411
|
|
|
336
412
|
def generate(
|
|
337
413
|
self,
|
|
@@ -344,7 +420,7 @@ class ChatglmCppChatModel(LLM):
|
|
|
344
420
|
|
|
345
421
|
params = {
|
|
346
422
|
"max_length": generate_config.get("max_tokens"),
|
|
347
|
-
"max_context_length": generate_config.get("max_tokens"),
|
|
423
|
+
"max_context_length": generate_config.get("max_tokens", 1024),
|
|
348
424
|
"top_k": generate_config.get("top_k"),
|
|
349
425
|
"top_p": generate_config.get("top_p"),
|
|
350
426
|
"temperature": generate_config.get("temperature"),
|
|
@@ -355,14 +431,23 @@ class ChatglmCppChatModel(LLM):
|
|
|
355
431
|
params = {k: v for k, v in params.items() if v is not None}
|
|
356
432
|
|
|
357
433
|
assert self._llm is not None
|
|
358
|
-
|
|
359
|
-
|
|
434
|
+
stream = generate_config.get("stream")
|
|
435
|
+
stream_options = generate_config.get("stream_options", None)
|
|
436
|
+
include_usage = (
|
|
437
|
+
stream_options["include_usage"]
|
|
438
|
+
if isinstance(stream_options, dict)
|
|
439
|
+
else False
|
|
440
|
+
)
|
|
441
|
+
if stream:
|
|
360
442
|
it = self._llm.generate(
|
|
361
443
|
prompt,
|
|
362
444
|
**params,
|
|
363
445
|
)
|
|
364
446
|
assert not isinstance(it, str)
|
|
365
|
-
|
|
447
|
+
input_ids = self._llm.tokenizer.encode(prompt, params["max_context_length"])
|
|
448
|
+
return self._convert_str_to_completion_chunk(
|
|
449
|
+
it, self.model_uid, include_usage, input_ids
|
|
450
|
+
)
|
|
366
451
|
else:
|
|
367
452
|
c = self._llm.generate(
|
|
368
453
|
prompt,
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
import datetime
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
+
import time
|
|
17
18
|
from typing import Iterable, Iterator, List, Optional, Union
|
|
18
19
|
|
|
19
20
|
from ....types import (
|
|
@@ -22,6 +23,7 @@ from ....types import (
|
|
|
22
23
|
ChatCompletionMessage,
|
|
23
24
|
Completion,
|
|
24
25
|
CompletionChunk,
|
|
26
|
+
CompletionUsage,
|
|
25
27
|
CreateCompletionLlamaCpp,
|
|
26
28
|
Embedding,
|
|
27
29
|
LlamaCppGenerateConfig,
|
|
@@ -100,6 +102,8 @@ class LlamaCppModel(LLM):
|
|
|
100
102
|
generate_config = LlamaCppGenerateConfig(
|
|
101
103
|
**CreateCompletionLlamaCpp(**generate_config).dict()
|
|
102
104
|
)
|
|
105
|
+
# Currently, llama.cpp does not support lora
|
|
106
|
+
generate_config.pop("lora_name", None) # type: ignore
|
|
103
107
|
return generate_config
|
|
104
108
|
|
|
105
109
|
def _convert_ggml_to_gguf(self, model_path: str) -> str:
|
|
@@ -195,16 +199,59 @@ class LlamaCppModel(LLM):
|
|
|
195
199
|
_generate_config: LlamaCppGenerateConfig,
|
|
196
200
|
) -> Iterator[CompletionChunk]:
|
|
197
201
|
assert self._llm is not None
|
|
198
|
-
|
|
202
|
+
prompt_token_ids: List[int] = (
|
|
203
|
+
(
|
|
204
|
+
self._llm.tokenize(prompt.encode("utf-8"), special=True)
|
|
205
|
+
if prompt != ""
|
|
206
|
+
else [self._llm.token_bos()]
|
|
207
|
+
)
|
|
208
|
+
if isinstance(prompt, str)
|
|
209
|
+
else prompt
|
|
210
|
+
)
|
|
211
|
+
prompt_tokens = len(prompt_token_ids)
|
|
212
|
+
completion_tokens, total_tokens = 0, 0
|
|
213
|
+
request_id = 0
|
|
214
|
+
for index, _completion_chunk in enumerate(
|
|
215
|
+
self._llm(prompt=_prompt, **_generate_config)
|
|
216
|
+
):
|
|
217
|
+
request_id = _completion_chunk["id"]
|
|
218
|
+
choice = _completion_chunk["choices"][0]
|
|
219
|
+
if choice["finish_reason"] is not None:
|
|
220
|
+
completion_tokens = index
|
|
221
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
222
|
+
_completion_chunk["usage"] = CompletionUsage(
|
|
223
|
+
prompt_tokens=total_tokens,
|
|
224
|
+
completion_tokens=completion_tokens,
|
|
225
|
+
total_tokens=total_tokens,
|
|
226
|
+
)
|
|
199
227
|
yield _completion_chunk
|
|
228
|
+
if include_usage:
|
|
229
|
+
chunk = CompletionChunk(
|
|
230
|
+
id=request_id,
|
|
231
|
+
object="text_completion",
|
|
232
|
+
created=int(time.time()),
|
|
233
|
+
model=self.model_uid,
|
|
234
|
+
choices=[],
|
|
235
|
+
)
|
|
236
|
+
chunk["usage"] = CompletionUsage(
|
|
237
|
+
prompt_tokens=prompt_tokens,
|
|
238
|
+
completion_tokens=completion_tokens,
|
|
239
|
+
total_tokens=total_tokens,
|
|
240
|
+
)
|
|
241
|
+
yield chunk
|
|
200
242
|
|
|
201
243
|
logger.debug(
|
|
202
244
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
203
245
|
)
|
|
204
246
|
|
|
205
247
|
generate_config = self._sanitize_generate_config(generate_config)
|
|
206
|
-
|
|
207
248
|
stream = generate_config.get("stream", False)
|
|
249
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
250
|
+
include_usage = (
|
|
251
|
+
stream_options["include_usage"]
|
|
252
|
+
if isinstance(stream_options, dict)
|
|
253
|
+
else False
|
|
254
|
+
)
|
|
208
255
|
|
|
209
256
|
if not stream:
|
|
210
257
|
assert self._llm is not None
|
|
@@ -3651,7 +3651,7 @@
|
|
|
3651
3651
|
},
|
|
3652
3652
|
{
|
|
3653
3653
|
"version": 1,
|
|
3654
|
-
"context_length":
|
|
3654
|
+
"context_length": 262144,
|
|
3655
3655
|
"model_name": "Yi-200k",
|
|
3656
3656
|
"model_lang": [
|
|
3657
3657
|
"en",
|
|
@@ -3688,7 +3688,7 @@
|
|
|
3688
3688
|
},
|
|
3689
3689
|
{
|
|
3690
3690
|
"version": 1,
|
|
3691
|
-
"context_length":
|
|
3691
|
+
"context_length": 4096,
|
|
3692
3692
|
"model_name": "Yi-chat",
|
|
3693
3693
|
"model_lang": [
|
|
3694
3694
|
"en",
|
|
@@ -3707,6 +3707,17 @@
|
|
|
3707
3707
|
],
|
|
3708
3708
|
"model_id": "01-ai/Yi-34B-Chat-{quantization}"
|
|
3709
3709
|
},
|
|
3710
|
+
{
|
|
3711
|
+
"model_format": "pytorch",
|
|
3712
|
+
"model_size_in_billions": 6,
|
|
3713
|
+
"quantizations": [
|
|
3714
|
+
"4-bit",
|
|
3715
|
+
"8-bit",
|
|
3716
|
+
"none"
|
|
3717
|
+
],
|
|
3718
|
+
"model_id": "01-ai/Yi-6B-Chat",
|
|
3719
|
+
"model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b"
|
|
3720
|
+
},
|
|
3710
3721
|
{
|
|
3711
3722
|
"model_format": "pytorch",
|
|
3712
3723
|
"model_size_in_billions": 34,
|
|
@@ -3762,6 +3773,124 @@
|
|
|
3762
3773
|
]
|
|
3763
3774
|
}
|
|
3764
3775
|
},
|
|
3776
|
+
{
|
|
3777
|
+
"version": 1,
|
|
3778
|
+
"context_length": 4096,
|
|
3779
|
+
"model_name": "Yi-1.5",
|
|
3780
|
+
"model_lang": [
|
|
3781
|
+
"en",
|
|
3782
|
+
"zh"
|
|
3783
|
+
],
|
|
3784
|
+
"model_ability": [
|
|
3785
|
+
"generate"
|
|
3786
|
+
],
|
|
3787
|
+
"model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
|
|
3788
|
+
"model_specs": [
|
|
3789
|
+
{
|
|
3790
|
+
"model_format": "pytorch",
|
|
3791
|
+
"model_size_in_billions": 6,
|
|
3792
|
+
"quantizations": [
|
|
3793
|
+
"4-bit",
|
|
3794
|
+
"8-bit",
|
|
3795
|
+
"none"
|
|
3796
|
+
],
|
|
3797
|
+
"model_id": "01-ai/Yi-1.5-6B",
|
|
3798
|
+
"model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886"
|
|
3799
|
+
},
|
|
3800
|
+
{
|
|
3801
|
+
"model_format": "pytorch",
|
|
3802
|
+
"model_size_in_billions": 9,
|
|
3803
|
+
"quantizations": [
|
|
3804
|
+
"4-bit",
|
|
3805
|
+
"8-bit",
|
|
3806
|
+
"none"
|
|
3807
|
+
],
|
|
3808
|
+
"model_id": "01-ai/Yi-1.5-9B",
|
|
3809
|
+
"model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2"
|
|
3810
|
+
},
|
|
3811
|
+
{
|
|
3812
|
+
"model_format": "pytorch",
|
|
3813
|
+
"model_size_in_billions": 34,
|
|
3814
|
+
"quantizations": [
|
|
3815
|
+
"4-bit",
|
|
3816
|
+
"8-bit",
|
|
3817
|
+
"none"
|
|
3818
|
+
],
|
|
3819
|
+
"model_id": "01-ai/Yi-1.5-34B",
|
|
3820
|
+
"model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5"
|
|
3821
|
+
}
|
|
3822
|
+
]
|
|
3823
|
+
},
|
|
3824
|
+
{
|
|
3825
|
+
"version": 1,
|
|
3826
|
+
"context_length": 4096,
|
|
3827
|
+
"model_name": "Yi-1.5-chat",
|
|
3828
|
+
"model_lang": [
|
|
3829
|
+
"en",
|
|
3830
|
+
"zh"
|
|
3831
|
+
],
|
|
3832
|
+
"model_ability": [
|
|
3833
|
+
"chat"
|
|
3834
|
+
],
|
|
3835
|
+
"model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
|
|
3836
|
+
"model_specs": [
|
|
3837
|
+
{
|
|
3838
|
+
"model_format": "pytorch",
|
|
3839
|
+
"model_size_in_billions": 6,
|
|
3840
|
+
"quantizations": [
|
|
3841
|
+
"4-bit",
|
|
3842
|
+
"8-bit",
|
|
3843
|
+
"none"
|
|
3844
|
+
],
|
|
3845
|
+
"model_id": "01-ai/Yi-1.5-6B-Chat",
|
|
3846
|
+
"model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080"
|
|
3847
|
+
},
|
|
3848
|
+
{
|
|
3849
|
+
"model_format": "pytorch",
|
|
3850
|
+
"model_size_in_billions": 9,
|
|
3851
|
+
"quantizations": [
|
|
3852
|
+
"4-bit",
|
|
3853
|
+
"8-bit",
|
|
3854
|
+
"none"
|
|
3855
|
+
],
|
|
3856
|
+
"model_id": "01-ai/Yi-1.5-9B-Chat",
|
|
3857
|
+
"model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6"
|
|
3858
|
+
},
|
|
3859
|
+
{
|
|
3860
|
+
"model_format": "pytorch",
|
|
3861
|
+
"model_size_in_billions": 34,
|
|
3862
|
+
"quantizations": [
|
|
3863
|
+
"4-bit",
|
|
3864
|
+
"8-bit",
|
|
3865
|
+
"none"
|
|
3866
|
+
],
|
|
3867
|
+
"model_id": "01-ai/Yi-1.5-34B-Chat",
|
|
3868
|
+
"model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e"
|
|
3869
|
+
}
|
|
3870
|
+
],
|
|
3871
|
+
"prompt_style": {
|
|
3872
|
+
"style_name": "CHATML",
|
|
3873
|
+
"system_prompt": "",
|
|
3874
|
+
"roles": [
|
|
3875
|
+
"<|im_start|>user",
|
|
3876
|
+
"<|im_start|>assistant"
|
|
3877
|
+
],
|
|
3878
|
+
"intra_message_sep": "<|im_end|>",
|
|
3879
|
+
"inter_message_sep": "",
|
|
3880
|
+
"stop_token_ids": [
|
|
3881
|
+
2,
|
|
3882
|
+
6,
|
|
3883
|
+
7,
|
|
3884
|
+
8
|
|
3885
|
+
],
|
|
3886
|
+
"stop": [
|
|
3887
|
+
"<|endoftext|>",
|
|
3888
|
+
"<|im_start|>",
|
|
3889
|
+
"<|im_end|>",
|
|
3890
|
+
"<|im_sep|>"
|
|
3891
|
+
]
|
|
3892
|
+
}
|
|
3893
|
+
},
|
|
3765
3894
|
{
|
|
3766
3895
|
"version": 1,
|
|
3767
3896
|
"context_length": 2048,
|
|
@@ -4684,7 +4813,7 @@
|
|
|
4684
4813
|
},
|
|
4685
4814
|
{
|
|
4686
4815
|
"version": 1,
|
|
4687
|
-
"context_length":
|
|
4816
|
+
"context_length": 4096,
|
|
4688
4817
|
"model_name": "yi-vl-chat",
|
|
4689
4818
|
"model_lang": [
|
|
4690
4819
|
"en",
|