xinference 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (37) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/chat_interface.py +10 -4
  3. xinference/core/model.py +2 -2
  4. xinference/fields.py +3 -1
  5. xinference/model/llm/ggml/chatglm.py +98 -13
  6. xinference/model/llm/ggml/llamacpp.py +49 -2
  7. xinference/model/llm/llm_family.json +132 -3
  8. xinference/model/llm/llm_family_modelscope.json +139 -3
  9. xinference/model/llm/pytorch/chatglm.py +48 -0
  10. xinference/model/llm/pytorch/core.py +23 -6
  11. xinference/model/llm/pytorch/deepseek_vl.py +35 -9
  12. xinference/model/llm/pytorch/internlm2.py +32 -1
  13. xinference/model/llm/pytorch/qwen_vl.py +38 -11
  14. xinference/model/llm/pytorch/utils.py +38 -1
  15. xinference/model/llm/pytorch/yi_vl.py +42 -14
  16. xinference/model/llm/sglang/core.py +31 -9
  17. xinference/model/llm/utils.py +25 -5
  18. xinference/model/llm/vllm/core.py +82 -3
  19. xinference/types.py +10 -1
  20. xinference/web/ui/build/asset-manifest.json +3 -3
  21. xinference/web/ui/build/index.html +1 -1
  22. xinference/web/ui/build/static/js/{main.8e44da4b.js → main.551aa479.js} +3 -3
  23. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  24. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  25. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  26. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  27. {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/METADATA +3 -2
  28. {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/RECORD +33 -33
  29. xinference/web/ui/build/static/js/main.8e44da4b.js.map +0 -1
  30. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  31. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  32. xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +0 -1
  33. /xinference/web/ui/build/static/js/{main.8e44da4b.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +0 -0
  34. {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
  35. {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
  36. {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
  37. {xinference-0.11.0.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-05-11T17:30:18+0800",
11
+ "date": "2024-05-17T14:10:09+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "21be5abd6ff8411015a9b8862cbdb6b070bc2b1c",
15
- "version": "0.11.0"
14
+ "full-revisionid": "55a0200079eacf4fd6ee10c5868f0eaba244db29",
15
+ "version": "0.11.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -109,6 +109,7 @@ class GradioInterface:
109
109
  history: List[List[str]],
110
110
  max_tokens: int,
111
111
  temperature: float,
112
+ lora_name: str,
112
113
  ) -> Generator:
113
114
  from ..client import RESTfulClient
114
115
 
@@ -127,6 +128,7 @@ class GradioInterface:
127
128
  "max_tokens": int(max_tokens),
128
129
  "temperature": temperature,
129
130
  "stream": True,
131
+ "lora_name": lora_name,
130
132
  },
131
133
  ):
132
134
  assert isinstance(chunk, dict)
@@ -152,6 +154,7 @@ class GradioInterface:
152
154
  gr.Slider(
153
155
  minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
154
156
  ),
157
+ gr.Text(label="LoRA Name"),
155
158
  ],
156
159
  title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
157
160
  css="""
@@ -331,7 +334,7 @@ class GradioInterface:
331
334
  history: hist,
332
335
  }
333
336
 
334
- def complete(text, hist, max_tokens, temperature) -> Generator:
337
+ def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
335
338
  from ..client import RESTfulClient
336
339
 
337
340
  client = RESTfulClient(self.endpoint)
@@ -349,6 +352,7 @@ class GradioInterface:
349
352
  "max_tokens": max_tokens,
350
353
  "temperature": temperature,
351
354
  "stream": True,
355
+ "lora_name": lora_name,
352
356
  },
353
357
  ):
354
358
  assert isinstance(chunk, dict)
@@ -368,7 +372,7 @@ class GradioInterface:
368
372
  history: hist,
369
373
  }
370
374
 
371
- def retry(text, hist, max_tokens, temperature) -> Generator:
375
+ def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
372
376
  from ..client import RESTfulClient
373
377
 
374
378
  client = RESTfulClient(self.endpoint)
@@ -387,6 +391,7 @@ class GradioInterface:
387
391
  "max_tokens": max_tokens,
388
392
  "temperature": temperature,
389
393
  "stream": True,
394
+ "lora_name": lora_name,
390
395
  },
391
396
  ):
392
397
  assert isinstance(chunk, dict)
@@ -470,10 +475,11 @@ class GradioInterface:
470
475
  temperature = gr.Slider(
471
476
  minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
472
477
  )
478
+ lora_name = gr.Text(label="LoRA Name")
473
479
 
474
480
  btn_generate.click(
475
481
  fn=complete,
476
- inputs=[textbox, history, length, temperature],
482
+ inputs=[textbox, history, length, temperature, lora_name],
477
483
  outputs=[textbox, history],
478
484
  )
479
485
 
@@ -485,7 +491,7 @@ class GradioInterface:
485
491
 
486
492
  btn_retry.click(
487
493
  fn=retry,
488
- inputs=[textbox, history, length, temperature],
494
+ inputs=[textbox, history, length, temperature, lora_name],
489
495
  outputs=[textbox, history],
490
496
  )
491
497
 
xinference/core/model.py CHANGED
@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
257
257
  for v in gen:
258
258
  if time_to_first_token is None:
259
259
  time_to_first_token = (time.time() - start_time) * 1000
260
- final_usage = v.pop("usage", None)
260
+ final_usage = v.get("usage", None)
261
261
  v = dict(data=json.dumps(v))
262
262
  yield sse_starlette.sse.ensure_bytes(v, None)
263
263
  except OutOfMemoryError:
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
289
289
  async for v in gen:
290
290
  if time_to_first_token is None:
291
291
  time_to_first_token = (time.time() - start_time) * 1000
292
- final_usage = v.pop("usage", None)
292
+ final_usage = v.get("usage", None)
293
293
  v = await asyncio.to_thread(json.dumps, v)
294
294
  v = dict(data=v) # noqa: F821
295
295
  yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
xinference/fields.py CHANGED
@@ -75,7 +75,9 @@ stream_field = Field(
75
75
  )
76
76
 
77
77
  stream_option_field = Field(
78
- default={},
78
+ default={
79
+ "include_usage": False,
80
+ },
79
81
  description="If set, an additional chunk will be streamed before the `data: [DONE]` message.",
80
82
  )
81
83
 
@@ -108,10 +108,11 @@ class ChatglmCppChatModel(LLM):
108
108
 
109
109
  @staticmethod
110
110
  def _convert_raw_text_chunks_to_chat(
111
- tokens: Iterator[Any], model_name: str
111
+ tokens: Iterator[Any], model_name: str, include_usage: bool, input_ids
112
112
  ) -> Iterator[ChatCompletionChunk]:
113
+ request_id = str(uuid.uuid4())
113
114
  yield {
114
- "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
115
+ "id": "chat" + f"cmpl-{request_id}",
115
116
  "model": model_name,
116
117
  "object": "chat.completion.chunk",
117
118
  "created": int(time.time()),
@@ -125,9 +126,13 @@ class ChatglmCppChatModel(LLM):
125
126
  }
126
127
  ],
127
128
  }
129
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
128
130
  for token in tokens:
131
+ prompt_tokens = len(input_ids)
132
+ completion_tokens = completion_tokens + 1
133
+ total_tokens = prompt_tokens + completion_tokens
129
134
  yield {
130
- "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
135
+ "id": "chat" + f"cmpl-{request_id}",
131
136
  "model": model_name,
132
137
  "object": "chat.completion.chunk",
133
138
  "created": int(time.time()),
@@ -143,6 +148,35 @@ class ChatglmCppChatModel(LLM):
143
148
  }
144
149
  ],
145
150
  }
151
+ # stop
152
+ yield {
153
+ "id": "chat" + f"cmpl-{request_id}",
154
+ "model": model_name,
155
+ "object": "chat.completion.chunk",
156
+ "created": int(time.time()),
157
+ "choices": [
158
+ {
159
+ "index": 0,
160
+ "delta": {
161
+ "content": "",
162
+ },
163
+ "finish_reason": "stop",
164
+ }
165
+ ],
166
+ }
167
+ if include_usage:
168
+ yield {
169
+ "id": "chat" + f"cmpl-{request_id}",
170
+ "model": model_name,
171
+ "object": "chat.completion.chunk",
172
+ "created": int(time.time()),
173
+ "choices": [],
174
+ "usage": {
175
+ "prompt_tokens": prompt_tokens,
176
+ "completion_tokens": completion_tokens,
177
+ "total_tokens": total_tokens,
178
+ },
179
+ }
146
180
 
147
181
  @classmethod
148
182
  def _convert_raw_text_completion_to_chat(
@@ -273,7 +307,7 @@ class ChatglmCppChatModel(LLM):
273
307
 
274
308
  params = {
275
309
  "max_length": generate_config.get("max_tokens"),
276
- "max_context_length": generate_config.get("max_tokens"),
310
+ "max_context_length": generate_config.get("max_tokens", 1024),
277
311
  "top_k": generate_config.get("top_k"),
278
312
  "top_p": generate_config.get("top_p"),
279
313
  "temperature": generate_config.get("temperature"),
@@ -286,13 +320,27 @@ class ChatglmCppChatModel(LLM):
286
320
  assert self._llm is not None
287
321
  chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
288
322
 
289
- if generate_config["stream"]:
323
+ stream = generate_config.get("stream")
324
+ stream_options = generate_config.get("stream_options", None)
325
+ include_usage = (
326
+ stream_options["include_usage"]
327
+ if isinstance(stream_options, dict)
328
+ else False
329
+ )
330
+
331
+ if stream:
290
332
  it = self._llm.chat(
291
333
  chat_history_messages,
292
334
  **params,
293
335
  )
294
336
  assert not isinstance(it, str)
295
- return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
337
+ input_ids = self._llm.tokenizer.encode_messages(
338
+ chat_history_messages, params["max_context_length"]
339
+ )
340
+ return self._convert_raw_text_chunks_to_chat(
341
+ it, self.model_uid, include_usage, input_ids
342
+ )
343
+
296
344
  else:
297
345
  c = self._llm.chat(
298
346
  chat_history_messages,
@@ -320,11 +368,13 @@ class ChatglmCppChatModel(LLM):
320
368
 
321
369
  @staticmethod
322
370
  def _convert_str_to_completion_chunk(
323
- tokens: Iterator[str], model_name: str
371
+ tokens: Iterator[str], model_name: str, include_usage: bool, input_ids
324
372
  ) -> Iterator[CompletionChunk]:
325
- for token in tokens:
373
+ request_id = str(uuid.uuid4())
374
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
375
+ for i, token in enumerate(tokens):
326
376
  yield {
327
- "id": "generate" + f"-{str(uuid.uuid4())}",
377
+ "id": "generate" + f"-{request_id}",
328
378
  "model": model_name,
329
379
  "object": "text_completion",
330
380
  "created": int(time.time()),
@@ -332,6 +382,32 @@ class ChatglmCppChatModel(LLM):
332
382
  {"index": 0, "text": token, "finish_reason": None, "logprobs": None}
333
383
  ],
334
384
  }
385
+ prompt_tokens = len(input_ids)
386
+ completion_tokens = i
387
+ total_tokens = prompt_tokens + completion_tokens
388
+ # stop
389
+ yield {
390
+ "id": "chat" + f"cmpl-{request_id}",
391
+ "model": model_name,
392
+ "object": "text_completion",
393
+ "created": int(time.time()),
394
+ "choices": [
395
+ {"index": 0, "text": "", "finish_reason": "stop", "logprobs": None}
396
+ ],
397
+ }
398
+ if include_usage:
399
+ yield {
400
+ "id": "chat" + f"cmpl-{request_id}",
401
+ "model": model_name,
402
+ "object": "text_completion",
403
+ "created": int(time.time()),
404
+ "choices": [],
405
+ "usage": {
406
+ "prompt_tokens": prompt_tokens,
407
+ "completion_tokens": completion_tokens,
408
+ "total_tokens": total_tokens,
409
+ },
410
+ }
335
411
 
336
412
  def generate(
337
413
  self,
@@ -344,7 +420,7 @@ class ChatglmCppChatModel(LLM):
344
420
 
345
421
  params = {
346
422
  "max_length": generate_config.get("max_tokens"),
347
- "max_context_length": generate_config.get("max_tokens"),
423
+ "max_context_length": generate_config.get("max_tokens", 1024),
348
424
  "top_k": generate_config.get("top_k"),
349
425
  "top_p": generate_config.get("top_p"),
350
426
  "temperature": generate_config.get("temperature"),
@@ -355,14 +431,23 @@ class ChatglmCppChatModel(LLM):
355
431
  params = {k: v for k, v in params.items() if v is not None}
356
432
 
357
433
  assert self._llm is not None
358
-
359
- if generate_config["stream"]:
434
+ stream = generate_config.get("stream")
435
+ stream_options = generate_config.get("stream_options", None)
436
+ include_usage = (
437
+ stream_options["include_usage"]
438
+ if isinstance(stream_options, dict)
439
+ else False
440
+ )
441
+ if stream:
360
442
  it = self._llm.generate(
361
443
  prompt,
362
444
  **params,
363
445
  )
364
446
  assert not isinstance(it, str)
365
- return self._convert_str_to_completion_chunk(it, self.model_uid)
447
+ input_ids = self._llm.tokenizer.encode(prompt, params["max_context_length"])
448
+ return self._convert_str_to_completion_chunk(
449
+ it, self.model_uid, include_usage, input_ids
450
+ )
366
451
  else:
367
452
  c = self._llm.generate(
368
453
  prompt,
@@ -14,6 +14,7 @@
14
14
  import datetime
15
15
  import logging
16
16
  import os
17
+ import time
17
18
  from typing import Iterable, Iterator, List, Optional, Union
18
19
 
19
20
  from ....types import (
@@ -22,6 +23,7 @@ from ....types import (
22
23
  ChatCompletionMessage,
23
24
  Completion,
24
25
  CompletionChunk,
26
+ CompletionUsage,
25
27
  CreateCompletionLlamaCpp,
26
28
  Embedding,
27
29
  LlamaCppGenerateConfig,
@@ -100,6 +102,8 @@ class LlamaCppModel(LLM):
100
102
  generate_config = LlamaCppGenerateConfig(
101
103
  **CreateCompletionLlamaCpp(**generate_config).dict()
102
104
  )
105
+ # Currently, llama.cpp does not support lora
106
+ generate_config.pop("lora_name", None) # type: ignore
103
107
  return generate_config
104
108
 
105
109
  def _convert_ggml_to_gguf(self, model_path: str) -> str:
@@ -195,16 +199,59 @@ class LlamaCppModel(LLM):
195
199
  _generate_config: LlamaCppGenerateConfig,
196
200
  ) -> Iterator[CompletionChunk]:
197
201
  assert self._llm is not None
198
- for _completion_chunk in self._llm(prompt=_prompt, **_generate_config):
202
+ prompt_token_ids: List[int] = (
203
+ (
204
+ self._llm.tokenize(prompt.encode("utf-8"), special=True)
205
+ if prompt != ""
206
+ else [self._llm.token_bos()]
207
+ )
208
+ if isinstance(prompt, str)
209
+ else prompt
210
+ )
211
+ prompt_tokens = len(prompt_token_ids)
212
+ completion_tokens, total_tokens = 0, 0
213
+ request_id = 0
214
+ for index, _completion_chunk in enumerate(
215
+ self._llm(prompt=_prompt, **_generate_config)
216
+ ):
217
+ request_id = _completion_chunk["id"]
218
+ choice = _completion_chunk["choices"][0]
219
+ if choice["finish_reason"] is not None:
220
+ completion_tokens = index
221
+ total_tokens = prompt_tokens + completion_tokens
222
+ _completion_chunk["usage"] = CompletionUsage(
223
+ prompt_tokens=total_tokens,
224
+ completion_tokens=completion_tokens,
225
+ total_tokens=total_tokens,
226
+ )
199
227
  yield _completion_chunk
228
+ if include_usage:
229
+ chunk = CompletionChunk(
230
+ id=request_id,
231
+ object="text_completion",
232
+ created=int(time.time()),
233
+ model=self.model_uid,
234
+ choices=[],
235
+ )
236
+ chunk["usage"] = CompletionUsage(
237
+ prompt_tokens=prompt_tokens,
238
+ completion_tokens=completion_tokens,
239
+ total_tokens=total_tokens,
240
+ )
241
+ yield chunk
200
242
 
201
243
  logger.debug(
202
244
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
203
245
  )
204
246
 
205
247
  generate_config = self._sanitize_generate_config(generate_config)
206
-
207
248
  stream = generate_config.get("stream", False)
249
+ stream_options = generate_config.pop("stream_options", None)
250
+ include_usage = (
251
+ stream_options["include_usage"]
252
+ if isinstance(stream_options, dict)
253
+ else False
254
+ )
208
255
 
209
256
  if not stream:
210
257
  assert self._llm is not None
@@ -3651,7 +3651,7 @@
3651
3651
  },
3652
3652
  {
3653
3653
  "version": 1,
3654
- "context_length": 204800,
3654
+ "context_length": 262144,
3655
3655
  "model_name": "Yi-200k",
3656
3656
  "model_lang": [
3657
3657
  "en",
@@ -3688,7 +3688,7 @@
3688
3688
  },
3689
3689
  {
3690
3690
  "version": 1,
3691
- "context_length": 204800,
3691
+ "context_length": 4096,
3692
3692
  "model_name": "Yi-chat",
3693
3693
  "model_lang": [
3694
3694
  "en",
@@ -3707,6 +3707,17 @@
3707
3707
  ],
3708
3708
  "model_id": "01-ai/Yi-34B-Chat-{quantization}"
3709
3709
  },
3710
+ {
3711
+ "model_format": "pytorch",
3712
+ "model_size_in_billions": 6,
3713
+ "quantizations": [
3714
+ "4-bit",
3715
+ "8-bit",
3716
+ "none"
3717
+ ],
3718
+ "model_id": "01-ai/Yi-6B-Chat",
3719
+ "model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b"
3720
+ },
3710
3721
  {
3711
3722
  "model_format": "pytorch",
3712
3723
  "model_size_in_billions": 34,
@@ -3762,6 +3773,124 @@
3762
3773
  ]
3763
3774
  }
3764
3775
  },
3776
+ {
3777
+ "version": 1,
3778
+ "context_length": 4096,
3779
+ "model_name": "Yi-1.5",
3780
+ "model_lang": [
3781
+ "en",
3782
+ "zh"
3783
+ ],
3784
+ "model_ability": [
3785
+ "generate"
3786
+ ],
3787
+ "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
3788
+ "model_specs": [
3789
+ {
3790
+ "model_format": "pytorch",
3791
+ "model_size_in_billions": 6,
3792
+ "quantizations": [
3793
+ "4-bit",
3794
+ "8-bit",
3795
+ "none"
3796
+ ],
3797
+ "model_id": "01-ai/Yi-1.5-6B",
3798
+ "model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886"
3799
+ },
3800
+ {
3801
+ "model_format": "pytorch",
3802
+ "model_size_in_billions": 9,
3803
+ "quantizations": [
3804
+ "4-bit",
3805
+ "8-bit",
3806
+ "none"
3807
+ ],
3808
+ "model_id": "01-ai/Yi-1.5-9B",
3809
+ "model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2"
3810
+ },
3811
+ {
3812
+ "model_format": "pytorch",
3813
+ "model_size_in_billions": 34,
3814
+ "quantizations": [
3815
+ "4-bit",
3816
+ "8-bit",
3817
+ "none"
3818
+ ],
3819
+ "model_id": "01-ai/Yi-1.5-34B",
3820
+ "model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5"
3821
+ }
3822
+ ]
3823
+ },
3824
+ {
3825
+ "version": 1,
3826
+ "context_length": 4096,
3827
+ "model_name": "Yi-1.5-chat",
3828
+ "model_lang": [
3829
+ "en",
3830
+ "zh"
3831
+ ],
3832
+ "model_ability": [
3833
+ "chat"
3834
+ ],
3835
+ "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
3836
+ "model_specs": [
3837
+ {
3838
+ "model_format": "pytorch",
3839
+ "model_size_in_billions": 6,
3840
+ "quantizations": [
3841
+ "4-bit",
3842
+ "8-bit",
3843
+ "none"
3844
+ ],
3845
+ "model_id": "01-ai/Yi-1.5-6B-Chat",
3846
+ "model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080"
3847
+ },
3848
+ {
3849
+ "model_format": "pytorch",
3850
+ "model_size_in_billions": 9,
3851
+ "quantizations": [
3852
+ "4-bit",
3853
+ "8-bit",
3854
+ "none"
3855
+ ],
3856
+ "model_id": "01-ai/Yi-1.5-9B-Chat",
3857
+ "model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6"
3858
+ },
3859
+ {
3860
+ "model_format": "pytorch",
3861
+ "model_size_in_billions": 34,
3862
+ "quantizations": [
3863
+ "4-bit",
3864
+ "8-bit",
3865
+ "none"
3866
+ ],
3867
+ "model_id": "01-ai/Yi-1.5-34B-Chat",
3868
+ "model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e"
3869
+ }
3870
+ ],
3871
+ "prompt_style": {
3872
+ "style_name": "CHATML",
3873
+ "system_prompt": "",
3874
+ "roles": [
3875
+ "<|im_start|>user",
3876
+ "<|im_start|>assistant"
3877
+ ],
3878
+ "intra_message_sep": "<|im_end|>",
3879
+ "inter_message_sep": "",
3880
+ "stop_token_ids": [
3881
+ 2,
3882
+ 6,
3883
+ 7,
3884
+ 8
3885
+ ],
3886
+ "stop": [
3887
+ "<|endoftext|>",
3888
+ "<|im_start|>",
3889
+ "<|im_end|>",
3890
+ "<|im_sep|>"
3891
+ ]
3892
+ }
3893
+ },
3765
3894
  {
3766
3895
  "version": 1,
3767
3896
  "context_length": 2048,
@@ -4684,7 +4813,7 @@
4684
4813
  },
4685
4814
  {
4686
4815
  "version": 1,
4687
- "context_length": 204800,
4816
+ "context_length": 4096,
4688
4817
  "model_name": "yi-vl-chat",
4689
4818
  "model_lang": [
4690
4819
  "en",