xinference 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +44 -9
  5. xinference/core/model.py +4 -4
  6. xinference/core/scheduler.py +1 -2
  7. xinference/core/worker.py +1 -1
  8. xinference/deploy/cmdline.py +2 -2
  9. xinference/deploy/test/test_cmdline.py +7 -7
  10. xinference/model/llm/__init__.py +20 -27
  11. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  12. xinference/model/llm/llm_family.json +448 -1153
  13. xinference/model/llm/llm_family.py +14 -139
  14. xinference/model/llm/llm_family_modelscope.json +230 -313
  15. xinference/model/llm/memory.py +9 -9
  16. xinference/model/llm/sglang/core.py +2 -2
  17. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  18. xinference/model/llm/{pytorch → transformers}/core.py +2 -10
  19. xinference/model/llm/transformers/intern_vl.py +457 -0
  20. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  21. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
  22. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  23. xinference/model/llm/utils.py +76 -70
  24. xinference/model/llm/vllm/core.py +110 -11
  25. xinference/model/utils.py +1 -95
  26. xinference/thirdparty/internvl/__init__.py +0 -0
  27. xinference/thirdparty/internvl/conversation.py +393 -0
  28. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
  32. xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  45. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/METADATA +12 -15
  46. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
  47. xinference/locale/utils.py +0 -39
  48. xinference/locale/zh_CN.json +0 -26
  49. xinference/model/llm/ggml/tools/__init__.py +0 -15
  50. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  51. xinference/model/llm/ggml/tools/gguf.py +0 -884
  52. xinference/model/llm/pytorch/__init__.py +0 -13
  53. xinference/model/llm/pytorch/baichuan.py +0 -81
  54. xinference/model/llm/pytorch/falcon.py +0 -138
  55. xinference/model/llm/pytorch/intern_vl.py +0 -352
  56. xinference/model/llm/pytorch/vicuna.py +0 -69
  57. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  58. xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  71. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  72. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  73. /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
  74. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  75. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  76. /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
  77. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  78. /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
  79. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  80. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  81. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  82. /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
  83. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
  84. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
  85. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
  86. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
  87. {xinference-0.14.1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
@@ -1,103 +1,4 @@
1
1
  [
2
- {
3
- "version": 1,
4
- "context_length": 4096,
5
- "model_name": "baichuan",
6
- "model_lang": [
7
- "en",
8
- "zh"
9
- ],
10
- "model_ability": [
11
- "generate"
12
- ],
13
- "model_description": "Baichuan is an open-source Transformer based LLM that is trained on both Chinese and English data.",
14
- "model_specs": [
15
- {
16
- "model_format": "ggmlv3",
17
- "model_size_in_billions": 7,
18
- "quantizations": [
19
- "q2_K",
20
- "q3_K_L",
21
- "q3_K_M",
22
- "q3_K_S",
23
- "q4_0",
24
- "q4_1",
25
- "q4_K_M",
26
- "q4_K_S",
27
- "q5_0",
28
- "q5_1",
29
- "q5_K_M",
30
- "q5_K_S",
31
- "q6_K",
32
- "q8_0"
33
- ],
34
- "model_id": "TheBloke/baichuan-llama-7B-GGML",
35
- "model_file_name_template": "baichuan-llama-7b.ggmlv3.{quantization}.bin"
36
- },
37
- {
38
- "model_format": "pytorch",
39
- "model_size_in_billions": 7,
40
- "quantizations": [
41
- "4-bit",
42
- "8-bit",
43
- "none"
44
- ],
45
- "model_id": "baichuan-inc/Baichuan-7B",
46
- "model_revision": "c1a5c7d5b7f50ecc51bb0e08150a9f12e5656756"
47
- },
48
- {
49
- "model_format": "pytorch",
50
- "model_size_in_billions": 13,
51
- "quantizations": [
52
- "4-bit",
53
- "8-bit",
54
- "none"
55
- ],
56
- "model_id": "baichuan-inc/Baichuan-13B-Base",
57
- "model_revision": "0ef0739c7bdd34df954003ef76d80f3dabca2ff9"
58
- }
59
- ]
60
- },
61
- {
62
- "version": 1,
63
- "context_length": 4096,
64
- "model_name": "baichuan-chat",
65
- "model_lang": [
66
- "en",
67
- "zh"
68
- ],
69
- "model_ability": [
70
- "chat"
71
- ],
72
- "model_description": "Baichuan-chat is a fine-tuned version of the Baichuan LLM, specializing in chatting.",
73
- "model_specs": [
74
- {
75
- "model_format": "pytorch",
76
- "model_size_in_billions": 13,
77
- "quantizations": [
78
- "4-bit",
79
- "8-bit",
80
- "none"
81
- ],
82
- "model_id": "baichuan-inc/Baichuan-13B-Chat",
83
- "model_revision": "19ef51ba5bad8935b03acd20ff04a269210983bc"
84
- }
85
- ],
86
- "prompt_style": {
87
- "style_name": "NO_COLON_TWO",
88
- "system_prompt": "",
89
- "roles": [
90
- " <reserved_102> ",
91
- " <reserved_103> "
92
- ],
93
- "intra_message_sep": "",
94
- "inter_message_sep": "</s>",
95
- "stop_token_ids": [
96
- 2,
97
- 195
98
- ]
99
- }
100
- },
101
2
  {
102
3
  "version": 1,
103
4
  "context_length": 8194,
@@ -164,258 +65,6 @@
164
65
  ]
165
66
  }
166
67
  },
167
- {
168
- "version": 1,
169
- "context_length": 2048,
170
- "model_name": "wizardlm-v1.0",
171
- "model_lang": [
172
- "en"
173
- ],
174
- "model_ability": [
175
- "chat"
176
- ],
177
- "model_description": "WizardLM is an open-source LLM trained by fine-tuning LLaMA with Evol-Instruct.",
178
- "model_specs": [
179
- {
180
- "model_format": "ggmlv3",
181
- "model_size_in_billions": 7,
182
- "quantizations": [
183
- "q2_K",
184
- "q3_K_L",
185
- "q3_K_M",
186
- "q3_K_S",
187
- "q4_0",
188
- "q4_1",
189
- "q4_K_M",
190
- "q4_K_S",
191
- "q5_0",
192
- "q5_1",
193
- "q5_K_M",
194
- "q5_K_S",
195
- "q6_K",
196
- "q8_0"
197
- ],
198
- "model_id": "TheBloke/WizardLM-7B-V1.0-Uncensored-GGML",
199
- "model_file_name_template": "wizardlm-7b-v1.0-uncensored.ggmlv3.{quantization}.bin"
200
- },
201
- {
202
- "model_format": "ggmlv3",
203
- "model_size_in_billions": 13,
204
- "quantizations": [
205
- "q2_K",
206
- "q3_K_L",
207
- "q3_K_M",
208
- "q3_K_S",
209
- "q4_0",
210
- "q4_1",
211
- "q4_K_M",
212
- "q4_K_S",
213
- "q5_0",
214
- "q5_1",
215
- "q5_K_M",
216
- "q5_K_S",
217
- "q6_K",
218
- "q8_0"
219
- ],
220
- "model_id": "TheBloke/WizardLM-13B-V1.0-Uncensored-GGML",
221
- "model_file_name_template": "wizardlm-13b-v1.0-uncensored.ggmlv3.{quantization}.bin"
222
- }
223
- ],
224
- "prompt_style": {
225
- "style_name": "ADD_COLON_SINGLE",
226
- "system_prompt": "You are a helpful AI assistant.",
227
- "roles": [
228
- "USER",
229
- "ASSISTANT"
230
- ],
231
- "intra_message_sep": "\n"
232
- }
233
- },
234
- {
235
- "version": 1,
236
- "context_length": 2048,
237
- "model_name": "vicuna-v1.3",
238
- "model_lang": [
239
- "en"
240
- ],
241
- "model_ability": [
242
- "chat"
243
- ],
244
- "model_description": "Vicuna is an open-source LLM trained by fine-tuning LLaMA on data collected from ShareGPT.",
245
- "model_specs": [
246
- {
247
- "model_format": "ggmlv3",
248
- "model_size_in_billions": 7,
249
- "quantizations": [
250
- "q2_K",
251
- "q3_K_L",
252
- "q3_K_M",
253
- "q3_K_S",
254
- "q4_0",
255
- "q4_1",
256
- "q4_K_M",
257
- "q4_K_S",
258
- "q5_0",
259
- "q5_1",
260
- "q5_K_M",
261
- "q5_K_S",
262
- "q6_K",
263
- "q8_0"
264
- ],
265
- "model_id": "TheBloke/vicuna-7B-v1.3-GGML",
266
- "model_file_name_template": "vicuna-7b-v1.3.ggmlv3.{quantization}.bin"
267
- },
268
- {
269
- "model_format": "ggmlv3",
270
- "model_size_in_billions": 13,
271
- "quantizations": [
272
- "q2_K",
273
- "q3_K_L",
274
- "q3_K_M",
275
- "q3_K_S",
276
- "q4_0",
277
- "q4_1",
278
- "q4_K_M",
279
- "q4_K_S",
280
- "q5_0",
281
- "q5_1",
282
- "q5_K_M",
283
- "q5_K_S",
284
- "q6_K",
285
- "q8_0"
286
- ],
287
- "model_id": "TheBloke/vicuna-13b-v1.3.0-GGML",
288
- "model_file_name_template": "vicuna-13b-v1.3.0.ggmlv3.{quantization}.bin"
289
- },
290
- {
291
- "model_format": "ggmlv3",
292
- "model_size_in_billions": 33,
293
- "quantizations": [
294
- "q2_K",
295
- "q3_K_L",
296
- "q3_K_M",
297
- "q3_K_S",
298
- "q4_0",
299
- "q4_1",
300
- "q4_K_M",
301
- "q4_K_S",
302
- "q5_0",
303
- "q5_1",
304
- "q5_K_M",
305
- "q5_K_S",
306
- "q6_K",
307
- "q8_0"
308
- ],
309
- "model_id": "TheBloke/vicuna-33B-GGML",
310
- "model_file_name_template": "vicuna-33b.ggmlv3.{quantization}.bin"
311
- },
312
- {
313
- "model_format": "pytorch",
314
- "model_size_in_billions": 33,
315
- "quantizations": [
316
- "4-bit",
317
- "8-bit",
318
- "none"
319
- ],
320
- "model_id": "lmsys/vicuna-33b-v1.3",
321
- "model_revision": "ef8d6becf883fb3ce52e3706885f761819477ab4"
322
- },
323
- {
324
- "model_format": "pytorch",
325
- "model_size_in_billions": 13,
326
- "quantizations": [
327
- "4-bit",
328
- "8-bit",
329
- "none"
330
- ],
331
- "model_id": "lmsys/vicuna-13b-v1.3",
332
- "model_revision": "6566e9cb1787585d1147dcf4f9bc48f29e1328d2"
333
- },
334
- {
335
- "model_format": "pytorch",
336
- "model_size_in_billions": 7,
337
- "quantizations": [
338
- "4-bit",
339
- "8-bit",
340
- "none"
341
- ],
342
- "model_id": "lmsys/vicuna-7b-v1.3",
343
- "model_revision": "236eeeab96f0dc2e463f2bebb7bb49809279c6d6"
344
- }
345
- ],
346
- "prompt_style": {
347
- "style_name": "ADD_COLON_TWO",
348
- "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
349
- "roles": [
350
- "USER",
351
- "ASSISTANT"
352
- ],
353
- "intra_message_sep": " ",
354
- "inter_message_sep": "</s>"
355
- }
356
- },
357
- {
358
- "version": 1,
359
- "context_length": 2048,
360
- "model_name": "orca",
361
- "model_lang": [
362
- "en"
363
- ],
364
- "model_ability": [
365
- "chat"
366
- ],
367
- "model_description": "Orca is an LLM trained by fine-tuning LLaMA on explanation traces obtained from GPT-4.",
368
- "model_specs": [
369
- {
370
- "model_format": "ggmlv3",
371
- "model_size_in_billions": 3,
372
- "quantizations": [
373
- "q4_0",
374
- "q4_1",
375
- "q5_0",
376
- "q5_1",
377
- "q8_0"
378
- ],
379
- "model_id": "TheBloke/orca_mini_3B-GGML",
380
- "model_file_name_template": "orca-mini-3b.ggmlv3.{quantization}.bin"
381
- },
382
- {
383
- "model_format": "ggmlv3",
384
- "model_size_in_billions": 7,
385
- "quantizations": [
386
- "q4_0",
387
- "q4_1",
388
- "q5_0",
389
- "q5_1",
390
- "q8_0"
391
- ],
392
- "model_id": "TheBloke/orca_mini_7B-GGML",
393
- "model_file_name_template": "orca-mini-7b.ggmlv3.{quantization}.bin"
394
- },
395
- {
396
- "model_format": "ggmlv3",
397
- "model_size_in_billions": 13,
398
- "quantizations": [
399
- "q4_0",
400
- "q4_1",
401
- "q5_0",
402
- "q5_1",
403
- "q8_0"
404
- ],
405
- "model_id": "TheBloke/orca_mini_13B-GGML",
406
- "model_file_name_template": "orca-mini-13b.ggmlv3.{quantization}.bin"
407
- }
408
- ],
409
- "prompt_style": {
410
- "style_name": "ADD_COLON_SINGLE",
411
- "system_prompt": "You are an AI assistant that follows instruction extremely well. Help as much as you can.",
412
- "roles": [
413
- "User",
414
- "Response"
415
- ],
416
- "intra_message_sep": "\n\n### "
417
- }
418
- },
419
68
  {
420
69
  "version": 1,
421
70
  "context_length": 2048,
@@ -561,111 +210,6 @@
561
210
  ]
562
211
  }
563
212
  },
564
- {
565
- "version": 1,
566
- "context_length": 2048,
567
- "model_name": "chatglm",
568
- "model_lang": [
569
- "en",
570
- "zh"
571
- ],
572
- "model_ability": [
573
- "chat"
574
- ],
575
- "model_description": "ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data.",
576
- "model_specs": [
577
- {
578
- "model_format": "pytorch",
579
- "model_size_in_billions": 6,
580
- "quantizations": [
581
- "4-bit",
582
- "8-bit",
583
- "none"
584
- ],
585
- "model_id": "THUDM/chatglm-6b",
586
- "model_revision": "8b7d33596d18c5e83e2da052d05ca4db02e60620"
587
- }
588
- ],
589
- "prompt_style": {
590
- "style_name": "CHATGLM",
591
- "system_prompt": "",
592
- "roles": [
593
- "问",
594
- "答"
595
- ],
596
- "intra_message_sep": "\n"
597
- }
598
- },
599
- {
600
- "version": 1,
601
- "context_length": 8192,
602
- "model_name": "chatglm2",
603
- "model_lang": [
604
- "en",
605
- "zh"
606
- ],
607
- "model_ability": [
608
- "chat"
609
- ],
610
- "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
611
- "model_specs": [
612
- {
613
- "model_format": "pytorch",
614
- "model_size_in_billions": 6,
615
- "quantizations": [
616
- "4-bit",
617
- "8-bit",
618
- "none"
619
- ],
620
- "model_id": "THUDM/chatglm2-6b",
621
- "model_revision": "7fabe56db91e085c9c027f56f1c654d137bdba40"
622
- }
623
- ],
624
- "prompt_style": {
625
- "style_name": "CHATGLM",
626
- "system_prompt": "",
627
- "roles": [
628
- "问",
629
- "答"
630
- ],
631
- "intra_message_sep": "\n\n"
632
- }
633
- },
634
- {
635
- "version": 1,
636
- "context_length": 32768,
637
- "model_name": "chatglm2-32k",
638
- "model_lang": [
639
- "en",
640
- "zh"
641
- ],
642
- "model_ability": [
643
- "chat"
644
- ],
645
- "model_description": "ChatGLM2-32k is a special version of ChatGLM2, with a context window of 32k tokens instead of 8k.",
646
- "model_specs": [
647
- {
648
- "model_format": "pytorch",
649
- "model_size_in_billions": 6,
650
- "quantizations": [
651
- "4-bit",
652
- "8-bit",
653
- "none"
654
- ],
655
- "model_id": "THUDM/chatglm2-6b-32k",
656
- "model_revision": "a2065f5dc8253f036a209e642d7220a942d92765"
657
- }
658
- ],
659
- "prompt_style": {
660
- "style_name": "CHATGLM",
661
- "system_prompt": "",
662
- "roles": [
663
- "问",
664
- "答"
665
- ],
666
- "intra_message_sep": "\n\n"
667
- }
668
- },
669
213
  {
670
214
  "version": 1,
671
215
  "context_length": 8192,
@@ -819,7 +363,7 @@
819
363
  "none"
820
364
  ],
821
365
  "model_id": "THUDM/glm-4-9b-chat",
822
- "model_revision": "76f3474a854145aa4a9ed2612fee9bc8d4a8966b"
366
+ "model_revision": "aae8bd74af5c6dff63a49d7fbdcc89349ebf87aa"
823
367
  },
824
368
  {
825
369
  "model_format": "ggufv2",
@@ -890,7 +434,7 @@
890
434
  "none"
891
435
  ],
892
436
  "model_id": "THUDM/glm-4-9b-chat-1m",
893
- "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
437
+ "model_revision": "0aa722c7e0745dd21453427dd44c257dd253304f"
894
438
  },
895
439
  {
896
440
  "model_format": "ggufv2",
@@ -1148,70 +692,73 @@
1148
692
  "model_description": "Llama-2-Chat is a fine-tuned version of the Llama-2 LLM, specializing in chatting.",
1149
693
  "model_specs": [
1150
694
  {
1151
- "model_format": "ggmlv3",
695
+ "model_format": "ggufv2",
1152
696
  "model_size_in_billions": 7,
1153
697
  "quantizations": [
1154
- "q2_K",
1155
- "q3_K_L",
1156
- "q3_K_M",
1157
- "q3_K_S",
1158
- "q4_0",
1159
- "q4_1",
1160
- "q4_K_M",
1161
- "q4_K_S",
1162
- "q5_0",
1163
- "q5_1",
1164
- "q5_K_M",
1165
- "q5_K_S",
1166
- "q6_K",
1167
- "q8_0"
698
+ "Q2_K",
699
+ "Q3_K_S",
700
+ "Q3_K_M",
701
+ "Q3_K_L",
702
+ "Q4_0",
703
+ "Q4_K_S",
704
+ "Q4_K_M",
705
+ "Q5_0",
706
+ "Q5_K_S",
707
+ "Q5_K_M",
708
+ "Q6_K",
709
+ "Q8_0"
1168
710
  ],
1169
- "model_id": "TheBloke/Llama-2-7B-Chat-GGML",
1170
- "model_file_name_template": "llama-2-7b-chat.ggmlv3.{quantization}.bin"
711
+ "model_id": "TheBloke/Llama-2-7B-Chat-GGUF",
712
+ "model_file_name_template": "llama-2-7b-chat.{quantization}.gguf"
1171
713
  },
1172
714
  {
1173
- "model_format": "ggmlv3",
715
+ "model_format": "ggufv2",
1174
716
  "model_size_in_billions": 13,
1175
717
  "quantizations": [
1176
- "q2_K",
1177
- "q3_K_L",
1178
- "q3_K_M",
1179
- "q3_K_S",
1180
- "q4_0",
1181
- "q4_1",
1182
- "q4_K_M",
1183
- "q4_K_S",
1184
- "q5_0",
1185
- "q5_1",
1186
- "q5_K_M",
1187
- "q5_K_S",
1188
- "q6_K",
1189
- "q8_0"
718
+ "Q2_K",
719
+ "Q3_K_S",
720
+ "Q3_K_M",
721
+ "Q3_K_L",
722
+ "Q4_0",
723
+ "Q4_K_S",
724
+ "Q4_K_M",
725
+ "Q5_0",
726
+ "Q5_K_S",
727
+ "Q5_K_M",
728
+ "Q6_K",
729
+ "Q8_0"
1190
730
  ],
1191
- "model_id": "TheBloke/Llama-2-13B-chat-GGML",
1192
- "model_file_name_template": "llama-2-13b-chat.ggmlv3.{quantization}.bin"
731
+ "model_id": "TheBloke/Llama-2-13B-chat-GGUF",
732
+ "model_file_name_template": "llama-2-13b-chat.{quantization}.gguf"
1193
733
  },
1194
734
  {
1195
- "model_format": "ggmlv3",
735
+ "model_format": "ggufv2",
1196
736
  "model_size_in_billions": 70,
1197
737
  "quantizations": [
1198
- "q2_K",
1199
- "q3_K_L",
1200
- "q3_K_M",
1201
- "q3_K_S",
1202
- "q4_0",
1203
- "q4_1",
1204
- "q4_K_M",
1205
- "q4_K_S",
1206
- "q5_0",
1207
- "q5_1",
1208
- "q5_K_M",
1209
- "q5_K_S",
1210
- "q6_K",
1211
- "q8_0"
738
+ "Q2_K",
739
+ "Q3_K_S",
740
+ "Q3_K_M",
741
+ "Q3_K_L",
742
+ "Q4_0",
743
+ "Q4_K_S",
744
+ "Q4_K_M",
745
+ "Q5_0",
746
+ "Q5_K_S",
747
+ "Q5_K_M"
1212
748
  ],
1213
- "model_id": "TheBloke/Llama-2-70B-Chat-GGML",
1214
- "model_file_name_template": "llama-2-70b-chat.ggmlv3.{quantization}.bin"
749
+ "quantization_parts": {
750
+ "Q6_K": [
751
+ "split-a",
752
+ "split-b"
753
+ ],
754
+ "Q8_0": [
755
+ "split-a",
756
+ "split-b"
757
+ ]
758
+ },
759
+ "model_id": "TheBloke/Llama-2-70B-Chat-GGUF",
760
+ "model_file_name_template": "llama-2-70b-chat.{quantization}.gguf",
761
+ "model_file_name_split_template": "llama-2-70b-chat.{quantization}.gguf-{part}"
1215
762
  },
1216
763
  {
1217
764
  "model_format": "pytorch",
@@ -1293,64 +840,6 @@
1293
840
  ],
1294
841
  "model_id": "meta-llama/Llama-2-70b-chat-hf",
1295
842
  "model_revision": "36d9a7388cc80e5f4b3e9701ca2f250d21a96c30"
1296
- },
1297
- {
1298
- "model_format": "ggufv2",
1299
- "model_size_in_billions": 7,
1300
- "quantizations": [
1301
- "Q2_K",
1302
- "Q3_K_S",
1303
- "Q3_K_M",
1304
- "Q3_K_L",
1305
- "Q4_0",
1306
- "Q4_K_S",
1307
- "Q4_K_M",
1308
- "Q5_0",
1309
- "Q5_K_S",
1310
- "Q5_K_M",
1311
- "Q6_K",
1312
- "Q8_0"
1313
- ],
1314
- "model_id": "TheBloke/Llama-2-7B-Chat-GGUF",
1315
- "model_file_name_template": "llama-2-7b-chat.{quantization}.gguf"
1316
- },
1317
- {
1318
- "model_format": "ggufv2",
1319
- "model_size_in_billions": 13,
1320
- "quantizations": [
1321
- "Q2_K",
1322
- "Q3_K_S",
1323
- "Q3_K_M",
1324
- "Q3_K_L",
1325
- "Q4_0",
1326
- "Q4_K_S",
1327
- "Q4_K_M",
1328
- "Q5_0",
1329
- "Q5_K_S",
1330
- "Q5_K_M",
1331
- "Q6_K",
1332
- "Q8_0"
1333
- ],
1334
- "model_id": "TheBloke/Llama-2-13B-chat-GGUF",
1335
- "model_file_name_template": "llama-2-13b-chat.{quantization}.gguf"
1336
- },
1337
- {
1338
- "model_format": "ggufv2",
1339
- "model_size_in_billions": 70,
1340
- "quantizations": [
1341
- "Q2_K",
1342
- "Q3_K_S",
1343
- "Q3_K_M",
1344
- "Q3_K_L",
1345
- "Q4_0",
1346
- "Q4_K_S",
1347
- "Q4_K_M",
1348
- "Q5_0",
1349
- "Q5_K_S",
1350
- "Q5_K_M"
1351
- ],
1352
- "model_id": "TheBloke/Llama-2-70B-Chat-GGUF",
1353
- "model_file_name_template": "llama-2-70b-chat.{quantization}.gguf"
1354
843
  }
1355
844
  ],
1356
845
  "prompt_style": {
@@ -1383,26 +872,24 @@
1383
872
  "model_description": "Llama-2 is the second generation of Llama, open-source and trained on a larger amount of data.",
1384
873
  "model_specs": [
1385
874
  {
1386
- "model_format": "ggmlv3",
875
+ "model_format": "ggufv2",
1387
876
  "model_size_in_billions": 7,
1388
877
  "quantizations": [
1389
- "q2_K",
1390
- "q3_K_L",
1391
- "q3_K_M",
1392
- "q3_K_S",
1393
- "q4_0",
1394
- "q4_1",
1395
- "q4_K_M",
1396
- "q4_K_S",
1397
- "q5_0",
1398
- "q5_1",
1399
- "q5_K_M",
1400
- "q5_K_S",
1401
- "q6_K",
1402
- "q8_0"
878
+ "Q2_K",
879
+ "Q3_K_S",
880
+ "Q3_K_M",
881
+ "Q3_K_L",
882
+ "Q4_0",
883
+ "Q4_K_S",
884
+ "Q4_K_M",
885
+ "Q5_0",
886
+ "Q5_K_S",
887
+ "Q5_K_M",
888
+ "Q6_K",
889
+ "Q8_0"
1403
890
  ],
1404
- "model_id": "TheBloke/Llama-2-7B-GGML",
1405
- "model_file_name_template": "llama-2-7b.ggmlv3.{quantization}.bin"
891
+ "model_id": "TheBloke/Llama-2-7B-GGUF",
892
+ "model_file_name_template": "llama-2-7b.{quantization}.gguf"
1406
893
  },
1407
894
  {
1408
895
  "model_format": "gptq",
@@ -1421,48 +908,53 @@
1421
908
  "model_id": "TheBloke/Llama-2-7B-AWQ"
1422
909
  },
1423
910
  {
1424
- "model_format": "ggmlv3",
911
+ "model_format": "ggufv2",
1425
912
  "model_size_in_billions": 13,
1426
913
  "quantizations": [
1427
- "q2_K",
1428
- "q3_K_L",
1429
- "q3_K_M",
1430
- "q3_K_S",
1431
- "q4_0",
1432
- "q4_1",
1433
- "q4_K_M",
1434
- "q4_K_S",
1435
- "q5_0",
1436
- "q5_1",
1437
- "q5_K_M",
1438
- "q5_K_S",
1439
- "q6_K",
1440
- "q8_0"
914
+ "Q2_K",
915
+ "Q3_K_S",
916
+ "Q3_K_M",
917
+ "Q3_K_L",
918
+ "Q4_0",
919
+ "Q4_K_S",
920
+ "Q4_K_M",
921
+ "Q5_0",
922
+ "Q5_K_S",
923
+ "Q5_K_M",
924
+ "Q6_K",
925
+ "Q8_0"
1441
926
  ],
1442
- "model_id": "TheBloke/Llama-2-13B-GGML",
1443
- "model_file_name_template": "llama-2-13b.ggmlv3.{quantization}.bin"
927
+ "model_id": "TheBloke/Llama-2-13B-GGUF",
928
+ "model_file_name_template": "llama-2-13b.{quantization}.gguf"
1444
929
  },
1445
930
  {
1446
- "model_format": "ggmlv3",
931
+ "model_format": "ggufv2",
1447
932
  "model_size_in_billions": 70,
1448
933
  "quantizations": [
1449
- "q2_K",
1450
- "q3_K_L",
1451
- "q3_K_M",
1452
- "q3_K_S",
1453
- "q4_0",
1454
- "q4_1",
1455
- "q4_K_M",
1456
- "q4_K_S",
1457
- "q5_0",
1458
- "q5_1",
1459
- "q5_K_M",
1460
- "q5_K_S",
1461
- "q6_K",
1462
- "q8_0"
934
+ "Q2_K",
935
+ "Q3_K_S",
936
+ "Q3_K_M",
937
+ "Q3_K_L",
938
+ "Q4_0",
939
+ "Q4_K_S",
940
+ "Q4_K_M",
941
+ "Q5_0",
942
+ "Q5_K_S",
943
+ "Q5_K_M"
1463
944
  ],
1464
- "model_id": "TheBloke/Llama-2-70B-GGML",
1465
- "model_file_name_template": "llama-2-70b.ggmlv3.{quantization}.bin"
945
+ "quantization_parts": {
946
+ "Q6_K": [
947
+ "split-a",
948
+ "split-b"
949
+ ],
950
+ "Q8_0": [
951
+ "split-a",
952
+ "split-b"
953
+ ]
954
+ },
955
+ "model_id": "TheBloke/Llama-2-70B-GGUF",
956
+ "model_file_name_template": "llama-2-70b.{quantization}.gguf",
957
+ "model_file_name_split_template": "llama-2-70b.{quantization}.gguf-{part}"
1466
958
  },
1467
959
  {
1468
960
  "model_format": "pytorch",
@@ -2026,199 +1518,36 @@
2026
1518
  128001,
2027
1519
  128009
2028
1520
  ],
2029
- "stop": [
2030
- "<|end_of_text|>",
2031
- "<|eot_id|>"
2032
- ]
2033
- }
2034
- },
2035
- {
2036
- "version": 1,
2037
- "context_length": 2048,
2038
- "model_name": "opt",
2039
- "model_lang": [
2040
- "en"
2041
- ],
2042
- "model_ability": [
2043
- "generate"
2044
- ],
2045
- "model_description": "Opt is an open-source, decoder-only, Transformer based LLM that was designed to replicate GPT-3.",
2046
- "model_specs": [
2047
- {
2048
- "model_format": "pytorch",
2049
- "model_size_in_billions": 1,
2050
- "quantizations": [
2051
- "4-bit",
2052
- "8-bit",
2053
- "none"
2054
- ],
2055
- "model_id": "facebook/opt-125m",
2056
- "model_revision": "3d2b5f275bdf882b8775f902e1bfdb790e2cfc32"
2057
- }
2058
- ]
2059
- },
2060
- {
2061
- "version": 1,
2062
- "context_length": 2048,
2063
- "model_name": "falcon",
2064
- "model_lang": [
2065
- "en"
2066
- ],
2067
- "model_ability": [
2068
- "generate"
2069
- ],
2070
- "model_description": "Falcon is an open-source Transformer based LLM trained on the RefinedWeb dataset.",
2071
- "model_specs": [
2072
- {
2073
- "model_format": "pytorch",
2074
- "model_size_in_billions": 40,
2075
- "quantizations": [
2076
- "4-bit",
2077
- "8-bit",
2078
- "none"
2079
- ],
2080
- "model_id": "tiiuae/falcon-40b",
2081
- "model_revision": "561820f7eef0cc56a31ea38af15ca1acb07fab5d"
2082
- },
2083
- {
2084
- "model_format": "pytorch",
2085
- "model_size_in_billions": 7,
2086
- "quantizations": [
2087
- "4-bit",
2088
- "8-bit",
2089
- "none"
2090
- ],
2091
- "model_id": "tiiuae/falcon-7b",
2092
- "model_revision": "378337427557d1df3e742264a2901a49f25d4eb1"
2093
- }
2094
- ]
2095
- },
2096
- {
2097
- "version": 1,
2098
- "context_length": 2048,
2099
- "model_name": "falcon-instruct",
2100
- "model_lang": [
2101
- "en"
2102
- ],
2103
- "model_ability": [
2104
- "chat"
2105
- ],
2106
- "model_description": "Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting.",
2107
- "model_specs": [
2108
- {
2109
- "model_format": "pytorch",
2110
- "model_size_in_billions": 7,
2111
- "quantizations": [
2112
- "4-bit",
2113
- "8-bit",
2114
- "none"
2115
- ],
2116
- "model_id": "tiiuae/falcon-7b-instruct",
2117
- "model_revision": "eb410fb6ffa9028e97adb801f0d6ec46d02f8b07"
2118
- },
2119
- {
2120
- "model_format": "pytorch",
2121
- "model_size_in_billions": 40,
2122
- "quantizations": [
2123
- "4-bit",
2124
- "8-bit",
2125
- "none"
2126
- ],
2127
- "model_id": "tiiuae/falcon-40b-instruct",
2128
- "model_revision": "ca78eac0ed45bf64445ff0687fabba1598daebf3"
2129
- }
2130
- ],
2131
- "prompt_style": {
2132
- "style_name": "FALCON",
2133
- "system_prompt": "",
2134
- "roles": [
2135
- "User",
2136
- "Assistant"
2137
- ],
2138
- "intra_message_sep": "\n",
2139
- "inter_message_sep": "<|endoftext|>",
2140
- "stop": [
2141
- "\nUser"
2142
- ],
2143
- "stop_token_ids": [
2144
- 0,
2145
- 1,
2146
- 2,
2147
- 3,
2148
- 4,
2149
- 5,
2150
- 6,
2151
- 7,
2152
- 8,
2153
- 9,
2154
- 10,
2155
- 11
2156
- ]
2157
- }
2158
- },
2159
- {
2160
- "version": 1,
2161
- "context_length": 8192,
2162
- "model_name": "starcoderplus",
2163
- "model_lang": [
2164
- "en"
2165
- ],
2166
- "model_ability": [
2167
- "generate"
2168
- ],
2169
- "model_description": "Starcoderplus is an open-source LLM trained by fine-tuning Starcoder on RedefinedWeb and StarCoderData datasets.",
2170
- "model_specs": [
2171
- {
2172
- "model_format": "pytorch",
2173
- "model_size_in_billions": 16,
2174
- "quantizations": [
2175
- "4-bit",
2176
- "8-bit",
2177
- "none"
2178
- ],
2179
- "model_id": "bigcode/starcoderplus",
2180
- "model_revision": "95be82087c33f14ee9941c812a154a9dd66efe72"
2181
- }
2182
- ],
2183
- "prompt_style": null
1521
+ "stop": [
1522
+ "<|end_of_text|>",
1523
+ "<|eot_id|>"
1524
+ ]
1525
+ }
2184
1526
  },
2185
1527
  {
2186
1528
  "version": 1,
2187
- "context_length": 8192,
2188
- "model_name": "starchat-beta",
1529
+ "context_length": 2048,
1530
+ "model_name": "opt",
2189
1531
  "model_lang": [
2190
1532
  "en"
2191
1533
  ],
2192
1534
  "model_ability": [
2193
- "chat"
1535
+ "generate"
2194
1536
  ],
2195
- "model_description": "Starchat-beta is a fine-tuned version of the Starcoderplus LLM, specializing in coding assistance.",
1537
+ "model_description": "Opt is an open-source, decoder-only, Transformer based LLM that was designed to replicate GPT-3.",
2196
1538
  "model_specs": [
2197
1539
  {
2198
1540
  "model_format": "pytorch",
2199
- "model_size_in_billions": 16,
1541
+ "model_size_in_billions": 1,
2200
1542
  "quantizations": [
2201
1543
  "4-bit",
2202
1544
  "8-bit",
2203
1545
  "none"
2204
1546
  ],
2205
- "model_id": "HuggingFaceH4/starchat-beta",
2206
- "model_revision": "b1bcda690655777373f57ea6614eb095ec2c886f"
1547
+ "model_id": "facebook/opt-125m",
1548
+ "model_revision": "3d2b5f275bdf882b8775f902e1bfdb790e2cfc32"
2207
1549
  }
2208
- ],
2209
- "prompt_style": {
2210
- "style_name": "CHATML",
2211
- "system_prompt": "<system>{system_message}\n",
2212
- "roles": [
2213
- "<|user|>",
2214
- "<|assistant|>"
2215
- ],
2216
- "intra_message_sep": "<|end|>",
2217
- "stop_token_ids": [
2218
- 0,
2219
- 49155
2220
- ]
2221
- }
1550
+ ]
2222
1551
  },
2223
1552
  {
2224
1553
  "version": 1,
@@ -2984,6 +2313,46 @@
2984
2313
  ],
2985
2314
  "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
2986
2315
  },
2316
+ {
2317
+ "model_format": "fp8",
2318
+ "model_size_in_billions": "0_5",
2319
+ "quantizations": [
2320
+ "fp8"
2321
+ ],
2322
+ "model_id": "neuralmagic/Qwen2-0.5B-Instruct-FP8"
2323
+ },
2324
+ {
2325
+ "model_format": "fp8",
2326
+ "model_size_in_billions": "0_5",
2327
+ "quantizations": [
2328
+ "fp8"
2329
+ ],
2330
+ "model_id": "neuralmagic/Qwen2-0.5B-Instruct-FP8"
2331
+ },
2332
+ {
2333
+ "model_format": "fp8",
2334
+ "model_size_in_billions": "1_5",
2335
+ "quantizations": [
2336
+ "fp8"
2337
+ ],
2338
+ "model_id": "neuralmagic/Qwen2-1.5B-Instruct-FP8"
2339
+ },
2340
+ {
2341
+ "model_format": "fp8",
2342
+ "model_size_in_billions": 7,
2343
+ "quantizations": [
2344
+ "fp8"
2345
+ ],
2346
+ "model_id": "neuralmagic/Qwen2-7B-Instruct-FP8"
2347
+ },
2348
+ {
2349
+ "model_format": "fp8",
2350
+ "model_size_in_billions": 72,
2351
+ "quantizations": [
2352
+ "fp8"
2353
+ ],
2354
+ "model_id": "neuralmagic/Qwen2-72B-Instruct-FP8"
2355
+ },
2987
2356
  {
2988
2357
  "model_format": "mlx",
2989
2358
  "model_size_in_billions": "0_5",
@@ -3211,33 +2580,6 @@
3211
2580
  ]
3212
2581
  }
3213
2582
  },
3214
- {
3215
- "version": 1,
3216
- "context_length": 8192,
3217
- "model_name": "starcoder",
3218
- "model_lang": [
3219
- "en"
3220
- ],
3221
- "model_ability": [
3222
- "generate"
3223
- ],
3224
- "model_description": "Starcoder is an open-source Transformer based LLM that is trained on permissively licensed data from GitHub.",
3225
- "model_specs": [
3226
- {
3227
- "model_format": "ggmlv3",
3228
- "model_size_in_billions": 16,
3229
- "quantizations": [
3230
- "q4_0",
3231
- "q4_1",
3232
- "q5_0",
3233
- "q5_1",
3234
- "q8_0"
3235
- ],
3236
- "model_id": "TheBloke/starcoder-GGML",
3237
- "model_file_name_template": "starcoder.ggmlv3.{quantization}.bin"
3238
- }
3239
- ]
3240
- },
3241
2583
  {
3242
2584
  "version": 1,
3243
2585
  "context_length": 1024,
@@ -3254,242 +2596,12 @@
3254
2596
  "model_format": "pytorch",
3255
2597
  "model_size_in_billions": "1_5",
3256
2598
  "quantizations": [
3257
- "none"
3258
- ],
3259
- "model_id": "openai-community/gpt2",
3260
- "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
3261
- }
3262
- ]
3263
- },
3264
- {
3265
- "version": 1,
3266
- "context_length": 8192,
3267
- "model_name": "internlm-7b",
3268
- "model_lang": [
3269
- "en",
3270
- "zh"
3271
- ],
3272
- "model_ability": [
3273
- "generate"
3274
- ],
3275
- "model_description": "InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios.",
3276
- "model_specs": [
3277
- {
3278
- "model_format": "pytorch",
3279
- "model_size_in_billions": 7,
3280
- "quantizations": [
3281
- "4-bit",
3282
- "8-bit",
3283
- "none"
3284
- ],
3285
- "model_id": "internlm/internlm-7b",
3286
- "model_revision": "592b0efc83be3eb1cba8990c4caf41ce604b958c"
3287
- }
3288
- ]
3289
- },
3290
- {
3291
- "version": 1,
3292
- "context_length": 4096,
3293
- "model_name": "internlm-chat-7b",
3294
- "model_lang": [
3295
- "en",
3296
- "zh"
3297
- ],
3298
- "model_ability": [
3299
- "chat"
3300
- ],
3301
- "model_description": "Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.",
3302
- "model_specs": [
3303
- {
3304
- "model_format": "pytorch",
3305
- "model_size_in_billions": 7,
3306
- "quantizations": [
3307
- "4-bit",
3308
- "8-bit",
3309
- "none"
3310
- ],
3311
- "model_id": "internlm/internlm-chat-7b",
3312
- "model_revision": "d4fa2dbcbd2fa4edfa6735aa2ba0f0577fed6a62"
3313
- }
3314
- ],
3315
- "prompt_style": {
3316
- "style_name": "INTERNLM",
3317
- "system_prompt": "",
3318
- "roles": [
3319
- "<|User|>",
3320
- "<|Bot|>"
3321
- ],
3322
- "intra_message_sep": "<eoh>\n",
3323
- "inter_message_sep": "<eoa>\n",
3324
- "stop_token_ids": [
3325
- 1,
3326
- 103028
3327
- ],
3328
- "stop": [
3329
- "<eoa>"
3330
- ]
3331
- }
3332
- },
3333
- {
3334
- "version": 1,
3335
- "context_length": 16384,
3336
- "model_name": "internlm-20b",
3337
- "model_lang": [
3338
- "en",
3339
- "zh"
3340
- ],
3341
- "model_ability": [
3342
- "generate"
3343
- ],
3344
- "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.",
3345
- "model_specs": [
3346
- {
3347
- "model_format": "pytorch",
3348
- "model_size_in_billions": 20,
3349
- "quantizations": [
3350
- "4-bit",
3351
- "8-bit",
3352
- "none"
3353
- ],
3354
- "model_id": "internlm/internlm-20b",
3355
- "model_revision": "c56a72957239b490ea206ea857e86611b3f65f3a"
3356
- }
3357
- ]
3358
- },
3359
- {
3360
- "version": 1,
3361
- "context_length": 16384,
3362
- "model_name": "internlm-chat-20b",
3363
- "model_lang": [
3364
- "en",
3365
- "zh"
3366
- ],
3367
- "model_ability": [
3368
- "chat"
3369
- ],
3370
- "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.",
3371
- "model_specs": [
3372
- {
3373
- "model_format": "pytorch",
3374
- "model_size_in_billions": 20,
3375
- "quantizations": [
3376
- "4-bit",
3377
- "8-bit",
3378
- "none"
3379
- ],
3380
- "model_id": "internlm/internlm-chat-20b",
3381
- "model_revision": "c67e80e42c4950ebae18a955c9fe138c5ceb5b10"
3382
- }
3383
- ],
3384
- "prompt_style": {
3385
- "style_name": "INTERNLM",
3386
- "system_prompt": "",
3387
- "roles": [
3388
- "<|User|>",
3389
- "<|Bot|>"
3390
- ],
3391
- "intra_message_sep": "<eoh>\n",
3392
- "inter_message_sep": "<eoa>\n",
3393
- "stop_token_ids": [
3394
- 1,
3395
- 103028
3396
- ],
3397
- "stop": [
3398
- "<eoa>"
3399
- ]
3400
- }
3401
- },
3402
- {
3403
- "version": 1,
3404
- "context_length": 4096,
3405
- "model_name": "vicuna-v1.5",
3406
- "model_lang": [
3407
- "en"
3408
- ],
3409
- "model_ability": [
3410
- "chat"
3411
- ],
3412
- "model_description": "Vicuna is an open-source LLM trained by fine-tuning LLaMA on data collected from ShareGPT.",
3413
- "model_specs": [
3414
- {
3415
- "model_format": "pytorch",
3416
- "model_size_in_billions": 7,
3417
- "quantizations": [
3418
- "4-bit",
3419
- "8-bit",
3420
- "none"
3421
- ],
3422
- "model_id": "lmsys/vicuna-7b-v1.5",
3423
- "model_revision": "de56c35b1763eaae20f4d60efd64af0a9091ebe5"
3424
- },
3425
- {
3426
- "model_format": "pytorch",
3427
- "model_size_in_billions": 13,
3428
- "quantizations": [
3429
- "4-bit",
3430
- "8-bit",
3431
- "none"
3432
- ],
3433
- "model_id": "lmsys/vicuna-13b-v1.5",
3434
- "model_revision": "3deb0106f72a3a433f0c6ea0cb978bdf14bcd3a6"
3435
- }
3436
- ],
3437
- "prompt_style": {
3438
- "style_name": "ADD_COLON_TWO",
3439
- "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
3440
- "roles": [
3441
- "USER",
3442
- "ASSISTANT"
3443
- ],
3444
- "intra_message_sep": " ",
3445
- "inter_message_sep": "</s>"
3446
- }
3447
- },
3448
- {
3449
- "version": 1,
3450
- "context_length": 16384,
3451
- "model_name": "vicuna-v1.5-16k",
3452
- "model_lang": [
3453
- "en"
3454
- ],
3455
- "model_ability": [
3456
- "chat"
3457
- ],
3458
- "model_description": "Vicuna-v1.5-16k is a special version of Vicuna-v1.5, with a context window of 16k tokens instead of 4k.",
3459
- "model_specs": [
3460
- {
3461
- "model_format": "pytorch",
3462
- "model_size_in_billions": 7,
3463
- "quantizations": [
3464
- "4-bit",
3465
- "8-bit",
3466
- "none"
3467
- ],
3468
- "model_id": "lmsys/vicuna-7b-v1.5-16k",
3469
- "model_revision": "9a93d7d11fac7f3f9074510b80092b53bc1a5bec"
3470
- },
3471
- {
3472
- "model_format": "pytorch",
3473
- "model_size_in_billions": 13,
3474
- "quantizations": [
3475
- "4-bit",
3476
- "8-bit",
3477
- "none"
3478
- ],
3479
- "model_id": "lmsys/vicuna-13b-v1.5-16k",
3480
- "model_revision": "277697af19d4b267626ebc9f4e078d19a9a0fddf"
3481
- }
3482
- ],
3483
- "prompt_style": {
3484
- "style_name": "ADD_COLON_TWO",
3485
- "system_prompt": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
3486
- "roles": [
3487
- "USER",
3488
- "ASSISTANT"
3489
- ],
3490
- "intra_message_sep": " ",
3491
- "inter_message_sep": "</s>"
3492
- }
2599
+ "none"
2600
+ ],
2601
+ "model_id": "openai-community/gpt2",
2602
+ "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
2603
+ }
2604
+ ]
3493
2605
  },
3494
2606
  {
3495
2607
  "version": 1,
@@ -5504,93 +4616,6 @@
5504
4616
  ]
5505
4617
  }
5506
4618
  },
5507
- {
5508
- "version": 1,
5509
- "context_length": 2048,
5510
- "model_name": "OpenBuddy",
5511
- "model_lang": [
5512
- "en"
5513
- ],
5514
- "model_ability": [
5515
- "chat"
5516
- ],
5517
- "model_description": "OpenBuddy is a powerful open multilingual chatbot model aimed at global users.",
5518
- "model_specs": [
5519
- {
5520
- "model_format": "ggmlv3",
5521
- "model_size_in_billions": 13,
5522
- "quantizations": [
5523
- "Q2_K",
5524
- "Q3_K_S",
5525
- "Q3_K_M",
5526
- "Q3_K_L",
5527
- "Q4_0",
5528
- "Q4_1",
5529
- "Q4_K_S",
5530
- "Q4_K_M",
5531
- "Q5_0",
5532
- "Q5_1",
5533
- "Q5_K_S",
5534
- "Q5_K_M",
5535
- "Q6_K",
5536
- "Q8_0"
5537
- ],
5538
- "model_id": "TheBloke/OpenBuddy-Llama2-13B-v11.1-GGML",
5539
- "model_file_name_template": "openbuddy-llama2-13b-v11.1.ggmlv3.{quantization}.bin"
5540
- }
5541
- ],
5542
- "prompt_style": {
5543
- "style_name": "INSTRUCTION",
5544
- "system_prompt": "You are a professional translator. Be faithful or accurate in translation. Make the translation readable or intelligible. Be elegant or natural in translation. Do not translate person's name. Do not add any additional text to the translation. Do not give me any comments or suggestions.\nUser:\n\n{0}\nAssistant:",
5545
- "roles": [
5546
- "User",
5547
- "Assistant"
5548
- ],
5549
- "intra_message_sep": "",
5550
- "inter_message_sep": ""
5551
- }
5552
- },
5553
- {
5554
- "version": 1,
5555
- "context_length": 16384,
5556
- "model_name": "glaive-coder",
5557
- "model_description": "A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.",
5558
- "model_lang": [
5559
- "en"
5560
- ],
5561
- "model_ability": [
5562
- "chat"
5563
- ],
5564
- "model_specs": [
5565
- {
5566
- "model_format": "pytorch",
5567
- "model_size_in_billions": 7,
5568
- "quantizations": [
5569
- "4-bit",
5570
- "8-bit",
5571
- "none"
5572
- ],
5573
- "model_id": "glaiveai/glaive-coder-7b",
5574
- "model_revision": "72a255a58480ef0713eed988312fe82f77f94f37"
5575
- }
5576
- ],
5577
- "prompt_style": {
5578
- "style_name": "LLAMA2",
5579
- "system_prompt": "<s>[INST] <<SYS>>\nWrite code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n<</SYS>>\n\n",
5580
- "roles": [
5581
- "[INST]",
5582
- "[/INST]"
5583
- ],
5584
- "intra_message_sep": " ",
5585
- "inter_message_sep": " </s><s>",
5586
- "stop_token_ids": [
5587
- 2
5588
- ],
5589
- "stop": [
5590
- "</s>"
5591
- ]
5592
- }
5593
- },
5594
4619
  {
5595
4620
  "version": 1,
5596
4621
  "context_length": 100000,
@@ -6624,6 +5649,15 @@
6624
5649
  ],
6625
5650
  "model_description": "InternLM2.5 series of the InternLM model.",
6626
5651
  "model_specs": [
5652
+ {
5653
+ "model_format": "pytorch",
5654
+ "model_size_in_billions": "1_8",
5655
+ "quantizations": [
5656
+ "none"
5657
+ ],
5658
+ "model_id": "internlm/internlm2_5-1_8b-chat",
5659
+ "model_revision": "4426f00b854561fa60d555d2b628064b56bcb758"
5660
+ },
6627
5661
  {
6628
5662
  "model_format": "pytorch",
6629
5663
  "model_size_in_billions": 7,
@@ -6633,6 +5667,15 @@
6633
5667
  "model_id": "internlm/internlm2_5-7b-chat",
6634
5668
  "model_revision": "9dc8536a922ab4954726aad1b37fa199004a291a"
6635
5669
  },
5670
+ {
5671
+ "model_format": "pytorch",
5672
+ "model_size_in_billions": 20,
5673
+ "quantizations": [
5674
+ "none"
5675
+ ],
5676
+ "model_id": "internlm/internlm2_5-20b-chat",
5677
+ "model_revision": "ef17bde929761255fee76d95e2c25969ccd93b0d"
5678
+ },
6636
5679
  {
6637
5680
  "model_format": "gptq",
6638
5681
  "model_size_in_billions": 7,
@@ -6642,6 +5685,23 @@
6642
5685
  "model_id": "ModelCloud/internlm-2.5-7b-chat-gptq-4bit",
6643
5686
  "model_revision": "2e2dda735c326544921a4035bbeb6c6e316a8254"
6644
5687
  },
5688
+ {
5689
+ "model_format": "ggufv2",
5690
+ "model_size_in_billions": "1_8",
5691
+ "quantizations": [
5692
+ "q2_k",
5693
+ "q3_k_m",
5694
+ "q4_0",
5695
+ "q4_k_m",
5696
+ "q5_0",
5697
+ "q5_k_m",
5698
+ "q6_k",
5699
+ "q8_0",
5700
+ "fp16"
5701
+ ],
5702
+ "model_id": "internlm/internlm2_5-1_8b-chat-gguf",
5703
+ "model_file_name_template": "internlm2_5-1_8b-chat-{quantization}.gguf"
5704
+ },
6645
5705
  {
6646
5706
  "model_format": "ggufv2",
6647
5707
  "model_size_in_billions": 7,
@@ -6659,6 +5719,23 @@
6659
5719
  "model_id": "internlm/internlm2_5-7b-chat-gguf",
6660
5720
  "model_file_name_template": "internlm2_5-7b-chat-{quantization}.gguf"
6661
5721
  },
5722
+ {
5723
+ "model_format": "ggufv2",
5724
+ "model_size_in_billions": 20,
5725
+ "quantizations": [
5726
+ "q2_k",
5727
+ "q3_k_m",
5728
+ "q4_0",
5729
+ "q4_k_m",
5730
+ "q5_0",
5731
+ "q5_k_m",
5732
+ "q6_k",
5733
+ "q8_0",
5734
+ "fp16"
5735
+ ],
5736
+ "model_id": "internlm/internlm2_5-20b-chat-gguf",
5737
+ "model_file_name_template": "internlm2_5-20b-chat-{quantization}.gguf"
5738
+ },
6662
5739
  {
6663
5740
  "model_format": "mlx",
6664
5741
  "model_size_in_billions": 7,
@@ -7142,6 +6219,16 @@
7142
6219
  ],
7143
6220
  "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
7144
6221
  "model_specs": [
6222
+ {
6223
+ "model_format": "pytorch",
6224
+ "model_size_in_billions": 2,
6225
+ "quantizations": [
6226
+ "none",
6227
+ "4-bit",
6228
+ "8-bit"
6229
+ ],
6230
+ "model_id": "google/gemma-2-2b-it"
6231
+ },
7145
6232
  {
7146
6233
  "model_format": "pytorch",
7147
6234
  "model_size_in_billions": 9,
@@ -7162,6 +6249,23 @@
7162
6249
  ],
7163
6250
  "model_id": "google/gemma-2-27b-it"
7164
6251
  },
6252
+ {
6253
+ "model_format": "ggufv2",
6254
+ "model_size_in_billions": 2,
6255
+ "quantizations": [
6256
+ "Q3_K_L",
6257
+ "Q4_K_M",
6258
+ "Q4_K_S",
6259
+ "Q5_K_M",
6260
+ "Q5_K_S",
6261
+ "Q6_K",
6262
+ "Q6_K_L",
6263
+ "Q8_0",
6264
+ "f32"
6265
+ ],
6266
+ "model_id": "bartowski/gemma-2-2b-it-GGUF",
6267
+ "model_file_name_template": "gemma-2-2b-it-{quantization}.gguf"
6268
+ },
7165
6269
  {
7166
6270
  "model_format": "ggufv2",
7167
6271
  "model_size_in_billions": 9,
@@ -7208,6 +6312,30 @@
7208
6312
  "model_id": "bartowski/gemma-2-27b-it-GGUF",
7209
6313
  "model_file_name_template": "gemma-2-27b-it-{quantization}.gguf"
7210
6314
  },
6315
+ {
6316
+ "model_format": "mlx",
6317
+ "model_size_in_billions": 2,
6318
+ "quantizations": [
6319
+ "4-bit"
6320
+ ],
6321
+ "model_id": "mlx-community/gemma-2-2b-it-4bit"
6322
+ },
6323
+ {
6324
+ "model_format": "mlx",
6325
+ "model_size_in_billions": 2,
6326
+ "quantizations": [
6327
+ "8-bit"
6328
+ ],
6329
+ "model_id": "mlx-community/gemma-2-2b-it-8bit"
6330
+ },
6331
+ {
6332
+ "model_format": "mlx",
6333
+ "model_size_in_billions": 2,
6334
+ "quantizations": [
6335
+ "None"
6336
+ ],
6337
+ "model_id": "mlx-community/gemma-2-2b-it"
6338
+ },
7211
6339
  {
7212
6340
  "model_format": "mlx",
7213
6341
  "model_size_in_billions": 9,
@@ -7955,32 +7083,195 @@
7955
7083
  "model_format": "pytorch",
7956
7084
  "model_size_in_billions": 2,
7957
7085
  "quantizations": [
7958
- "none"
7086
+ "4-bit",
7087
+ "8-bit",
7088
+ "none"
7959
7089
  ],
7960
7090
  "model_id": "OpenGVLab/Mini-InternVL-Chat-2B-V1-5",
7961
- "model_revision": "ce3f67acff17281bacbf4b156f402a0580fb9605"
7091
+ "model_revision": "ecbbd21dcf38caa74d925967b997167b0c7b3f47"
7092
+ },
7093
+ {
7094
+ "model_format": "pytorch",
7095
+ "model_size_in_billions": 4,
7096
+ "quantizations": [
7097
+ "4-bit",
7098
+ "8-bit",
7099
+ "none"
7100
+ ],
7101
+ "model_id": "OpenGVLab/Mini-InternVL-Chat-4B-V1-5",
7102
+ "model_revision": "ce1559ddf9d87f5130aa5233b0e93b95e4e4161a"
7962
7103
  },
7963
7104
  {
7964
7105
  "model_format": "pytorch",
7965
7106
  "model_size_in_billions": 26,
7966
7107
  "quantizations": [
7967
- "none"
7108
+ "4-bit",
7109
+ "8-bit",
7110
+ "none"
7968
7111
  ],
7969
7112
  "model_id": "OpenGVLab/InternVL-Chat-V1-5",
7970
- "model_revision": "e822119e5806946ce128043023a73d715ecabf8d"
7113
+ "model_revision": "9db32d9127cac0c85961e169d75da57a18a847b1"
7114
+ }
7115
+ ],
7116
+ "prompt_style": {
7117
+ "style_name": "INTERNVL",
7118
+ "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
7119
+ "roles": [
7120
+ "<|im_start|>user",
7121
+ "<|im_start|>assistant"
7122
+ ],
7123
+ "intra_message_sep": "<|im_end|>",
7124
+ "stop_token_ids": [
7125
+ 2,
7126
+ 92543,
7127
+ 92542
7128
+ ],
7129
+ "stop": [
7130
+ "</s>",
7131
+ "<|im_end|>",
7132
+ "<|im_start|>"
7133
+ ]
7134
+ }
7135
+ },
7136
+ {
7137
+ "version": 1,
7138
+ "context_length": 32768,
7139
+ "model_name": "internvl2",
7140
+ "model_lang": [
7141
+ "en",
7142
+ "zh"
7143
+ ],
7144
+ "model_ability": [
7145
+ "chat",
7146
+ "vision"
7147
+ ],
7148
+ "model_description": "InternVL 2 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
7149
+ "model_specs": [
7150
+ {
7151
+ "model_format": "pytorch",
7152
+ "model_size_in_billions": 1,
7153
+ "quantizations": [
7154
+ "4-bit",
7155
+ "8-bit",
7156
+ "none"
7157
+ ],
7158
+ "model_id": "OpenGVLab/InternVL2-1B",
7159
+ "model_revision": "a9fc14aea824b6ea1d44f8778cad6b35512c4ce1"
7160
+ },
7161
+ {
7162
+ "model_format": "pytorch",
7163
+ "model_size_in_billions": 2,
7164
+ "quantizations": [
7165
+ "4-bit",
7166
+ "8-bit",
7167
+ "none"
7168
+ ],
7169
+ "model_id": "OpenGVLab/InternVL2-2B",
7170
+ "model_revision": "422ad7c6335917bfb514958233955512338485a6"
7171
+ },
7172
+ {
7173
+ "model_format": "awq",
7174
+ "model_size_in_billions": 2,
7175
+ "quantizations": [
7176
+ "Int4"
7177
+ ],
7178
+ "model_id": "OpenGVLab/InternVL2-2B-AWQ",
7179
+ "model_revision": "701bc3fc098a8a3b686b3b4135cfb77202be89e0"
7180
+ },
7181
+ {
7182
+ "model_format": "pytorch",
7183
+ "model_size_in_billions": 4,
7184
+ "quantizations": [
7185
+ "4-bit",
7186
+ "8-bit",
7187
+ "none"
7188
+ ],
7189
+ "model_id": "OpenGVLab/InternVL2-4B",
7190
+ "model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
7191
+ },
7192
+ {
7193
+ "model_format": "awq",
7194
+ "model_size_in_billions": 4,
7195
+ "quantizations": [
7196
+ "Int4"
7197
+ ],
7198
+ "model_id": "OpenGVLab/InternVL2-8B-AWQ",
7199
+ "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
7200
+ },
7201
+ {
7202
+ "model_format": "pytorch",
7203
+ "model_size_in_billions": 8,
7204
+ "quantizations": [
7205
+ "4-bit",
7206
+ "8-bit",
7207
+ "none"
7208
+ ],
7209
+ "model_id": "OpenGVLab/InternVL2-8B",
7210
+ "model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
7971
7211
  },
7972
7212
  {
7973
7213
  "model_format": "pytorch",
7974
7214
  "model_size_in_billions": 26,
7975
7215
  "quantizations": [
7976
- "Int8"
7216
+ "4-bit",
7217
+ "8-bit",
7218
+ "none"
7219
+ ],
7220
+ "model_id": "OpenGVLab/InternVL2-26B",
7221
+ "model_revision": "b9f3c7e6d575b0115e076a3ffc46fd20b7586899"
7222
+ },
7223
+ {
7224
+ "model_format": "awq",
7225
+ "model_size_in_billions": 26,
7226
+ "quantizations": [
7227
+ "Int4"
7228
+ ],
7229
+ "model_id": "OpenGVLab/InternVL2-26B-AWQ",
7230
+ "model_revision": "469e0019ffd251e22ff6501a5c2321964e86ef0d"
7231
+ },
7232
+ {
7233
+ "model_format": "pytorch",
7234
+ "model_size_in_billions": 40,
7235
+ "quantizations": [
7236
+ "4-bit",
7237
+ "8-bit",
7238
+ "none"
7239
+ ],
7240
+ "model_id": "OpenGVLab/InternVL2-40B",
7241
+ "model_revision": "725a12063bb855c966e30a0617d0ccd9e870d772"
7242
+ },
7243
+ {
7244
+ "model_format": "awq",
7245
+ "model_size_in_billions": 40,
7246
+ "quantizations": [
7247
+ "Int4"
7248
+ ],
7249
+ "model_id": "OpenGVLab/InternVL2-40B-AWQ",
7250
+ "model_revision": "d92e140f6dfe8ea9679924c6a31898f42c4e1846"
7251
+ },
7252
+ {
7253
+ "model_format": "pytorch",
7254
+ "model_size_in_billions": 76,
7255
+ "quantizations": [
7256
+ "4-bit",
7257
+ "8-bit",
7258
+ "none"
7259
+ ],
7260
+ "model_id": "OpenGVLab/InternVL2-Llama3-76B",
7261
+ "model_revision": "cf7914905f78e9e3560ddbd6f5dfc39becac494f"
7262
+ },
7263
+ {
7264
+ "model_format": "awq",
7265
+ "model_size_in_billions": 76,
7266
+ "quantizations": [
7267
+ "Int4"
7977
7268
  ],
7978
- "model_id": "OpenGVLab/InternVL-Chat-V1-5-{quantization}",
7979
- "model_revision": "acaaed06937c603ab04f084216ecb0268160f538"
7269
+ "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
7270
+ "model_revision": "1bc796bf80f2ebc7d6a14c15f55217a4600d50a4"
7980
7271
  }
7981
7272
  ],
7982
7273
  "prompt_style": {
7983
- "style_name": "INTERNLM2",
7274
+ "style_name": "INTERNVL",
7984
7275
  "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
7985
7276
  "roles": [
7986
7277
  "<|im_start|>user",
@@ -7988,10 +7279,14 @@
7988
7279
  ],
7989
7280
  "intra_message_sep": "<|im_end|>",
7990
7281
  "stop_token_ids": [
7282
+ 2,
7283
+ 92543,
7991
7284
  92542
7992
7285
  ],
7993
7286
  "stop": [
7994
- "<|im_end|>"
7287
+ "</s>",
7288
+ "<|im_end|>",
7289
+ "<|im_start|>"
7995
7290
  ]
7996
7291
  }
7997
7292
  },