xinference 0.14.1.post1__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +44 -9
  5. xinference/core/model.py +4 -4
  6. xinference/core/scheduler.py +1 -2
  7. xinference/core/worker.py +1 -1
  8. xinference/deploy/cmdline.py +2 -2
  9. xinference/deploy/test/test_cmdline.py +7 -7
  10. xinference/model/llm/__init__.py +20 -27
  11. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  12. xinference/model/llm/llm_family.json +448 -1153
  13. xinference/model/llm/llm_family.py +14 -139
  14. xinference/model/llm/llm_family_modelscope.json +230 -313
  15. xinference/model/llm/memory.py +9 -9
  16. xinference/model/llm/sglang/core.py +2 -2
  17. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  18. xinference/model/llm/{pytorch → transformers}/core.py +2 -10
  19. xinference/model/llm/transformers/intern_vl.py +457 -0
  20. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  21. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
  22. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  23. xinference/model/llm/utils.py +76 -70
  24. xinference/model/llm/vllm/core.py +110 -11
  25. xinference/model/utils.py +1 -95
  26. xinference/thirdparty/internvl/__init__.py +0 -0
  27. xinference/thirdparty/internvl/conversation.py +393 -0
  28. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
  32. xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  45. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/METADATA +5 -8
  46. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
  47. xinference/locale/utils.py +0 -39
  48. xinference/locale/zh_CN.json +0 -26
  49. xinference/model/llm/ggml/tools/__init__.py +0 -15
  50. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  51. xinference/model/llm/ggml/tools/gguf.py +0 -884
  52. xinference/model/llm/pytorch/__init__.py +0 -13
  53. xinference/model/llm/pytorch/baichuan.py +0 -81
  54. xinference/model/llm/pytorch/falcon.py +0 -138
  55. xinference/model/llm/pytorch/intern_vl.py +0 -352
  56. xinference/model/llm/pytorch/vicuna.py +0 -69
  57. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  58. xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  71. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  72. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  73. /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
  74. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  75. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  76. /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
  77. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  78. /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
  79. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  80. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  81. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  82. /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
  83. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
  84. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
  85. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
  86. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
  87. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
@@ -503,78 +503,6 @@
503
503
  }
504
504
  ]
505
505
  },
506
- {
507
- "version": 1,
508
- "context_length": 8192,
509
- "model_name": "chatglm2",
510
- "model_lang": [
511
- "en",
512
- "zh"
513
- ],
514
- "model_ability": [
515
- "chat"
516
- ],
517
- "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
518
- "model_specs": [
519
- {
520
- "model_format": "pytorch",
521
- "model_size_in_billions": 6,
522
- "quantizations": [
523
- "4-bit",
524
- "8-bit",
525
- "none"
526
- ],
527
- "model_hub": "modelscope",
528
- "model_id": "ZhipuAI/chatglm2-6b",
529
- "model_revision": "v1.0.12"
530
- }
531
- ],
532
- "prompt_style": {
533
- "style_name": "CHATGLM",
534
- "system_prompt": "",
535
- "roles": [
536
- "问",
537
- "答"
538
- ],
539
- "intra_message_sep": "\n\n"
540
- }
541
- },
542
- {
543
- "version": 1,
544
- "context_length": 32768,
545
- "model_name": "chatglm2-32k",
546
- "model_lang": [
547
- "en",
548
- "zh"
549
- ],
550
- "model_ability": [
551
- "chat"
552
- ],
553
- "model_description": "ChatGLM2-32k is a special version of ChatGLM2, with a context window of 32k tokens instead of 8k.",
554
- "model_specs": [
555
- {
556
- "model_format": "pytorch",
557
- "model_size_in_billions": 6,
558
- "quantizations": [
559
- "4-bit",
560
- "8-bit",
561
- "none"
562
- ],
563
- "model_hub": "modelscope",
564
- "model_id": "ZhipuAI/chatglm2-6b-32k",
565
- "model_revision": "v1.0.2"
566
- }
567
- ],
568
- "prompt_style": {
569
- "style_name": "CHATGLM",
570
- "system_prompt": "",
571
- "roles": [
572
- "问",
573
- "答"
574
- ],
575
- "intra_message_sep": "\n\n"
576
- }
577
- },
578
506
  {
579
507
  "version": 1,
580
508
  "context_length": 8192,
@@ -1060,166 +988,60 @@
1060
988
  },
1061
989
  {
1062
990
  "version": 1,
1063
- "context_length": 8192,
1064
- "model_name": "internlm-7b",
991
+ "context_length": 32768,
992
+ "model_name": "internlm2.5-chat",
1065
993
  "model_lang": [
1066
994
  "en",
1067
995
  "zh"
1068
996
  ],
1069
997
  "model_ability": [
1070
- "generate"
998
+ "chat"
1071
999
  ],
1072
- "model_description": "InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios.",
1000
+ "model_description": "InternLM2.5 series of the InternLM model.",
1073
1001
  "model_specs": [
1074
1002
  {
1075
1003
  "model_format": "pytorch",
1076
- "model_size_in_billions": 7,
1004
+ "model_size_in_billions": "1_8",
1077
1005
  "quantizations": [
1078
- "4-bit",
1079
- "8-bit",
1080
1006
  "none"
1081
1007
  ],
1082
- "model_id": "Shanghai_AI_Laboratory/internlm-7b",
1083
- "model_hub": "modelscope",
1084
- "model_revision": "v1.0.1"
1085
- }
1086
- ]
1087
- },
1088
- {
1089
- "version": 1,
1090
- "context_length": 4096,
1091
- "model_name": "internlm-chat-7b",
1092
- "model_lang": [
1093
- "en",
1094
- "zh"
1095
- ],
1096
- "model_ability": [
1097
- "chat"
1098
- ],
1099
- "model_description": "Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.",
1100
- "model_specs": [
1008
+ "model_id": "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat",
1009
+ "model_hub": "modelscope"
1010
+ },
1101
1011
  {
1102
1012
  "model_format": "pytorch",
1103
1013
  "model_size_in_billions": 7,
1104
1014
  "quantizations": [
1105
- "4-bit",
1106
- "8-bit",
1107
1015
  "none"
1108
1016
  ],
1109
- "model_id": "Shanghai_AI_Laboratory/internlm-chat-7b",
1110
- "model_hub": "modelscope",
1111
- "model_revision": "v1.0.1"
1112
- }
1113
- ],
1114
- "prompt_style": {
1115
- "style_name": "INTERNLM",
1116
- "system_prompt": "",
1117
- "roles": [
1118
- "<|User|>",
1119
- "<|Bot|>"
1120
- ],
1121
- "intra_message_sep": "<eoh>\n",
1122
- "inter_message_sep": "<eoa>\n",
1123
- "stop_token_ids": [
1124
- 1,
1125
- 103028
1126
- ],
1127
- "stop": [
1128
- "<eoa>"
1129
- ]
1130
- }
1131
- },
1132
- {
1133
- "version": 1,
1134
- "context_length": 16384,
1135
- "model_name": "internlm-20b",
1136
- "model_lang": [
1137
- "en",
1138
- "zh"
1139
- ],
1140
- "model_ability": [
1141
- "generate"
1142
- ],
1143
- "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.",
1144
- "model_specs": [
1017
+ "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
1018
+ "model_hub": "modelscope"
1019
+ },
1145
1020
  {
1146
- "model_format": "pytorch",
1147
- "model_size_in_billions": 20,
1021
+ "model_format": "ggufv2",
1022
+ "model_size_in_billions": 7,
1148
1023
  "quantizations": [
1149
- "4-bit",
1150
- "8-bit",
1151
- "none"
1024
+ "q2_k",
1025
+ "q3_k_m",
1026
+ "q4_0",
1027
+ "q4_k_m",
1028
+ "q5_0",
1029
+ "q5_k_m",
1030
+ "q6_k",
1031
+ "q8_0",
1032
+ "fp16"
1152
1033
  ],
1153
- "model_id": "Shanghai_AI_Laboratory/internlm-20b",
1154
- "model_hub": "modelscope",
1155
- "model_revision": "v1.0.1"
1156
- }
1157
- ]
1158
- },
1159
- {
1160
- "version": 1,
1161
- "context_length": 16384,
1162
- "model_name": "internlm-chat-20b",
1163
- "model_lang": [
1164
- "en",
1165
- "zh"
1166
- ],
1167
- "model_ability": [
1168
- "chat"
1169
- ],
1170
- "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.",
1171
- "model_specs": [
1034
+ "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat-gguf",
1035
+ "model_file_name_template": "internlm2_5-7b-chat-{quantization}.gguf",
1036
+ "model_hub": "modelscope"
1037
+ },
1172
1038
  {
1173
1039
  "model_format": "pytorch",
1174
1040
  "model_size_in_billions": 20,
1175
1041
  "quantizations": [
1176
- "4-bit",
1177
- "8-bit",
1178
1042
  "none"
1179
1043
  ],
1180
- "model_id": "Shanghai_AI_Laboratory/internlm-chat-20b",
1181
- "model_hub": "modelscope",
1182
- "model_revision": "v1.0.1"
1183
- }
1184
- ],
1185
- "prompt_style": {
1186
- "style_name": "INTERNLM",
1187
- "system_prompt": "",
1188
- "roles": [
1189
- "<|User|>",
1190
- "<|Bot|>"
1191
- ],
1192
- "intra_message_sep": "<eoh>\n",
1193
- "inter_message_sep": "<eoa>\n",
1194
- "stop_token_ids": [
1195
- 1,
1196
- 103028
1197
- ],
1198
- "stop": [
1199
- "<eoa>"
1200
- ]
1201
- }
1202
- },
1203
- {
1204
- "version": 1,
1205
- "context_length": 32768,
1206
- "model_name": "internlm2.5-chat",
1207
- "model_lang": [
1208
- "en",
1209
- "zh"
1210
- ],
1211
- "model_ability": [
1212
- "chat"
1213
- ],
1214
- "model_description": "InternLM2.5 series of the InternLM model.",
1215
- "model_specs": [
1216
- {
1217
- "model_format": "pytorch",
1218
- "model_size_in_billions": 7,
1219
- "quantizations": [
1220
- "none"
1221
- ],
1222
- "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
1044
+ "model_id": "Shanghai_AI_Laboratory/internlm2_5-20b-chat",
1223
1045
  "model_hub": "modelscope"
1224
1046
  }
1225
1047
  ],
@@ -2403,59 +2225,6 @@
2403
2225
  ]
2404
2226
  }
2405
2227
  },
2406
- {
2407
- "version": 1,
2408
- "context_length": 2048,
2409
- "model_name": "falcon-instruct",
2410
- "model_lang": [
2411
- "en"
2412
- ],
2413
- "model_ability": [
2414
- "chat"
2415
- ],
2416
- "model_description": "Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting.",
2417
- "model_specs": [
2418
- {
2419
- "model_format": "pytorch",
2420
- "model_size_in_billions": 7,
2421
- "quantizations": [
2422
- "4-bit",
2423
- "8-bit",
2424
- "none"
2425
- ],
2426
- "model_hub": "modelscope",
2427
- "model_id": "Xorbits/falcon-7b-instruct",
2428
- "model_revision": "v1.0.0"
2429
- }
2430
- ],
2431
- "prompt_style": {
2432
- "style_name": "FALCON",
2433
- "system_prompt": "",
2434
- "roles": [
2435
- "User",
2436
- "Assistant"
2437
- ],
2438
- "intra_message_sep": "\n",
2439
- "inter_message_sep": "<|endoftext|>",
2440
- "stop": [
2441
- "\nUser"
2442
- ],
2443
- "stop_token_ids": [
2444
- 0,
2445
- 1,
2446
- 2,
2447
- 3,
2448
- 4,
2449
- 5,
2450
- 6,
2451
- 7,
2452
- 8,
2453
- 9,
2454
- 10,
2455
- 11
2456
- ]
2457
- }
2458
- },
2459
2228
  {
2460
2229
  "version": 1,
2461
2230
  "context_length": 8192,
@@ -2540,53 +2309,6 @@
2540
2309
  ]
2541
2310
  }
2542
2311
  },
2543
- {
2544
- "version": 1,
2545
- "context_length": 2048,
2546
- "model_name": "OpenBuddy",
2547
- "model_lang": [
2548
- "en"
2549
- ],
2550
- "model_ability": [
2551
- "chat"
2552
- ],
2553
- "model_description": "OpenBuddy is a powerful open multilingual chatbot model aimed at global users.",
2554
- "model_specs": [
2555
- {
2556
- "model_format": "ggmlv3",
2557
- "model_size_in_billions": 13,
2558
- "quantizations": [
2559
- "Q2_K",
2560
- "Q3_K_S",
2561
- "Q3_K_M",
2562
- "Q3_K_L",
2563
- "Q4_0",
2564
- "Q4_1",
2565
- "Q4_K_S",
2566
- "Q4_K_M",
2567
- "Q5_0",
2568
- "Q5_1",
2569
- "Q5_K_S",
2570
- "Q5_K_M",
2571
- "Q6_K",
2572
- "Q8_0"
2573
- ],
2574
- "model_hub": "modelscope",
2575
- "model_id": "Xorbits/OpenBuddy-Llama2-13B-v11.1-GGML",
2576
- "model_file_name_template": "openbuddy-llama2-13b-v11.1.ggmlv3.{quantization}.bin"
2577
- }
2578
- ],
2579
- "prompt_style": {
2580
- "style_name": "INSTRUCTION",
2581
- "system_prompt": "You are a professional translator. Be faithful or accurate in translation. Make the translation readable or intelligible. Be elegant or natural in translation. Do not translate person's name. Do not add any additional text to the translation. Do not give me any comments or suggestions.\nUser:\n\n{0}\nAssistant:",
2582
- "roles": [
2583
- "User",
2584
- "Assistant"
2585
- ],
2586
- "intra_message_sep": "",
2587
- "inter_message_sep": ""
2588
- }
2589
- },
2590
2312
  {
2591
2313
  "version": 1,
2592
2314
  "context_length": 32768,
@@ -3416,6 +3138,24 @@
3416
3138
  "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
3417
3139
  "model_hub": "modelscope"
3418
3140
  },
3141
+ {
3142
+ "model_format": "fp8",
3143
+ "model_size_in_billions": 7,
3144
+ "quantizations": [
3145
+ "fp8"
3146
+ ],
3147
+ "model_id": "liuzhenghua/Qwen2-7B-FP8-Instruct",
3148
+ "model_hub": "modelscope"
3149
+ },
3150
+ {
3151
+ "model_format": "fp8",
3152
+ "model_size_in_billions": 72,
3153
+ "quantizations": [
3154
+ "fp8"
3155
+ ],
3156
+ "model_id": "liuzhenghua/Qwen2-72B-FP8-Instruct",
3157
+ "model_hub": "modelscope"
3158
+ },
3419
3159
  {
3420
3160
  "model_format": "mlx",
3421
3161
  "model_size_in_billions": "0_5",
@@ -4245,6 +3985,17 @@
4245
3985
  ],
4246
3986
  "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
4247
3987
  "model_specs": [
3988
+ {
3989
+ "model_format": "pytorch",
3990
+ "model_size_in_billions": 2,
3991
+ "quantizations": [
3992
+ "none",
3993
+ "4-bit",
3994
+ "8-bit"
3995
+ ],
3996
+ "model_id": "LLM-Research/gemma-2-2b-it",
3997
+ "model_hub": "modelscope"
3998
+ },
4248
3999
  {
4249
4000
  "model_format": "pytorch",
4250
4001
  "model_size_in_billions": 9,
@@ -4958,25 +4709,187 @@
4958
4709
  "model_format": "pytorch",
4959
4710
  "model_size_in_billions": 26,
4960
4711
  "quantizations": [
4961
- "none"
4712
+ "4-bit",
4713
+ "8-bit",
4714
+ "none"
4715
+ ],
4716
+ "model_hub": "modelscope",
4717
+ "model_id": "OpenGVLab/InternVL-Chat-V1-5",
4718
+ "model_revision": "master"
4719
+ }
4720
+ ],
4721
+ "prompt_style": {
4722
+ "style_name": "INTERNVL",
4723
+ "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
4724
+ "roles": [
4725
+ "<|im_start|>user",
4726
+ "<|im_start|>assistant"
4727
+ ],
4728
+ "intra_message_sep": "<|im_end|>",
4729
+ "stop_token_ids": [
4730
+ 2,
4731
+ 92543,
4732
+ 92542
4733
+ ],
4734
+ "stop": [
4735
+ "</s>",
4736
+ "<|im_end|>",
4737
+ "<|im_start|>"
4738
+ ]
4739
+ }
4740
+ },
4741
+ {
4742
+ "version": 1,
4743
+ "context_length": 32768,
4744
+ "model_name": "internvl2",
4745
+ "model_lang": [
4746
+ "en",
4747
+ "zh"
4748
+ ],
4749
+ "model_ability": [
4750
+ "chat",
4751
+ "vision"
4752
+ ],
4753
+ "model_description": "InternVL 2 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
4754
+ "model_specs": [
4755
+
4756
+ {
4757
+ "model_format": "pytorch",
4758
+ "model_size_in_billions": 1,
4759
+ "quantizations": [
4760
+ "4-bit",
4761
+ "8-bit",
4762
+ "none"
4763
+ ],
4764
+ "model_hub": "modelscope",
4765
+ "model_id": "OpenGVLab/InternVL2-1B",
4766
+ "model_revision": "master"
4767
+ },
4768
+ {
4769
+ "model_format": "pytorch",
4770
+ "model_size_in_billions": 2,
4771
+ "quantizations": [
4772
+ "4-bit",
4773
+ "8-bit",
4774
+ "none"
4775
+ ],
4776
+ "model_hub": "modelscope",
4777
+ "model_id": "OpenGVLab/InternVL2-2B",
4778
+ "model_revision": "master"
4779
+ },
4780
+ {
4781
+ "model_format": "pytorch",
4782
+ "model_size_in_billions": 2,
4783
+ "quantizations": [
4784
+ "none"
4962
4785
  ],
4963
- "model_hub": "modelscope",
4964
- "model_id": "AI-ModelScope/InternVL-Chat-V1-5",
4786
+ "model_hub": "modelscope",
4787
+ "model_id": "OpenGVLab/InternVL2-2B-AWQ",
4788
+ "model_revision": "master"
4789
+ },
4790
+ {
4791
+ "model_format": "pytorch",
4792
+ "model_size_in_billions": 4,
4793
+ "quantizations": [
4794
+ "4-bit",
4795
+ "8-bit",
4796
+ "none"
4797
+ ],
4798
+ "model_hub": "modelscope",
4799
+ "model_id": "OpenGVLab/InternVL2-4B",
4800
+ "model_revision": "master"
4801
+ },
4802
+ {
4803
+ "model_format": "pytorch",
4804
+ "model_size_in_billions": 8,
4805
+ "quantizations": [
4806
+ "4-bit",
4807
+ "8-bit",
4808
+ "none"
4809
+ ],
4810
+ "model_hub": "modelscope",
4811
+ "model_id": "OpenGVLab/InternVL2-8B",
4812
+ "model_revision": "master"
4813
+ },
4814
+ {
4815
+ "model_format": "pytorch",
4816
+ "model_size_in_billions": 8,
4817
+ "quantizations": [
4818
+ "none"
4819
+ ],
4820
+ "model_hub": "modelscope",
4821
+ "model_id": "OpenGVLab/InternVL2-8B-AWQ",
4822
+ "model_revision": "master"
4823
+ },
4824
+ {
4825
+ "model_format": "pytorch",
4826
+ "model_size_in_billions": 26,
4827
+ "quantizations": [
4828
+ "4-bit",
4829
+ "8-bit",
4830
+ "none"
4831
+ ],
4832
+ "model_hub": "modelscope",
4833
+ "model_id": "OpenGVLab/InternVL2-26B",
4965
4834
  "model_revision": "master"
4966
4835
  },
4967
4836
  {
4968
4837
  "model_format": "pytorch",
4969
4838
  "model_size_in_billions": 26,
4970
4839
  "quantizations": [
4971
- "Int8"
4840
+ "none"
4841
+ ],
4842
+ "model_hub": "modelscope",
4843
+ "model_id": "OpenGVLab/InternVL2-26B-AWQ",
4844
+ "model_revision": "master"
4845
+ },
4846
+ {
4847
+ "model_format": "pytorch",
4848
+ "model_size_in_billions": 40,
4849
+ "quantizations": [
4850
+ "4-bit",
4851
+ "8-bit",
4852
+ "none"
4853
+ ],
4854
+ "model_hub": "modelscope",
4855
+ "model_id": "OpenGVLab/InternVL2-40B",
4856
+ "model_revision": "master"
4857
+ },
4858
+ {
4859
+ "model_format": "pytorch",
4860
+ "model_size_in_billions": 40,
4861
+ "quantizations": [
4862
+ "none"
4863
+ ],
4864
+ "model_hub": "modelscope",
4865
+ "model_id": "OpenGVLab/InternVL2-40B-AWQ",
4866
+ "model_revision": "master"
4867
+ },
4868
+ {
4869
+ "model_format": "pytorch",
4870
+ "model_size_in_billions": 76,
4871
+ "quantizations": [
4872
+ "4-bit",
4873
+ "8-bit",
4874
+ "none"
4875
+ ],
4876
+ "model_hub": "modelscope",
4877
+ "model_id": "OpenGVLab/InternVL2-Llama3-76B",
4878
+ "model_revision": "master"
4879
+ },
4880
+ {
4881
+ "model_format": "pytorch",
4882
+ "model_size_in_billions": 76,
4883
+ "quantizations": [
4884
+ "none"
4972
4885
  ],
4973
- "model_hub": "modelscope",
4974
- "model_id": "AI-ModelScope/InternVL-Chat-V1-5-{quantization}",
4886
+ "model_hub": "modelscope",
4887
+ "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
4975
4888
  "model_revision": "master"
4976
4889
  }
4977
4890
  ],
4978
4891
  "prompt_style": {
4979
- "style_name": "INTERNLM2",
4892
+ "style_name": "INTERNVL",
4980
4893
  "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
4981
4894
  "roles": [
4982
4895
  "<|im_start|>user",
@@ -4984,10 +4897,14 @@
4984
4897
  ],
4985
4898
  "intra_message_sep": "<|im_end|>",
4986
4899
  "stop_token_ids": [
4900
+ 2,
4901
+ 92543,
4987
4902
  92542
4988
4903
  ],
4989
4904
  "stop": [
4990
- "<|im_end|>"
4905
+ "</s>",
4906
+ "<|im_end|>",
4907
+ "<|im_start|>"
4991
4908
  ]
4992
4909
  }
4993
4910
  },
@@ -61,7 +61,7 @@ class ModelMemInfo:
61
61
 
62
62
  QUANT_NORMALIZE = {"int4": "4-bit", "int8": "8-bit", "4-bit": "4-bit", "8-bit": "8-bit"}
63
63
 
64
- GGML_MULTI_FACTOR_DICT = {
64
+ GGUF_MULTI_FACTOR_DICT = {
65
65
  "q4_0": 18,
66
66
  "q4_1": 20,
67
67
  "q5_0": 22,
@@ -70,14 +70,14 @@ GGML_MULTI_FACTOR_DICT = {
70
70
  "q8_1": 40,
71
71
  }
72
72
 
73
- GGML_MULTI_FACTOR_DICT_64 = {
73
+ GGUF_MULTI_FACTOR_DICT_64 = {
74
74
  "q6_K": 54.0,
75
75
  "q3": 26.0,
76
76
  "q4": 38.0,
77
77
  "q5": 46.0,
78
78
  }
79
79
 
80
- GGML_MULTI_FACTOR_DICT_COMBINE = {
80
+ GGUF_MULTI_FACTOR_DICT_COMBINE = {
81
81
  "q3_K_L": [38.0, 26.0],
82
82
  "q3_K_M": [46.0, 26.0],
83
83
  "q4_K_S": [46.0, 38.0],
@@ -136,9 +136,9 @@ def estimate_llm_gpu_memory_details(
136
136
  else:
137
137
  kv_dtype_size = 4
138
138
  overhead = 650.0
139
- if model_format == "ggmlv3":
139
+ if model_format == "ggufv2":
140
140
  assert quantization is not None and quantization != "none"
141
- model_size_in_mb = _compute_model_size_ggml(info, quantization)
141
+ model_size_in_mb = _compute_model_size_gguf(info, quantization)
142
142
  inference_mem = float(
143
143
  context_length * kv_dtype_size * info.hidden_dim * info.num_layers
144
144
  )
@@ -291,7 +291,7 @@ def _compute_inference_only_activation_memory(
291
291
  return ret
292
292
 
293
293
 
294
- def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
294
+ def _compute_model_size_gguf(info: ModelLayersInfo, quantization: str) -> float:
295
295
  assert quantization is not None
296
296
  vocab_size = info.vocab_size
297
297
  num_layers = info.num_layers
@@ -310,13 +310,13 @@ def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
310
310
  )
311
311
 
312
312
  total = 0.0
313
- v1 = GGML_MULTI_FACTOR_DICT.get(quantization)
313
+ v1 = GGUF_MULTI_FACTOR_DICT.get(quantization)
314
314
  if v1 is not None:
315
315
  total = (v1 * total_params) / (32 * 1024 * 1024)
316
- v2 = GGML_MULTI_FACTOR_DICT_64.get(quantization)
316
+ v2 = GGUF_MULTI_FACTOR_DICT_64.get(quantization)
317
317
  if v2 is not None:
318
318
  total = (v2 * total_params) / (64 * 1024 * 1024)
319
- v3 = GGML_MULTI_FACTOR_DICT_COMBINE.get(quantization)
319
+ v3 = GGUF_MULTI_FACTOR_DICT_COMBINE.get(quantization)
320
320
  if v3 is not None:
321
321
  factors = v3
322
322
  if quantization == "q2_K":