xinference 0.14.1.post1__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +15 -34
- xinference/client/restful/restful_client.py +2 -2
- xinference/core/chat_interface.py +44 -9
- xinference/core/model.py +4 -4
- xinference/core/scheduler.py +1 -2
- xinference/core/worker.py +1 -1
- xinference/deploy/cmdline.py +2 -2
- xinference/deploy/test/test_cmdline.py +7 -7
- xinference/model/llm/__init__.py +20 -27
- xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
- xinference/model/llm/llm_family.json +448 -1153
- xinference/model/llm/llm_family.py +14 -139
- xinference/model/llm/llm_family_modelscope.json +230 -313
- xinference/model/llm/memory.py +9 -9
- xinference/model/llm/sglang/core.py +2 -2
- xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
- xinference/model/llm/{pytorch → transformers}/core.py +2 -10
- xinference/model/llm/transformers/intern_vl.py +457 -0
- xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
- xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
- xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
- xinference/model/llm/utils.py +76 -70
- xinference/model/llm/vllm/core.py +110 -11
- xinference/model/utils.py +1 -95
- xinference/thirdparty/internvl/__init__.py +0 -0
- xinference/thirdparty/internvl/conversation.py +393 -0
- xinference/thirdparty/omnilmm/model/utils.py +16 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
- xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/METADATA +5 -8
- {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
- xinference/locale/utils.py +0 -39
- xinference/locale/zh_CN.json +0 -26
- xinference/model/llm/ggml/tools/__init__.py +0 -15
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
- xinference/model/llm/ggml/tools/gguf.py +0 -884
- xinference/model/llm/pytorch/__init__.py +0 -13
- xinference/model/llm/pytorch/baichuan.py +0 -81
- xinference/model/llm/pytorch/falcon.py +0 -138
- xinference/model/llm/pytorch/intern_vl.py +0 -352
- xinference/model/llm/pytorch/vicuna.py +0 -69
- xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
- xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
- /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
- /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
- /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
|
@@ -503,78 +503,6 @@
|
|
|
503
503
|
}
|
|
504
504
|
]
|
|
505
505
|
},
|
|
506
|
-
{
|
|
507
|
-
"version": 1,
|
|
508
|
-
"context_length": 8192,
|
|
509
|
-
"model_name": "chatglm2",
|
|
510
|
-
"model_lang": [
|
|
511
|
-
"en",
|
|
512
|
-
"zh"
|
|
513
|
-
],
|
|
514
|
-
"model_ability": [
|
|
515
|
-
"chat"
|
|
516
|
-
],
|
|
517
|
-
"model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
518
|
-
"model_specs": [
|
|
519
|
-
{
|
|
520
|
-
"model_format": "pytorch",
|
|
521
|
-
"model_size_in_billions": 6,
|
|
522
|
-
"quantizations": [
|
|
523
|
-
"4-bit",
|
|
524
|
-
"8-bit",
|
|
525
|
-
"none"
|
|
526
|
-
],
|
|
527
|
-
"model_hub": "modelscope",
|
|
528
|
-
"model_id": "ZhipuAI/chatglm2-6b",
|
|
529
|
-
"model_revision": "v1.0.12"
|
|
530
|
-
}
|
|
531
|
-
],
|
|
532
|
-
"prompt_style": {
|
|
533
|
-
"style_name": "CHATGLM",
|
|
534
|
-
"system_prompt": "",
|
|
535
|
-
"roles": [
|
|
536
|
-
"问",
|
|
537
|
-
"答"
|
|
538
|
-
],
|
|
539
|
-
"intra_message_sep": "\n\n"
|
|
540
|
-
}
|
|
541
|
-
},
|
|
542
|
-
{
|
|
543
|
-
"version": 1,
|
|
544
|
-
"context_length": 32768,
|
|
545
|
-
"model_name": "chatglm2-32k",
|
|
546
|
-
"model_lang": [
|
|
547
|
-
"en",
|
|
548
|
-
"zh"
|
|
549
|
-
],
|
|
550
|
-
"model_ability": [
|
|
551
|
-
"chat"
|
|
552
|
-
],
|
|
553
|
-
"model_description": "ChatGLM2-32k is a special version of ChatGLM2, with a context window of 32k tokens instead of 8k.",
|
|
554
|
-
"model_specs": [
|
|
555
|
-
{
|
|
556
|
-
"model_format": "pytorch",
|
|
557
|
-
"model_size_in_billions": 6,
|
|
558
|
-
"quantizations": [
|
|
559
|
-
"4-bit",
|
|
560
|
-
"8-bit",
|
|
561
|
-
"none"
|
|
562
|
-
],
|
|
563
|
-
"model_hub": "modelscope",
|
|
564
|
-
"model_id": "ZhipuAI/chatglm2-6b-32k",
|
|
565
|
-
"model_revision": "v1.0.2"
|
|
566
|
-
}
|
|
567
|
-
],
|
|
568
|
-
"prompt_style": {
|
|
569
|
-
"style_name": "CHATGLM",
|
|
570
|
-
"system_prompt": "",
|
|
571
|
-
"roles": [
|
|
572
|
-
"问",
|
|
573
|
-
"答"
|
|
574
|
-
],
|
|
575
|
-
"intra_message_sep": "\n\n"
|
|
576
|
-
}
|
|
577
|
-
},
|
|
578
506
|
{
|
|
579
507
|
"version": 1,
|
|
580
508
|
"context_length": 8192,
|
|
@@ -1060,166 +988,60 @@
|
|
|
1060
988
|
},
|
|
1061
989
|
{
|
|
1062
990
|
"version": 1,
|
|
1063
|
-
"context_length":
|
|
1064
|
-
"model_name": "
|
|
991
|
+
"context_length": 32768,
|
|
992
|
+
"model_name": "internlm2.5-chat",
|
|
1065
993
|
"model_lang": [
|
|
1066
994
|
"en",
|
|
1067
995
|
"zh"
|
|
1068
996
|
],
|
|
1069
997
|
"model_ability": [
|
|
1070
|
-
"
|
|
998
|
+
"chat"
|
|
1071
999
|
],
|
|
1072
|
-
"model_description": "
|
|
1000
|
+
"model_description": "InternLM2.5 series of the InternLM model.",
|
|
1073
1001
|
"model_specs": [
|
|
1074
1002
|
{
|
|
1075
1003
|
"model_format": "pytorch",
|
|
1076
|
-
"model_size_in_billions":
|
|
1004
|
+
"model_size_in_billions": "1_8",
|
|
1077
1005
|
"quantizations": [
|
|
1078
|
-
"4-bit",
|
|
1079
|
-
"8-bit",
|
|
1080
1006
|
"none"
|
|
1081
1007
|
],
|
|
1082
|
-
"model_id": "Shanghai_AI_Laboratory/
|
|
1083
|
-
"model_hub": "modelscope"
|
|
1084
|
-
|
|
1085
|
-
}
|
|
1086
|
-
]
|
|
1087
|
-
},
|
|
1088
|
-
{
|
|
1089
|
-
"version": 1,
|
|
1090
|
-
"context_length": 4096,
|
|
1091
|
-
"model_name": "internlm-chat-7b",
|
|
1092
|
-
"model_lang": [
|
|
1093
|
-
"en",
|
|
1094
|
-
"zh"
|
|
1095
|
-
],
|
|
1096
|
-
"model_ability": [
|
|
1097
|
-
"chat"
|
|
1098
|
-
],
|
|
1099
|
-
"model_description": "Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.",
|
|
1100
|
-
"model_specs": [
|
|
1008
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat",
|
|
1009
|
+
"model_hub": "modelscope"
|
|
1010
|
+
},
|
|
1101
1011
|
{
|
|
1102
1012
|
"model_format": "pytorch",
|
|
1103
1013
|
"model_size_in_billions": 7,
|
|
1104
1014
|
"quantizations": [
|
|
1105
|
-
"4-bit",
|
|
1106
|
-
"8-bit",
|
|
1107
1015
|
"none"
|
|
1108
1016
|
],
|
|
1109
|
-
"model_id": "Shanghai_AI_Laboratory/
|
|
1110
|
-
"model_hub": "modelscope"
|
|
1111
|
-
|
|
1112
|
-
}
|
|
1113
|
-
],
|
|
1114
|
-
"prompt_style": {
|
|
1115
|
-
"style_name": "INTERNLM",
|
|
1116
|
-
"system_prompt": "",
|
|
1117
|
-
"roles": [
|
|
1118
|
-
"<|User|>",
|
|
1119
|
-
"<|Bot|>"
|
|
1120
|
-
],
|
|
1121
|
-
"intra_message_sep": "<eoh>\n",
|
|
1122
|
-
"inter_message_sep": "<eoa>\n",
|
|
1123
|
-
"stop_token_ids": [
|
|
1124
|
-
1,
|
|
1125
|
-
103028
|
|
1126
|
-
],
|
|
1127
|
-
"stop": [
|
|
1128
|
-
"<eoa>"
|
|
1129
|
-
]
|
|
1130
|
-
}
|
|
1131
|
-
},
|
|
1132
|
-
{
|
|
1133
|
-
"version": 1,
|
|
1134
|
-
"context_length": 16384,
|
|
1135
|
-
"model_name": "internlm-20b",
|
|
1136
|
-
"model_lang": [
|
|
1137
|
-
"en",
|
|
1138
|
-
"zh"
|
|
1139
|
-
],
|
|
1140
|
-
"model_ability": [
|
|
1141
|
-
"generate"
|
|
1142
|
-
],
|
|
1143
|
-
"model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.",
|
|
1144
|
-
"model_specs": [
|
|
1017
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
|
|
1018
|
+
"model_hub": "modelscope"
|
|
1019
|
+
},
|
|
1145
1020
|
{
|
|
1146
|
-
"model_format": "
|
|
1147
|
-
"model_size_in_billions":
|
|
1021
|
+
"model_format": "ggufv2",
|
|
1022
|
+
"model_size_in_billions": 7,
|
|
1148
1023
|
"quantizations": [
|
|
1149
|
-
"
|
|
1150
|
-
"
|
|
1151
|
-
"
|
|
1024
|
+
"q2_k",
|
|
1025
|
+
"q3_k_m",
|
|
1026
|
+
"q4_0",
|
|
1027
|
+
"q4_k_m",
|
|
1028
|
+
"q5_0",
|
|
1029
|
+
"q5_k_m",
|
|
1030
|
+
"q6_k",
|
|
1031
|
+
"q8_0",
|
|
1032
|
+
"fp16"
|
|
1152
1033
|
],
|
|
1153
|
-
"model_id": "Shanghai_AI_Laboratory/
|
|
1154
|
-
"
|
|
1155
|
-
"
|
|
1156
|
-
}
|
|
1157
|
-
]
|
|
1158
|
-
},
|
|
1159
|
-
{
|
|
1160
|
-
"version": 1,
|
|
1161
|
-
"context_length": 16384,
|
|
1162
|
-
"model_name": "internlm-chat-20b",
|
|
1163
|
-
"model_lang": [
|
|
1164
|
-
"en",
|
|
1165
|
-
"zh"
|
|
1166
|
-
],
|
|
1167
|
-
"model_ability": [
|
|
1168
|
-
"chat"
|
|
1169
|
-
],
|
|
1170
|
-
"model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.",
|
|
1171
|
-
"model_specs": [
|
|
1034
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat-gguf",
|
|
1035
|
+
"model_file_name_template": "internlm2_5-7b-chat-{quantization}.gguf",
|
|
1036
|
+
"model_hub": "modelscope"
|
|
1037
|
+
},
|
|
1172
1038
|
{
|
|
1173
1039
|
"model_format": "pytorch",
|
|
1174
1040
|
"model_size_in_billions": 20,
|
|
1175
1041
|
"quantizations": [
|
|
1176
|
-
"4-bit",
|
|
1177
|
-
"8-bit",
|
|
1178
1042
|
"none"
|
|
1179
1043
|
],
|
|
1180
|
-
"model_id": "Shanghai_AI_Laboratory/
|
|
1181
|
-
"model_hub": "modelscope",
|
|
1182
|
-
"model_revision": "v1.0.1"
|
|
1183
|
-
}
|
|
1184
|
-
],
|
|
1185
|
-
"prompt_style": {
|
|
1186
|
-
"style_name": "INTERNLM",
|
|
1187
|
-
"system_prompt": "",
|
|
1188
|
-
"roles": [
|
|
1189
|
-
"<|User|>",
|
|
1190
|
-
"<|Bot|>"
|
|
1191
|
-
],
|
|
1192
|
-
"intra_message_sep": "<eoh>\n",
|
|
1193
|
-
"inter_message_sep": "<eoa>\n",
|
|
1194
|
-
"stop_token_ids": [
|
|
1195
|
-
1,
|
|
1196
|
-
103028
|
|
1197
|
-
],
|
|
1198
|
-
"stop": [
|
|
1199
|
-
"<eoa>"
|
|
1200
|
-
]
|
|
1201
|
-
}
|
|
1202
|
-
},
|
|
1203
|
-
{
|
|
1204
|
-
"version": 1,
|
|
1205
|
-
"context_length": 32768,
|
|
1206
|
-
"model_name": "internlm2.5-chat",
|
|
1207
|
-
"model_lang": [
|
|
1208
|
-
"en",
|
|
1209
|
-
"zh"
|
|
1210
|
-
],
|
|
1211
|
-
"model_ability": [
|
|
1212
|
-
"chat"
|
|
1213
|
-
],
|
|
1214
|
-
"model_description": "InternLM2.5 series of the InternLM model.",
|
|
1215
|
-
"model_specs": [
|
|
1216
|
-
{
|
|
1217
|
-
"model_format": "pytorch",
|
|
1218
|
-
"model_size_in_billions": 7,
|
|
1219
|
-
"quantizations": [
|
|
1220
|
-
"none"
|
|
1221
|
-
],
|
|
1222
|
-
"model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
|
|
1044
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2_5-20b-chat",
|
|
1223
1045
|
"model_hub": "modelscope"
|
|
1224
1046
|
}
|
|
1225
1047
|
],
|
|
@@ -2403,59 +2225,6 @@
|
|
|
2403
2225
|
]
|
|
2404
2226
|
}
|
|
2405
2227
|
},
|
|
2406
|
-
{
|
|
2407
|
-
"version": 1,
|
|
2408
|
-
"context_length": 2048,
|
|
2409
|
-
"model_name": "falcon-instruct",
|
|
2410
|
-
"model_lang": [
|
|
2411
|
-
"en"
|
|
2412
|
-
],
|
|
2413
|
-
"model_ability": [
|
|
2414
|
-
"chat"
|
|
2415
|
-
],
|
|
2416
|
-
"model_description": "Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting.",
|
|
2417
|
-
"model_specs": [
|
|
2418
|
-
{
|
|
2419
|
-
"model_format": "pytorch",
|
|
2420
|
-
"model_size_in_billions": 7,
|
|
2421
|
-
"quantizations": [
|
|
2422
|
-
"4-bit",
|
|
2423
|
-
"8-bit",
|
|
2424
|
-
"none"
|
|
2425
|
-
],
|
|
2426
|
-
"model_hub": "modelscope",
|
|
2427
|
-
"model_id": "Xorbits/falcon-7b-instruct",
|
|
2428
|
-
"model_revision": "v1.0.0"
|
|
2429
|
-
}
|
|
2430
|
-
],
|
|
2431
|
-
"prompt_style": {
|
|
2432
|
-
"style_name": "FALCON",
|
|
2433
|
-
"system_prompt": "",
|
|
2434
|
-
"roles": [
|
|
2435
|
-
"User",
|
|
2436
|
-
"Assistant"
|
|
2437
|
-
],
|
|
2438
|
-
"intra_message_sep": "\n",
|
|
2439
|
-
"inter_message_sep": "<|endoftext|>",
|
|
2440
|
-
"stop": [
|
|
2441
|
-
"\nUser"
|
|
2442
|
-
],
|
|
2443
|
-
"stop_token_ids": [
|
|
2444
|
-
0,
|
|
2445
|
-
1,
|
|
2446
|
-
2,
|
|
2447
|
-
3,
|
|
2448
|
-
4,
|
|
2449
|
-
5,
|
|
2450
|
-
6,
|
|
2451
|
-
7,
|
|
2452
|
-
8,
|
|
2453
|
-
9,
|
|
2454
|
-
10,
|
|
2455
|
-
11
|
|
2456
|
-
]
|
|
2457
|
-
}
|
|
2458
|
-
},
|
|
2459
2228
|
{
|
|
2460
2229
|
"version": 1,
|
|
2461
2230
|
"context_length": 8192,
|
|
@@ -2540,53 +2309,6 @@
|
|
|
2540
2309
|
]
|
|
2541
2310
|
}
|
|
2542
2311
|
},
|
|
2543
|
-
{
|
|
2544
|
-
"version": 1,
|
|
2545
|
-
"context_length": 2048,
|
|
2546
|
-
"model_name": "OpenBuddy",
|
|
2547
|
-
"model_lang": [
|
|
2548
|
-
"en"
|
|
2549
|
-
],
|
|
2550
|
-
"model_ability": [
|
|
2551
|
-
"chat"
|
|
2552
|
-
],
|
|
2553
|
-
"model_description": "OpenBuddy is a powerful open multilingual chatbot model aimed at global users.",
|
|
2554
|
-
"model_specs": [
|
|
2555
|
-
{
|
|
2556
|
-
"model_format": "ggmlv3",
|
|
2557
|
-
"model_size_in_billions": 13,
|
|
2558
|
-
"quantizations": [
|
|
2559
|
-
"Q2_K",
|
|
2560
|
-
"Q3_K_S",
|
|
2561
|
-
"Q3_K_M",
|
|
2562
|
-
"Q3_K_L",
|
|
2563
|
-
"Q4_0",
|
|
2564
|
-
"Q4_1",
|
|
2565
|
-
"Q4_K_S",
|
|
2566
|
-
"Q4_K_M",
|
|
2567
|
-
"Q5_0",
|
|
2568
|
-
"Q5_1",
|
|
2569
|
-
"Q5_K_S",
|
|
2570
|
-
"Q5_K_M",
|
|
2571
|
-
"Q6_K",
|
|
2572
|
-
"Q8_0"
|
|
2573
|
-
],
|
|
2574
|
-
"model_hub": "modelscope",
|
|
2575
|
-
"model_id": "Xorbits/OpenBuddy-Llama2-13B-v11.1-GGML",
|
|
2576
|
-
"model_file_name_template": "openbuddy-llama2-13b-v11.1.ggmlv3.{quantization}.bin"
|
|
2577
|
-
}
|
|
2578
|
-
],
|
|
2579
|
-
"prompt_style": {
|
|
2580
|
-
"style_name": "INSTRUCTION",
|
|
2581
|
-
"system_prompt": "You are a professional translator. Be faithful or accurate in translation. Make the translation readable or intelligible. Be elegant or natural in translation. Do not translate person's name. Do not add any additional text to the translation. Do not give me any comments or suggestions.\nUser:\n\n{0}\nAssistant:",
|
|
2582
|
-
"roles": [
|
|
2583
|
-
"User",
|
|
2584
|
-
"Assistant"
|
|
2585
|
-
],
|
|
2586
|
-
"intra_message_sep": "",
|
|
2587
|
-
"inter_message_sep": ""
|
|
2588
|
-
}
|
|
2589
|
-
},
|
|
2590
2312
|
{
|
|
2591
2313
|
"version": 1,
|
|
2592
2314
|
"context_length": 32768,
|
|
@@ -3416,6 +3138,24 @@
|
|
|
3416
3138
|
"model_id": "qwen/Qwen2-72B-Instruct-AWQ",
|
|
3417
3139
|
"model_hub": "modelscope"
|
|
3418
3140
|
},
|
|
3141
|
+
{
|
|
3142
|
+
"model_format": "fp8",
|
|
3143
|
+
"model_size_in_billions": 7,
|
|
3144
|
+
"quantizations": [
|
|
3145
|
+
"fp8"
|
|
3146
|
+
],
|
|
3147
|
+
"model_id": "liuzhenghua/Qwen2-7B-FP8-Instruct",
|
|
3148
|
+
"model_hub": "modelscope"
|
|
3149
|
+
},
|
|
3150
|
+
{
|
|
3151
|
+
"model_format": "fp8",
|
|
3152
|
+
"model_size_in_billions": 72,
|
|
3153
|
+
"quantizations": [
|
|
3154
|
+
"fp8"
|
|
3155
|
+
],
|
|
3156
|
+
"model_id": "liuzhenghua/Qwen2-72B-FP8-Instruct",
|
|
3157
|
+
"model_hub": "modelscope"
|
|
3158
|
+
},
|
|
3419
3159
|
{
|
|
3420
3160
|
"model_format": "mlx",
|
|
3421
3161
|
"model_size_in_billions": "0_5",
|
|
@@ -4245,6 +3985,17 @@
|
|
|
4245
3985
|
],
|
|
4246
3986
|
"model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
|
|
4247
3987
|
"model_specs": [
|
|
3988
|
+
{
|
|
3989
|
+
"model_format": "pytorch",
|
|
3990
|
+
"model_size_in_billions": 2,
|
|
3991
|
+
"quantizations": [
|
|
3992
|
+
"none",
|
|
3993
|
+
"4-bit",
|
|
3994
|
+
"8-bit"
|
|
3995
|
+
],
|
|
3996
|
+
"model_id": "LLM-Research/gemma-2-2b-it",
|
|
3997
|
+
"model_hub": "modelscope"
|
|
3998
|
+
},
|
|
4248
3999
|
{
|
|
4249
4000
|
"model_format": "pytorch",
|
|
4250
4001
|
"model_size_in_billions": 9,
|
|
@@ -4958,25 +4709,187 @@
|
|
|
4958
4709
|
"model_format": "pytorch",
|
|
4959
4710
|
"model_size_in_billions": 26,
|
|
4960
4711
|
"quantizations": [
|
|
4961
|
-
|
|
4712
|
+
"4-bit",
|
|
4713
|
+
"8-bit",
|
|
4714
|
+
"none"
|
|
4715
|
+
],
|
|
4716
|
+
"model_hub": "modelscope",
|
|
4717
|
+
"model_id": "OpenGVLab/InternVL-Chat-V1-5",
|
|
4718
|
+
"model_revision": "master"
|
|
4719
|
+
}
|
|
4720
|
+
],
|
|
4721
|
+
"prompt_style": {
|
|
4722
|
+
"style_name": "INTERNVL",
|
|
4723
|
+
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
4724
|
+
"roles": [
|
|
4725
|
+
"<|im_start|>user",
|
|
4726
|
+
"<|im_start|>assistant"
|
|
4727
|
+
],
|
|
4728
|
+
"intra_message_sep": "<|im_end|>",
|
|
4729
|
+
"stop_token_ids": [
|
|
4730
|
+
2,
|
|
4731
|
+
92543,
|
|
4732
|
+
92542
|
|
4733
|
+
],
|
|
4734
|
+
"stop": [
|
|
4735
|
+
"</s>",
|
|
4736
|
+
"<|im_end|>",
|
|
4737
|
+
"<|im_start|>"
|
|
4738
|
+
]
|
|
4739
|
+
}
|
|
4740
|
+
},
|
|
4741
|
+
{
|
|
4742
|
+
"version": 1,
|
|
4743
|
+
"context_length": 32768,
|
|
4744
|
+
"model_name": "internvl2",
|
|
4745
|
+
"model_lang": [
|
|
4746
|
+
"en",
|
|
4747
|
+
"zh"
|
|
4748
|
+
],
|
|
4749
|
+
"model_ability": [
|
|
4750
|
+
"chat",
|
|
4751
|
+
"vision"
|
|
4752
|
+
],
|
|
4753
|
+
"model_description": "InternVL 2 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
|
|
4754
|
+
"model_specs": [
|
|
4755
|
+
|
|
4756
|
+
{
|
|
4757
|
+
"model_format": "pytorch",
|
|
4758
|
+
"model_size_in_billions": 1,
|
|
4759
|
+
"quantizations": [
|
|
4760
|
+
"4-bit",
|
|
4761
|
+
"8-bit",
|
|
4762
|
+
"none"
|
|
4763
|
+
],
|
|
4764
|
+
"model_hub": "modelscope",
|
|
4765
|
+
"model_id": "OpenGVLab/InternVL2-1B",
|
|
4766
|
+
"model_revision": "master"
|
|
4767
|
+
},
|
|
4768
|
+
{
|
|
4769
|
+
"model_format": "pytorch",
|
|
4770
|
+
"model_size_in_billions": 2,
|
|
4771
|
+
"quantizations": [
|
|
4772
|
+
"4-bit",
|
|
4773
|
+
"8-bit",
|
|
4774
|
+
"none"
|
|
4775
|
+
],
|
|
4776
|
+
"model_hub": "modelscope",
|
|
4777
|
+
"model_id": "OpenGVLab/InternVL2-2B",
|
|
4778
|
+
"model_revision": "master"
|
|
4779
|
+
},
|
|
4780
|
+
{
|
|
4781
|
+
"model_format": "pytorch",
|
|
4782
|
+
"model_size_in_billions": 2,
|
|
4783
|
+
"quantizations": [
|
|
4784
|
+
"none"
|
|
4962
4785
|
],
|
|
4963
|
-
|
|
4964
|
-
"model_id": "
|
|
4786
|
+
"model_hub": "modelscope",
|
|
4787
|
+
"model_id": "OpenGVLab/InternVL2-2B-AWQ",
|
|
4788
|
+
"model_revision": "master"
|
|
4789
|
+
},
|
|
4790
|
+
{
|
|
4791
|
+
"model_format": "pytorch",
|
|
4792
|
+
"model_size_in_billions": 4,
|
|
4793
|
+
"quantizations": [
|
|
4794
|
+
"4-bit",
|
|
4795
|
+
"8-bit",
|
|
4796
|
+
"none"
|
|
4797
|
+
],
|
|
4798
|
+
"model_hub": "modelscope",
|
|
4799
|
+
"model_id": "OpenGVLab/InternVL2-4B",
|
|
4800
|
+
"model_revision": "master"
|
|
4801
|
+
},
|
|
4802
|
+
{
|
|
4803
|
+
"model_format": "pytorch",
|
|
4804
|
+
"model_size_in_billions": 8,
|
|
4805
|
+
"quantizations": [
|
|
4806
|
+
"4-bit",
|
|
4807
|
+
"8-bit",
|
|
4808
|
+
"none"
|
|
4809
|
+
],
|
|
4810
|
+
"model_hub": "modelscope",
|
|
4811
|
+
"model_id": "OpenGVLab/InternVL2-8B",
|
|
4812
|
+
"model_revision": "master"
|
|
4813
|
+
},
|
|
4814
|
+
{
|
|
4815
|
+
"model_format": "pytorch",
|
|
4816
|
+
"model_size_in_billions": 8,
|
|
4817
|
+
"quantizations": [
|
|
4818
|
+
"none"
|
|
4819
|
+
],
|
|
4820
|
+
"model_hub": "modelscope",
|
|
4821
|
+
"model_id": "OpenGVLab/InternVL2-8B-AWQ",
|
|
4822
|
+
"model_revision": "master"
|
|
4823
|
+
},
|
|
4824
|
+
{
|
|
4825
|
+
"model_format": "pytorch",
|
|
4826
|
+
"model_size_in_billions": 26,
|
|
4827
|
+
"quantizations": [
|
|
4828
|
+
"4-bit",
|
|
4829
|
+
"8-bit",
|
|
4830
|
+
"none"
|
|
4831
|
+
],
|
|
4832
|
+
"model_hub": "modelscope",
|
|
4833
|
+
"model_id": "OpenGVLab/InternVL2-26B",
|
|
4965
4834
|
"model_revision": "master"
|
|
4966
4835
|
},
|
|
4967
4836
|
{
|
|
4968
4837
|
"model_format": "pytorch",
|
|
4969
4838
|
"model_size_in_billions": 26,
|
|
4970
4839
|
"quantizations": [
|
|
4971
|
-
|
|
4840
|
+
"none"
|
|
4841
|
+
],
|
|
4842
|
+
"model_hub": "modelscope",
|
|
4843
|
+
"model_id": "OpenGVLab/InternVL2-26B-AWQ",
|
|
4844
|
+
"model_revision": "master"
|
|
4845
|
+
},
|
|
4846
|
+
{
|
|
4847
|
+
"model_format": "pytorch",
|
|
4848
|
+
"model_size_in_billions": 40,
|
|
4849
|
+
"quantizations": [
|
|
4850
|
+
"4-bit",
|
|
4851
|
+
"8-bit",
|
|
4852
|
+
"none"
|
|
4853
|
+
],
|
|
4854
|
+
"model_hub": "modelscope",
|
|
4855
|
+
"model_id": "OpenGVLab/InternVL2-40B",
|
|
4856
|
+
"model_revision": "master"
|
|
4857
|
+
},
|
|
4858
|
+
{
|
|
4859
|
+
"model_format": "pytorch",
|
|
4860
|
+
"model_size_in_billions": 40,
|
|
4861
|
+
"quantizations": [
|
|
4862
|
+
"none"
|
|
4863
|
+
],
|
|
4864
|
+
"model_hub": "modelscope",
|
|
4865
|
+
"model_id": "OpenGVLab/InternVL2-40B-AWQ",
|
|
4866
|
+
"model_revision": "master"
|
|
4867
|
+
},
|
|
4868
|
+
{
|
|
4869
|
+
"model_format": "pytorch",
|
|
4870
|
+
"model_size_in_billions": 76,
|
|
4871
|
+
"quantizations": [
|
|
4872
|
+
"4-bit",
|
|
4873
|
+
"8-bit",
|
|
4874
|
+
"none"
|
|
4875
|
+
],
|
|
4876
|
+
"model_hub": "modelscope",
|
|
4877
|
+
"model_id": "OpenGVLab/InternVL2-Llama3-76B",
|
|
4878
|
+
"model_revision": "master"
|
|
4879
|
+
},
|
|
4880
|
+
{
|
|
4881
|
+
"model_format": "pytorch",
|
|
4882
|
+
"model_size_in_billions": 76,
|
|
4883
|
+
"quantizations": [
|
|
4884
|
+
"none"
|
|
4972
4885
|
],
|
|
4973
|
-
|
|
4974
|
-
"model_id": "
|
|
4886
|
+
"model_hub": "modelscope",
|
|
4887
|
+
"model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
|
|
4975
4888
|
"model_revision": "master"
|
|
4976
4889
|
}
|
|
4977
4890
|
],
|
|
4978
4891
|
"prompt_style": {
|
|
4979
|
-
"style_name": "
|
|
4892
|
+
"style_name": "INTERNVL",
|
|
4980
4893
|
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
4981
4894
|
"roles": [
|
|
4982
4895
|
"<|im_start|>user",
|
|
@@ -4984,10 +4897,14 @@
|
|
|
4984
4897
|
],
|
|
4985
4898
|
"intra_message_sep": "<|im_end|>",
|
|
4986
4899
|
"stop_token_ids": [
|
|
4900
|
+
2,
|
|
4901
|
+
92543,
|
|
4987
4902
|
92542
|
|
4988
4903
|
],
|
|
4989
4904
|
"stop": [
|
|
4990
|
-
"
|
|
4905
|
+
"</s>",
|
|
4906
|
+
"<|im_end|>",
|
|
4907
|
+
"<|im_start|>"
|
|
4991
4908
|
]
|
|
4992
4909
|
}
|
|
4993
4910
|
},
|
xinference/model/llm/memory.py
CHANGED
|
@@ -61,7 +61,7 @@ class ModelMemInfo:
|
|
|
61
61
|
|
|
62
62
|
QUANT_NORMALIZE = {"int4": "4-bit", "int8": "8-bit", "4-bit": "4-bit", "8-bit": "8-bit"}
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
GGUF_MULTI_FACTOR_DICT = {
|
|
65
65
|
"q4_0": 18,
|
|
66
66
|
"q4_1": 20,
|
|
67
67
|
"q5_0": 22,
|
|
@@ -70,14 +70,14 @@ GGML_MULTI_FACTOR_DICT = {
|
|
|
70
70
|
"q8_1": 40,
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
GGUF_MULTI_FACTOR_DICT_64 = {
|
|
74
74
|
"q6_K": 54.0,
|
|
75
75
|
"q3": 26.0,
|
|
76
76
|
"q4": 38.0,
|
|
77
77
|
"q5": 46.0,
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
GGUF_MULTI_FACTOR_DICT_COMBINE = {
|
|
81
81
|
"q3_K_L": [38.0, 26.0],
|
|
82
82
|
"q3_K_M": [46.0, 26.0],
|
|
83
83
|
"q4_K_S": [46.0, 38.0],
|
|
@@ -136,9 +136,9 @@ def estimate_llm_gpu_memory_details(
|
|
|
136
136
|
else:
|
|
137
137
|
kv_dtype_size = 4
|
|
138
138
|
overhead = 650.0
|
|
139
|
-
if model_format == "
|
|
139
|
+
if model_format == "ggufv2":
|
|
140
140
|
assert quantization is not None and quantization != "none"
|
|
141
|
-
model_size_in_mb =
|
|
141
|
+
model_size_in_mb = _compute_model_size_gguf(info, quantization)
|
|
142
142
|
inference_mem = float(
|
|
143
143
|
context_length * kv_dtype_size * info.hidden_dim * info.num_layers
|
|
144
144
|
)
|
|
@@ -291,7 +291,7 @@ def _compute_inference_only_activation_memory(
|
|
|
291
291
|
return ret
|
|
292
292
|
|
|
293
293
|
|
|
294
|
-
def
|
|
294
|
+
def _compute_model_size_gguf(info: ModelLayersInfo, quantization: str) -> float:
|
|
295
295
|
assert quantization is not None
|
|
296
296
|
vocab_size = info.vocab_size
|
|
297
297
|
num_layers = info.num_layers
|
|
@@ -310,13 +310,13 @@ def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
|
|
|
310
310
|
)
|
|
311
311
|
|
|
312
312
|
total = 0.0
|
|
313
|
-
v1 =
|
|
313
|
+
v1 = GGUF_MULTI_FACTOR_DICT.get(quantization)
|
|
314
314
|
if v1 is not None:
|
|
315
315
|
total = (v1 * total_params) / (32 * 1024 * 1024)
|
|
316
|
-
v2 =
|
|
316
|
+
v2 = GGUF_MULTI_FACTOR_DICT_64.get(quantization)
|
|
317
317
|
if v2 is not None:
|
|
318
318
|
total = (v2 * total_params) / (64 * 1024 * 1024)
|
|
319
|
-
v3 =
|
|
319
|
+
v3 = GGUF_MULTI_FACTOR_DICT_COMBINE.get(quantization)
|
|
320
320
|
if v3 is not None:
|
|
321
321
|
factors = v3
|
|
322
322
|
if quantization == "q2_K":
|