xinference 0.16.3__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +22 -2
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +148 -12
- xinference/client/restful/restful_client.py +47 -2
- xinference/constants.py +1 -0
- xinference/core/model.py +45 -15
- xinference/core/supervisor.py +8 -2
- xinference/core/utils.py +67 -2
- xinference/model/audio/__init__.py +12 -0
- xinference/model/audio/core.py +21 -4
- xinference/model/audio/fish_speech.py +70 -35
- xinference/model/audio/model_spec.json +81 -1
- xinference/model/audio/whisper_mlx.py +208 -0
- xinference/model/embedding/core.py +259 -4
- xinference/model/embedding/model_spec.json +1 -1
- xinference/model/embedding/model_spec_modelscope.json +1 -1
- xinference/model/image/stable_diffusion/core.py +5 -2
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/llm_family.json +485 -6
- xinference/model/llm/llm_family_modelscope.json +519 -0
- xinference/model/llm/mlx/core.py +45 -3
- xinference/model/llm/sglang/core.py +1 -0
- xinference/model/llm/transformers/core.py +1 -0
- xinference/model/llm/transformers/glm_edge_v.py +230 -0
- xinference/model/llm/utils.py +19 -0
- xinference/model/llm/vllm/core.py +84 -2
- xinference/model/rerank/core.py +11 -4
- xinference/thirdparty/fish_speech/fish_speech/conversation.py +254 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +76 -11
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +32 -1
- xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
- xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
- xinference/thirdparty/fish_speech/tools/api.py +578 -75
- xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
- xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
- xinference/thirdparty/fish_speech/tools/llama/generate.py +393 -9
- xinference/thirdparty/fish_speech/tools/msgpack_api.py +90 -29
- xinference/thirdparty/fish_speech/tools/post_api.py +37 -15
- xinference/thirdparty/fish_speech/tools/schema.py +187 -0
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
- xinference/thirdparty/fish_speech/tools/webui.py +138 -75
- xinference/types.py +2 -1
- {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/METADATA +30 -6
- {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/RECORD +58 -63
- {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/WHEEL +1 -1
- xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/commons.py +0 -35
- xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
- {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/LICENSE +0 -0
- {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.16.3.dist-info → xinference-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -5907,6 +5907,18 @@
|
|
|
5907
5907
|
],
|
|
5908
5908
|
"model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
|
|
5909
5909
|
"model_specs": [
|
|
5910
|
+
{
|
|
5911
|
+
"model_format": "pytorch",
|
|
5912
|
+
"model_size_in_billions": "0_5",
|
|
5913
|
+
"quantizations": [
|
|
5914
|
+
"4-bit",
|
|
5915
|
+
"8-bit",
|
|
5916
|
+
"none"
|
|
5917
|
+
],
|
|
5918
|
+
"model_id": "qwen/Qwen2.5-Coder-0.5B",
|
|
5919
|
+
"model_revision": "master",
|
|
5920
|
+
"model_hub": "modelscope"
|
|
5921
|
+
},
|
|
5910
5922
|
{
|
|
5911
5923
|
"model_format": "pytorch",
|
|
5912
5924
|
"model_size_in_billions": "1_5",
|
|
@@ -5919,6 +5931,18 @@
|
|
|
5919
5931
|
"model_revision": "master",
|
|
5920
5932
|
"model_hub": "modelscope"
|
|
5921
5933
|
},
|
|
5934
|
+
{
|
|
5935
|
+
"model_format": "pytorch",
|
|
5936
|
+
"model_size_in_billions": "3",
|
|
5937
|
+
"quantizations": [
|
|
5938
|
+
"4-bit",
|
|
5939
|
+
"8-bit",
|
|
5940
|
+
"none"
|
|
5941
|
+
],
|
|
5942
|
+
"model_id": "qwen/Qwen2.5-Coder-3B",
|
|
5943
|
+
"model_revision": "master",
|
|
5944
|
+
"model_hub": "modelscope"
|
|
5945
|
+
},
|
|
5922
5946
|
{
|
|
5923
5947
|
"model_format": "pytorch",
|
|
5924
5948
|
"model_size_in_billions": 7,
|
|
@@ -5930,6 +5954,30 @@
|
|
|
5930
5954
|
"model_id": "qwen/Qwen2.5-Coder-7B",
|
|
5931
5955
|
"model_revision": "master",
|
|
5932
5956
|
"model_hub": "modelscope"
|
|
5957
|
+
},
|
|
5958
|
+
{
|
|
5959
|
+
"model_format": "pytorch",
|
|
5960
|
+
"model_size_in_billions": 14,
|
|
5961
|
+
"quantizations": [
|
|
5962
|
+
"4-bit",
|
|
5963
|
+
"8-bit",
|
|
5964
|
+
"none"
|
|
5965
|
+
],
|
|
5966
|
+
"model_id": "qwen/Qwen2.5-Coder-14B",
|
|
5967
|
+
"model_revision": "master",
|
|
5968
|
+
"model_hub": "modelscope"
|
|
5969
|
+
},
|
|
5970
|
+
{
|
|
5971
|
+
"model_format": "pytorch",
|
|
5972
|
+
"model_size_in_billions": 32,
|
|
5973
|
+
"quantizations": [
|
|
5974
|
+
"4-bit",
|
|
5975
|
+
"8-bit",
|
|
5976
|
+
"none"
|
|
5977
|
+
],
|
|
5978
|
+
"model_id": "qwen/Qwen2.5-Coder-32B",
|
|
5979
|
+
"model_revision": "master",
|
|
5980
|
+
"model_hub": "modelscope"
|
|
5933
5981
|
}
|
|
5934
5982
|
]
|
|
5935
5983
|
},
|
|
@@ -5947,6 +5995,18 @@
|
|
|
5947
5995
|
],
|
|
5948
5996
|
"model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
|
|
5949
5997
|
"model_specs": [
|
|
5998
|
+
{
|
|
5999
|
+
"model_format": "pytorch",
|
|
6000
|
+
"model_size_in_billions": "0_5",
|
|
6001
|
+
"quantizations": [
|
|
6002
|
+
"4-bit",
|
|
6003
|
+
"8-bit",
|
|
6004
|
+
"none"
|
|
6005
|
+
],
|
|
6006
|
+
"model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct",
|
|
6007
|
+
"model_revision": "master",
|
|
6008
|
+
"model_hub": "modelscope"
|
|
6009
|
+
},
|
|
5950
6010
|
{
|
|
5951
6011
|
"model_format": "pytorch",
|
|
5952
6012
|
"model_size_in_billions": "1_5",
|
|
@@ -5958,6 +6018,17 @@
|
|
|
5958
6018
|
"model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct",
|
|
5959
6019
|
"model_revision": "master",
|
|
5960
6020
|
"model_hub": "modelscope"
|
|
6021
|
+
}, {
|
|
6022
|
+
"model_format": "pytorch",
|
|
6023
|
+
"model_size_in_billions": "3",
|
|
6024
|
+
"quantizations": [
|
|
6025
|
+
"4-bit",
|
|
6026
|
+
"8-bit",
|
|
6027
|
+
"none"
|
|
6028
|
+
],
|
|
6029
|
+
"model_id": "qwen/Qwen2.5-Coder-3B-Instruct",
|
|
6030
|
+
"model_revision": "master",
|
|
6031
|
+
"model_hub": "modelscope"
|
|
5961
6032
|
},
|
|
5962
6033
|
{
|
|
5963
6034
|
"model_format": "pytorch",
|
|
@@ -5971,6 +6042,63 @@
|
|
|
5971
6042
|
"model_revision": "master",
|
|
5972
6043
|
"model_hub": "modelscope"
|
|
5973
6044
|
},
|
|
6045
|
+
{
|
|
6046
|
+
"model_format": "pytorch",
|
|
6047
|
+
"model_size_in_billions": 14,
|
|
6048
|
+
"quantizations": [
|
|
6049
|
+
"4-bit",
|
|
6050
|
+
"8-bit",
|
|
6051
|
+
"none"
|
|
6052
|
+
],
|
|
6053
|
+
"model_id": "qwen/Qwen2.5-Coder-14B-Instruct",
|
|
6054
|
+
"model_revision": "master",
|
|
6055
|
+
"model_hub": "modelscope"
|
|
6056
|
+
},
|
|
6057
|
+
{
|
|
6058
|
+
"model_format": "pytorch",
|
|
6059
|
+
"model_size_in_billions": 32,
|
|
6060
|
+
"quantizations": [
|
|
6061
|
+
"4-bit",
|
|
6062
|
+
"8-bit",
|
|
6063
|
+
"none"
|
|
6064
|
+
],
|
|
6065
|
+
"model_id": "qwen/Qwen2.5-Coder-32B-Instruct",
|
|
6066
|
+
"model_revision": "master",
|
|
6067
|
+
"model_hub": "modelscope"
|
|
6068
|
+
},
|
|
6069
|
+
{
|
|
6070
|
+
"model_format": "gptq",
|
|
6071
|
+
"model_size_in_billions": "0_5",
|
|
6072
|
+
"quantizations": [
|
|
6073
|
+
"Int4",
|
|
6074
|
+
"Int8"
|
|
6075
|
+
],
|
|
6076
|
+
"model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-{quantization}",
|
|
6077
|
+
"model_revision": "master",
|
|
6078
|
+
"model_hub": "modelscope"
|
|
6079
|
+
},
|
|
6080
|
+
{
|
|
6081
|
+
"model_format": "gptq",
|
|
6082
|
+
"model_size_in_billions": "1_5",
|
|
6083
|
+
"quantizations": [
|
|
6084
|
+
"Int4",
|
|
6085
|
+
"Int8"
|
|
6086
|
+
],
|
|
6087
|
+
"model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-{quantization}",
|
|
6088
|
+
"model_revision": "master",
|
|
6089
|
+
"model_hub": "modelscope"
|
|
6090
|
+
},
|
|
6091
|
+
{
|
|
6092
|
+
"model_format": "gptq",
|
|
6093
|
+
"model_size_in_billions": 3,
|
|
6094
|
+
"quantizations": [
|
|
6095
|
+
"Int4",
|
|
6096
|
+
"Int8"
|
|
6097
|
+
],
|
|
6098
|
+
"model_id": "qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-{quantization}",
|
|
6099
|
+
"model_revision": "master",
|
|
6100
|
+
"model_hub": "modelscope"
|
|
6101
|
+
},
|
|
5974
6102
|
{
|
|
5975
6103
|
"model_format": "gptq",
|
|
5976
6104
|
"model_size_in_billions": 7,
|
|
@@ -5982,6 +6110,89 @@
|
|
|
5982
6110
|
"model_revision": "master",
|
|
5983
6111
|
"model_hub": "modelscope"
|
|
5984
6112
|
},
|
|
6113
|
+
{
|
|
6114
|
+
"model_format": "gptq",
|
|
6115
|
+
"model_size_in_billions": 14,
|
|
6116
|
+
"quantizations": [
|
|
6117
|
+
"Int4",
|
|
6118
|
+
"Int8"
|
|
6119
|
+
],
|
|
6120
|
+
"model_id": "qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-{quantization}",
|
|
6121
|
+
"model_revision": "master",
|
|
6122
|
+
"model_hub": "modelscope"
|
|
6123
|
+
},
|
|
6124
|
+
{
|
|
6125
|
+
"model_format": "gptq",
|
|
6126
|
+
"model_size_in_billions": 32,
|
|
6127
|
+
"quantizations": [
|
|
6128
|
+
"Int4",
|
|
6129
|
+
"Int8"
|
|
6130
|
+
],
|
|
6131
|
+
"model_id": "qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-{quantization}",
|
|
6132
|
+
"model_revision": "master",
|
|
6133
|
+
"model_hub": "modelscope"
|
|
6134
|
+
},
|
|
6135
|
+
{
|
|
6136
|
+
"model_format": "awq",
|
|
6137
|
+
"model_size_in_billions": "0_5",
|
|
6138
|
+
"quantizations": [
|
|
6139
|
+
"Int4"
|
|
6140
|
+
],
|
|
6141
|
+
"model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ",
|
|
6142
|
+
"model_revision": "master",
|
|
6143
|
+
"model_hub": "modelscope"
|
|
6144
|
+
},
|
|
6145
|
+
{
|
|
6146
|
+
"model_format": "awq",
|
|
6147
|
+
"model_size_in_billions": "1_5",
|
|
6148
|
+
"quantizations": [
|
|
6149
|
+
"Int4"
|
|
6150
|
+
],
|
|
6151
|
+
"model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ",
|
|
6152
|
+
"model_revision": "master",
|
|
6153
|
+
"model_hub": "modelscope"
|
|
6154
|
+
},
|
|
6155
|
+
{
|
|
6156
|
+
"model_format": "awq",
|
|
6157
|
+
"model_size_in_billions": 3,
|
|
6158
|
+
"quantizations": [
|
|
6159
|
+
"Int4"
|
|
6160
|
+
],
|
|
6161
|
+
"model_id": "qwen/Qwen2.5-Coder-3B-Instruct-AWQ",
|
|
6162
|
+
"model_revision": "master",
|
|
6163
|
+
"model_hub": "modelscope"
|
|
6164
|
+
},
|
|
6165
|
+
{
|
|
6166
|
+
"model_format": "awq",
|
|
6167
|
+
"model_size_in_billions": 7,
|
|
6168
|
+
"quantizations": [
|
|
6169
|
+
"Int4"
|
|
6170
|
+
],
|
|
6171
|
+
"model_id": "qwen/Qwen2.5-Coder-7B-Instruct-AWQ",
|
|
6172
|
+
"model_revision": "master",
|
|
6173
|
+
"model_hub": "modelscope"
|
|
6174
|
+
},
|
|
6175
|
+
{
|
|
6176
|
+
"model_format": "awq",
|
|
6177
|
+
"model_size_in_billions": 14,
|
|
6178
|
+
"quantizations": [
|
|
6179
|
+
"Int4"
|
|
6180
|
+
],
|
|
6181
|
+
"model_id": "qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
|
|
6182
|
+
"model_revision": "master",
|
|
6183
|
+
"model_hub": "modelscope"
|
|
6184
|
+
},
|
|
6185
|
+
{
|
|
6186
|
+
"model_format": "awq",
|
|
6187
|
+
"model_size_in_billions": 32,
|
|
6188
|
+
"quantizations": [
|
|
6189
|
+
"Int4"
|
|
6190
|
+
],
|
|
6191
|
+
"model_id": "qwen/Qwen2.5-Coder-32B-Instruct-AWQ",
|
|
6192
|
+
"model_revision": "master",
|
|
6193
|
+
"model_hub": "modelscope"
|
|
6194
|
+
},
|
|
6195
|
+
|
|
5985
6196
|
{
|
|
5986
6197
|
"model_format": "ggufv2",
|
|
5987
6198
|
"model_size_in_billions": "1_5",
|
|
@@ -6056,5 +6267,313 @@
|
|
|
6056
6267
|
"<|im_start|>",
|
|
6057
6268
|
"<|im_end|>"
|
|
6058
6269
|
]
|
|
6270
|
+
},
|
|
6271
|
+
{
|
|
6272
|
+
"version": 1,
|
|
6273
|
+
"context_length": 32768,
|
|
6274
|
+
"model_name": "QwQ-32B-Preview",
|
|
6275
|
+
"model_lang": [
|
|
6276
|
+
"en",
|
|
6277
|
+
"zh"
|
|
6278
|
+
],
|
|
6279
|
+
"model_ability": [
|
|
6280
|
+
"chat"
|
|
6281
|
+
],
|
|
6282
|
+
"model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
|
|
6283
|
+
"model_specs": [
|
|
6284
|
+
{
|
|
6285
|
+
"model_format": "pytorch",
|
|
6286
|
+
"model_size_in_billions": 32,
|
|
6287
|
+
"quantizations": [
|
|
6288
|
+
"4-bit",
|
|
6289
|
+
"8-bit",
|
|
6290
|
+
"none"
|
|
6291
|
+
],
|
|
6292
|
+
"model_id": "Qwen/QwQ-32B-Preview",
|
|
6293
|
+
"model_hub": "modelscope"
|
|
6294
|
+
},
|
|
6295
|
+
{
|
|
6296
|
+
"model_format": "mlx",
|
|
6297
|
+
"model_size_in_billions": 32,
|
|
6298
|
+
"quantizations": [
|
|
6299
|
+
"4-bit"
|
|
6300
|
+
],
|
|
6301
|
+
"model_id": "okwinds/QwQ-32B-Preview-MLX-4bit",
|
|
6302
|
+
"model_hub": "modelscope"
|
|
6303
|
+
},
|
|
6304
|
+
{
|
|
6305
|
+
"model_format": "mlx",
|
|
6306
|
+
"model_size_in_billions": 32,
|
|
6307
|
+
"quantizations": [
|
|
6308
|
+
"8-bit"
|
|
6309
|
+
],
|
|
6310
|
+
"model_id": "okwinds/QwQ-32B-Preview-MLX-8bit",
|
|
6311
|
+
"model_hub": "modelscope"
|
|
6312
|
+
},
|
|
6313
|
+
{
|
|
6314
|
+
"model_format": "ggufv2",
|
|
6315
|
+
"model_size_in_billions": 32,
|
|
6316
|
+
"quantizations": [
|
|
6317
|
+
"Q3_K_L",
|
|
6318
|
+
"Q4_K_M",
|
|
6319
|
+
"Q6_K",
|
|
6320
|
+
"Q8_0"
|
|
6321
|
+
],
|
|
6322
|
+
"model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
|
|
6323
|
+
"model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
|
|
6324
|
+
}
|
|
6325
|
+
],
|
|
6326
|
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
|
6327
|
+
"stop_token_ids": [
|
|
6328
|
+
151643,
|
|
6329
|
+
151644,
|
|
6330
|
+
151645
|
|
6331
|
+
],
|
|
6332
|
+
"stop": [
|
|
6333
|
+
"<|endoftext|>",
|
|
6334
|
+
"<|im_start|>",
|
|
6335
|
+
"<|im_end|>"
|
|
6336
|
+
]
|
|
6337
|
+
},
|
|
6338
|
+
{
|
|
6339
|
+
"version": 1,
|
|
6340
|
+
"context_length": 8192,
|
|
6341
|
+
"model_name": "glm-edge-chat",
|
|
6342
|
+
"model_lang": [
|
|
6343
|
+
"en",
|
|
6344
|
+
"zh"
|
|
6345
|
+
],
|
|
6346
|
+
"model_ability": [
|
|
6347
|
+
"chat"
|
|
6348
|
+
],
|
|
6349
|
+
"model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
|
|
6350
|
+
"model_specs": [
|
|
6351
|
+
{
|
|
6352
|
+
"model_format": "pytorch",
|
|
6353
|
+
"model_size_in_billions": "1_5",
|
|
6354
|
+
"quantizations": [
|
|
6355
|
+
"4-bit",
|
|
6356
|
+
"8-bit",
|
|
6357
|
+
"none"
|
|
6358
|
+
],
|
|
6359
|
+
"model_id": "ZhipuAI/glm-edge-1.5b-chat",
|
|
6360
|
+
"model_hub": "modelscope"
|
|
6361
|
+
},
|
|
6362
|
+
{
|
|
6363
|
+
"model_format": "pytorch",
|
|
6364
|
+
"model_size_in_billions": "4",
|
|
6365
|
+
"quantizations": [
|
|
6366
|
+
"4-bit",
|
|
6367
|
+
"8-bit",
|
|
6368
|
+
"none"
|
|
6369
|
+
],
|
|
6370
|
+
"model_id": "ZhipuAI/glm-edge-4b-chat",
|
|
6371
|
+
"model_hub": "modelscope"
|
|
6372
|
+
},
|
|
6373
|
+
{
|
|
6374
|
+
"model_format": "ggufv2",
|
|
6375
|
+
"model_size_in_billions": "1_5",
|
|
6376
|
+
"quantizations": [
|
|
6377
|
+
"Q4_0",
|
|
6378
|
+
"Q4_1",
|
|
6379
|
+
"Q4_K",
|
|
6380
|
+
"Q4_K_M",
|
|
6381
|
+
"Q4_K_S",
|
|
6382
|
+
"Q5_0",
|
|
6383
|
+
"Q5_1",
|
|
6384
|
+
"Q5_K",
|
|
6385
|
+
"Q5_K_M",
|
|
6386
|
+
"Q5_K_S",
|
|
6387
|
+
"Q6_K",
|
|
6388
|
+
"Q8_0"
|
|
6389
|
+
],
|
|
6390
|
+
"model_file_name_template": "ggml-model-{quantization}.gguf",
|
|
6391
|
+
"model_hub": "modelscope",
|
|
6392
|
+
"model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
|
|
6393
|
+
},
|
|
6394
|
+
{
|
|
6395
|
+
"model_format": "ggufv2",
|
|
6396
|
+
"model_size_in_billions": "1_5",
|
|
6397
|
+
"quantizations": [
|
|
6398
|
+
"F16"
|
|
6399
|
+
],
|
|
6400
|
+
"model_file_name_template": "glm-edge-1.5B-chat-{quantization}.gguf",
|
|
6401
|
+
"model_hub": "modelscope",
|
|
6402
|
+
"model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
|
|
6403
|
+
},
|
|
6404
|
+
{
|
|
6405
|
+
"model_format": "ggufv2",
|
|
6406
|
+
"model_size_in_billions": "4",
|
|
6407
|
+
"quantizations": [
|
|
6408
|
+
"Q4_0",
|
|
6409
|
+
"Q4_1",
|
|
6410
|
+
"Q4_K",
|
|
6411
|
+
"Q4_K_M",
|
|
6412
|
+
"Q4_K_S",
|
|
6413
|
+
"Q5_0",
|
|
6414
|
+
"Q5_1",
|
|
6415
|
+
"Q5_K",
|
|
6416
|
+
"Q5_K_M",
|
|
6417
|
+
"Q5_K_S",
|
|
6418
|
+
"Q6_K",
|
|
6419
|
+
"Q8_0"
|
|
6420
|
+
],
|
|
6421
|
+
"model_file_name_template": "ggml-model-{quantization}.gguf",
|
|
6422
|
+
"model_hub": "modelscope",
|
|
6423
|
+
"model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
|
|
6424
|
+
},
|
|
6425
|
+
{
|
|
6426
|
+
"model_format": "ggufv2",
|
|
6427
|
+
"model_size_in_billions": "4",
|
|
6428
|
+
"quantizations": [
|
|
6429
|
+
"F16"
|
|
6430
|
+
],
|
|
6431
|
+
"model_file_name_template": "glm-edge-4B-chat-{quantization}.gguf",
|
|
6432
|
+
"model_hub": "modelscope",
|
|
6433
|
+
"model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
|
|
6434
|
+
}
|
|
6435
|
+
],
|
|
6436
|
+
"chat_template": "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
|
|
6437
|
+
"stop_token_ids": [
|
|
6438
|
+
59246,
|
|
6439
|
+
59253,
|
|
6440
|
+
59255
|
|
6441
|
+
],
|
|
6442
|
+
"stop": [
|
|
6443
|
+
"<|endoftext|>",
|
|
6444
|
+
"<|user|>",
|
|
6445
|
+
"<|observation|>"
|
|
6446
|
+
]
|
|
6447
|
+
},
|
|
6448
|
+
{
|
|
6449
|
+
"version": 1,
|
|
6450
|
+
"context_length": 8192,
|
|
6451
|
+
"model_name": "glm-edge-v",
|
|
6452
|
+
"model_lang": [
|
|
6453
|
+
"en",
|
|
6454
|
+
"zh"
|
|
6455
|
+
],
|
|
6456
|
+
"model_ability": [
|
|
6457
|
+
"chat",
|
|
6458
|
+
"vision"
|
|
6459
|
+
],
|
|
6460
|
+
"model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
|
|
6461
|
+
"model_specs": [
|
|
6462
|
+
{
|
|
6463
|
+
"model_format": "pytorch",
|
|
6464
|
+
"model_size_in_billions": "2",
|
|
6465
|
+
"quantizations": [
|
|
6466
|
+
"4-bit",
|
|
6467
|
+
"8-bit",
|
|
6468
|
+
"none"
|
|
6469
|
+
],
|
|
6470
|
+
"model_id": "ZhipuAI/glm-edge-v-2b",
|
|
6471
|
+
"model_hub": "modelscope"
|
|
6472
|
+
},
|
|
6473
|
+
{
|
|
6474
|
+
"model_format": "pytorch",
|
|
6475
|
+
"model_size_in_billions": "5",
|
|
6476
|
+
"quantizations": [
|
|
6477
|
+
"4-bit",
|
|
6478
|
+
"8-bit",
|
|
6479
|
+
"none"
|
|
6480
|
+
],
|
|
6481
|
+
"model_id": "ZhipuAI/glm-edge-v-5b",
|
|
6482
|
+
"model_hub": "modelscope"
|
|
6483
|
+
},
|
|
6484
|
+
{
|
|
6485
|
+
"model_format": "ggufv2",
|
|
6486
|
+
"model_size_in_billions": "2",
|
|
6487
|
+
"quantizations": [
|
|
6488
|
+
"Q4_0",
|
|
6489
|
+
"Q4_1",
|
|
6490
|
+
"Q4_K",
|
|
6491
|
+
"Q4_K_M",
|
|
6492
|
+
"Q4_K_S",
|
|
6493
|
+
"Q5_0",
|
|
6494
|
+
"Q5_1",
|
|
6495
|
+
"Q5_K",
|
|
6496
|
+
"Q5_K_M",
|
|
6497
|
+
"Q5_K_S",
|
|
6498
|
+
"Q6_K",
|
|
6499
|
+
"Q8_0"
|
|
6500
|
+
],
|
|
6501
|
+
"model_file_name_template": "ggml-model-{quantization}.gguf",
|
|
6502
|
+
"model_hub": "modelscope",
|
|
6503
|
+
"model_id": "ZhipuAI/glm-edge-v-2b-gguf"
|
|
6504
|
+
},
|
|
6505
|
+
{
|
|
6506
|
+
"model_format": "ggufv2",
|
|
6507
|
+
"model_size_in_billions": "2",
|
|
6508
|
+
"quantizations": [
|
|
6509
|
+
"F16"
|
|
6510
|
+
],
|
|
6511
|
+
"model_file_name_template": "glm-edge-v-2B-{quantization}.gguf",
|
|
6512
|
+
"model_hub": "modelscope",
|
|
6513
|
+
"model_id": "ZhipuAI/glm-edge-v-2b-gguf"
|
|
6514
|
+
},
|
|
6515
|
+
{
|
|
6516
|
+
"model_format": "ggufv2",
|
|
6517
|
+
"model_size_in_billions": "2",
|
|
6518
|
+
"quantizations": [
|
|
6519
|
+
"f16"
|
|
6520
|
+
],
|
|
6521
|
+
"model_file_name_template": "mmproj-model-{quantization}.gguf",
|
|
6522
|
+
"model_hub": "modelscope",
|
|
6523
|
+
"model_id": "ZhipuAI/glm-edge-v-2b-gguf"
|
|
6524
|
+
},
|
|
6525
|
+
{
|
|
6526
|
+
"model_format": "ggufv2",
|
|
6527
|
+
"model_size_in_billions": "5",
|
|
6528
|
+
"quantizations": [
|
|
6529
|
+
"Q4_0",
|
|
6530
|
+
"Q4_1",
|
|
6531
|
+
"Q4_K",
|
|
6532
|
+
"Q4_K_M",
|
|
6533
|
+
"Q4_K_S",
|
|
6534
|
+
"Q5_0",
|
|
6535
|
+
"Q5_1",
|
|
6536
|
+
"Q5_K",
|
|
6537
|
+
"Q5_K_M",
|
|
6538
|
+
"Q5_K_S",
|
|
6539
|
+
"Q6_K",
|
|
6540
|
+
"Q8_0"
|
|
6541
|
+
],
|
|
6542
|
+
"model_file_name_template": "ggml-model-{quantization}.gguf",
|
|
6543
|
+
"model_hub": "modelscope",
|
|
6544
|
+
"model_id": "ZhipuAI/glm-edge-v-5b-gguf"
|
|
6545
|
+
},
|
|
6546
|
+
{
|
|
6547
|
+
"model_format": "ggufv2",
|
|
6548
|
+
"model_size_in_billions": "5",
|
|
6549
|
+
"quantizations": [
|
|
6550
|
+
"F16"
|
|
6551
|
+
],
|
|
6552
|
+
"model_file_name_template": "glm-edge-v-5B-{quantization}.gguf",
|
|
6553
|
+
"model_hub": "modelscope",
|
|
6554
|
+
"model_id": "ZhipuAI/glm-edge-v-5b-gguf"
|
|
6555
|
+
},
|
|
6556
|
+
{
|
|
6557
|
+
"model_format": "ggufv2",
|
|
6558
|
+
"model_size_in_billions": "5",
|
|
6559
|
+
"quantizations": [
|
|
6560
|
+
"f16"
|
|
6561
|
+
],
|
|
6562
|
+
"model_file_name_template": "mmproj-model-{quantization}.gguf",
|
|
6563
|
+
"model_hub": "modelscope",
|
|
6564
|
+
"model_id": "ZhipuAI/glm-edge-v-5b-gguf"
|
|
6565
|
+
}
|
|
6566
|
+
],
|
|
6567
|
+
"chat_template": "{% for item in messages %}{% if item['role'] != 'system' %}<|{{ item['role'] }}|>\n{% for content in item['content'] %}{% if content['type'] == 'image' %}{% for _ in range(578) %}<|begin_of_image|>{% endfor %}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
|
|
6568
|
+
"stop_token_ids": [
|
|
6569
|
+
59246,
|
|
6570
|
+
59253,
|
|
6571
|
+
59255
|
|
6572
|
+
],
|
|
6573
|
+
"stop": [
|
|
6574
|
+
"<|endoftext|>",
|
|
6575
|
+
"<|user|>",
|
|
6576
|
+
"<|observation|>"
|
|
6577
|
+
]
|
|
6059
6578
|
}
|
|
6060
6579
|
]
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -17,7 +17,8 @@ import platform
|
|
|
17
17
|
import sys
|
|
18
18
|
import time
|
|
19
19
|
import uuid
|
|
20
|
-
from
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union
|
|
21
22
|
|
|
22
23
|
from ....fields import max_tokens_field
|
|
23
24
|
from ....types import (
|
|
@@ -53,6 +54,14 @@ class MLXGenerateConfig(TypedDict, total=False):
|
|
|
53
54
|
stream: bool
|
|
54
55
|
stream_options: Optional[Union[dict, None]]
|
|
55
56
|
tools: Optional[List[Dict]]
|
|
57
|
+
lora_name: Optional[str]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class PromptCache:
|
|
62
|
+
cache: List[Any] = field(default_factory=list)
|
|
63
|
+
model_key: Tuple[str, Optional[str]] = ("", None)
|
|
64
|
+
tokens: List[int] = field(default_factory=list)
|
|
56
65
|
|
|
57
66
|
|
|
58
67
|
class MLXModel(LLM):
|
|
@@ -69,6 +78,8 @@ class MLXModel(LLM):
|
|
|
69
78
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
70
79
|
self._use_fast_tokenizer = True
|
|
71
80
|
self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
|
|
81
|
+
self._max_kv_size = None
|
|
82
|
+
self._prompt_cache = None
|
|
72
83
|
if peft_model is not None:
|
|
73
84
|
raise ValueError("MLX engine has not supported lora yet")
|
|
74
85
|
|
|
@@ -127,6 +138,9 @@ class MLXModel(LLM):
|
|
|
127
138
|
logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
|
|
128
139
|
mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
|
|
129
140
|
|
|
141
|
+
self._max_kv_size = kwargs.get("max_kv_size", None)
|
|
142
|
+
self._prompt_cache = PromptCache()
|
|
143
|
+
|
|
130
144
|
return load(
|
|
131
145
|
self.model_path,
|
|
132
146
|
tokenizer_config=tokenizer_config,
|
|
@@ -156,6 +170,27 @@ class MLXModel(LLM):
|
|
|
156
170
|
return False
|
|
157
171
|
return True
|
|
158
172
|
|
|
173
|
+
def _get_prompt_cache(self, prompt, lora_name: Optional[str] = None):
|
|
174
|
+
from mlx_lm.models.cache import make_prompt_cache
|
|
175
|
+
|
|
176
|
+
assert self._prompt_cache is not None
|
|
177
|
+
cache_len = len(self._prompt_cache.tokens)
|
|
178
|
+
model_key = (self.model_path, lora_name)
|
|
179
|
+
if (
|
|
180
|
+
self._prompt_cache.model_key != model_key
|
|
181
|
+
or cache_len >= len(prompt)
|
|
182
|
+
or self._prompt_cache.tokens != prompt[:cache_len]
|
|
183
|
+
):
|
|
184
|
+
self._prompt_cache.model_key = model_key
|
|
185
|
+
self._prompt_cache.cache = make_prompt_cache(self._model, self._max_kv_size)
|
|
186
|
+
self._prompt_cache.tokens = []
|
|
187
|
+
logger.debug("Making new prompt cache for %s", self.model_uid)
|
|
188
|
+
else:
|
|
189
|
+
prompt = prompt[cache_len:]
|
|
190
|
+
logger.debug("Cache hit for %s", self.model_uid)
|
|
191
|
+
self._prompt_cache.tokens.extend(prompt)
|
|
192
|
+
return prompt
|
|
193
|
+
|
|
159
194
|
def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
|
|
160
195
|
import mlx.core as mx
|
|
161
196
|
from mlx_lm.utils import generate_step
|
|
@@ -167,6 +202,7 @@ class MLXModel(LLM):
|
|
|
167
202
|
chunk_id = str(uuid.uuid4())
|
|
168
203
|
stop_token_ids = kwargs.get("stop_token_ids", [])
|
|
169
204
|
stream = kwargs.get("stream", False)
|
|
205
|
+
lora_name = kwargs.get("lora_name")
|
|
170
206
|
stream_options = kwargs.pop("stream_options", None)
|
|
171
207
|
include_usage = (
|
|
172
208
|
stream_options["include_usage"]
|
|
@@ -174,12 +210,15 @@ class MLXModel(LLM):
|
|
|
174
210
|
else False
|
|
175
211
|
)
|
|
176
212
|
|
|
177
|
-
|
|
213
|
+
prompt_token_ids = tokenizer.encode(prompt)
|
|
214
|
+
prompt_token_ids = self._get_prompt_cache(prompt_token_ids, lora_name)
|
|
215
|
+
prompt_tokens = mx.array(prompt_token_ids)
|
|
178
216
|
input_echo_len = len(prompt_tokens)
|
|
179
217
|
|
|
180
218
|
i = 0
|
|
181
219
|
start = time.time()
|
|
182
220
|
output = ""
|
|
221
|
+
tokens = []
|
|
183
222
|
for (token, _), i in zip(
|
|
184
223
|
generate_step(
|
|
185
224
|
prompt_tokens,
|
|
@@ -188,10 +227,11 @@ class MLXModel(LLM):
|
|
|
188
227
|
repetition_penalty=kwargs["repetition_penalty"],
|
|
189
228
|
repetition_context_size=kwargs["repetition_context_size"],
|
|
190
229
|
top_p=kwargs["top_p"],
|
|
191
|
-
|
|
230
|
+
prompt_cache=self._prompt_cache.cache, # type: ignore
|
|
192
231
|
),
|
|
193
232
|
range(max_tokens),
|
|
194
233
|
):
|
|
234
|
+
tokens.append(token)
|
|
195
235
|
if token == tokenizer.eos_token_id or token in stop_token_ids: # type: ignore
|
|
196
236
|
break
|
|
197
237
|
|
|
@@ -230,6 +270,8 @@ class MLXModel(LLM):
|
|
|
230
270
|
f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
|
|
231
271
|
)
|
|
232
272
|
|
|
273
|
+
self._prompt_cache.tokens.extend(tokens) # type: ignore
|
|
274
|
+
|
|
233
275
|
if i == max_tokens - 1:
|
|
234
276
|
finish_reason = "length"
|
|
235
277
|
else:
|