xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +34 -15
- xinference/client/oscar/actor_client.py +4 -3
- xinference/client/restful/restful_client.py +40 -18
- xinference/core/supervisor.py +48 -9
- xinference/core/worker.py +13 -8
- xinference/deploy/cmdline.py +22 -9
- xinference/model/audio/__init__.py +40 -1
- xinference/model/audio/core.py +25 -45
- xinference/model/audio/custom.py +148 -0
- xinference/model/core.py +6 -9
- xinference/model/embedding/core.py +1 -2
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/image/core.py +12 -4
- xinference/model/image/stable_diffusion/core.py +8 -7
- xinference/model/llm/__init__.py +0 -6
- xinference/model/llm/core.py +9 -14
- xinference/model/llm/ggml/llamacpp.py +2 -10
- xinference/model/llm/llm_family.json +507 -7
- xinference/model/llm/llm_family.py +41 -4
- xinference/model/llm/llm_family_modelscope.json +260 -0
- xinference/model/llm/pytorch/baichuan.py +4 -3
- xinference/model/llm/pytorch/chatglm.py +5 -2
- xinference/model/llm/pytorch/core.py +37 -41
- xinference/model/llm/pytorch/falcon.py +6 -5
- xinference/model/llm/pytorch/internlm2.py +5 -2
- xinference/model/llm/pytorch/llama_2.py +6 -5
- xinference/model/llm/pytorch/qwen_vl.py +2 -0
- xinference/model/llm/pytorch/vicuna.py +4 -3
- xinference/model/llm/pytorch/yi_vl.py +4 -2
- xinference/model/llm/utils.py +42 -4
- xinference/model/llm/vllm/core.py +54 -6
- xinference/model/rerank/core.py +26 -12
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +25 -1
- xinference/model/utils.py +12 -1
- xinference/thirdparty/omnilmm/chat.py +1 -1
- xinference/types.py +70 -19
- xinference/utils.py +1 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/METADATA +13 -10
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/RECORD +71 -74
- xinference/model/llm/ggml/ctransformers.py +0 -281
- xinference/model/llm/ggml/ctransformers_util.py +0 -161
- xinference/web/ui/build/static/js/main.98516614.js +0 -3
- xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
- /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
|
@@ -1825,6 +1825,17 @@
|
|
|
1825
1825
|
"model_id": "qwen/Qwen1.5-14B-Chat",
|
|
1826
1826
|
"model_hub": "modelscope"
|
|
1827
1827
|
},
|
|
1828
|
+
{
|
|
1829
|
+
"model_format": "pytorch",
|
|
1830
|
+
"model_size_in_billions": 32,
|
|
1831
|
+
"quantizations": [
|
|
1832
|
+
"4-bit",
|
|
1833
|
+
"8-bit",
|
|
1834
|
+
"none"
|
|
1835
|
+
],
|
|
1836
|
+
"model_id": "qwen/Qwen1.5-32B-Chat",
|
|
1837
|
+
"model_hub": "modelscope"
|
|
1838
|
+
},
|
|
1828
1839
|
{
|
|
1829
1840
|
"model_format": "pytorch",
|
|
1830
1841
|
"model_size_in_billions": 72,
|
|
@@ -1886,6 +1897,15 @@
|
|
|
1886
1897
|
"model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
|
|
1887
1898
|
"model_hub": "modelscope"
|
|
1888
1899
|
},
|
|
1900
|
+
{
|
|
1901
|
+
"model_format": "gptq",
|
|
1902
|
+
"model_size_in_billions": 32,
|
|
1903
|
+
"quantizations": [
|
|
1904
|
+
"Int4"
|
|
1905
|
+
],
|
|
1906
|
+
"model_id": "qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}",
|
|
1907
|
+
"model_hub": "modelscope"
|
|
1908
|
+
},
|
|
1889
1909
|
{
|
|
1890
1910
|
"model_format": "gptq",
|
|
1891
1911
|
"model_size_in_billions": 72,
|
|
@@ -1941,6 +1961,15 @@
|
|
|
1941
1961
|
"model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
|
|
1942
1962
|
"model_hub": "modelscope"
|
|
1943
1963
|
},
|
|
1964
|
+
{
|
|
1965
|
+
"model_format": "awq",
|
|
1966
|
+
"model_size_in_billions": 32,
|
|
1967
|
+
"quantizations": [
|
|
1968
|
+
"Int4"
|
|
1969
|
+
],
|
|
1970
|
+
"model_id": "qwen/Qwen1.5-32B-Chat-AWQ",
|
|
1971
|
+
"model_hub": "modelscope"
|
|
1972
|
+
},
|
|
1944
1973
|
{
|
|
1945
1974
|
"model_format": "awq",
|
|
1946
1975
|
"model_size_in_billions": 72,
|
|
@@ -2035,6 +2064,23 @@
|
|
|
2035
2064
|
"model_hub": "modelscope",
|
|
2036
2065
|
"model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
|
|
2037
2066
|
},
|
|
2067
|
+
{
|
|
2068
|
+
"model_format": "ggufv2",
|
|
2069
|
+
"model_size_in_billions": 32,
|
|
2070
|
+
"quantizations": [
|
|
2071
|
+
"q2_k",
|
|
2072
|
+
"q3_k_m",
|
|
2073
|
+
"q4_0",
|
|
2074
|
+
"q4_k_m",
|
|
2075
|
+
"q5_0",
|
|
2076
|
+
"q5_k_m",
|
|
2077
|
+
"q6_k",
|
|
2078
|
+
"q8_0"
|
|
2079
|
+
],
|
|
2080
|
+
"model_id": "qwen/Qwen1.5-32B-Chat-GGUF",
|
|
2081
|
+
"model_hub": "modelscope",
|
|
2082
|
+
"model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
|
|
2083
|
+
},
|
|
2038
2084
|
{
|
|
2039
2085
|
"model_format": "ggufv2",
|
|
2040
2086
|
"model_size_in_billions": 72,
|
|
@@ -2075,6 +2121,131 @@
|
|
|
2075
2121
|
]
|
|
2076
2122
|
}
|
|
2077
2123
|
},
|
|
2124
|
+
{
|
|
2125
|
+
"version": 1,
|
|
2126
|
+
"context_length": 32768,
|
|
2127
|
+
"model_name": "qwen1.5-moe-chat",
|
|
2128
|
+
"model_lang": [
|
|
2129
|
+
"en",
|
|
2130
|
+
"zh"
|
|
2131
|
+
],
|
|
2132
|
+
"model_ability": [
|
|
2133
|
+
"chat"
|
|
2134
|
+
],
|
|
2135
|
+
"model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
|
|
2136
|
+
"model_specs": [
|
|
2137
|
+
{
|
|
2138
|
+
"model_format": "pytorch",
|
|
2139
|
+
"model_size_in_billions": "2_7",
|
|
2140
|
+
"quantizations": [
|
|
2141
|
+
"4-bit",
|
|
2142
|
+
"8-bit",
|
|
2143
|
+
"none"
|
|
2144
|
+
],
|
|
2145
|
+
"model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
|
|
2146
|
+
"model_hub": "modelscope"
|
|
2147
|
+
},
|
|
2148
|
+
{
|
|
2149
|
+
"model_format": "gptq",
|
|
2150
|
+
"model_size_in_billions": "2_7",
|
|
2151
|
+
"quantizations": [
|
|
2152
|
+
"Int4"
|
|
2153
|
+
],
|
|
2154
|
+
"model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
|
|
2155
|
+
"model_hub": "modelscope"
|
|
2156
|
+
}
|
|
2157
|
+
],
|
|
2158
|
+
"prompt_style": {
|
|
2159
|
+
"style_name": "QWEN",
|
|
2160
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2161
|
+
"roles": [
|
|
2162
|
+
"user",
|
|
2163
|
+
"assistant"
|
|
2164
|
+
],
|
|
2165
|
+
"intra_message_sep": "\n",
|
|
2166
|
+
"stop_token_ids": [
|
|
2167
|
+
151643,
|
|
2168
|
+
151644,
|
|
2169
|
+
151645
|
|
2170
|
+
],
|
|
2171
|
+
"stop": [
|
|
2172
|
+
"<|endoftext|>",
|
|
2173
|
+
"<|im_start|>",
|
|
2174
|
+
"<|im_end|>"
|
|
2175
|
+
]
|
|
2176
|
+
}
|
|
2177
|
+
},
|
|
2178
|
+
{
|
|
2179
|
+
"version": 1,
|
|
2180
|
+
"context_length": 65536,
|
|
2181
|
+
"model_name": "codeqwen1.5-chat",
|
|
2182
|
+
"model_lang": [
|
|
2183
|
+
"en",
|
|
2184
|
+
"zh"
|
|
2185
|
+
],
|
|
2186
|
+
"model_ability": [
|
|
2187
|
+
"chat"
|
|
2188
|
+
],
|
|
2189
|
+
"model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
|
|
2190
|
+
"model_specs": [
|
|
2191
|
+
{
|
|
2192
|
+
"model_format": "ggufv2",
|
|
2193
|
+
"model_size_in_billions": 7,
|
|
2194
|
+
"quantizations": [
|
|
2195
|
+
"q2_k",
|
|
2196
|
+
"q3_k_m",
|
|
2197
|
+
"q4_0",
|
|
2198
|
+
"q4_k_m",
|
|
2199
|
+
"q5_0",
|
|
2200
|
+
"q5_k_m",
|
|
2201
|
+
"q6_k",
|
|
2202
|
+
"q8_0"
|
|
2203
|
+
],
|
|
2204
|
+
"model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
|
|
2205
|
+
"model_hub": "modelscope",
|
|
2206
|
+
"model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
|
|
2207
|
+
},
|
|
2208
|
+
{
|
|
2209
|
+
"model_format": "pytorch",
|
|
2210
|
+
"model_size_in_billions": 7,
|
|
2211
|
+
"quantizations": [
|
|
2212
|
+
"4-bit",
|
|
2213
|
+
"8-bit",
|
|
2214
|
+
"none"
|
|
2215
|
+
],
|
|
2216
|
+
"model_id": "qwen/CodeQwen1.5-7B-Chat",
|
|
2217
|
+
"model_hub": "modelscope"
|
|
2218
|
+
},
|
|
2219
|
+
{
|
|
2220
|
+
"model_format": "awq",
|
|
2221
|
+
"model_size_in_billions": 7,
|
|
2222
|
+
"quantizations": [
|
|
2223
|
+
"Int4"
|
|
2224
|
+
],
|
|
2225
|
+
"model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
|
|
2226
|
+
"model_hub": "modelscope"
|
|
2227
|
+
}
|
|
2228
|
+
],
|
|
2229
|
+
"prompt_style": {
|
|
2230
|
+
"style_name": "QWEN",
|
|
2231
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2232
|
+
"roles": [
|
|
2233
|
+
"user",
|
|
2234
|
+
"assistant"
|
|
2235
|
+
],
|
|
2236
|
+
"intra_message_sep": "\n",
|
|
2237
|
+
"stop_token_ids": [
|
|
2238
|
+
151643,
|
|
2239
|
+
151644,
|
|
2240
|
+
151645
|
|
2241
|
+
],
|
|
2242
|
+
"stop": [
|
|
2243
|
+
"<|endoftext|>",
|
|
2244
|
+
"<|im_start|>",
|
|
2245
|
+
"<|im_end|>"
|
|
2246
|
+
]
|
|
2247
|
+
}
|
|
2248
|
+
},
|
|
2078
2249
|
{
|
|
2079
2250
|
"version": 1,
|
|
2080
2251
|
"context_length": 4096,
|
|
@@ -2945,5 +3116,94 @@
|
|
|
2945
3116
|
"</s>"
|
|
2946
3117
|
]
|
|
2947
3118
|
}
|
|
3119
|
+
},
|
|
3120
|
+
{
|
|
3121
|
+
"version": 1,
|
|
3122
|
+
"context_length": 131072,
|
|
3123
|
+
"model_name": "c4ai-command-r-v01",
|
|
3124
|
+
"model_lang": [
|
|
3125
|
+
"en",
|
|
3126
|
+
"fr",
|
|
3127
|
+
"de",
|
|
3128
|
+
"es",
|
|
3129
|
+
"it",
|
|
3130
|
+
"pt",
|
|
3131
|
+
"ja",
|
|
3132
|
+
"ko",
|
|
3133
|
+
"zh",
|
|
3134
|
+
"ar"
|
|
3135
|
+
],
|
|
3136
|
+
"model_ability": [
|
|
3137
|
+
"generate"
|
|
3138
|
+
],
|
|
3139
|
+
"model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
|
|
3140
|
+
"model_specs": [
|
|
3141
|
+
{
|
|
3142
|
+
"model_format": "pytorch",
|
|
3143
|
+
"model_size_in_billions": 35,
|
|
3144
|
+
"quantizations": [
|
|
3145
|
+
"none"
|
|
3146
|
+
],
|
|
3147
|
+
"model_hub": "modelscope",
|
|
3148
|
+
"model_id": "AI-ModelScope/c4ai-command-r-v01",
|
|
3149
|
+
"model_revision": "master"
|
|
3150
|
+
},
|
|
3151
|
+
{
|
|
3152
|
+
"model_format": "ggufv2",
|
|
3153
|
+
"model_size_in_billions": 35,
|
|
3154
|
+
"quantizations": [
|
|
3155
|
+
"Q2_K",
|
|
3156
|
+
"Q4_K_M",
|
|
3157
|
+
"Q5_K_M"
|
|
3158
|
+
],
|
|
3159
|
+
"model_id": "mirror013/C4AI-Command-R-v01-GGUF",
|
|
3160
|
+
"model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
|
|
3161
|
+
"model_hub": "modelscope",
|
|
3162
|
+
"model_revision": "master"
|
|
3163
|
+
},
|
|
3164
|
+
{
|
|
3165
|
+
"model_format": "pytorch",
|
|
3166
|
+
"model_size_in_billions": 104,
|
|
3167
|
+
"quantizations": [
|
|
3168
|
+
"none"
|
|
3169
|
+
],
|
|
3170
|
+
"model_hub": "modelscope",
|
|
3171
|
+
"model_id": "AI-ModelScope/c4ai-command-r-plus",
|
|
3172
|
+
"model_revision": "master"
|
|
3173
|
+
}
|
|
3174
|
+
]
|
|
3175
|
+
},
|
|
3176
|
+
{
|
|
3177
|
+
"version": 1,
|
|
3178
|
+
"context_length": 131072,
|
|
3179
|
+
"model_name": "c4ai-command-r-v01-4bit",
|
|
3180
|
+
"model_lang": [
|
|
3181
|
+
"en",
|
|
3182
|
+
"fr",
|
|
3183
|
+
"de",
|
|
3184
|
+
"es",
|
|
3185
|
+
"it",
|
|
3186
|
+
"pt",
|
|
3187
|
+
"ja",
|
|
3188
|
+
"ko",
|
|
3189
|
+
"zh",
|
|
3190
|
+
"ar"
|
|
3191
|
+
],
|
|
3192
|
+
"model_ability": [
|
|
3193
|
+
"generate"
|
|
3194
|
+
],
|
|
3195
|
+
"model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
|
|
3196
|
+
"model_specs": [
|
|
3197
|
+
{
|
|
3198
|
+
"model_format": "pytorch",
|
|
3199
|
+
"model_size_in_billions": 35,
|
|
3200
|
+
"quantizations": [
|
|
3201
|
+
"none"
|
|
3202
|
+
],
|
|
3203
|
+
"model_hub": "modelscope",
|
|
3204
|
+
"model_id": "mirror013/c4ai-command-r-v01-4bit",
|
|
3205
|
+
"model_revision": "master"
|
|
3206
|
+
}
|
|
3207
|
+
]
|
|
2948
3208
|
}
|
|
2949
3209
|
]
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import List, Optional
|
|
16
16
|
|
|
17
|
+
from ....types import LoRA
|
|
17
18
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
18
19
|
from .core import PytorchChatModel, PytorchModelConfig
|
|
19
20
|
|
|
@@ -27,7 +28,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
|
|
|
27
28
|
quantization: str,
|
|
28
29
|
model_path: str,
|
|
29
30
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
-
|
|
31
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
model_uid,
|
|
@@ -36,7 +37,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
|
|
|
36
37
|
quantization,
|
|
37
38
|
model_path,
|
|
38
39
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
-
|
|
40
|
+
peft_model=peft_model,
|
|
40
41
|
)
|
|
41
42
|
self._use_fast_tokenizer = False
|
|
42
43
|
|
|
@@ -24,6 +24,7 @@ from ....types import (
|
|
|
24
24
|
CompletionChoice,
|
|
25
25
|
CompletionChunk,
|
|
26
26
|
CompletionUsage,
|
|
27
|
+
LoRA,
|
|
27
28
|
PytorchGenerateConfig,
|
|
28
29
|
)
|
|
29
30
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -39,7 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
39
40
|
quantization: str,
|
|
40
41
|
model_path: str,
|
|
41
42
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
42
|
-
|
|
43
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
43
44
|
):
|
|
44
45
|
super().__init__(
|
|
45
46
|
model_uid,
|
|
@@ -48,7 +49,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
48
49
|
quantization,
|
|
49
50
|
model_path,
|
|
50
51
|
pytorch_model_config=pytorch_model_config,
|
|
51
|
-
|
|
52
|
+
peft_model=peft_model,
|
|
52
53
|
)
|
|
53
54
|
|
|
54
55
|
def _load_model(self, **kwargs):
|
|
@@ -135,6 +136,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
135
136
|
chat_history = [h for h in chat_history if not h.get("tool_calls")]
|
|
136
137
|
if not chat_history:
|
|
137
138
|
chat_history = []
|
|
139
|
+
if system_prompt:
|
|
140
|
+
chat_history.append({"role": "system", "content": system_prompt})
|
|
138
141
|
if tools:
|
|
139
142
|
msg = self._model.chat(
|
|
140
143
|
self._tokenizer, prompt, [tools] + chat_history, **kwargs
|
|
@@ -32,6 +32,7 @@ from ....types import (
|
|
|
32
32
|
Embedding,
|
|
33
33
|
EmbeddingData,
|
|
34
34
|
EmbeddingUsage,
|
|
35
|
+
LoRA,
|
|
35
36
|
PytorchGenerateConfig,
|
|
36
37
|
PytorchModelConfig,
|
|
37
38
|
)
|
|
@@ -42,6 +43,25 @@ from ..utils import ChatModelMixin
|
|
|
42
43
|
|
|
43
44
|
logger = logging.getLogger(__name__)
|
|
44
45
|
|
|
46
|
+
NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
47
|
+
"baichuan-chat",
|
|
48
|
+
"baichuan-2-chat",
|
|
49
|
+
"vicuna-v1.3",
|
|
50
|
+
"falcon",
|
|
51
|
+
"falcon-instruct",
|
|
52
|
+
"chatglm",
|
|
53
|
+
"chatglm2",
|
|
54
|
+
"chatglm2-32k",
|
|
55
|
+
"chatglm2-128k",
|
|
56
|
+
"llama-2",
|
|
57
|
+
"llama-2-chat",
|
|
58
|
+
"internlm2-chat",
|
|
59
|
+
"qwen-vl-chat",
|
|
60
|
+
"OmniLMM",
|
|
61
|
+
"yi-vl-chat",
|
|
62
|
+
"deepseek-vl-chat",
|
|
63
|
+
]
|
|
64
|
+
|
|
45
65
|
|
|
46
66
|
class PytorchModel(LLM):
|
|
47
67
|
def __init__(
|
|
@@ -52,14 +72,14 @@ class PytorchModel(LLM):
|
|
|
52
72
|
quantization: str,
|
|
53
73
|
model_path: str,
|
|
54
74
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
55
|
-
|
|
75
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
56
76
|
):
|
|
57
77
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
58
78
|
self._use_fast_tokenizer = True
|
|
59
79
|
self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
|
|
60
80
|
pytorch_model_config
|
|
61
81
|
)
|
|
62
|
-
self.
|
|
82
|
+
self._peft_model = peft_model
|
|
63
83
|
|
|
64
84
|
def _sanitize_model_config(
|
|
65
85
|
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
@@ -115,7 +135,7 @@ class PytorchModel(LLM):
|
|
|
115
135
|
return model, tokenizer
|
|
116
136
|
|
|
117
137
|
def _apply_lora(self):
|
|
118
|
-
if self.
|
|
138
|
+
if self._peft_model is not None:
|
|
119
139
|
try:
|
|
120
140
|
from peft import PeftModel
|
|
121
141
|
except ImportError:
|
|
@@ -123,14 +143,15 @@ class PytorchModel(LLM):
|
|
|
123
143
|
f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
|
|
124
144
|
)
|
|
125
145
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
self._model
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
146
|
+
for peft_model in self._peft_model:
|
|
147
|
+
# Apply LoRA
|
|
148
|
+
self._model = PeftModel.from_pretrained(
|
|
149
|
+
self._model,
|
|
150
|
+
peft_model.local_path,
|
|
151
|
+
)
|
|
152
|
+
logger.info(
|
|
153
|
+
f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
|
|
154
|
+
)
|
|
134
155
|
|
|
135
156
|
def load(self):
|
|
136
157
|
try:
|
|
@@ -233,17 +254,7 @@ class PytorchModel(LLM):
|
|
|
233
254
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
234
255
|
return False
|
|
235
256
|
model_family = llm_family.model_family or llm_family.model_name
|
|
236
|
-
if model_family in
|
|
237
|
-
"baichuan-chat",
|
|
238
|
-
"vicuna-v1.3",
|
|
239
|
-
"falcon",
|
|
240
|
-
"falcon-instruct",
|
|
241
|
-
"chatglm",
|
|
242
|
-
"chatglm2",
|
|
243
|
-
"chatglm2-32k",
|
|
244
|
-
"llama-2",
|
|
245
|
-
"llama-2-chat",
|
|
246
|
-
]:
|
|
257
|
+
if model_family in NON_DEFAULT_MODEL_LIST:
|
|
247
258
|
return False
|
|
248
259
|
if "generate" not in llm_family.model_ability:
|
|
249
260
|
return False
|
|
@@ -412,7 +423,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
412
423
|
quantization: str,
|
|
413
424
|
model_path: str,
|
|
414
425
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
415
|
-
|
|
426
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
416
427
|
):
|
|
417
428
|
super().__init__(
|
|
418
429
|
model_uid,
|
|
@@ -421,7 +432,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
421
432
|
quantization,
|
|
422
433
|
model_path,
|
|
423
434
|
pytorch_model_config,
|
|
424
|
-
|
|
435
|
+
peft_model,
|
|
425
436
|
)
|
|
426
437
|
|
|
427
438
|
def _sanitize_generate_config(
|
|
@@ -452,23 +463,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
452
463
|
) -> bool:
|
|
453
464
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
454
465
|
return False
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
"baichuan-2-chat",
|
|
458
|
-
"vicuna-v1.3",
|
|
459
|
-
"falcon",
|
|
460
|
-
"falcon-instruct",
|
|
461
|
-
"chatglm",
|
|
462
|
-
"chatglm2",
|
|
463
|
-
"chatglm2-32k",
|
|
464
|
-
"llama-2",
|
|
465
|
-
"llama-2-chat",
|
|
466
|
-
"internlm2-chat",
|
|
467
|
-
"qwen-vl-chat",
|
|
468
|
-
"OmniLMM",
|
|
469
|
-
"yi-vl-chat",
|
|
470
|
-
"deepseek-vl-chat",
|
|
471
|
-
]:
|
|
466
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
467
|
+
if model_family in NON_DEFAULT_MODEL_LIST:
|
|
472
468
|
return False
|
|
473
469
|
if "chat" not in llm_family.model_ability:
|
|
474
470
|
return False
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import List, Optional
|
|
16
16
|
|
|
17
|
+
from ....types import LoRA
|
|
17
18
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
18
19
|
from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
|
|
19
20
|
|
|
@@ -27,7 +28,7 @@ class FalconPytorchModel(PytorchModel):
|
|
|
27
28
|
quantization: str,
|
|
28
29
|
model_path: str,
|
|
29
30
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
-
|
|
31
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
model_uid,
|
|
@@ -36,7 +37,7 @@ class FalconPytorchModel(PytorchModel):
|
|
|
36
37
|
quantization,
|
|
37
38
|
model_path,
|
|
38
39
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
-
|
|
40
|
+
peft_model=peft_model,
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
def _load_model(self, **kwargs):
|
|
@@ -86,7 +87,7 @@ class FalconPytorchChatModel(PytorchChatModel):
|
|
|
86
87
|
quantization: str,
|
|
87
88
|
model_path: str,
|
|
88
89
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
89
|
-
|
|
90
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
90
91
|
):
|
|
91
92
|
super().__init__(
|
|
92
93
|
model_uid,
|
|
@@ -95,7 +96,7 @@ class FalconPytorchChatModel(PytorchChatModel):
|
|
|
95
96
|
quantization,
|
|
96
97
|
model_path,
|
|
97
98
|
pytorch_model_config=pytorch_model_config,
|
|
98
|
-
|
|
99
|
+
peft_model=peft_model,
|
|
99
100
|
)
|
|
100
101
|
|
|
101
102
|
def _load_model(self, **kwargs):
|
|
@@ -23,6 +23,7 @@ from ....types import (
|
|
|
23
23
|
CompletionChoice,
|
|
24
24
|
CompletionChunk,
|
|
25
25
|
CompletionUsage,
|
|
26
|
+
LoRA,
|
|
26
27
|
PytorchGenerateConfig,
|
|
27
28
|
)
|
|
28
29
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -38,7 +39,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
38
39
|
quantization: str,
|
|
39
40
|
model_path: str,
|
|
40
41
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
41
|
-
|
|
42
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
42
43
|
):
|
|
43
44
|
super().__init__(
|
|
44
45
|
model_uid,
|
|
@@ -47,7 +48,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
47
48
|
quantization,
|
|
48
49
|
model_path,
|
|
49
50
|
pytorch_model_config=pytorch_model_config,
|
|
50
|
-
|
|
51
|
+
peft_model=peft_model,
|
|
51
52
|
)
|
|
52
53
|
|
|
53
54
|
def _load_model(self, **kwargs):
|
|
@@ -114,6 +115,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
114
115
|
]
|
|
115
116
|
else:
|
|
116
117
|
input_history = []
|
|
118
|
+
if system_prompt:
|
|
119
|
+
kwargs["meta_instruction"] = system_prompt
|
|
117
120
|
if stream:
|
|
118
121
|
|
|
119
122
|
def _stream_generator():
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import List, Optional
|
|
16
16
|
|
|
17
|
+
from ....types import LoRA
|
|
17
18
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
18
19
|
from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
|
|
19
20
|
|
|
@@ -27,7 +28,7 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
27
28
|
quantization: str,
|
|
28
29
|
model_path: str,
|
|
29
30
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
-
|
|
31
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
model_uid,
|
|
@@ -36,7 +37,7 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
36
37
|
quantization,
|
|
37
38
|
model_path,
|
|
38
39
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
-
|
|
40
|
+
peft_model=peft_model,
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
def _load_model(self, **kwargs):
|
|
@@ -69,8 +70,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
69
70
|
model_spec: "LLMSpecV1",
|
|
70
71
|
quantization: str,
|
|
71
72
|
model_path: str,
|
|
72
|
-
peft_model_path: Optional[str] = None,
|
|
73
73
|
pytorch_model_config: Optional["PytorchModelConfig"] = None,
|
|
74
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
74
75
|
):
|
|
75
76
|
super().__init__(
|
|
76
77
|
model_uid,
|
|
@@ -78,7 +79,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
78
79
|
model_spec,
|
|
79
80
|
quantization,
|
|
80
81
|
model_path,
|
|
81
|
-
|
|
82
|
+
peft_model=peft_model,
|
|
82
83
|
pytorch_model_config=pytorch_model_config,
|
|
83
84
|
)
|
|
84
85
|
self._use_fast_tokenizer = False
|
|
@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
53
53
|
|
|
54
54
|
device = self._pytorch_model_config.get("device", "auto")
|
|
55
55
|
device = select_device(device)
|
|
56
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
57
|
+
device = "auto" if device == "cuda" else device
|
|
56
58
|
|
|
57
59
|
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
58
60
|
self.model_path,
|
|
@@ -26,8 +26,9 @@
|
|
|
26
26
|
# See the License for the specific language governing permissions and
|
|
27
27
|
# limitations under the License.
|
|
28
28
|
|
|
29
|
-
from typing import Optional
|
|
29
|
+
from typing import List, Optional
|
|
30
30
|
|
|
31
|
+
from ....types import LoRA
|
|
31
32
|
from .. import LLMFamilyV1, LLMSpecV1
|
|
32
33
|
from .core import PytorchChatModel, PytorchModelConfig
|
|
33
34
|
|
|
@@ -41,7 +42,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
41
42
|
quantization: str,
|
|
42
43
|
model_path: str,
|
|
43
44
|
pytorch_model_config: Optional["PytorchModelConfig"] = None,
|
|
44
|
-
|
|
45
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
45
46
|
):
|
|
46
47
|
super().__init__(
|
|
47
48
|
model_uid,
|
|
@@ -50,7 +51,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
50
51
|
quantization,
|
|
51
52
|
model_path,
|
|
52
53
|
pytorch_model_config=pytorch_model_config,
|
|
53
|
-
|
|
54
|
+
peft_model=peft_model,
|
|
54
55
|
)
|
|
55
56
|
self._use_fast_tokenizer = False
|
|
56
57
|
|