xinference 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +25 -6
- xinference/client/oscar/actor_client.py +4 -3
- xinference/client/restful/restful_client.py +8 -2
- xinference/core/supervisor.py +16 -0
- xinference/model/embedding/core.py +1 -2
- xinference/model/llm/__init__.py +0 -6
- xinference/model/llm/ggml/llamacpp.py +2 -10
- xinference/model/llm/llm_family.json +244 -7
- xinference/model/llm/llm_family.py +15 -0
- xinference/model/llm/llm_family_modelscope.json +100 -0
- xinference/model/llm/pytorch/chatglm.py +2 -0
- xinference/model/llm/pytorch/core.py +22 -28
- xinference/model/llm/pytorch/internlm2.py +2 -0
- xinference/model/llm/pytorch/qwen_vl.py +2 -0
- xinference/model/llm/pytorch/yi_vl.py +4 -2
- xinference/model/llm/utils.py +42 -4
- xinference/model/llm/vllm/core.py +51 -6
- xinference/model/rerank/core.py +3 -0
- xinference/thirdparty/omnilmm/chat.py +1 -1
- xinference/types.py +15 -19
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
- xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/METADATA +10 -10
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/RECORD +50 -56
- xinference/model/llm/ggml/ctransformers.py +0 -281
- xinference/model/llm/ggml/ctransformers_util.py +0 -161
- xinference/web/ui/build/static/js/main.98516614.js +0 -3
- xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
- /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -1825,6 +1825,17 @@
|
|
|
1825
1825
|
"model_id": "qwen/Qwen1.5-14B-Chat",
|
|
1826
1826
|
"model_hub": "modelscope"
|
|
1827
1827
|
},
|
|
1828
|
+
{
|
|
1829
|
+
"model_format": "pytorch",
|
|
1830
|
+
"model_size_in_billions": 32,
|
|
1831
|
+
"quantizations": [
|
|
1832
|
+
"4-bit",
|
|
1833
|
+
"8-bit",
|
|
1834
|
+
"none"
|
|
1835
|
+
],
|
|
1836
|
+
"model_id": "qwen/Qwen1.5-32B-Chat",
|
|
1837
|
+
"model_hub": "modelscope"
|
|
1838
|
+
},
|
|
1828
1839
|
{
|
|
1829
1840
|
"model_format": "pytorch",
|
|
1830
1841
|
"model_size_in_billions": 72,
|
|
@@ -1886,6 +1897,15 @@
|
|
|
1886
1897
|
"model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
|
|
1887
1898
|
"model_hub": "modelscope"
|
|
1888
1899
|
},
|
|
1900
|
+
{
|
|
1901
|
+
"model_format": "gptq",
|
|
1902
|
+
"model_size_in_billions": 32,
|
|
1903
|
+
"quantizations": [
|
|
1904
|
+
"Int4"
|
|
1905
|
+
],
|
|
1906
|
+
"model_id": "qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}",
|
|
1907
|
+
"model_hub": "modelscope"
|
|
1908
|
+
},
|
|
1889
1909
|
{
|
|
1890
1910
|
"model_format": "gptq",
|
|
1891
1911
|
"model_size_in_billions": 72,
|
|
@@ -1941,6 +1961,15 @@
|
|
|
1941
1961
|
"model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
|
|
1942
1962
|
"model_hub": "modelscope"
|
|
1943
1963
|
},
|
|
1964
|
+
{
|
|
1965
|
+
"model_format": "awq",
|
|
1966
|
+
"model_size_in_billions": 32,
|
|
1967
|
+
"quantizations": [
|
|
1968
|
+
"Int4"
|
|
1969
|
+
],
|
|
1970
|
+
"model_id": "qwen/Qwen1.5-32B-Chat-AWQ",
|
|
1971
|
+
"model_hub": "modelscope"
|
|
1972
|
+
},
|
|
1944
1973
|
{
|
|
1945
1974
|
"model_format": "awq",
|
|
1946
1975
|
"model_size_in_billions": 72,
|
|
@@ -2035,6 +2064,23 @@
|
|
|
2035
2064
|
"model_hub": "modelscope",
|
|
2036
2065
|
"model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
|
|
2037
2066
|
},
|
|
2067
|
+
{
|
|
2068
|
+
"model_format": "ggufv2",
|
|
2069
|
+
"model_size_in_billions": 32,
|
|
2070
|
+
"quantizations": [
|
|
2071
|
+
"q2_k",
|
|
2072
|
+
"q3_k_m",
|
|
2073
|
+
"q4_0",
|
|
2074
|
+
"q4_k_m",
|
|
2075
|
+
"q5_0",
|
|
2076
|
+
"q5_k_m",
|
|
2077
|
+
"q6_k",
|
|
2078
|
+
"q8_0"
|
|
2079
|
+
],
|
|
2080
|
+
"model_id": "qwen/Qwen1.5-32B-Chat-GGUF",
|
|
2081
|
+
"model_hub": "modelscope",
|
|
2082
|
+
"model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
|
|
2083
|
+
},
|
|
2038
2084
|
{
|
|
2039
2085
|
"model_format": "ggufv2",
|
|
2040
2086
|
"model_size_in_billions": 72,
|
|
@@ -2075,6 +2121,60 @@
|
|
|
2075
2121
|
]
|
|
2076
2122
|
}
|
|
2077
2123
|
},
|
|
2124
|
+
{
|
|
2125
|
+
"version": 1,
|
|
2126
|
+
"context_length": 32768,
|
|
2127
|
+
"model_name": "qwen1.5-moe-chat",
|
|
2128
|
+
"model_lang": [
|
|
2129
|
+
"en",
|
|
2130
|
+
"zh"
|
|
2131
|
+
],
|
|
2132
|
+
"model_ability": [
|
|
2133
|
+
"chat"
|
|
2134
|
+
],
|
|
2135
|
+
"model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
|
|
2136
|
+
"model_specs": [
|
|
2137
|
+
{
|
|
2138
|
+
"model_format": "pytorch",
|
|
2139
|
+
"model_size_in_billions": "2_7",
|
|
2140
|
+
"quantizations": [
|
|
2141
|
+
"4-bit",
|
|
2142
|
+
"8-bit",
|
|
2143
|
+
"none"
|
|
2144
|
+
],
|
|
2145
|
+
"model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
|
|
2146
|
+
"model_hub": "modelscope"
|
|
2147
|
+
},
|
|
2148
|
+
{
|
|
2149
|
+
"model_format": "gptq",
|
|
2150
|
+
"model_size_in_billions": "2_7",
|
|
2151
|
+
"quantizations": [
|
|
2152
|
+
"Int4"
|
|
2153
|
+
],
|
|
2154
|
+
"model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
|
|
2155
|
+
"model_hub": "modelscope"
|
|
2156
|
+
}
|
|
2157
|
+
],
|
|
2158
|
+
"prompt_style": {
|
|
2159
|
+
"style_name": "QWEN",
|
|
2160
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2161
|
+
"roles": [
|
|
2162
|
+
"user",
|
|
2163
|
+
"assistant"
|
|
2164
|
+
],
|
|
2165
|
+
"intra_message_sep": "\n",
|
|
2166
|
+
"stop_token_ids": [
|
|
2167
|
+
151643,
|
|
2168
|
+
151644,
|
|
2169
|
+
151645
|
|
2170
|
+
],
|
|
2171
|
+
"stop": [
|
|
2172
|
+
"<|endoftext|>",
|
|
2173
|
+
"<|im_start|>",
|
|
2174
|
+
"<|im_end|>"
|
|
2175
|
+
]
|
|
2176
|
+
}
|
|
2177
|
+
},
|
|
2078
2178
|
{
|
|
2079
2179
|
"version": 1,
|
|
2080
2180
|
"context_length": 4096,
|
|
@@ -135,6 +135,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
135
135
|
chat_history = [h for h in chat_history if not h.get("tool_calls")]
|
|
136
136
|
if not chat_history:
|
|
137
137
|
chat_history = []
|
|
138
|
+
if system_prompt:
|
|
139
|
+
chat_history.append({"role": "system", "content": system_prompt})
|
|
138
140
|
if tools:
|
|
139
141
|
msg = self._model.chat(
|
|
140
142
|
self._tokenizer, prompt, [tools] + chat_history, **kwargs
|
|
@@ -42,6 +42,25 @@ from ..utils import ChatModelMixin
|
|
|
42
42
|
|
|
43
43
|
logger = logging.getLogger(__name__)
|
|
44
44
|
|
|
45
|
+
NON_DEFAULT_MODEL_LIST: List[str] = [
|
|
46
|
+
"baichuan-chat",
|
|
47
|
+
"baichuan-2-chat",
|
|
48
|
+
"vicuna-v1.3",
|
|
49
|
+
"falcon",
|
|
50
|
+
"falcon-instruct",
|
|
51
|
+
"chatglm",
|
|
52
|
+
"chatglm2",
|
|
53
|
+
"chatglm2-32k",
|
|
54
|
+
"chatglm2-128k",
|
|
55
|
+
"llama-2",
|
|
56
|
+
"llama-2-chat",
|
|
57
|
+
"internlm2-chat",
|
|
58
|
+
"qwen-vl-chat",
|
|
59
|
+
"OmniLMM",
|
|
60
|
+
"yi-vl-chat",
|
|
61
|
+
"deepseek-vl-chat",
|
|
62
|
+
]
|
|
63
|
+
|
|
45
64
|
|
|
46
65
|
class PytorchModel(LLM):
|
|
47
66
|
def __init__(
|
|
@@ -233,17 +252,7 @@ class PytorchModel(LLM):
|
|
|
233
252
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
234
253
|
return False
|
|
235
254
|
model_family = llm_family.model_family or llm_family.model_name
|
|
236
|
-
if model_family in
|
|
237
|
-
"baichuan-chat",
|
|
238
|
-
"vicuna-v1.3",
|
|
239
|
-
"falcon",
|
|
240
|
-
"falcon-instruct",
|
|
241
|
-
"chatglm",
|
|
242
|
-
"chatglm2",
|
|
243
|
-
"chatglm2-32k",
|
|
244
|
-
"llama-2",
|
|
245
|
-
"llama-2-chat",
|
|
246
|
-
]:
|
|
255
|
+
if model_family in NON_DEFAULT_MODEL_LIST:
|
|
247
256
|
return False
|
|
248
257
|
if "generate" not in llm_family.model_ability:
|
|
249
258
|
return False
|
|
@@ -452,23 +461,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
452
461
|
) -> bool:
|
|
453
462
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
454
463
|
return False
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
"baichuan-2-chat",
|
|
458
|
-
"vicuna-v1.3",
|
|
459
|
-
"falcon",
|
|
460
|
-
"falcon-instruct",
|
|
461
|
-
"chatglm",
|
|
462
|
-
"chatglm2",
|
|
463
|
-
"chatglm2-32k",
|
|
464
|
-
"llama-2",
|
|
465
|
-
"llama-2-chat",
|
|
466
|
-
"internlm2-chat",
|
|
467
|
-
"qwen-vl-chat",
|
|
468
|
-
"OmniLMM",
|
|
469
|
-
"yi-vl-chat",
|
|
470
|
-
"deepseek-vl-chat",
|
|
471
|
-
]:
|
|
464
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
465
|
+
if model_family in NON_DEFAULT_MODEL_LIST:
|
|
472
466
|
return False
|
|
473
467
|
if "chat" not in llm_family.model_ability:
|
|
474
468
|
return False
|
|
@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
53
53
|
|
|
54
54
|
device = self._pytorch_model_config.get("device", "auto")
|
|
55
55
|
device = select_device(device)
|
|
56
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
57
|
+
device = "auto" if device == "cuda" else device
|
|
56
58
|
|
|
57
59
|
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
58
60
|
self.model_path,
|
|
@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
59
59
|
|
|
60
60
|
self._device = self._pytorch_model_config.get("device", "auto")
|
|
61
61
|
self._device = select_device(self._device)
|
|
62
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
63
|
+
self._device = "auto" if self._device == "cuda" else self._device
|
|
62
64
|
|
|
63
65
|
key_info["model_path"] = self.model_path
|
|
64
66
|
# Default device_map is auto, it can loads model to multiple cards.
|
|
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
190
192
|
prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
|
|
191
193
|
)
|
|
192
194
|
.unsqueeze(0)
|
|
193
|
-
.to(self.
|
|
195
|
+
.to(self._model.device)
|
|
194
196
|
)
|
|
195
197
|
|
|
196
198
|
images = state.get_images(return_pil=True)
|
|
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
215
217
|
"input_ids": input_ids,
|
|
216
218
|
"images": image_tensor.unsqueeze(0)
|
|
217
219
|
.to(dtype=torch.bfloat16)
|
|
218
|
-
.to(self.
|
|
220
|
+
.to(self._model.device),
|
|
219
221
|
"streamer": streamer,
|
|
220
222
|
"do_sample": True,
|
|
221
223
|
"top_p": float(top_p),
|
xinference/model/llm/utils.py
CHANGED
|
@@ -163,7 +163,7 @@ class ChatModelMixin:
|
|
|
163
163
|
|
|
164
164
|
for i, message in enumerate(chat_history):
|
|
165
165
|
role = get_role(message["role"])
|
|
166
|
-
content = message
|
|
166
|
+
content = message.get("content")
|
|
167
167
|
tool_calls = message.get("tool_calls")
|
|
168
168
|
if tool_calls:
|
|
169
169
|
content = tool_calls[0]["function"]
|
|
@@ -248,7 +248,7 @@ Begin!"""
|
|
|
248
248
|
ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
|
|
249
249
|
for message in chat_history:
|
|
250
250
|
role = get_role(message["role"])
|
|
251
|
-
content = message
|
|
251
|
+
content = message.get("content")
|
|
252
252
|
|
|
253
253
|
ret += prompt_style.intra_message_sep
|
|
254
254
|
if tools:
|
|
@@ -446,6 +446,11 @@ Begin!"""
|
|
|
446
446
|
"index": i,
|
|
447
447
|
"delta": {
|
|
448
448
|
"content": choice["text"],
|
|
449
|
+
**(
|
|
450
|
+
{"tool_calls": choice["tool_calls"]}
|
|
451
|
+
if "tool_calls" in choice
|
|
452
|
+
else {}
|
|
453
|
+
),
|
|
449
454
|
},
|
|
450
455
|
"finish_reason": choice["finish_reason"],
|
|
451
456
|
}
|
|
@@ -592,8 +597,7 @@ Begin!"""
|
|
|
592
597
|
return text, None, None
|
|
593
598
|
|
|
594
599
|
@classmethod
|
|
595
|
-
def
|
|
596
|
-
_id = str(uuid.uuid4())
|
|
600
|
+
def _eval_tool_arguments(cls, model_family, c, tools):
|
|
597
601
|
family = model_family.model_family or model_family.model_name
|
|
598
602
|
if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
|
|
599
603
|
content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
|
|
@@ -606,7 +610,41 @@ Begin!"""
|
|
|
606
610
|
f"Model {model_family.model_name} is not support tool calls."
|
|
607
611
|
)
|
|
608
612
|
logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
|
|
613
|
+
return content, func, args
|
|
614
|
+
|
|
615
|
+
@classmethod
|
|
616
|
+
def _tools_token_filter(cls, model_family):
|
|
617
|
+
"""
|
|
618
|
+
Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
|
|
609
619
|
|
|
620
|
+
Returns:
|
|
621
|
+
A function that takes tokens (string output by the model so far) as input
|
|
622
|
+
returns True if current token is after "\nFinal Answer:", else False.
|
|
623
|
+
"""
|
|
624
|
+
family = model_family.model_family or model_family.model_name
|
|
625
|
+
if family in ["qwen-chat", "qwen1.5-chat"]:
|
|
626
|
+
# Encapsulating function to reset 'found' after each call
|
|
627
|
+
found = False
|
|
628
|
+
|
|
629
|
+
def process_token(tokens: str):
|
|
630
|
+
nonlocal found
|
|
631
|
+
# Once "Final Answer:" is found, future tokens are allowed.
|
|
632
|
+
if found:
|
|
633
|
+
return True
|
|
634
|
+
# Check if the token ends with "\nFinal Answer:" and update `found`.
|
|
635
|
+
if tokens.endswith("\nFinal Answer:"):
|
|
636
|
+
found = True
|
|
637
|
+
return False
|
|
638
|
+
|
|
639
|
+
return process_token
|
|
640
|
+
else:
|
|
641
|
+
# For other families, allow all tokens.
|
|
642
|
+
return lambda tokens: True
|
|
643
|
+
|
|
644
|
+
@classmethod
|
|
645
|
+
def _tool_calls_completion(cls, model_family, model_uid, c, tools):
|
|
646
|
+
_id = str(uuid.uuid4())
|
|
647
|
+
content, func, args = cls._eval_tool_arguments(model_family, c, tools)
|
|
610
648
|
if func:
|
|
611
649
|
m = {
|
|
612
650
|
"role": "assistant",
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import json
|
|
15
16
|
import logging
|
|
16
17
|
import multiprocessing
|
|
17
18
|
import time
|
|
@@ -36,6 +37,8 @@ from ....types import (
|
|
|
36
37
|
CompletionChoice,
|
|
37
38
|
CompletionChunk,
|
|
38
39
|
CompletionUsage,
|
|
40
|
+
ToolCallFunction,
|
|
41
|
+
ToolCalls,
|
|
39
42
|
)
|
|
40
43
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
41
44
|
from ..llm_family import CustomLLMFamilyV1
|
|
@@ -80,7 +83,15 @@ try:
|
|
|
80
83
|
except ImportError:
|
|
81
84
|
VLLM_INSTALLED = False
|
|
82
85
|
|
|
83
|
-
VLLM_SUPPORTED_MODELS = [
|
|
86
|
+
VLLM_SUPPORTED_MODELS = [
|
|
87
|
+
"llama-2",
|
|
88
|
+
"baichuan",
|
|
89
|
+
"internlm-16k",
|
|
90
|
+
"mistral-v0.1",
|
|
91
|
+
"Yi",
|
|
92
|
+
"code-llama",
|
|
93
|
+
"code-llama-python",
|
|
94
|
+
]
|
|
84
95
|
VLLM_SUPPORTED_CHAT_MODELS = [
|
|
85
96
|
"llama-2-chat",
|
|
86
97
|
"vicuna-v1.3",
|
|
@@ -90,16 +101,16 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
90
101
|
"internlm-chat-7b",
|
|
91
102
|
"internlm-chat-8k",
|
|
92
103
|
"internlm-chat-20b",
|
|
104
|
+
"internlm2-chat",
|
|
93
105
|
"qwen-chat",
|
|
94
|
-
"Yi",
|
|
95
106
|
"Yi-chat",
|
|
96
|
-
"code-llama",
|
|
97
|
-
"code-llama-python",
|
|
98
107
|
"code-llama-instruct",
|
|
99
108
|
"mistral-instruct-v0.1",
|
|
100
109
|
"mistral-instruct-v0.2",
|
|
101
110
|
"mixtral-instruct-v0.1",
|
|
102
111
|
"chatglm3",
|
|
112
|
+
"chatglm3-32k",
|
|
113
|
+
"chatglm3-128k",
|
|
103
114
|
"deepseek-chat",
|
|
104
115
|
"deepseek-coder-instruct",
|
|
105
116
|
]
|
|
@@ -113,6 +124,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
|
|
|
113
124
|
VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
|
|
114
125
|
VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
|
|
115
126
|
|
|
127
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
|
|
128
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
|
|
129
|
+
|
|
116
130
|
|
|
117
131
|
class VLLMModel(LLM):
|
|
118
132
|
def __init__(
|
|
@@ -293,6 +307,7 @@ class VLLMModel(LLM):
|
|
|
293
307
|
self,
|
|
294
308
|
prompt: str,
|
|
295
309
|
generate_config: Optional[Dict] = None,
|
|
310
|
+
tools: object = False,
|
|
296
311
|
) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
|
|
297
312
|
try:
|
|
298
313
|
from vllm.sampling_params import SamplingParams
|
|
@@ -319,16 +334,46 @@ class VLLMModel(LLM):
|
|
|
319
334
|
|
|
320
335
|
async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
|
|
321
336
|
previous_texts = [""] * sanitized_generate_config["n"]
|
|
337
|
+
tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
|
|
322
338
|
async for _request_output in results_generator:
|
|
323
339
|
chunk = self._convert_request_output_to_completion_chunk(
|
|
324
340
|
request_id=request_id,
|
|
325
341
|
model=self.model_uid,
|
|
326
342
|
request_output=_request_output,
|
|
327
343
|
)
|
|
344
|
+
|
|
328
345
|
for i, choice in enumerate(chunk["choices"]):
|
|
329
346
|
delta = choice["text"][len(previous_texts[i]) :]
|
|
330
347
|
previous_texts[i] = choice["text"]
|
|
331
348
|
choice["text"] = delta
|
|
349
|
+
|
|
350
|
+
if tools:
|
|
351
|
+
# only handle the first choice
|
|
352
|
+
choice = chunk["choices"][0]
|
|
353
|
+
if choice["finish_reason"] is not None:
|
|
354
|
+
# use previous text for evaluation temporarily
|
|
355
|
+
choice_delta = choice["text"]
|
|
356
|
+
choice["text"] = previous_texts[0]
|
|
357
|
+
_content, func, args = ChatModelMixin._eval_tool_arguments(
|
|
358
|
+
self.model_family, chunk, tools
|
|
359
|
+
)
|
|
360
|
+
choice["text"] = choice_delta
|
|
361
|
+
if func is not None:
|
|
362
|
+
choice["text"] = None
|
|
363
|
+
choice["finish_reason"] = "tool_calls"
|
|
364
|
+
choice["tool_calls"] = [
|
|
365
|
+
ToolCalls(
|
|
366
|
+
id=str(uuid.uuid4()),
|
|
367
|
+
type="function",
|
|
368
|
+
function=ToolCallFunction(
|
|
369
|
+
name=func,
|
|
370
|
+
arguments=json.dumps(args, ensure_ascii=False),
|
|
371
|
+
),
|
|
372
|
+
)
|
|
373
|
+
]
|
|
374
|
+
# use a filter function to skip Qwen's react thought process
|
|
375
|
+
elif not tools_token_filter(previous_texts[0]):
|
|
376
|
+
continue
|
|
332
377
|
prompt_tokens = len(_request_output.prompt_token_ids)
|
|
333
378
|
completion_tokens = sum(
|
|
334
379
|
len(output.token_ids) for output in _request_output.outputs
|
|
@@ -416,7 +461,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
416
461
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
417
462
|
# TODO(codingl2k1): qwen hacky to set stop for function call.
|
|
418
463
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
419
|
-
if tools and "qwen-chat"
|
|
464
|
+
if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
|
|
420
465
|
stop = generate_config.get("stop")
|
|
421
466
|
if isinstance(stop, str):
|
|
422
467
|
generate_config["stop"] = [stop, "Observation:"]
|
|
@@ -429,7 +474,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
429
474
|
stream = generate_config.get("stream", None)
|
|
430
475
|
|
|
431
476
|
if stream:
|
|
432
|
-
agen = await self.async_generate(full_prompt, generate_config)
|
|
477
|
+
agen = await self.async_generate(full_prompt, generate_config, tools)
|
|
433
478
|
assert isinstance(agen, AsyncGenerator)
|
|
434
479
|
return self._async_to_chat_completion_chunks(agen)
|
|
435
480
|
else:
|
xinference/model/rerank/core.py
CHANGED
|
@@ -134,8 +134,11 @@ class RerankModel:
|
|
|
134
134
|
top_n: Optional[int],
|
|
135
135
|
max_chunks_per_doc: Optional[int],
|
|
136
136
|
return_documents: Optional[bool],
|
|
137
|
+
**kwargs,
|
|
137
138
|
) -> Rerank:
|
|
138
139
|
assert self._model is not None
|
|
140
|
+
if kwargs:
|
|
141
|
+
raise ValueError("rerank hasn't support extra parameter.")
|
|
139
142
|
if max_chunks_per_doc is not None:
|
|
140
143
|
raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
|
|
141
144
|
sentence_combinations = [[query, doc] for doc in documents]
|
|
@@ -207,7 +207,7 @@ class OmniLMM3B:
|
|
|
207
207
|
|
|
208
208
|
class OmniLMMChat:
|
|
209
209
|
def __init__(self, model_path, device_map) -> None:
|
|
210
|
-
if "
|
|
210
|
+
if "12b" in model_path:
|
|
211
211
|
self.model = OmniLMM12B(model_path, device_map)
|
|
212
212
|
else:
|
|
213
213
|
self.model = OmniLMM3B(model_path, device_map)
|
xinference/types.py
CHANGED
|
@@ -91,11 +91,23 @@ class CompletionLogprobs(TypedDict):
|
|
|
91
91
|
top_logprobs: List[Optional[Dict[str, float]]]
|
|
92
92
|
|
|
93
93
|
|
|
94
|
+
class ToolCallFunction(TypedDict):
|
|
95
|
+
name: str
|
|
96
|
+
arguments: str
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class ToolCalls(TypedDict):
|
|
100
|
+
id: str
|
|
101
|
+
type: Literal["function"]
|
|
102
|
+
function: ToolCallFunction
|
|
103
|
+
|
|
104
|
+
|
|
94
105
|
class CompletionChoice(TypedDict):
|
|
95
106
|
text: str
|
|
96
107
|
index: int
|
|
97
108
|
logprobs: Optional[CompletionLogprobs]
|
|
98
109
|
finish_reason: Optional[str]
|
|
110
|
+
tool_calls: NotRequired[List[ToolCalls]]
|
|
99
111
|
|
|
100
112
|
|
|
101
113
|
class CompletionUsage(TypedDict):
|
|
@@ -147,6 +159,7 @@ class ChatCompletion(TypedDict):
|
|
|
147
159
|
class ChatCompletionChunkDelta(TypedDict):
|
|
148
160
|
role: NotRequired[str]
|
|
149
161
|
content: NotRequired[str]
|
|
162
|
+
tool_calls: NotRequired[List[ToolCalls]]
|
|
150
163
|
|
|
151
164
|
|
|
152
165
|
class ChatCompletionChunkChoice(TypedDict):
|
|
@@ -232,6 +245,8 @@ class LlamaCppModelConfig(TypedDict, total=False):
|
|
|
232
245
|
n_ctx: int
|
|
233
246
|
n_parts: int
|
|
234
247
|
n_gpu_layers: int
|
|
248
|
+
split_mode: int
|
|
249
|
+
main_gpu: int
|
|
235
250
|
seed: int
|
|
236
251
|
f16_kv: bool
|
|
237
252
|
logits_all: bool
|
|
@@ -355,21 +370,6 @@ try:
|
|
|
355
370
|
except ImportError:
|
|
356
371
|
CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
|
|
357
372
|
|
|
358
|
-
CreateCompletionCTransformers: BaseModel
|
|
359
|
-
try:
|
|
360
|
-
from ctransformers.llm import LLM
|
|
361
|
-
|
|
362
|
-
CreateCompletionCTransformers = get_pydantic_model_from_method(
|
|
363
|
-
LLM.generate,
|
|
364
|
-
exclude_fields=["tokens"],
|
|
365
|
-
include_fields={
|
|
366
|
-
"max_tokens": (Optional[int], max_tokens_field),
|
|
367
|
-
"stream": (Optional[bool], stream_field),
|
|
368
|
-
},
|
|
369
|
-
)
|
|
370
|
-
except ImportError:
|
|
371
|
-
CreateCompletionCTransformers = create_model("CreateCompletionCTransformers")
|
|
372
|
-
|
|
373
373
|
|
|
374
374
|
# This type is for openai API compatibility
|
|
375
375
|
CreateCompletionOpenAI: BaseModel
|
|
@@ -415,7 +415,6 @@ class CreateCompletion(
|
|
|
415
415
|
ModelAndPrompt,
|
|
416
416
|
CreateCompletionTorch,
|
|
417
417
|
CreateCompletionLlamaCpp,
|
|
418
|
-
CreateCompletionCTransformers,
|
|
419
418
|
CreateCompletionOpenAI,
|
|
420
419
|
):
|
|
421
420
|
pass
|
|
@@ -428,8 +427,6 @@ class CreateChatModel(BaseModel):
|
|
|
428
427
|
# Currently, chat calls generates, so the params share the same one.
|
|
429
428
|
CreateChatCompletionTorch = CreateCompletionTorch
|
|
430
429
|
CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
|
|
431
|
-
CreateChatCompletionCTransformers: BaseModel = CreateCompletionCTransformers
|
|
432
|
-
|
|
433
430
|
|
|
434
431
|
# This type is for openai API compatibility
|
|
435
432
|
CreateChatCompletionOpenAI: BaseModel
|
|
@@ -450,7 +447,6 @@ class CreateChatCompletion(
|
|
|
450
447
|
CreateChatModel,
|
|
451
448
|
CreateChatCompletionTorch,
|
|
452
449
|
CreateChatCompletionLlamaCpp,
|
|
453
|
-
CreateChatCompletionCTransformers,
|
|
454
450
|
CreateChatCompletionOpenAI,
|
|
455
451
|
):
|
|
456
452
|
pass
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
|
-
"main.js": "./static/js/main.
|
|
3
|
+
"main.js": "./static/js/main.76ef2b17.js",
|
|
4
4
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
5
5
|
"index.html": "./index.html",
|
|
6
|
-
"main.
|
|
6
|
+
"main.76ef2b17.js.map": "./static/js/main.76ef2b17.js.map"
|
|
7
7
|
},
|
|
8
8
|
"entrypoints": [
|
|
9
|
-
"static/js/main.
|
|
9
|
+
"static/js/main.76ef2b17.js"
|
|
10
10
|
]
|
|
11
11
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.76ef2b17.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|