xinference 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +25 -6
  3. xinference/client/oscar/actor_client.py +4 -3
  4. xinference/client/restful/restful_client.py +8 -2
  5. xinference/core/supervisor.py +16 -0
  6. xinference/model/embedding/core.py +1 -2
  7. xinference/model/llm/__init__.py +0 -6
  8. xinference/model/llm/ggml/llamacpp.py +2 -10
  9. xinference/model/llm/llm_family.json +244 -7
  10. xinference/model/llm/llm_family.py +15 -0
  11. xinference/model/llm/llm_family_modelscope.json +100 -0
  12. xinference/model/llm/pytorch/chatglm.py +2 -0
  13. xinference/model/llm/pytorch/core.py +22 -28
  14. xinference/model/llm/pytorch/internlm2.py +2 -0
  15. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  16. xinference/model/llm/pytorch/yi_vl.py +4 -2
  17. xinference/model/llm/utils.py +42 -4
  18. xinference/model/llm/vllm/core.py +51 -6
  19. xinference/model/rerank/core.py +3 -0
  20. xinference/thirdparty/omnilmm/chat.py +1 -1
  21. xinference/types.py +15 -19
  22. xinference/web/ui/build/asset-manifest.json +3 -3
  23. xinference/web/ui/build/index.html +1 -1
  24. xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
  25. xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
  26. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  27. xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
  28. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
  44. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/METADATA +10 -10
  45. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/RECORD +50 -56
  46. xinference/model/llm/ggml/ctransformers.py +0 -281
  47. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  48. xinference/web/ui/build/static/js/main.98516614.js +0 -3
  49. xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
  50. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  51. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  52. xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
  53. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  54. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  55. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  56. xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  58. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  72. /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
  73. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
  74. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
  75. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
  76. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1825,6 +1825,17 @@
1825
1825
  "model_id": "qwen/Qwen1.5-14B-Chat",
1826
1826
  "model_hub": "modelscope"
1827
1827
  },
1828
+ {
1829
+ "model_format": "pytorch",
1830
+ "model_size_in_billions": 32,
1831
+ "quantizations": [
1832
+ "4-bit",
1833
+ "8-bit",
1834
+ "none"
1835
+ ],
1836
+ "model_id": "qwen/Qwen1.5-32B-Chat",
1837
+ "model_hub": "modelscope"
1838
+ },
1828
1839
  {
1829
1840
  "model_format": "pytorch",
1830
1841
  "model_size_in_billions": 72,
@@ -1886,6 +1897,15 @@
1886
1897
  "model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
1887
1898
  "model_hub": "modelscope"
1888
1899
  },
1900
+ {
1901
+ "model_format": "gptq",
1902
+ "model_size_in_billions": 32,
1903
+ "quantizations": [
1904
+ "Int4"
1905
+ ],
1906
+ "model_id": "qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}",
1907
+ "model_hub": "modelscope"
1908
+ },
1889
1909
  {
1890
1910
  "model_format": "gptq",
1891
1911
  "model_size_in_billions": 72,
@@ -1941,6 +1961,15 @@
1941
1961
  "model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
1942
1962
  "model_hub": "modelscope"
1943
1963
  },
1964
+ {
1965
+ "model_format": "awq",
1966
+ "model_size_in_billions": 32,
1967
+ "quantizations": [
1968
+ "Int4"
1969
+ ],
1970
+ "model_id": "qwen/Qwen1.5-32B-Chat-AWQ",
1971
+ "model_hub": "modelscope"
1972
+ },
1944
1973
  {
1945
1974
  "model_format": "awq",
1946
1975
  "model_size_in_billions": 72,
@@ -2035,6 +2064,23 @@
2035
2064
  "model_hub": "modelscope",
2036
2065
  "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
2037
2066
  },
2067
+ {
2068
+ "model_format": "ggufv2",
2069
+ "model_size_in_billions": 32,
2070
+ "quantizations": [
2071
+ "q2_k",
2072
+ "q3_k_m",
2073
+ "q4_0",
2074
+ "q4_k_m",
2075
+ "q5_0",
2076
+ "q5_k_m",
2077
+ "q6_k",
2078
+ "q8_0"
2079
+ ],
2080
+ "model_id": "qwen/Qwen1.5-32B-Chat-GGUF",
2081
+ "model_hub": "modelscope",
2082
+ "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
2083
+ },
2038
2084
  {
2039
2085
  "model_format": "ggufv2",
2040
2086
  "model_size_in_billions": 72,
@@ -2075,6 +2121,60 @@
2075
2121
  ]
2076
2122
  }
2077
2123
  },
2124
+ {
2125
+ "version": 1,
2126
+ "context_length": 32768,
2127
+ "model_name": "qwen1.5-moe-chat",
2128
+ "model_lang": [
2129
+ "en",
2130
+ "zh"
2131
+ ],
2132
+ "model_ability": [
2133
+ "chat"
2134
+ ],
2135
+ "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
2136
+ "model_specs": [
2137
+ {
2138
+ "model_format": "pytorch",
2139
+ "model_size_in_billions": "2_7",
2140
+ "quantizations": [
2141
+ "4-bit",
2142
+ "8-bit",
2143
+ "none"
2144
+ ],
2145
+ "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
2146
+ "model_hub": "modelscope"
2147
+ },
2148
+ {
2149
+ "model_format": "gptq",
2150
+ "model_size_in_billions": "2_7",
2151
+ "quantizations": [
2152
+ "Int4"
2153
+ ],
2154
+ "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
2155
+ "model_hub": "modelscope"
2156
+ }
2157
+ ],
2158
+ "prompt_style": {
2159
+ "style_name": "QWEN",
2160
+ "system_prompt": "You are a helpful assistant.",
2161
+ "roles": [
2162
+ "user",
2163
+ "assistant"
2164
+ ],
2165
+ "intra_message_sep": "\n",
2166
+ "stop_token_ids": [
2167
+ 151643,
2168
+ 151644,
2169
+ 151645
2170
+ ],
2171
+ "stop": [
2172
+ "<|endoftext|>",
2173
+ "<|im_start|>",
2174
+ "<|im_end|>"
2175
+ ]
2176
+ }
2177
+ },
2078
2178
  {
2079
2179
  "version": 1,
2080
2180
  "context_length": 4096,
@@ -135,6 +135,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
135
135
  chat_history = [h for h in chat_history if not h.get("tool_calls")]
136
136
  if not chat_history:
137
137
  chat_history = []
138
+ if system_prompt:
139
+ chat_history.append({"role": "system", "content": system_prompt})
138
140
  if tools:
139
141
  msg = self._model.chat(
140
142
  self._tokenizer, prompt, [tools] + chat_history, **kwargs
@@ -42,6 +42,25 @@ from ..utils import ChatModelMixin
42
42
 
43
43
  logger = logging.getLogger(__name__)
44
44
 
45
+ NON_DEFAULT_MODEL_LIST: List[str] = [
46
+ "baichuan-chat",
47
+ "baichuan-2-chat",
48
+ "vicuna-v1.3",
49
+ "falcon",
50
+ "falcon-instruct",
51
+ "chatglm",
52
+ "chatglm2",
53
+ "chatglm2-32k",
54
+ "chatglm2-128k",
55
+ "llama-2",
56
+ "llama-2-chat",
57
+ "internlm2-chat",
58
+ "qwen-vl-chat",
59
+ "OmniLMM",
60
+ "yi-vl-chat",
61
+ "deepseek-vl-chat",
62
+ ]
63
+
45
64
 
46
65
  class PytorchModel(LLM):
47
66
  def __init__(
@@ -233,17 +252,7 @@ class PytorchModel(LLM):
233
252
  if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
234
253
  return False
235
254
  model_family = llm_family.model_family or llm_family.model_name
236
- if model_family in [
237
- "baichuan-chat",
238
- "vicuna-v1.3",
239
- "falcon",
240
- "falcon-instruct",
241
- "chatglm",
242
- "chatglm2",
243
- "chatglm2-32k",
244
- "llama-2",
245
- "llama-2-chat",
246
- ]:
255
+ if model_family in NON_DEFAULT_MODEL_LIST:
247
256
  return False
248
257
  if "generate" not in llm_family.model_ability:
249
258
  return False
@@ -452,23 +461,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
452
461
  ) -> bool:
453
462
  if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
454
463
  return False
455
- if llm_family.model_name in [
456
- "baichuan-chat",
457
- "baichuan-2-chat",
458
- "vicuna-v1.3",
459
- "falcon",
460
- "falcon-instruct",
461
- "chatglm",
462
- "chatglm2",
463
- "chatglm2-32k",
464
- "llama-2",
465
- "llama-2-chat",
466
- "internlm2-chat",
467
- "qwen-vl-chat",
468
- "OmniLMM",
469
- "yi-vl-chat",
470
- "deepseek-vl-chat",
471
- ]:
464
+ model_family = llm_family.model_family or llm_family.model_name
465
+ if model_family in NON_DEFAULT_MODEL_LIST:
472
466
  return False
473
467
  if "chat" not in llm_family.model_ability:
474
468
  return False
@@ -114,6 +114,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
114
114
  ]
115
115
  else:
116
116
  input_history = []
117
+ if system_prompt:
118
+ kwargs["meta_instruction"] = system_prompt
117
119
  if stream:
118
120
 
119
121
  def _stream_generator():
@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
53
53
 
54
54
  device = self._pytorch_model_config.get("device", "auto")
55
55
  device = select_device(device)
56
+ # for multiple GPU, set back to auto to make multiple devices work
57
+ device = "auto" if device == "cuda" else device
56
58
 
57
59
  self._tokenizer = AutoTokenizer.from_pretrained(
58
60
  self.model_path,
@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
59
59
 
60
60
  self._device = self._pytorch_model_config.get("device", "auto")
61
61
  self._device = select_device(self._device)
62
+ # for multiple GPU, set back to auto to make multiple devices work
63
+ self._device = "auto" if self._device == "cuda" else self._device
62
64
 
63
65
  key_info["model_path"] = self.model_path
64
66
  # Default device_map is auto, it can loads model to multiple cards.
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
190
192
  prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
191
193
  )
192
194
  .unsqueeze(0)
193
- .to(self._device)
195
+ .to(self._model.device)
194
196
  )
195
197
 
196
198
  images = state.get_images(return_pil=True)
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
215
217
  "input_ids": input_ids,
216
218
  "images": image_tensor.unsqueeze(0)
217
219
  .to(dtype=torch.bfloat16)
218
- .to(self._device),
220
+ .to(self._model.device),
219
221
  "streamer": streamer,
220
222
  "do_sample": True,
221
223
  "top_p": float(top_p),
@@ -163,7 +163,7 @@ class ChatModelMixin:
163
163
 
164
164
  for i, message in enumerate(chat_history):
165
165
  role = get_role(message["role"])
166
- content = message["content"]
166
+ content = message.get("content")
167
167
  tool_calls = message.get("tool_calls")
168
168
  if tool_calls:
169
169
  content = tool_calls[0]["function"]
@@ -248,7 +248,7 @@ Begin!"""
248
248
  ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
249
249
  for message in chat_history:
250
250
  role = get_role(message["role"])
251
- content = message["content"]
251
+ content = message.get("content")
252
252
 
253
253
  ret += prompt_style.intra_message_sep
254
254
  if tools:
@@ -446,6 +446,11 @@ Begin!"""
446
446
  "index": i,
447
447
  "delta": {
448
448
  "content": choice["text"],
449
+ **(
450
+ {"tool_calls": choice["tool_calls"]}
451
+ if "tool_calls" in choice
452
+ else {}
453
+ ),
449
454
  },
450
455
  "finish_reason": choice["finish_reason"],
451
456
  }
@@ -592,8 +597,7 @@ Begin!"""
592
597
  return text, None, None
593
598
 
594
599
  @classmethod
595
- def _tool_calls_completion(cls, model_family, model_uid, c, tools):
596
- _id = str(uuid.uuid4())
600
+ def _eval_tool_arguments(cls, model_family, c, tools):
597
601
  family = model_family.model_family or model_family.model_name
598
602
  if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
599
603
  content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
@@ -606,7 +610,41 @@ Begin!"""
606
610
  f"Model {model_family.model_name} is not support tool calls."
607
611
  )
608
612
  logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
613
+ return content, func, args
614
+
615
+ @classmethod
616
+ def _tools_token_filter(cls, model_family):
617
+ """
618
+ Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
609
619
 
620
+ Returns:
621
+ A function that takes tokens (string output by the model so far) as input
622
+ returns True if current token is after "\nFinal Answer:", else False.
623
+ """
624
+ family = model_family.model_family or model_family.model_name
625
+ if family in ["qwen-chat", "qwen1.5-chat"]:
626
+ # Encapsulating function to reset 'found' after each call
627
+ found = False
628
+
629
+ def process_token(tokens: str):
630
+ nonlocal found
631
+ # Once "Final Answer:" is found, future tokens are allowed.
632
+ if found:
633
+ return True
634
+ # Check if the token ends with "\nFinal Answer:" and update `found`.
635
+ if tokens.endswith("\nFinal Answer:"):
636
+ found = True
637
+ return False
638
+
639
+ return process_token
640
+ else:
641
+ # For other families, allow all tokens.
642
+ return lambda tokens: True
643
+
644
+ @classmethod
645
+ def _tool_calls_completion(cls, model_family, model_uid, c, tools):
646
+ _id = str(uuid.uuid4())
647
+ content, func, args = cls._eval_tool_arguments(model_family, c, tools)
610
648
  if func:
611
649
  m = {
612
650
  "role": "assistant",
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import json
15
16
  import logging
16
17
  import multiprocessing
17
18
  import time
@@ -36,6 +37,8 @@ from ....types import (
36
37
  CompletionChoice,
37
38
  CompletionChunk,
38
39
  CompletionUsage,
40
+ ToolCallFunction,
41
+ ToolCalls,
39
42
  )
40
43
  from .. import LLM, LLMFamilyV1, LLMSpecV1
41
44
  from ..llm_family import CustomLLMFamilyV1
@@ -80,7 +83,15 @@ try:
80
83
  except ImportError:
81
84
  VLLM_INSTALLED = False
82
85
 
83
- VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k", "mistral-v0.1"]
86
+ VLLM_SUPPORTED_MODELS = [
87
+ "llama-2",
88
+ "baichuan",
89
+ "internlm-16k",
90
+ "mistral-v0.1",
91
+ "Yi",
92
+ "code-llama",
93
+ "code-llama-python",
94
+ ]
84
95
  VLLM_SUPPORTED_CHAT_MODELS = [
85
96
  "llama-2-chat",
86
97
  "vicuna-v1.3",
@@ -90,16 +101,16 @@ VLLM_SUPPORTED_CHAT_MODELS = [
90
101
  "internlm-chat-7b",
91
102
  "internlm-chat-8k",
92
103
  "internlm-chat-20b",
104
+ "internlm2-chat",
93
105
  "qwen-chat",
94
- "Yi",
95
106
  "Yi-chat",
96
- "code-llama",
97
- "code-llama-python",
98
107
  "code-llama-instruct",
99
108
  "mistral-instruct-v0.1",
100
109
  "mistral-instruct-v0.2",
101
110
  "mixtral-instruct-v0.1",
102
111
  "chatglm3",
112
+ "chatglm3-32k",
113
+ "chatglm3-128k",
103
114
  "deepseek-chat",
104
115
  "deepseek-coder-instruct",
105
116
  ]
@@ -113,6 +124,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
113
124
  VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
114
125
  VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
115
126
 
127
+ if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
128
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
129
+
116
130
 
117
131
  class VLLMModel(LLM):
118
132
  def __init__(
@@ -293,6 +307,7 @@ class VLLMModel(LLM):
293
307
  self,
294
308
  prompt: str,
295
309
  generate_config: Optional[Dict] = None,
310
+ tools: object = False,
296
311
  ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
297
312
  try:
298
313
  from vllm.sampling_params import SamplingParams
@@ -319,16 +334,46 @@ class VLLMModel(LLM):
319
334
 
320
335
  async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
321
336
  previous_texts = [""] * sanitized_generate_config["n"]
337
+ tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
322
338
  async for _request_output in results_generator:
323
339
  chunk = self._convert_request_output_to_completion_chunk(
324
340
  request_id=request_id,
325
341
  model=self.model_uid,
326
342
  request_output=_request_output,
327
343
  )
344
+
328
345
  for i, choice in enumerate(chunk["choices"]):
329
346
  delta = choice["text"][len(previous_texts[i]) :]
330
347
  previous_texts[i] = choice["text"]
331
348
  choice["text"] = delta
349
+
350
+ if tools:
351
+ # only handle the first choice
352
+ choice = chunk["choices"][0]
353
+ if choice["finish_reason"] is not None:
354
+ # use previous text for evaluation temporarily
355
+ choice_delta = choice["text"]
356
+ choice["text"] = previous_texts[0]
357
+ _content, func, args = ChatModelMixin._eval_tool_arguments(
358
+ self.model_family, chunk, tools
359
+ )
360
+ choice["text"] = choice_delta
361
+ if func is not None:
362
+ choice["text"] = None
363
+ choice["finish_reason"] = "tool_calls"
364
+ choice["tool_calls"] = [
365
+ ToolCalls(
366
+ id=str(uuid.uuid4()),
367
+ type="function",
368
+ function=ToolCallFunction(
369
+ name=func,
370
+ arguments=json.dumps(args, ensure_ascii=False),
371
+ ),
372
+ )
373
+ ]
374
+ # use a filter function to skip Qwen's react thought process
375
+ elif not tools_token_filter(previous_texts[0]):
376
+ continue
332
377
  prompt_tokens = len(_request_output.prompt_token_ids)
333
378
  completion_tokens = sum(
334
379
  len(output.token_ids) for output in _request_output.outputs
@@ -416,7 +461,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
416
461
  generate_config = self._sanitize_chat_config(generate_config)
417
462
  # TODO(codingl2k1): qwen hacky to set stop for function call.
418
463
  model_family = self.model_family.model_family or self.model_family.model_name
419
- if tools and "qwen-chat" == model_family:
464
+ if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
420
465
  stop = generate_config.get("stop")
421
466
  if isinstance(stop, str):
422
467
  generate_config["stop"] = [stop, "Observation:"]
@@ -429,7 +474,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
429
474
  stream = generate_config.get("stream", None)
430
475
 
431
476
  if stream:
432
- agen = await self.async_generate(full_prompt, generate_config)
477
+ agen = await self.async_generate(full_prompt, generate_config, tools)
433
478
  assert isinstance(agen, AsyncGenerator)
434
479
  return self._async_to_chat_completion_chunks(agen)
435
480
  else:
@@ -134,8 +134,11 @@ class RerankModel:
134
134
  top_n: Optional[int],
135
135
  max_chunks_per_doc: Optional[int],
136
136
  return_documents: Optional[bool],
137
+ **kwargs,
137
138
  ) -> Rerank:
138
139
  assert self._model is not None
140
+ if kwargs:
141
+ raise ValueError("rerank hasn't support extra parameter.")
139
142
  if max_chunks_per_doc is not None:
140
143
  raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
141
144
  sentence_combinations = [[query, doc] for doc in documents]
@@ -207,7 +207,7 @@ class OmniLMM3B:
207
207
 
208
208
  class OmniLMMChat:
209
209
  def __init__(self, model_path, device_map) -> None:
210
- if "12B" in model_path:
210
+ if "12b" in model_path:
211
211
  self.model = OmniLMM12B(model_path, device_map)
212
212
  else:
213
213
  self.model = OmniLMM3B(model_path, device_map)
xinference/types.py CHANGED
@@ -91,11 +91,23 @@ class CompletionLogprobs(TypedDict):
91
91
  top_logprobs: List[Optional[Dict[str, float]]]
92
92
 
93
93
 
94
+ class ToolCallFunction(TypedDict):
95
+ name: str
96
+ arguments: str
97
+
98
+
99
+ class ToolCalls(TypedDict):
100
+ id: str
101
+ type: Literal["function"]
102
+ function: ToolCallFunction
103
+
104
+
94
105
  class CompletionChoice(TypedDict):
95
106
  text: str
96
107
  index: int
97
108
  logprobs: Optional[CompletionLogprobs]
98
109
  finish_reason: Optional[str]
110
+ tool_calls: NotRequired[List[ToolCalls]]
99
111
 
100
112
 
101
113
  class CompletionUsage(TypedDict):
@@ -147,6 +159,7 @@ class ChatCompletion(TypedDict):
147
159
  class ChatCompletionChunkDelta(TypedDict):
148
160
  role: NotRequired[str]
149
161
  content: NotRequired[str]
162
+ tool_calls: NotRequired[List[ToolCalls]]
150
163
 
151
164
 
152
165
  class ChatCompletionChunkChoice(TypedDict):
@@ -232,6 +245,8 @@ class LlamaCppModelConfig(TypedDict, total=False):
232
245
  n_ctx: int
233
246
  n_parts: int
234
247
  n_gpu_layers: int
248
+ split_mode: int
249
+ main_gpu: int
235
250
  seed: int
236
251
  f16_kv: bool
237
252
  logits_all: bool
@@ -355,21 +370,6 @@ try:
355
370
  except ImportError:
356
371
  CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
357
372
 
358
- CreateCompletionCTransformers: BaseModel
359
- try:
360
- from ctransformers.llm import LLM
361
-
362
- CreateCompletionCTransformers = get_pydantic_model_from_method(
363
- LLM.generate,
364
- exclude_fields=["tokens"],
365
- include_fields={
366
- "max_tokens": (Optional[int], max_tokens_field),
367
- "stream": (Optional[bool], stream_field),
368
- },
369
- )
370
- except ImportError:
371
- CreateCompletionCTransformers = create_model("CreateCompletionCTransformers")
372
-
373
373
 
374
374
  # This type is for openai API compatibility
375
375
  CreateCompletionOpenAI: BaseModel
@@ -415,7 +415,6 @@ class CreateCompletion(
415
415
  ModelAndPrompt,
416
416
  CreateCompletionTorch,
417
417
  CreateCompletionLlamaCpp,
418
- CreateCompletionCTransformers,
419
418
  CreateCompletionOpenAI,
420
419
  ):
421
420
  pass
@@ -428,8 +427,6 @@ class CreateChatModel(BaseModel):
428
427
  # Currently, chat calls generates, so the params share the same one.
429
428
  CreateChatCompletionTorch = CreateCompletionTorch
430
429
  CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
431
- CreateChatCompletionCTransformers: BaseModel = CreateCompletionCTransformers
432
-
433
430
 
434
431
  # This type is for openai API compatibility
435
432
  CreateChatCompletionOpenAI: BaseModel
@@ -450,7 +447,6 @@ class CreateChatCompletion(
450
447
  CreateChatModel,
451
448
  CreateChatCompletionTorch,
452
449
  CreateChatCompletionLlamaCpp,
453
- CreateChatCompletionCTransformers,
454
450
  CreateChatCompletionOpenAI,
455
451
  ):
456
452
  pass
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.98516614.js",
3
+ "main.js": "./static/js/main.76ef2b17.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.98516614.js.map": "./static/js/main.98516614.js.map"
6
+ "main.76ef2b17.js.map": "./static/js/main.76ef2b17.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.98516614.js"
9
+ "static/js/main.76ef2b17.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.98516614.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.76ef2b17.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>