xinference 0.9.2__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (48) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +111 -13
  3. xinference/client/restful/restful_client.py +2 -1
  4. xinference/conftest.py +18 -15
  5. xinference/constants.py +2 -0
  6. xinference/core/image_interface.py +252 -0
  7. xinference/core/supervisor.py +3 -10
  8. xinference/deploy/cmdline.py +69 -4
  9. xinference/deploy/local.py +1 -1
  10. xinference/deploy/supervisor.py +1 -1
  11. xinference/model/image/__init__.py +13 -7
  12. xinference/model/image/core.py +17 -1
  13. xinference/model/llm/__init__.py +2 -0
  14. xinference/model/llm/ggml/llamacpp.py +1 -5
  15. xinference/model/llm/llm_family.json +98 -13
  16. xinference/model/llm/llm_family_modelscope.json +98 -7
  17. xinference/model/llm/pytorch/chatglm.py +2 -1
  18. xinference/model/llm/pytorch/internlm2.py +2 -1
  19. xinference/model/llm/sglang/__init__.py +13 -0
  20. xinference/model/llm/sglang/core.py +365 -0
  21. xinference/model/llm/utils.py +35 -12
  22. xinference/model/llm/vllm/core.py +17 -0
  23. xinference/web/ui/build/asset-manifest.json +3 -3
  24. xinference/web/ui/build/index.html +1 -1
  25. xinference/web/ui/build/static/js/{main.78829790.js → main.66b1c4fb.js} +3 -3
  26. xinference/web/ui/build/static/js/main.66b1c4fb.js.map +1 -0
  27. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +1 -0
  28. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/fd4a8ae5d192331af1bedd1d2d70efcc569708ee6cc4cb479b225d059482aa81.json +1 -0
  34. {xinference-0.9.2.dist-info → xinference-0.9.4.dist-info}/METADATA +8 -5
  35. {xinference-0.9.2.dist-info → xinference-0.9.4.dist-info}/RECORD +40 -37
  36. {xinference-0.9.2.dist-info → xinference-0.9.4.dist-info}/WHEEL +1 -1
  37. xinference/web/ui/build/static/js/main.78829790.js.map +0 -1
  38. xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +0 -1
  39. xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +0 -1
  40. xinference/web/ui/node_modules/.cache/babel-loader/396f7ce6ae6900bfdb00e369ade8a05045dc1df025610057ff7436d9e58af81c.json +0 -1
  41. xinference/web/ui/node_modules/.cache/babel-loader/5282ee05e064b3a80bc991e9003ddef6a4958471d8f4fc65589dc64553365cdd.json +0 -1
  42. xinference/web/ui/node_modules/.cache/babel-loader/83beb31daa7169fb0057453d4f86411f1effd3e3f7af97472cbd22accbfc65bb.json +0 -1
  43. xinference/web/ui/node_modules/.cache/babel-loader/ddf597663270471b31251b2abb36e3fa093efe20489387d996f993d2c61be112.json +0 -1
  44. xinference/web/ui/node_modules/.cache/babel-loader/e8687f75d2adacd34852b71c41ca17203d6fb4c8999ea55325bb2939f9d9ea90.json +0 -1
  45. /xinference/web/ui/build/static/js/{main.78829790.js.LICENSE.txt → main.66b1c4fb.js.LICENSE.txt} +0 -0
  46. {xinference-0.9.2.dist-info → xinference-0.9.4.dist-info}/LICENSE +0 -0
  47. {xinference-0.9.2.dist-info → xinference-0.9.4.dist-info}/entry_points.txt +0 -0
  48. {xinference-0.9.2.dist-info → xinference-0.9.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,365 @@
1
+ # Copyright 2022-2024 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import time
17
+ import uuid
18
+ from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
19
+
20
+ from ....constants import XINFERENCE_ENABLE_SGLANG
21
+ from ....types import (
22
+ ChatCompletion,
23
+ ChatCompletionChunk,
24
+ ChatCompletionMessage,
25
+ Completion,
26
+ CompletionChoice,
27
+ CompletionChunk,
28
+ CompletionUsage,
29
+ )
30
+ from .. import LLM, LLMFamilyV1, LLMSpecV1
31
+ from ..llm_family import CustomLLMFamilyV1
32
+ from ..utils import ChatModelMixin
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class SGLANGModelConfig(TypedDict, total=False):
38
+ tokenizer_mode: str
39
+ trust_remote_code: bool
40
+ tp_size: int
41
+ mem_fraction_static: float
42
+ log_level: str
43
+ attention_reduce_in_fp32: bool # For gemma
44
+
45
+
46
+ class SGLANGGenerateConfig(TypedDict, total=False):
47
+ presence_penalty: float
48
+ frequency_penalty: float
49
+ temperature: float
50
+ top_p: float
51
+ top_k: int
52
+ max_new_tokens: int
53
+ stop: Optional[Union[str, List[str]]]
54
+ ignore_eos: bool
55
+ stream: bool
56
+
57
+
58
+ try:
59
+ import sglang # noqa: F401
60
+
61
+ SGLANG_INSTALLED = True
62
+ except ImportError:
63
+ SGLANG_INSTALLED = False
64
+
65
+ SGLANG_SUPPORTED_MODELS = ["llama-2", "mistral-v0.1", "mixtral-v0.1"]
66
+ SGLANG_SUPPORTED_CHAT_MODELS = [
67
+ "llama-2-chat",
68
+ "qwen-chat",
69
+ "qwen1.5-chat",
70
+ "mistral-instruct-v0.1",
71
+ "mistral-instruct-v0.2",
72
+ "mixtral-instruct-v0.1",
73
+ "gemma-it",
74
+ ]
75
+
76
+
77
+ class SGLANGModel(LLM):
78
+ def __init__(
79
+ self,
80
+ model_uid: str,
81
+ model_family: "LLMFamilyV1",
82
+ model_spec: "LLMSpecV1",
83
+ quantization: str,
84
+ model_path: str,
85
+ model_config: Optional[SGLANGModelConfig],
86
+ ):
87
+ super().__init__(model_uid, model_family, model_spec, quantization, model_path)
88
+ self._model_config = model_config
89
+ self._engine = None
90
+
91
+ def load(self):
92
+ try:
93
+ import sglang as sgl
94
+ except ImportError:
95
+ error_message = "Failed to import module 'sglang'"
96
+ installation_guide = [
97
+ "Please make sure 'sglang' is installed. ",
98
+ "You can install it by `pip install 'sglang[all]'`\n",
99
+ ]
100
+
101
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
102
+
103
+ self._model_config = self._sanitize_model_config(self._model_config)
104
+ logger.info(
105
+ f"Loading {self.model_uid} with following model config: {self._model_config}"
106
+ )
107
+
108
+ self._engine = sgl.Runtime(
109
+ model_path=self.model_path,
110
+ tokenizer_path=self.model_path,
111
+ **self._model_config,
112
+ )
113
+
114
+ def _sanitize_model_config(
115
+ self, model_config: Optional[SGLANGModelConfig]
116
+ ) -> SGLANGModelConfig:
117
+ if model_config is None:
118
+ model_config = SGLANGModelConfig()
119
+
120
+ cuda_count = self._get_cuda_count()
121
+ model_config.setdefault("tokenizer_mode", "auto")
122
+ model_config.setdefault("trust_remote_code", True)
123
+ model_config.setdefault("tp_size", cuda_count)
124
+ # See https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py#L37
125
+ mem_fraction_static = model_config.pop("mem_fraction_static", None)
126
+ if mem_fraction_static is None:
127
+ tp_size = model_config.get("tp_size", cuda_count)
128
+ if tp_size >= 8:
129
+ model_config["mem_fraction_static"] = 0.80
130
+ elif tp_size >= 4:
131
+ model_config["mem_fraction_static"] = 0.82
132
+ elif tp_size >= 2:
133
+ model_config["mem_fraction_static"] = 0.85
134
+ else:
135
+ model_config["mem_fraction_static"] = 0.90
136
+ model_config.setdefault("log_level", "info")
137
+ model_config.setdefault("attention_reduce_in_fp32", False)
138
+
139
+ return model_config
140
+
141
+ @staticmethod
142
+ def _sanitize_generate_config(
143
+ generate_config: Optional[SGLANGGenerateConfig] = None,
144
+ ) -> SGLANGGenerateConfig:
145
+ if generate_config is None:
146
+ generate_config = SGLANGGenerateConfig()
147
+
148
+ generate_config.setdefault("presence_penalty", 0.0)
149
+ generate_config.setdefault("frequency_penalty", 0.0)
150
+ generate_config.setdefault("temperature", 1.0)
151
+ generate_config.setdefault("top_p", 1.0)
152
+ generate_config.setdefault("top_k", -1)
153
+ # See https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/ir.py#L120
154
+ # 16 is too less, so here set 256 by default
155
+ generate_config.setdefault(
156
+ "max_new_tokens", generate_config.pop("max_tokens", 256) # type: ignore
157
+ )
158
+ generate_config.setdefault("stop", [])
159
+ generate_config.setdefault("stream", False)
160
+ generate_config.setdefault("ignore_eos", False)
161
+
162
+ return generate_config
163
+
164
+ @classmethod
165
+ def match(
166
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
167
+ ) -> bool:
168
+ if not XINFERENCE_ENABLE_SGLANG:
169
+ return False
170
+ if not cls._has_cuda_device():
171
+ return False
172
+ if not cls._is_linux():
173
+ return False
174
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
175
+ return False
176
+ if llm_spec.model_format == "pytorch":
177
+ if quantization != "none" and not (quantization is None):
178
+ return False
179
+ if llm_spec.model_format in ["gptq", "awq"]:
180
+ # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
181
+ if "4" not in quantization:
182
+ return False
183
+ if isinstance(llm_family, CustomLLMFamilyV1):
184
+ if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
185
+ return False
186
+ else:
187
+ if llm_family.model_name not in SGLANG_SUPPORTED_MODELS:
188
+ return False
189
+ if "generate" not in llm_family.model_ability:
190
+ return False
191
+ return SGLANG_INSTALLED
192
+
193
+ @staticmethod
194
+ def _convert_state_to_completion_chunk(
195
+ request_id: str, model: str, output_text: str, meta_info: Dict
196
+ ) -> CompletionChunk:
197
+ choices: List[CompletionChoice] = [
198
+ CompletionChoice(
199
+ text=output_text,
200
+ index=0,
201
+ logprobs=None,
202
+ finish_reason=None,
203
+ )
204
+ ]
205
+ chunk = CompletionChunk(
206
+ id=request_id,
207
+ object="text_completion",
208
+ created=int(time.time()),
209
+ model=model,
210
+ choices=choices,
211
+ )
212
+ prompt_tokens = meta_info["prompt_tokens"]
213
+ completion_tokens = meta_info["completion_tokens"]
214
+ chunk["usage"] = CompletionUsage(
215
+ prompt_tokens=prompt_tokens,
216
+ completion_tokens=completion_tokens,
217
+ total_tokens=prompt_tokens + completion_tokens,
218
+ )
219
+ return chunk
220
+
221
+ @staticmethod
222
+ def _convert_state_to_completion(
223
+ request_id: str, model: str, output_text: str, meta_info: Dict
224
+ ) -> Completion:
225
+ choices = [
226
+ CompletionChoice(
227
+ text=output_text,
228
+ index=0,
229
+ logprobs=None,
230
+ finish_reason=None,
231
+ )
232
+ ]
233
+
234
+ usage = CompletionUsage(
235
+ prompt_tokens=meta_info["prompt_tokens"],
236
+ completion_tokens=meta_info["completion_tokens"],
237
+ total_tokens=meta_info["prompt_tokens"] + meta_info["completion_tokens"],
238
+ )
239
+ return Completion(
240
+ id=request_id,
241
+ object="text_completion",
242
+ created=int(time.time()),
243
+ model=model,
244
+ choices=choices,
245
+ usage=usage,
246
+ )
247
+
248
+ async def async_generate(
249
+ self,
250
+ prompt: str,
251
+ generate_config: Optional[SGLANGGenerateConfig] = None,
252
+ ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
253
+ try:
254
+ import sglang as sgl
255
+ from sglang import assistant, gen, user
256
+ except ImportError:
257
+ error_message = "Failed to import module 'sglang'"
258
+ installation_guide = [
259
+ "Please make sure 'sglang' is installed. ",
260
+ "You can install it by `pip install sglang[all]`\n",
261
+ ]
262
+
263
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
264
+
265
+ @sgl.function
266
+ def pipeline(s, question):
267
+ s += user(question)
268
+ s += assistant(gen("answer"))
269
+
270
+ sanitized_generate_config = self._sanitize_generate_config(generate_config)
271
+ logger.debug(
272
+ "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
273
+ )
274
+ stream = sanitized_generate_config.pop("stream")
275
+ request_id = str(uuid.uuid1())
276
+ state = pipeline.run(
277
+ question=prompt,
278
+ backend=self._engine,
279
+ stream=stream,
280
+ **sanitized_generate_config,
281
+ )
282
+ if not stream:
283
+ return self._convert_state_to_completion(
284
+ request_id,
285
+ model=self.model_uid,
286
+ output_text=state["answer"],
287
+ meta_info=state.get_meta_info(name="answer"),
288
+ )
289
+ else:
290
+
291
+ async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
292
+ async for out, meta_info in state.text_async_iter(
293
+ var_name="answer", return_meta_data=True
294
+ ):
295
+ chunk = self._convert_state_to_completion_chunk(
296
+ request_id, self.model_uid, output_text=out, meta_info=meta_info
297
+ )
298
+ yield chunk
299
+
300
+ return stream_results()
301
+
302
+
303
+ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
304
+ @classmethod
305
+ def match(
306
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
307
+ ) -> bool:
308
+ if not XINFERENCE_ENABLE_SGLANG:
309
+ return False
310
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
311
+ return False
312
+ if llm_spec.model_format == "pytorch":
313
+ if quantization != "none" and not (quantization is None):
314
+ return False
315
+ if llm_spec.model_format in ["gptq", "awq"]:
316
+ # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
317
+ if "4" not in quantization:
318
+ return False
319
+ if isinstance(llm_family, CustomLLMFamilyV1):
320
+ if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
321
+ return False
322
+ else:
323
+ if llm_family.model_name not in SGLANG_SUPPORTED_CHAT_MODELS:
324
+ return False
325
+ if "chat" not in llm_family.model_ability:
326
+ return False
327
+ return SGLANG_INSTALLED
328
+
329
+ def _sanitize_chat_config(
330
+ self,
331
+ generate_config: Optional[Dict] = None,
332
+ ) -> Dict:
333
+ if not generate_config:
334
+ generate_config = {}
335
+ if self.model_family.prompt_style:
336
+ if (
337
+ not generate_config.get("stop")
338
+ ) and self.model_family.prompt_style.stop:
339
+ generate_config["stop"] = self.model_family.prompt_style.stop.copy()
340
+ return generate_config
341
+
342
+ async def async_chat(
343
+ self,
344
+ prompt: str,
345
+ system_prompt: Optional[str] = None,
346
+ chat_history: Optional[List[ChatCompletionMessage]] = None,
347
+ generate_config: Optional[Dict] = None,
348
+ ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
349
+ assert self.model_family.prompt_style is not None
350
+ prompt_style = self.model_family.prompt_style.copy()
351
+ if system_prompt:
352
+ prompt_style.system_prompt = system_prompt
353
+ chat_history = chat_history or []
354
+ full_prompt = self.get_prompt(prompt, chat_history, prompt_style)
355
+
356
+ generate_config = self._sanitize_chat_config(generate_config)
357
+ stream = generate_config.get("stream", None)
358
+ if stream:
359
+ agen = await self.async_generate(full_prompt, generate_config) # type: ignore
360
+ assert isinstance(agen, AsyncGenerator)
361
+ return self._async_to_chat_completion_chunks(agen)
362
+ else:
363
+ c = await self.async_generate(full_prompt, generate_config) # type: ignore
364
+ assert not isinstance(c, AsyncGenerator)
365
+ return self._to_chat_completion(c)
@@ -411,6 +411,16 @@ Begin!"""
411
411
  if content:
412
412
  ret += content + "<end_of_turn>\n"
413
413
  return ret
414
+ elif prompt_style.style_name == "CodeShell":
415
+ ret = ""
416
+ for message in chat_history:
417
+ content = message["content"]
418
+ role = get_role(message["role"])
419
+ if content:
420
+ ret += f"{role}{content}|<end>|"
421
+ else:
422
+ ret += f"{role}".rstrip()
423
+ return ret
414
424
  else:
415
425
  raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
416
426
 
@@ -451,6 +461,7 @@ Begin!"""
451
461
  "index": i,
452
462
  "delta": {
453
463
  "role": "assistant",
464
+ "content": "",
454
465
  },
455
466
  "finish_reason": None,
456
467
  }
@@ -535,26 +546,39 @@ Begin!"""
535
546
  # Refer to:
536
547
  # https://github.com/QwenLM/Qwen/blob/main/examples/react_prompt.md
537
548
  # https://github.com/QwenLM/Qwen/blob/main/openai_api.py#L297
538
- func_name, func_args = "", ""
549
+ func_name, func_args, content = "", "", ""
539
550
  i = text.rfind("\nAction:")
540
551
  j = text.rfind("\nAction Input:")
541
552
  k = text.rfind("\nObservation:")
553
+ t = max(
554
+ text.rfind("\nThought:", 0, i), text.rfind("Thought:", 0, i)
555
+ ) # find the last thought just before Action, considering the Thought at the very beginning
542
556
  if 0 <= i < j: # If the text has `Action` and `Action input`,
543
557
  if k < j: # but does not contain `Observation`,
544
558
  # then it is likely that `Observation` is omitted by the LLM,
545
559
  # because the output text may have discarded the stop word.
546
560
  text = text.rstrip() + "\nObservation:" # Add it back.
547
561
  k = text.rfind("\nObservation:")
548
- if 0 <= i < j < k:
562
+ if 0 <= t < i < j < k:
549
563
  func_name = text[i + len("\nAction:") : j].strip()
550
564
  func_args = text[j + len("\nAction Input:") : k].strip()
565
+ content = text[
566
+ t + len("\nThought:") : i
567
+ ].strip() # len("\nThought:") and len("Thought:") both are OK since there is a space after :
551
568
  if func_name:
552
- return None, func_name, json.loads(func_args)
553
- z = text.rfind("\nFinal Answer: ")
554
- if z >= 0:
555
- text = text[z + len("\nFinal Answer: ") :]
569
+ return content, func_name, json.loads(func_args)
556
570
  except Exception as e:
557
571
  logger.error("Eval tool calls completion failed: %s", e)
572
+ t = max(text.rfind("\nThought:"), text.rfind("Thought:"))
573
+ z = max(text.rfind("\nFinal Answer:"), text.rfind("Final Answer:"))
574
+ if z >= 0:
575
+ text = text[
576
+ z + len("\nFinal Answer:") :
577
+ ] # len("\nFinal Answer::") and len("Final Answer::") both are OK since there is a space after :
578
+ else:
579
+ text = text[
580
+ t + len("\nThought:") :
581
+ ] # There is only Thought: no Final Answer:
558
582
  return text, None, None
559
583
 
560
584
  @classmethod
@@ -573,13 +597,10 @@ Begin!"""
573
597
  )
574
598
  logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
575
599
 
576
- if content:
577
- m = {"role": "assistant", "content": content, "tool_calls": []}
578
- finish_reason = "stop"
579
- else:
600
+ if func:
580
601
  m = {
581
602
  "role": "assistant",
582
- "content": None,
603
+ "content": content,
583
604
  "tool_calls": [
584
605
  {
585
606
  "id": f"call_{_id}",
@@ -592,7 +613,9 @@ Begin!"""
592
613
  ],
593
614
  }
594
615
  finish_reason = "tool_calls"
595
-
616
+ else:
617
+ m = {"role": "assistant", "content": content, "tool_calls": []}
618
+ finish_reason = "stop"
596
619
  return {
597
620
  "id": "chat" + f"cmpl-{_id}",
598
621
  "model": model_uid,
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import logging
16
+ import multiprocessing
16
17
  import time
17
18
  import uuid
18
19
  from typing import (
@@ -102,6 +103,13 @@ VLLM_SUPPORTED_CHAT_MODELS = [
102
103
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
103
104
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
104
105
 
106
+ if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
107
+ VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
108
+
109
+ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
110
+ VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
111
+ VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
112
+
105
113
 
106
114
  class VLLMModel(LLM):
107
115
  def __init__(
@@ -119,6 +127,7 @@ class VLLMModel(LLM):
119
127
 
120
128
  def load(self):
121
129
  try:
130
+ import vllm
122
131
  from vllm.engine.arg_utils import AsyncEngineArgs
123
132
  from vllm.engine.async_llm_engine import AsyncLLMEngine
124
133
  except ImportError:
@@ -130,6 +139,14 @@ class VLLMModel(LLM):
130
139
 
131
140
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
132
141
 
142
+ if vllm.__version__ >= "0.3.1":
143
+ # from vllm v0.3.1, it uses cupy as NCCL backend
144
+ # in which cupy will fork a process
145
+ # only for xoscar >= 0.3.0, new process is allowed in subpool
146
+ # besides, xinference set start method as forkserver for unix
147
+ # we need to set it to fork to make cupy NCCL work
148
+ multiprocessing.set_start_method("fork", force=True)
149
+
133
150
  self._model_config = self._sanitize_model_config(self._model_config)
134
151
  logger.info(
135
152
  f"Loading {self.model_uid} with following model config: {self._model_config}"
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.78829790.js",
3
+ "main.js": "./static/js/main.66b1c4fb.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.78829790.js.map": "./static/js/main.78829790.js.map"
6
+ "main.66b1c4fb.js.map": "./static/js/main.66b1c4fb.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.78829790.js"
9
+ "static/js/main.66b1c4fb.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.78829790.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.66b1c4fb.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>