PyPI - xinference - Versions diffs - 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

xinference 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (35) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +35 -1
xinference/client/oscar/actor_client.py +2 -2
xinference/client/restful/restful_client.py +2 -2
xinference/conftest.py +5 -1
xinference/core/metrics.py +83 -0
xinference/core/model.py +148 -8
xinference/core/status_guard.py +86 -0
xinference/core/supervisor.py +57 -7
xinference/core/worker.py +132 -13
xinference/deploy/cmdline.py +57 -4
xinference/deploy/local.py +32 -6
xinference/deploy/worker.py +33 -5
xinference/fields.py +4 -1
xinference/model/llm/__init__.py +7 -0
xinference/model/llm/ggml/llamacpp.py +3 -2
xinference/model/llm/llm_family.json +70 -3
xinference/model/llm/llm_family.py +11 -1
xinference/model/llm/llm_family_modelscope.json +72 -3
xinference/model/llm/pytorch/chatglm.py +70 -28
xinference/model/llm/pytorch/core.py +11 -30
xinference/model/llm/pytorch/internlm2.py +155 -0
xinference/model/llm/pytorch/utils.py +0 -153
xinference/model/llm/utils.py +37 -8
xinference/model/llm/vllm/core.py +15 -3
xinference/model/multimodal/__init__.py +15 -8
xinference/model/multimodal/model_spec_modelscope.json +45 -0
xinference/model/utils.py +7 -2
xinference/types.py +2 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/METADATA +2 -1
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/RECORD +35 -31
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -535,7 +535,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
@@ -609,6 +610,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },
@@ -1139,14 +1149,15 @@
   },
   {
     "version": 1,
-    "context_length": 2048,
+    "context_length": 32768,
     "model_name": "qwen-chat",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
@@ -1172,6 +1183,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": "1_8",
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-1_8B-Chat",
@@ -1181,6 +1194,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 7,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-7B-Chat",
@@ -1190,6 +1205,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 14,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-14B-Chat",
@@ -1199,6 +1216,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 72,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-72B-Chat",
@@ -3144,5 +3163,53 @@
         "model_revision": "70d1740208c8ba39f9ba250b22117ec25311ab33"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "internlm2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The second generation of the InternLM model, InternLM2.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "internlm/internlm2-chat-7b",
+        "model_revision": "5797f79825bab7013932d57e2babaac1b8de6b4f"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "internlm/internlm2-chat-20b",
+        "model_revision": "3ccaf3ae82d5d01c0a95eecf40ee550f9c543635"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "[UNUSED_TOKEN_146]user",
+        "[UNUSED_TOKEN_146]assistant"
+      ],
+      "intra_message_sep": "[UNUSED_TOKEN_145]",
+      "stop_token_ids": [
+        92542
+      ],
+      "stop": [
+        "[UNUSED_TOKEN_145]"
+      ]
+    }
   }
 ]

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -43,6 +43,7 @@ DEFAULT_CONTEXT_LENGTH = 2048
 BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
 BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
 BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
+BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
 class GgmlLLMSpecV1(BaseModel):
@@ -105,7 +106,7 @@ class LLMFamilyV1(BaseModel):
     context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
     model_name: str
     model_lang: List[str]
-    model_ability: List[Literal["embed", "generate", "chat"]]
+    model_ability: List[Literal["embed", "generate", "chat", "tools"]]
     model_description: Optional[str]
     # reason for not required str here: legacy registration
     model_family: Optional[str]
@@ -155,6 +156,15 @@ class CustomLLMFamilyV1(LLMFamilyV1):
                 f"`model_family` for chat model must be `other` or one of the following values: \n"
                 f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
             )
+        if (
+            llm_spec.model_family != "other"
+            and "tool_call" in llm_spec.model_ability
+            and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
+        ):
+            raise ValueError(
+                f"`model_family` for tool call model must be `other` or one of the following values: \n"
+                f"{', '.join(list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES))}"
+            )
         if (
             llm_spec.model_family != "other"
             and "chat" not in llm_spec.model_ability

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -297,7 +297,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
@@ -375,6 +376,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },
@@ -1461,14 +1471,15 @@
   },
   {
     "version": 1,
-    "context_length": 2048,
+    "context_length": 32768,
     "model_name": "qwen-chat",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
@@ -1498,6 +1509,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": "1_8",
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -1508,6 +1521,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 7,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -1518,6 +1533,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 72,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -1528,6 +1545,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 14,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "qwen/Qwen-14B-Chat",
@@ -1759,5 +1778,55 @@
         "model_revision": "master"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "internlm2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The second generation of the InternLM model, InternLM2.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm2-chat-7b",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm2-chat-20b",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "[UNUSED_TOKEN_146]user",
+        "[UNUSED_TOKEN_146]assistant"
+      ],
+      "intra_message_sep": "[UNUSED_TOKEN_145]",
+      "stop_token_ids": [
+        92542
+      ],
+      "stop": [
+        "[UNUSED_TOKEN_145]"
+      ]
+    }
   }
 ]

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -11,13 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import time
+import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
 from ....types import (
     SPECIAL_TOOL_PROMPT,
     ChatCompletion,
+    ChatCompletionChoice,
     ChatCompletionChunk,
     ChatCompletionMessage,
+    CompletionChoice,
+    CompletionChunk,
+    CompletionUsage,
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -106,38 +112,74 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         tools = self._handle_tools(generate_config)
+        kwargs: Dict[str, Any] = {}
+        generate_config = generate_config or {}
+        temperature = generate_config.get("temperature")
+        if temperature is not None:
+            kwargs["temperature"] = float(temperature)
+        top_p = generate_config.get("top_p")
+        if top_p is not None:
+            kwargs["top_p"] = float(top_p)
+        max_length = generate_config.get("max_tokens")
+        if max_length is not None:
+            kwargs["max_length"] = int(max_length)
+        # Tool calls only works for non stream, so we call chat directly.
+        if prompt == SPECIAL_TOOL_PROMPT and chat_history:
+            tool_message = chat_history.pop()
+            content = tool_message.get("content")
+            assert content is not None
+            prompt = content
+            kwargs["role"] = "observation"
+            chat_history = [h for h in chat_history if not h.get("tool_calls")]
+        if not chat_history:
+            chat_history = []
         if tools:
-            # Tool calls only works for non stream, so we call chat directly.
-            kwargs: Dict[str, Any] = {}
-            generate_config = generate_config or {}
-            temperature = generate_config.get("temperature")
-            if temperature is not None:
-                kwargs["temperature"] = float(temperature)
-            top_p = generate_config.get("top_p")
-            if top_p is not None:
-                kwargs["top_p"] = float(top_p)
-            max_length = generate_config.get("max_tokens")
-            if max_length is not None:
-                kwargs["max_length"] = int(max_length)
-            if prompt == SPECIAL_TOOL_PROMPT and chat_history:
-                tool_message = chat_history.pop()
-                content = tool_message.get("content")
-                assert content is not None
-                prompt = content
-                kwargs["role"] = "observation"
-                chat_history = [h for h in chat_history if not h.get("tool_calls")]
-            if not chat_history:
-                chat_history = []
             msg = self._model.chat(
                 self._tokenizer, prompt, [tools] + chat_history, **kwargs
             )
             return self._tool_calls_completion(
-                self.model_family.model_name, self.model_uid, msg, tools
+                self.model_family, self.model_uid, msg, tools
             )
         else:
-            return super().chat(
-                prompt=prompt,
-                system_prompt=system_prompt,
-                chat_history=chat_history,
-                generate_config=generate_config,
-            )
+            stream = generate_config.get("stream", False)
+            if stream:
+                def _stream_generator():
+                    last_chunk_text_length = 0
+                    for chunk_text, _ in self._model.stream_chat(
+                        self._tokenizer, prompt, chat_history, **kwargs
+                    ):
+                        chunk_text = chunk_text[last_chunk_text_length:]
+                        last_chunk_text_length += len(chunk_text)
+                        completion_choice = CompletionChoice(
+                            text=chunk_text, index=0, logprobs=None, finish_reason=None
+                        )
+                        yield CompletionChunk(
+                            id=str(uuid.uuid1()),
+                            object="text_completion",
+                            created=int(time.time()),
+                            model=self.model_uid,
+                            choices=[completion_choice],
+                        )
+                return self._to_chat_completion_chunks(_stream_generator())
+            else:
+                response, _ = self._model.chat(
+                    self._tokenizer, prompt, chat_history, **kwargs
+                )
+                return ChatCompletion(
+                    id="chat" + str(uuid.uuid1()),
+                    object="chat.completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[
+                        ChatCompletionChoice(
+                            index=0,
+                            message={"role": "assistant", "content": response},
+                            finish_reason="stop",
+                        )
+                    ],
+                    usage=CompletionUsage(
+                        prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+                    ),
+                )

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -192,7 +192,8 @@ class PytorchModel(LLM):
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq"]:
             return False
-        if llm_family.model_name in [
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family in [
             "baichuan-chat",
             "vicuna-v1.3",
             "falcon",
@@ -211,11 +212,7 @@ class PytorchModel(LLM):
     def generate(
         self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
     ) -> Union[Completion, Iterator[CompletionChunk]]:
-        from .utils import (
-            generate_stream,
-            generate_stream_chatglm,
-            generate_stream_falcon,
-        )
+        from .utils import generate_stream, generate_stream_falcon
         model_family_name = self.model_family.model_name.lower()
@@ -223,17 +220,7 @@ class PytorchModel(LLM):
             prompt: str, generate_config: PytorchGenerateConfig
         ) -> Iterator[CompletionChunk]:
             if "falcon" in model_family_name:
-                for completion_chunk, _ in generate_stream_falcon(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    yield completion_chunk
-            elif "chatglm" in model_family_name:
-                for completion_chunk, _ in generate_stream_chatglm(
+                for completion_chunk, completion_usage in generate_stream_falcon(
                     self.model_uid,
                     self._model,
                     self._tokenizer,
@@ -241,9 +228,10 @@ class PytorchModel(LLM):
                     self._device,
                     generate_config,
                 ):
+                    completion_chunk["usage"] = completion_usage
                     yield completion_chunk
             else:
-                for completion_chunk, _ in generate_stream(
+                for completion_chunk, completion_usage in generate_stream(
                     self.model_uid,
                     self._model,
                     self._tokenizer,
@@ -251,6 +239,7 @@ class PytorchModel(LLM):
                     self._device,
                     generate_config,
                 ):
+                    completion_chunk["usage"] = completion_usage
                     yield completion_chunk
         logger.debug(
@@ -274,16 +263,6 @@ class PytorchModel(LLM):
                     generate_config,
                 ):
                     pass
-            elif "chatglm" in model_family_name:
-                for completion_chunk, completion_usage in generate_stream_chatglm(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    pass
             else:
                 for completion_chunk, completion_usage in generate_stream(
                     self.model_uid,
@@ -442,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             "chatglm2-32k",
             "llama-2",
             "llama-2-chat",
+            "internlm2-chat",
         ]:
             return False
         if "chat" not in llm_family.model_ability:
@@ -465,7 +445,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         generate_config = self._sanitize_generate_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
-        if tools and self.model_family.model_name == "qwen-chat":
+        model_family = self.model_family.model_family or self.model_family.model_name
+        if tools and "qwen-chat" == model_family:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
@@ -485,6 +466,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             assert not isinstance(c, Iterator)
             if tools:
                 return self._tool_calls_completion(
-                    self.model_family.model_name, self.model_uid, c, tools
+                    self.model_family, self.model_uid, c, tools
                 )
             return self._to_chat_completion(c)

xinference/model/llm/pytorch/internlm2.py ADDED Viewed

@@ -0,0 +1,155 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import uuid
+from typing import Any, Dict, Iterator, List, Optional, Union
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionChoice,
+    CompletionChunk,
+    CompletionUsage,
+    PytorchGenerateConfig,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchChatModel, PytorchModelConfig
+class Internlm2PytorchChatModel(PytorchChatModel):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        pytorch_model_config: Optional[PytorchModelConfig] = None,
+    ):
+        super().__init__(
+            model_uid,
+            model_family,
+            model_spec,
+            quantization,
+            model_path,
+            pytorch_model_config=pytorch_model_config,
+        )
+    def _load_model(self, **kwargs):
+        try:
+            from transformers import AutoModel, AutoTokenizer
+        except ImportError:
+            error_message = "Failed to import module 'transformers'"
+            installation_guide = [
+                "Please make sure 'transformers' is installed. ",
+                "You can install it by `pip install transformers`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            trust_remote_code=kwargs["trust_remote_code"],
+            encode_special_tokens=True,
+            revision=kwargs["revision"],
+        )
+        model = AutoModel.from_pretrained(
+            self.model_path,
+            **kwargs,
+        )
+        return model, tokenizer
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family != "internlm2-chat":
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        kwargs: Dict[str, Any] = {}
+        generate_config = generate_config or {}
+        temperature = generate_config.get("temperature")
+        if temperature is not None:
+            kwargs["temperature"] = float(temperature)
+        top_p = generate_config.get("top_p")
+        if top_p is not None:
+            kwargs["top_p"] = float(top_p)
+        max_new_tokens = generate_config.get("max_tokens")
+        if max_new_tokens is not None:
+            kwargs["max_length"] = int(max_new_tokens)
+        stream = generate_config.get("stream", False)
+        if chat_history:
+            input_history = [
+                (chat_history[i]["content"], (chat_history[i + 1]["content"]))
+                for i in range(0, len(chat_history), 2)
+            ]
+        else:
+            input_history = []
+        if stream:
+            def _stream_generator():
+                last_chunk_text_length = 0
+                for chunk_text, _ in self._model.stream_chat(
+                    self._tokenizer, prompt, input_history, **kwargs
+                ):
+                    chunk_text = chunk_text[last_chunk_text_length:]
+                    last_chunk_text_length += len(chunk_text)
+                    completion_choice = CompletionChoice(
+                        text=chunk_text, index=0, logprobs=None, finish_reason=None
+                    )
+                    yield CompletionChunk(
+                        id=str(uuid.uuid1()),
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[completion_choice],
+                    )
+            return self._to_chat_completion_chunks(_stream_generator())
+        else:
+            response, _ = self._model.chat(
+                self._tokenizer, prompt, input_history, **kwargs
+            )
+            return ChatCompletion(
+                id="chat" + str(uuid.uuid1()),
+                object="chat.completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[
+                    ChatCompletionChoice(
+                        index=0,
+                        message={"role": "assistant", "content": response},
+                        finish_reason="stop",
+                    )
+                ],
+                usage=CompletionUsage(
+                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+                ),
+            )

xinference 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

xinference 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl