PyPI - xinference - Versions diffs - 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl - Mend

xinference 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show

xinference/__init__.py +0 -1
xinference/_version.py +3 -3
xinference/api/restful_api.py +30 -5
xinference/client/restful/restful_client.py +18 -3
xinference/constants.py +0 -4
xinference/core/chat_interface.py +2 -2
xinference/core/image_interface.py +6 -3
xinference/core/model.py +9 -4
xinference/core/scheduler.py +4 -4
xinference/core/supervisor.py +2 -0
xinference/core/worker.py +7 -0
xinference/deploy/utils.py +6 -0
xinference/model/audio/core.py +9 -4
xinference/model/audio/cosyvoice.py +136 -0
xinference/model/audio/model_spec.json +24 -0
xinference/model/audio/model_spec_modelscope.json +27 -0
xinference/model/core.py +25 -4
xinference/model/embedding/core.py +88 -13
xinference/model/embedding/model_spec.json +8 -0
xinference/model/embedding/model_spec_modelscope.json +8 -0
xinference/model/flexible/core.py +8 -2
xinference/model/flexible/launchers/__init__.py +1 -0
xinference/model/flexible/launchers/image_process_launcher.py +70 -0
xinference/model/image/core.py +8 -5
xinference/model/image/model_spec.json +36 -5
xinference/model/image/model_spec_modelscope.json +21 -3
xinference/model/image/stable_diffusion/core.py +36 -28
xinference/model/llm/core.py +6 -4
xinference/model/llm/ggml/llamacpp.py +7 -5
xinference/model/llm/llm_family.json +802 -82
xinference/model/llm/llm_family.py +6 -6
xinference/model/llm/llm_family_csghub.json +39 -0
xinference/model/llm/llm_family_modelscope.json +295 -47
xinference/model/llm/mlx/core.py +7 -0
xinference/model/llm/pytorch/chatglm.py +246 -5
xinference/model/llm/pytorch/cogvlm2.py +1 -1
xinference/model/llm/pytorch/deepseek_vl.py +2 -1
xinference/model/llm/pytorch/falcon.py +2 -1
xinference/model/llm/pytorch/llama_2.py +4 -2
xinference/model/llm/pytorch/omnilmm.py +2 -1
xinference/model/llm/pytorch/qwen_vl.py +2 -1
xinference/model/llm/pytorch/vicuna.py +2 -1
xinference/model/llm/pytorch/yi_vl.py +2 -1
xinference/model/llm/sglang/core.py +12 -6
xinference/model/llm/utils.py +78 -1
xinference/model/llm/vllm/core.py +9 -5
xinference/model/rerank/core.py +4 -3
xinference/thirdparty/cosyvoice/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
xinference/thirdparty/cosyvoice/bin/train.py +136 -0
xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
xinference/thirdparty/cosyvoice/cli/model.py +60 -0
xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
xinference/thirdparty/cosyvoice/utils/common.py +103 -0
xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.95c1d652.js → main.af906659.js} +3 -3
xinference/web/ui/build/static/js/main.af906659.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2cd5e4279ad7e13a1f41d486e9fca7756295bfad5bd77d90992f4ac3e10b496d.json +1 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/METADATA +39 -11
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/RECORD +101 -57
xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
/xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.af906659.js.LICENSE.txt} +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/LICENSE +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/WHEEL +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/entry_points.txt +0 -0
{xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/top_level.txt +0 -0

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -11,10 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
+import json
+import threading
 import time
 import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
+import torch
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList
 from ....core.scheduler import InferenceRequest
 from ....types import (
     SPECIAL_TOOL_PROMPT,
@@ -33,6 +40,16 @@ from ..utils import GLM4_TOOL_CALL_FAMILY
 from .core import PytorchChatModel, PytorchModelConfig
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 198] = 5e4
+        return scores
 class ChatglmPytorchChatModel(PytorchChatModel):
     def __init__(
         self,
@@ -103,9 +120,11 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         tools = generate_config.pop("tools", None)
         if tools is None:
             return False
+        # Convert a iterable to a list
+        tools = list(tools)
         tool_choice = generate_config.pop("tool_choice", "none")
         if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
-            chat_history[:] = self.process_messages(
+            chat_history[:] = self._process_messages(
                 chat_history, tools=tools, tool_choice=tool_choice
             )
             return True
@@ -124,7 +143,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             return True
     @staticmethod
-    def process_messages(messages, tools=None, tool_choice="none"):
+    def _process_messages(messages, tools=None, tool_choice="none"):
         # This method is adapted from https://github.com/THUDM/GLM-4/blob/main/basic_demo/openai_api_server.py
         _messages = messages
         processed_messages = []
@@ -210,6 +229,212 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     break
         return processed_messages
+    @staticmethod
+    def _process_response(output, history, tools, end=False):
+        # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py
+        content = ""
+        history = copy.deepcopy(history)
+        if not tools and end:
+            return None, None
+        for response in output.split("<|assistant|>"):
+            if "\n" in response:
+                metadata, content = response.split("\n", maxsplit=1)
+            else:
+                metadata, content = "", response
+            if not metadata.strip():
+                if tools and any(t.startswith(response) for t in tools) and not end:
+                    # Waiting for tool call complete.
+                    return None, None
+                content = content.strip()
+                history.append(
+                    {"role": "assistant", "metadata": metadata, "content": content}
+                )
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                if tools and metadata in tools and not end:
+                    return None, None
+                history.append(
+                    {"role": "assistant", "metadata": metadata, "content": content}
+                )
+                metadata = metadata.strip()
+                if tools and metadata in tools and end:
+                    try:
+                        parameters = json.loads(content)
+                        content = {"name": metadata.strip(), "parameters": parameters}
+                    except json.JSONDecodeError:
+                        content = {"name": metadata.strip(), "content": content}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+    def _get_generate_args(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Dict]] = None,
+        role: str = "user",
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [
+            tokenizer.eos_token_id,
+            tokenizer.convert_tokens_to_ids("<|user|>"),
+            tokenizer.convert_tokens_to_ids("<|observation|>"),
+        ]
+        gen_kwargs = {
+            "max_length": max_length,
+            "do_sample": do_sample,
+            "top_p": top_p,
+            "temperature": temperature,
+            "logits_processor": logits_processor,
+            **kwargs,
+        }
+        if past_key_values is None:
+            inputs = tokenizer.apply_chat_template(
+                history + [{"role": role, "content": query}],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_tensors="pt",
+                return_dict=True,
+            )
+        else:
+            inputs = tokenizer.apply_chat_template(
+                [{"role": role, "content": query}],
+                add_special_tokens=False,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_tensors="pt",
+                return_dict=True,
+            )
+        inputs = inputs.to(self._model.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat(
+                (attention_mask.new_ones(1, past_length), attention_mask), dim=1
+            )
+            inputs["attention_mask"] = attention_mask
+        history.append({"role": role, "content": query})
+        tools = history[0]["role"] == "system" and history[0].get("tools")
+        tools = (
+            [
+                t.get("function", {}).get("name", "")
+                for t in tools
+                if isinstance(t, dict)
+            ]
+            if tools
+            else []
+        )
+        kwargs = dict(inputs)
+        kwargs["past_key_values"] = past_key_values
+        kwargs["eos_token_id"] = eos_token_id
+        kwargs.update(gen_kwargs)
+        return kwargs, tools
+    @torch.inference_mode()
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Dict]] = None,
+        role: str = "user",
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        from transformers import TextIteratorStreamer
+        kwargs, tools = self._get_generate_args(
+            tokenizer=tokenizer,
+            query=query,
+            history=history,
+            role=role,
+            past_key_values=past_key_values,
+            max_length=max_length,
+            do_sample=do_sample,
+            top_p=top_p,
+            temperature=temperature,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+        streamer = TextIteratorStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        kwargs["streamer"] = streamer
+        thread = threading.Thread(target=self._model.generate, kwargs=kwargs)
+        thread.start()
+        response = ""
+        for token in streamer:
+            response += token
+            if response and response[-1] != "�":
+                new_response, new_history = self._process_response(
+                    response, history, tools, end=False
+                )
+                if new_response is None:
+                    continue
+                yield new_response, new_history
+        if tools:
+            new_response, new_history = self._process_response(
+                response, history, tools, end=True
+            )
+            if new_response:
+                yield new_response, new_history
+    @torch.inference_mode()
+    def non_stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Dict]] = None,
+        role: str = "user",
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        kwargs, tools = self._get_generate_args(
+            tokenizer=tokenizer,
+            query=query,
+            history=history,
+            role=role,
+            past_key_values=past_key_values,
+            max_length=max_length,
+            do_sample=do_sample,
+            top_p=top_p,
+            temperature=temperature,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+        outputs = self._model.generate(**kwargs)
+        outputs = outputs[:, kwargs["input_ids"].shape[1] :]
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if tools:
+            return self._process_response(response, history, tools, end=True)
+        else:
+            return self._process_response(response, history, tools)
     def chat(
         self,
         prompt: str,
@@ -247,7 +472,13 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             if isinstance(stream_options, dict)
             else False
         )
-        if stream and not tools:
+        if stream and (
+            not tools or self.model_family.model_name in GLM4_TOOL_CALL_FAMILY
+        ):
+            if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
+                stream_chat = self.stream_chat
+            else:
+                stream_chat = self._model.stream_chat
             def _stream_generator():
                 last_chunk_text_length = 0
@@ -256,9 +487,14 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                 inputs = self._tokenizer([prompt], return_tensors="pt")
                 inputs = inputs.to(self._model.device)
                 prompt_tokens = len(inputs["input_ids"][0])
-                for chunk_text, _ in self._model.stream_chat(
+                for chunk_text, _ in stream_chat(
                     self._tokenizer, prompt, chat_history, **kwargs
                 ):
+                    if tools and isinstance(chunk_text, dict):
+                        yield self._tool_calls_completion_chunk(
+                            self.model_family, self.model_uid, [chunk_text, _], tools
+                        )
+                        return
                     completion_tokens = completion_tokens + 1
                     total_tokens = prompt_tokens + completion_tokens
                     chunk_text = chunk_text[last_chunk_text_length:]
@@ -312,7 +548,12 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             return self._to_chat_completion_chunks(_stream_generator())
         else:
-            response = self._model.chat(self._tokenizer, prompt, chat_history, **kwargs)
+            if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
+                chat = self.non_stream_chat
+            else:
+                chat = self._model.chat
+            response = chat(self._tokenizer, prompt, chat_history, **kwargs)
             if tools:
                 return self._tool_calls_completion(
                     self.model_family, self.model_uid, response, tools

xinference/model/llm/pytorch/cogvlm2.py CHANGED Viewed

@@ -387,7 +387,7 @@ class CogVLM2Model(PytorchChatModel):
             prompt, system_prompt=system_prompt, chat_history=chat_history
         )
-        input_by_model: dict = self._model.build_conversation_input_ids(
+        input_by_model: dict = self._model.build_conversation_input_ids(  # type: ignore
             self._tokenizer,
             query=query,
             history=history,

xinference/model/llm/pytorch/deepseek_vl.py CHANGED Viewed

@@ -52,7 +52,8 @@ class DeepSeekVLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if "deepseek" in model_family.model_name:
+        llm_family = model_family.model_family or model_family.model_name
+        if "deepseek-vl" in llm_family:
             return True
         return False

xinference/model/llm/pytorch/falcon.py CHANGED Viewed

@@ -71,7 +71,8 @@ class FalconPytorchModel(PytorchModel):
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False
-        if "falcon" not in llm_family.model_name:
+        model_family = llm_family.model_family or llm_family.model_name
+        if "falcon" not in model_family:
             return False
         if "generate" not in llm_family.model_ability:
             return False

xinference/model/llm/pytorch/llama_2.py CHANGED Viewed

@@ -55,7 +55,8 @@ class LlamaPytorchModel(PytorchModel):
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False
-        if "llama-2" not in llm_family.model_name:
+        model_family = llm_family.model_family or llm_family.model_name
+        if "llama-2" not in model_family:
             return False
         if "generate" not in llm_family.model_ability:
             return False
@@ -99,7 +100,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False
-        if "llama-2" not in llm_family.model_name:
+        model_family = llm_family.model_family or llm_family.model_name
+        if "llama-2" not in model_family:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference/model/llm/pytorch/omnilmm.py CHANGED Viewed

@@ -44,7 +44,8 @@ class OmniLMMModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if "OmniLMM" in model_family.model_name:
+        llm_family = model_family.model_family or model_family.model_name
+        if "OmniLMM" in llm_family:
             return True
         return False

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -52,7 +52,8 @@ class QwenVLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if "qwen" in model_family.model_name and "vision" in model_family.model_ability:
+        llm_family = model_family.model_family or model_family.model_name
+        if "qwen" in llm_family and "vision" in model_family.model_ability:
             return True
         return False

xinference/model/llm/pytorch/vicuna.py CHANGED Viewed

@@ -61,7 +61,8 @@ class VicunaPytorchChatModel(PytorchChatModel):
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False
-        if "vicuna" not in llm_family.model_name:
+        model_family = llm_family.model_family or llm_family.model_name
+        if "vicuna" not in model_family:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -51,7 +51,8 @@ class YiVLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if "yi" in model_family.model_name:
+        llm_family = model_family.model_family or model_family.model_name
+        if "yi-vl" in llm_family:
             return True
         return False

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -17,7 +17,6 @@ import time
 import uuid
 from typing import AsyncGenerator, Dict, List, Optional, TypedDict, Union
-from ....constants import XINFERENCE_ENABLE_SGLANG
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -63,15 +62,26 @@ try:
 except ImportError:
     SGLANG_INSTALLED = False
-SGLANG_SUPPORTED_MODELS = ["llama-2", "mistral-v0.1", "mixtral-v0.1"]
+SGLANG_SUPPORTED_MODELS = [
+    "llama-2",
+    "llama-3",
+    "llama-3.1",
+    "mistral-v0.1",
+    "mixtral-v0.1",
+]
 SGLANG_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
+    "llama-3-instruct",
+    "llama-3.1-instruct",
     "qwen-chat",
     "qwen1.5-chat",
+    "qwen2-instruct",
+    "qwen2-moe-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
     "mixtral-instruct-v0.1",
     "gemma-it",
+    "gemma-2-it",
 ]
@@ -168,8 +178,6 @@ class SGLANGModel(LLM):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if not XINFERENCE_ENABLE_SGLANG:
-            return False
         if not cls._has_cuda_device():
             return False
         if not cls._is_linux():
@@ -332,8 +340,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if not XINFERENCE_ENABLE_SGLANG:
-            return False
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         if llm_spec.model_format == "pytorch":

xinference/model/llm/utils.py CHANGED Viewed

@@ -483,11 +483,40 @@ Begin!"""
                 else:
                     ret += role
             return ret
+        elif prompt_style.style_name == "mistral-nemo":
+            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
+            ret = "<s>"
+            for i, message in enumerate(chat_history):
+                role = get_role(message["role"])
+                content = message["content"]
+                if content:
+                    if i == len(chat_history) - 2 and prompt_style.system_prompt:
+                        ret += (
+                            role
+                            + " "
+                            + prompt_style.system_prompt
+                            + "\n\n"
+                            + content
+                            + seps[i % 2]
+                        )
+                    else:
+                        ret += role + " " + content + seps[i % 2]
+                else:
+                    ret += role
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
     @classmethod
     def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
+        choices = chunk.get("choices")
+        if (
+            chunk.get("object") == "chat.completion.chunk"
+            and choices
+            and "delta" in choices[0]
+        ):
+            # Already a ChatCompletionChunk, we don't need to convert chunk.
+            return cast(ChatCompletionChunk, chunk)
         chat_chunk = {
             "id": "chat" + chunk["id"],
             "model": chunk["model"],
@@ -497,7 +526,7 @@ Begin!"""
                 {
                     "index": i,
                     "delta": {
-                        "content": choice["text"],
+                        "content": choice.get("text"),
                         **(
                             {"tool_calls": choice["tool_calls"]}
                             if "tool_calls" in choice
@@ -718,6 +747,54 @@ Begin!"""
         else:
             return lambda tokens, delta: delta
+    @classmethod
+    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools):
+        _id = str(uuid.uuid4())
+        content, func, args = cls._eval_tool_arguments(model_family, c, tools)
+        if func:
+            d = {
+                "role": "assistant",
+                "content": content,
+                "tool_calls": [
+                    {
+                        "id": f"call_{_id}",
+                        "type": "function",
+                        "function": {
+                            "name": func,
+                            "arguments": json.dumps(args),
+                        },
+                    }
+                ],
+            }
+            finish_reason = "tool_calls"
+        else:
+            d = {"role": "assistant", "content": content, "tool_calls": []}
+            finish_reason = "stop"
+        try:
+            usage = c.get("usage")
+            assert "prompt_tokens" in usage
+        except Exception:
+            usage = {
+                "prompt_tokens": -1,
+                "completion_tokens": -1,
+                "total_tokens": -1,
+            }
+        return {
+            "id": "chat" + f"cmpl-{_id}",
+            "model": model_uid,
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": d,
+                    "logprobs": None,
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": usage,
+        }
     @classmethod
     def _tool_calls_completion(cls, model_family, model_uid, c, tools):
         _id = str(uuid.uuid4())

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -28,7 +28,6 @@ from typing import (
     Union,
 )
-from ....constants import XINFERENCE_DISABLE_VLLM
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -151,6 +150,15 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")
+if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
+    VLLM_SUPPORTED_CHAT_MODELS.append("gemma-2-it")
+    VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("mistral-large-instruct")
+if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
+    VLLM_SUPPORTED_MODELS.append("llama-3.1")
+    VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
 class VLLMModel(LLM):
     def __init__(
@@ -288,8 +296,6 @@ class VLLMModel(LLM):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if XINFERENCE_DISABLE_VLLM:
-            return False
         if not cls._has_cuda_device():
             return False
         if not cls._is_linux():
@@ -514,8 +520,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if XINFERENCE_DISABLE_VLLM:
-            return False
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         if llm_spec.model_format == "pytorch":

xinference/model/rerank/core.py CHANGED Viewed

@@ -107,7 +107,7 @@ class RerankModel:
         self,
         model_spec: RerankModelSpec,
         model_uid: str,
-        model_path: str,
+        model_path: Optional[str] = None,
         device: Optional[str] = None,
         use_fp16: bool = False,
         model_config: Optional[Dict] = None,
@@ -290,6 +290,7 @@ def create_rerank_model_instance(
     model_uid: str,
     model_name: str,
     download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    model_path: Optional[str] = None,
     **kwargs,
 ) -> Tuple[RerankModel, RerankModelDescription]:
     from ..utils import download_from_modelscope
@@ -321,8 +322,8 @@ def create_rerank_model_instance(
                 f"Huggingface: {BUILTIN_RERANK_MODELS.keys()}"
                 f"ModelScope: {MODELSCOPE_RERANK_MODELS.keys()}"
             )
-    model_path = cache(model_spec)
+    if not model_path:
+        model_path = cache(model_spec)
     use_fp16 = kwargs.pop("use_fp16", False)
     model = RerankModel(
         model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs

xinference/thirdparty/cosyvoice/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/cosyvoice/bin/__init__.py ADDED Viewed

File without changes

xinference 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl

Potentially problematic release.

xinference 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl