PyPI - xinference - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

xinference 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (258) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -957,7 +957,7 @@
     "model_ability": [
       "generate"
     ],
-    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time.",
+    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
     "model_specs": [
       {
         "model_format": "pytorch",
@@ -1024,6 +1024,55 @@
       }
     ]
   },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "Yi-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-34B-Chat",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -1329,6 +1378,18 @@
         "model_id": "qwen/Qwen-7B-Chat",
         "model_revision": "v1.1.7"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "qwen/Qwen-72B-Chat",
+        "model_revision": "master"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 14,
@@ -1360,6 +1421,17 @@
         "model_id": "qwen/Qwen-14B-Chat-{quantization}",
         "model_hub": "modelscope",
         "model_revision": "v1.0.7"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen-72B-Chat-{quantization}",
+        "model_hub": "modelscope",
+        "model_revision": "master"
       }
     ],
     "prompt_style": {
@@ -1371,7 +1443,14 @@
       ],
       "intra_message_sep": "\n",
       "stop_token_ids": [
-        151643
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
       ]
     }
   }

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -11,9 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
+import time
+import uuid
+from typing import Iterator, List, Optional, Union
-from typing import Optional
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    PytorchGenerateConfig,
+)
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModelConfig
@@ -71,3 +79,88 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         if "chat" not in llm_family.model_ability:
             return False
         return True
+    @staticmethod
+    def _handle_tools(generate_config) -> Optional[dict]:
+        """Convert openai tools to ChatGLM tools."""
+        if generate_config is None:
+            return None
+        tools = generate_config.pop("tools", None)
+        if tools is None:
+            return None
+        chatglm_tools = []
+        for elem in tools:
+            if elem.get("type") != "function" or "function" not in elem:
+                raise ValueError("ChatGLM tools only support function type.")
+            chatglm_tools.append(elem["function"])
+        return {
+            "role": "system",
+            "content": f"Answer the following questions as best as you can. You have access to the following tools:",
+            "tools": chatglm_tools,
+        }
+    @staticmethod
+    def _tool_calls_completion(msg, model_name) -> ChatCompletion:
+        _id = str(uuid.uuid4())
+        return {
+            "id": "chat" + f"cmpl-{_id}",
+            "model": model_name,
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": f"call_{_id}",
+                                "type": "function",
+                                "function": {
+                                    "name": msg["name"],
+                                    "arguments": json.dumps(msg["parameters"]),
+                                },
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": -1,
+                "completion_tokens": -1,
+                "total_tokens": -1,
+            },
+        }
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        tools = self._handle_tools(generate_config)
+        if tools:
+            # Tool calls only works for non stream, so we call chat directly.
+            kwargs = {}
+            generate_config = generate_config or {}
+            temperature = generate_config.get("temperature")
+            if temperature is not None:
+                kwargs["temperature"] = float(temperature)
+            top_p = generate_config.get("top_p")
+            if top_p is not None:
+                kwargs["top_p"] = float(top_p)
+            max_length = generate_config.get("max_tokens")
+            if max_length is not None:
+                kwargs["max_length"] = int(max_length)
+            msg = self._model.chat(self._tokenizer, prompt, [tools], **kwargs)
+            return self._tool_calls_completion(msg[0], self.model_uid)
+        else:
+            return super().chat(
+                prompt=prompt,
+                system_prompt=system_prompt,
+                chat_history=chat_history,
+                generate_config=generate_config,
+            )

xinference/model/llm/utils.py CHANGED Viewed

@@ -122,19 +122,20 @@ class ChatModelMixin:
                     ret += role + "："
             return ret
         elif prompt_style.style_name == "CHATGLM3":
-            ret = (
-                f"<|system|> \n {prompt_style.system_prompt}"
+            prompts = (
+                [f"<|system|>\n{prompt_style.system_prompt}"]
                 if prompt_style.system_prompt
-                else ""
+                else []
             )
             for i, message in enumerate(chat_history):
                 role = message["role"]
                 content = message["content"]
                 if content:
-                    ret += f"<|{role}|> \n {content}"
+                    prompts.append(f"<|{role}|>\n{content}")
                 else:
-                    ret += f"<|{role}|>"
-            return ret
+                    prompts.append(f"<|{role}|>")
+            return "\n".join(prompts)
         elif prompt_style.style_name == "XVERSE":
             ret = (
                 f"<|system|> \n {prompt_style.system_prompt}"
@@ -184,11 +185,14 @@ class ChatModelMixin:
                     ret += "<s>"
                 role = message["role"]
                 content = message["content"]
-                ret += role + ":" + content + seps[i % 2]
+                ret += role + ":" + str(content) + seps[i % 2]
             if len(ret) == 0:
                 ret += "<s>"
             ret += (
-                chat_history[-2]["role"] + ":" + chat_history[-2]["content"] + seps[0]
+                chat_history[-2]["role"]
+                + ":"
+                + str(chat_history[-2]["content"])
+                + seps[0]
             )
             ret += chat_history[-1]["role"] + ":"
             return ret

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -17,6 +17,7 @@ import time
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, TypedDict, Union
+from ....constants import XINFERENCE_DISABLE_VLLM
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -44,6 +45,7 @@ class VLLMModelConfig(TypedDict, total=False):
     gpu_memory_utilization: float
     max_num_batched_tokens: int
     max_num_seqs: int
+    quantization: Optional[str]
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -54,6 +56,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
     temperature: float
     top_p: float
     max_tokens: int
+    stop_token_ids: Optional[List[int]]
     stop: Optional[Union[str, List[str]]]
     stream: bool  # non-sampling param, should not be passed to the engine.
@@ -65,7 +68,7 @@ try:
 except ImportError:
     VLLM_INSTALLED = False
-VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k"]
+VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k", "mistral-v0.1"]
 VLLM_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
     "vicuna-v1.3",
@@ -74,6 +77,10 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "internlm-chat-7b",
     "internlm-chat-8k",
     "internlm-chat-20b",
+    "qwen-chat",
+    "Yi",
+    "mistral-instruct-v0.1",
+    "chatglm3",
 ]
@@ -127,6 +134,7 @@ class VLLMModel(LLM):
         model_config.setdefault("swap_space", 4)
         model_config.setdefault("gpu_memory_utilization", 0.90)
         model_config.setdefault("max_num_seqs", 256)
+        model_config.setdefault("quantization", None)
         return model_config
@@ -150,6 +158,9 @@ class VLLMModel(LLM):
         sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
         sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 16))
         sanitized.setdefault("stop", generate_config.get("stop", None))
+        sanitized.setdefault(
+            "stop_token_ids", generate_config.get("stop_token_ids", None)
+        )
         sanitized.setdefault("stream", generate_config.get("stream", None))
         return sanitized
@@ -158,6 +169,8 @@ class VLLMModel(LLM):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        if XINFERENCE_DISABLE_VLLM:
+            return False
         if not cls._has_cuda_device():
             return False
         if not cls._is_linux():
@@ -287,6 +300,8 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        if XINFERENCE_DISABLE_VLLM:
+            return False
         if quantization != "none":
             return False
         if llm_spec.model_format != "pytorch":
@@ -303,10 +318,16 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Dict:
         if not generate_config:
             generate_config = {}
-        if self.model_family.prompt_style and self.model_family.prompt_style.stop:
-            generate_config.setdefault(
-                "stop", self.model_family.prompt_style.stop.copy()
-            )
+        if self.model_family.prompt_style:
+            if (
+                not generate_config.get("stop")
+            ) and self.model_family.prompt_style.stop:
+                generate_config["stop"] = self.model_family.prompt_style.stop.copy()
+            if self.model_family.prompt_style.stop_token_ids:
+                generate_config.setdefault(
+                    "stop_token_ids",
+                    self.model_family.prompt_style.stop_token_ids.copy(),
+                )
         return generate_config
     async def async_chat(

xinference/model/utils.py CHANGED Viewed

@@ -211,3 +211,28 @@ def copy_from_src_to_dst(
             )
             if attempt + 1 == max_attempt:
                 raise
+def patch_trust_remote_code():
+    """sentence-transformers calls transformers without the trust_remote_code=True, some embedding
+    models will fail to load, e.g. jina-embeddings-v2-base-en
+    :return:
+    """
+    try:
+        from transformers.dynamic_module_utils import resolve_trust_remote_code
+    except ImportError:
+        logger.error("Patch transformers trust_remote_code failed.")
+    else:
+        def _patched_resolve_trust_remote_code(*args, **kwargs):
+            logger.info("Patched resolve_trust_remote_code: %s %s", args, kwargs)
+            return True
+        if (
+            resolve_trust_remote_code.__code__
+            != _patched_resolve_trust_remote_code.__code__
+        ):
+            resolve_trust_remote_code.__code__ = (
+                _patched_resolve_trust_remote_code.__code__
+            )

xinference/types.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, ForwardRef, Iterable, List, Optional, Union
 from pydantic import (
     BaseModel,
@@ -121,8 +121,9 @@ class Completion(TypedDict):
 class ChatCompletionMessage(TypedDict):
     role: str
-    content: str
+    content: Optional[str]
     user: NotRequired[str]
+    tool_calls: NotRequired[List]
 class ChatCompletionChoice(TypedDict):
@@ -288,13 +289,33 @@ def get_pydantic_model_from_method(
         model.__fields__.pop(key)
     if exclude_fields is not None:
         for key in exclude_fields:
-            model.__fields__.pop(key)
+            model.__fields__.pop(key, None)
     if include_fields is not None:
         dummy_model = create_model("DummyModel", **include_fields)
         model.__fields__.update(dummy_model.__fields__)
     return model
+def fix_forward_ref(model):
+    """
+    pydantic in Python 3.8 generates ForwardRef field, we replace them
+    by the Optional[Any]
+    """
+    exclude_fields = []
+    include_fields = {}
+    for key, field in model.__fields__.items():
+        if isinstance(field.annotation, ForwardRef):
+            exclude_fields.append(key)
+            include_fields[key] = (Optional[Any], None)
+    if exclude_fields:
+        for key in exclude_fields:
+            model.__fields__.pop(key, None)
+    if include_fields:
+        dummy_model = create_model("DummyModel", **include_fields)
+        model.__fields__.update(dummy_model.__fields__)
+    return model
 class ModelAndPrompt(BaseModel):
     model: str
     prompt: str
@@ -318,7 +339,9 @@ try:
     from llama_cpp import Llama
     CreateCompletionLlamaCpp = get_pydantic_model_from_method(
-        Llama.create_completion, exclude_fields=["model", "prompt"]
+        Llama.create_completion,
+        exclude_fields=["model", "prompt", "grammar"],
+        include_fields={"grammar": (Optional[Any], None)},
     )
 except ImportError:
     CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
@@ -330,7 +353,7 @@ try:
     CreateCompletionCTransformers = get_pydantic_model_from_method(
         LLM.generate,
         exclude_fields=["tokens"],
-        include_fields={"max_tokens": (int, max_tokens_field)},
+        include_fields={"max_tokens": (Optional[int], max_tokens_field)},
     )
 except ImportError:
     CreateCompletionCTransformers = create_model("CreateCompletionCTransformers")
@@ -370,6 +393,7 @@ try:
     CreateCompletionOpenAI = create_model_from_typeddict(
         CompletionCreateParamsNonStreaming,
     )
+    CreateCompletionOpenAI = fix_forward_ref(CreateCompletionOpenAI)
 except ImportError:
     # TODO(codingl2k1): Remove it if openai < 1 is dropped.
     CreateCompletionOpenAI = _CreateCompletionOpenAIFallback
@@ -383,3 +407,38 @@ class CreateCompletion(
     CreateCompletionOpenAI,
 ):
     pass
+class CreateChatModel(BaseModel):
+    model: str
+# Currently, chat calls generates, so the params share the same one.
+CreateChatCompletionTorch = CreateCompletionTorch
+CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
+CreateChatCompletionCTransformers: BaseModel = CreateCompletionCTransformers
+# This type is for openai API compatibility
+CreateChatCompletionOpenAI: BaseModel
+# Only support openai > 1
+from openai.types.chat.completion_create_params import (
+    CompletionCreateParamsNonStreaming,
+)
+CreateChatCompletionOpenAI = create_model_from_typeddict(
+    CompletionCreateParamsNonStreaming,
+)
+CreateChatCompletionOpenAI = fix_forward_ref(CreateChatCompletionOpenAI)
+class CreateChatCompletion(
+    CreateChatModel,
+    CreateChatCompletionTorch,
+    CreateChatCompletionLlamaCpp,
+    CreateChatCompletionCTransformers,
+    CreateChatCompletionOpenAI,
+):
+    pass

xinference/utils.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+def cuda_count():
+    # even if install torch cpu, this interface would return 0.
+    return torch.cuda.device_count()

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "files": {
-    "main.js": "./static/js/main.8ae3b6d9.js",
+    "main.js": "./static/js/main.8126d441.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.8ae3b6d9.js.map": "./static/js/main.8ae3b6d9.js.map"
+    "main.8126d441.js.map": "./static/js/main.8126d441.js.map"
   },
   "entrypoints": [
-    "static/js/main.8ae3b6d9.js"
+    "static/js/main.8126d441.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~8ae3b6d9~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.8126d441.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

xinference 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl